[v2,09/18] x86/resctrl: Allow resctrl_arch_rmid_read() to sleep

Message ID 20230113175459.14825-10-james.morse@arm.com
State New
Headers
Series x86/resctrl: monitored closid+rmid together, separate arch/fs locking |

Commit Message

James Morse Jan. 13, 2023, 5:54 p.m. UTC
  MPAM's cache occupancy counters can take a little while to settle once
the monitor has been configured. The maximum settling time is described
to the driver via a firmware table. The value could be large enough
that it makes sense to sleep.

To avoid exposing this to resctrl, it should be hidden behind MPAM's
resctrl_arch_rmid_read(). But add_rmid_to_limbo() calls
resctrl_arch_rmid_read() from a non-preemptible context.

add_rmid_to_limbo() is opportunistically reading the L3 occupancy counter
on this domain to avoid adding the RMID to limbo if this domain's value
has drifted below resctrl_rmid_realloc_threshold since the limbo handler
last ran. Determining 'this domain' involves disabling preeption to
prevent the thread being migrated to CPUs in a different domain between
the check and resctrl_arch_rmid_read() call. The check is skipped
for all remote domains.

Instead, call resctrl_arch_rmid_read() for each domain, and get it to
read the arch specific counter via IPI if its called on a CPU outside
the target domain. By covering remote domains, this change stops the
limbo handler from being started unnecessarily.

This also allows resctrl_arch_rmid_read() to sleep.

Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Signed-off-by: James Morse <james.morse@arm.com>
---
The alternative is to remove the counter read from this path altogether,
and assume user-space would never try to re-allocate the last RMID before
the limbo handler runs next.
---
 arch/x86/kernel/cpu/resctrl/monitor.c | 58 ++++++++++++++++++---------
 1 file changed, 38 insertions(+), 20 deletions(-)
  

Comments

Peter Newman Jan. 23, 2023, 1:54 p.m. UTC | #1
Hi James,

On Fri, Jan 13, 2023 at 6:56 PM James Morse <james.morse@arm.com> wrote:
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index d309b830aeb2..d6ae4b713801 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -206,17 +206,19 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
>         return chunks >> shift;
>  }
>
> -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
> -                          u32 closid, u32 rmid, enum resctrl_event_id eventid,
> -                          u64 *val)
> +struct __rmid_read_arg
>  {
> -       struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
> -       struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
> -       struct arch_mbm_state *am;
> -       u64 msr_val, chunks;
> +       u32 rmid;
> +       enum resctrl_event_id eventid;
>
> -       if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
> -               return -EINVAL;
> +       u64 msr_val;
> +};
> +
> +static void __rmid_read(void *arg)
> +{
> +       enum resctrl_event_id eventid = ((struct __rmid_read_arg *)arg)->eventid;
> +       u32 rmid = ((struct __rmid_read_arg *)arg)->rmid;
> +       u64 msr_val;
>
>         /*
>          * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
> @@ -229,6 +231,28 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
>         wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
>         rdmsrl(MSR_IA32_QM_CTR, msr_val);
>
> +       ((struct __rmid_read_arg *)arg)->msr_val = msr_val;
> +}
> +
> +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
> +                          u32 closid, u32 rmid, enum resctrl_event_id eventid,
> +                          u64 *val)
> +{
> +       struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
> +       struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
> +       struct __rmid_read_arg arg;
> +       struct arch_mbm_state *am;
> +       u64 msr_val, chunks;
> +       int err;
> +
> +       arg.rmid = rmid;
> +       arg.eventid = eventid;
> +
> +       err = smp_call_function_any(&d->cpu_mask, __rmid_read, &arg, true);
> +       if (err)
> +               return err;
> +
> +       msr_val = arg.msr_val;

These changes are conflicting now after v6.2-rc4 due to my recent
changes in resctrl_arch_rmid_read(), which include my own
reintroduction of __rmid_read():

https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=2a81160d29d65b5876ab3f824fda99ae0219f05e

Fortunately it looks like our respective versions of __rmid_read()
aren't too much different from the original, but __rmid_read() does
have a new call site in resctrl_arch_reset_rmid() to record initial
event counts.

-Peter
  
Peter Newman Jan. 23, 2023, 3:33 p.m. UTC | #2
On Fri, Jan 13, 2023 at 6:56 PM James Morse <james.morse@arm.com> wrote:
> MPAM's cache occupancy counters can take a little while to settle once
> the monitor has been configured. The maximum settling time is described
> to the driver via a firmware table. The value could be large enough
> that it makes sense to sleep.

Would it be easier to return an error when reading the occupancy count
too soon after configuration? On Intel it is already normal for counter
reads to fail on newly-allocated RMIDs.

-Peter
  
James Morse March 6, 2023, 11:33 a.m. UTC | #3
Hi Peter,

On 23/01/2023 13:54, Peter Newman wrote:
> On Fri, Jan 13, 2023 at 6:56 PM James Morse <james.morse@arm.com> wrote:
>> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
>> index d309b830aeb2..d6ae4b713801 100644
>> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
>> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
>> @@ -206,17 +206,19 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
>>         return chunks >> shift;
>>  }
>>
>> -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
>> -                          u32 closid, u32 rmid, enum resctrl_event_id eventid,
>> -                          u64 *val)
>> +struct __rmid_read_arg
>>  {
>> -       struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
>> -       struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
>> -       struct arch_mbm_state *am;
>> -       u64 msr_val, chunks;
>> +       u32 rmid;
>> +       enum resctrl_event_id eventid;
>>
>> -       if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
>> -               return -EINVAL;
>> +       u64 msr_val;
>> +};
>> +
>> +static void __rmid_read(void *arg)
>> +{
>> +       enum resctrl_event_id eventid = ((struct __rmid_read_arg *)arg)->eventid;
>> +       u32 rmid = ((struct __rmid_read_arg *)arg)->rmid;
>> +       u64 msr_val;
>>
>>         /*
>>          * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
>> @@ -229,6 +231,28 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
>>         wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
>>         rdmsrl(MSR_IA32_QM_CTR, msr_val);
>>
>> +       ((struct __rmid_read_arg *)arg)->msr_val = msr_val;
>> +}
>> +
>> +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
>> +                          u32 closid, u32 rmid, enum resctrl_event_id eventid,
>> +                          u64 *val)
>> +{
>> +       struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
>> +       struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
>> +       struct __rmid_read_arg arg;
>> +       struct arch_mbm_state *am;
>> +       u64 msr_val, chunks;
>> +       int err;
>> +
>> +       arg.rmid = rmid;
>> +       arg.eventid = eventid;
>> +
>> +       err = smp_call_function_any(&d->cpu_mask, __rmid_read, &arg, true);
>> +       if (err)
>> +               return err;
>> +
>> +       msr_val = arg.msr_val;
> 
> These changes are conflicting now after v6.2-rc4 due to my recent
> changes in resctrl_arch_rmid_read(), which include my own
> reintroduction of __rmid_read():
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=2a81160d29d65b5876ab3f824fda99ae0219f05e
> 
> Fortunately it looks like our respective versions of __rmid_read()
> aren't too much different from the original, but __rmid_read() does
> have a new call site in resctrl_arch_reset_rmid() to record initial
> event counts.

Yup, this is the normal headache when rebasing over other changes.
Thanks for fixing that thing - I thought the 'first' behaviour in the filesystem code
covered it, but clearly it doesn't.


Thanks,

James
  
James Morse March 6, 2023, 11:33 a.m. UTC | #4
Hi Peter,

On 23/01/2023 15:33, Peter Newman wrote:
> On Fri, Jan 13, 2023 at 6:56 PM James Morse <james.morse@arm.com> wrote:
>> MPAM's cache occupancy counters can take a little while to settle once
>> the monitor has been configured. The maximum settling time is described
>> to the driver via a firmware table. The value could be large enough
>> that it makes sense to sleep.
> 
> Would it be easier to return an error when reading the occupancy count
> too soon after configuration? On Intel it is already normal for counter
> reads to fail on newly-allocated RMIDs.

For x86, you have as many counters as there are RMIDs, so there is no issue just accessing
the counter.

With MPAM there may be as few as 1 monitor for the CSU (cache storage utilisation)
counter, which needs to be multiplexed between different PARTID to find the cache
occupancy (This works for CSU because its a stable count, it doesn't work for the
bandwidth monitors)
On such a platform the monitor needs to be allocated and programmed before it reads a
value for a particular PARTID/CLOSID. If you had two threads trying to read the same
counter, they could interleave perfectly to prevent either thread managing to read a value.
The 'not ready' time is advertised in a firmware table, and the driver will wait at most
that long before giving up and returning an error.

Clearly 1 monitor is a corner case, and I hope no-one ever builds that. But if there are
fewer monitors than there are PARTID*PMG you get the same problem, (you just need more
threads reading the counters)


Thanks,

James
  
Peter Newman March 6, 2023, 1:14 p.m. UTC | #5
Hi James,

On Mon, Mar 6, 2023 at 12:34 PM James Morse <james.morse@arm.com> wrote:
> On 23/01/2023 15:33, Peter Newman wrote:
> > On Fri, Jan 13, 2023 at 6:56 PM James Morse <james.morse@arm.com> wrote:
> >> MPAM's cache occupancy counters can take a little while to settle once
> >> the monitor has been configured. The maximum settling time is described
> >> to the driver via a firmware table. The value could be large enough
> >> that it makes sense to sleep.
> >
> > Would it be easier to return an error when reading the occupancy count
> > too soon after configuration? On Intel it is already normal for counter
> > reads to fail on newly-allocated RMIDs.
>
> For x86, you have as many counters as there are RMIDs, so there is no issue just accessing
> the counter.

I should have said AMD instead of Intel, because their implementations
have far fewer counters than RMIDs.

>
> With MPAM there may be as few as 1 monitor for the CSU (cache storage utilisation)
> counter, which needs to be multiplexed between different PARTID to find the cache
> occupancy (This works for CSU because its a stable count, it doesn't work for the
> bandwidth monitors)
> On such a platform the monitor needs to be allocated and programmed before it reads a
> value for a particular PARTID/CLOSID. If you had two threads trying to read the same
> counter, they could interleave perfectly to prevent either thread managing to read a value.
> The 'not ready' time is advertised in a firmware table, and the driver will wait at most
> that long before giving up and returning an error.

Likewise, on AMD, a repeating sequence of tasks which are LRU in terms
of counter -> RMID allocation could prevent RMID event reads from ever
returning a value.

The main difference I see with MPAM is that software allocates the
counters instead of hardware, but the overall behavior sounds the same.

The part I object to is introducing the wait to the counter read because
existing software already expects an immediate error when reading a
counter too soon. To produce accurate data, these readings are usually
read at intervals of multiple seconds.

Instead, when configuring a counter, could you use the firmware table
value to compute the time when the counter will next be valid and return
errors on read requests received before that?

-Peter
  
James Morse March 8, 2023, 5:45 p.m. UTC | #6
Hi Peter,

On 06/03/2023 13:14, Peter Newman wrote:
> On Mon, Mar 6, 2023 at 12:34 PM James Morse <james.morse@arm.com> wrote:
>> On 23/01/2023 15:33, Peter Newman wrote:
>>> On Fri, Jan 13, 2023 at 6:56 PM James Morse <james.morse@arm.com> wrote:
>>>> MPAM's cache occupancy counters can take a little while to settle once
>>>> the monitor has been configured. The maximum settling time is described
>>>> to the driver via a firmware table. The value could be large enough
>>>> that it makes sense to sleep.
>>>
>>> Would it be easier to return an error when reading the occupancy count
>>> too soon after configuration? On Intel it is already normal for counter
>>> reads to fail on newly-allocated RMIDs.
>>
>> For x86, you have as many counters as there are RMIDs, so there is no issue just accessing
>> the counter.
> 
> I should have said AMD instead of Intel, because their implementations
> have far fewer counters than RMIDs.

Right, I assume Intel and AMD behaved in the same way here.


>> With MPAM there may be as few as 1 monitor for the CSU (cache storage utilisation)
>> counter, which needs to be multiplexed between different PARTID to find the cache
>> occupancy (This works for CSU because its a stable count, it doesn't work for the
>> bandwidth monitors)
>> On such a platform the monitor needs to be allocated and programmed before it reads a
>> value for a particular PARTID/CLOSID. If you had two threads trying to read the same
>> counter, they could interleave perfectly to prevent either thread managing to read a value.
>> The 'not ready' time is advertised in a firmware table, and the driver will wait at most
>> that long before giving up and returning an error.

> Likewise, on AMD, a repeating sequence of tasks which are LRU in terms
> of counter -> RMID allocation could prevent RMID event reads from ever
> returning a value.

Isn't that a terrible user-space interface? "If someone else is reading a similar file,
neither of you make progress".


> The main difference I see with MPAM is that software allocates the
> counters instead of hardware, but the overall behavior sounds the same.
> 
> The part I object to is introducing the wait to the counter read because
> existing software already expects an immediate error when reading a
> counter too soon. To produce accurate data, these readings are usually
> read at intervals of multiple seconds.


> Instead, when configuring a counter, could you use the firmware table
> value to compute the time when the counter will next be valid and return
> errors on read requests received before that?

The monitor might get re-allocated, re-programmed and become valid for a different
PARTID+PMG in the mean time. I don't think these things should remain allocated over a
return to user-space. Without doing that I don't see how we can return-early and make
progress.

How long should a CSU monitor remain allocated to a PARTID+PMG? Currently its only for the
duration of the read() syscall on the file.


The problem with MPAM is too much of it is optional. This particular behaviour is only
valid for CSU monitors, (llc_occupancy), and then, only if your hardware designers didn't
have a value to hand when the monitor is programmed, and need to do a scan of the cache to
come up with a result. The retry is only triggered if the hardware sets NRDY.
This is also only necessary if there aren't enough monitors for every RMID/(PARTID*PMG) to
have its own. If there were enough, the monitors could be allocated and programmed at
startup, and the whole thing becomes cheaper to access.

If a hardware platform needs time to do this, it has to come from somewhere. I don't think
maintaining an epoch based list of which monitor secretly belongs to a PARTID+PMG in the
hope user-space reads the file again 'quickly enough' is going to be maintainable.

If returning errors early is an important use-case, I can suggest ensuring the MPAM driver
allocates CSU monitors up-front if there are enough (today it only does this for MBWU
monitors). We then have to hope that folk who care about this also build hardware
platforms with enough monitors.


Thanks,

James
  
Peter Newman March 9, 2023, 1:41 p.m. UTC | #7
Hi James,

On Wed, Mar 8, 2023 at 6:45 PM James Morse <james.morse@arm.com> wrote:
> On 06/03/2023 13:14, Peter Newman wrote:
> > On Mon, Mar 6, 2023 at 12:34 PM James Morse <james.morse@arm.com> wrote:
>
> > Instead, when configuring a counter, could you use the firmware table
> > value to compute the time when the counter will next be valid and return
> > errors on read requests received before that?
>
> The monitor might get re-allocated, re-programmed and become valid for a different
> PARTID+PMG in the mean time. I don't think these things should remain allocated over a
> return to user-space. Without doing that I don't see how we can return-early and make
> progress.
>
> How long should a CSU monitor remain allocated to a PARTID+PMG? Currently its only for the
> duration of the read() syscall on the file.
>
>
> The problem with MPAM is too much of it is optional. This particular behaviour is only
> valid for CSU monitors, (llc_occupancy), and then, only if your hardware designers didn't
> have a value to hand when the monitor is programmed, and need to do a scan of the cache to
> come up with a result. The retry is only triggered if the hardware sets NRDY.
> This is also only necessary if there aren't enough monitors for every RMID/(PARTID*PMG) to
> have its own. If there were enough, the monitors could be allocated and programmed at
> startup, and the whole thing becomes cheaper to access.
>
> If a hardware platform needs time to do this, it has to come from somewhere. I don't think
> maintaining an epoch based list of which monitor secretly belongs to a PARTID+PMG in the
> hope user-space reads the file again 'quickly enough' is going to be maintainable.
>
> If returning errors early is an important use-case, I can suggest ensuring the MPAM driver
> allocates CSU monitors up-front if there are enough (today it only does this for MBWU
> monitors). We then have to hope that folk who care about this also build hardware
> platforms with enough monitors.

Thanks, this makes more sense now. Since CSU data isn't cumulative, I
see how synchronously collecting a snapshot is useful in this situation.
I was more concerned about understanding the need for the new behavior
than getting errors back quickly.

However, I do want to be sure that MBWU counters will never be silently
deallocated because we will never be able to trust the data unless we
know that the counter has been watching the group's tasks for the
entirety of the measurement window.

Unlike on AMD, MPAM allows software to control which PARTID+PMG the
monitoring hardware is watching. Could we instead make the user
explicitly request the mbm_{total,local}_bytes events be allocated to
monitoring groups after creating them? Or even just allocating the
events on monitoring group creation only when they're available could
also be marginably usable if a single user agent is managing rdtgroups.

Thanks!
-Peter
  
James Morse March 9, 2023, 5:35 p.m. UTC | #8
Hi Peter,

On 09/03/2023 13:41, Peter Newman wrote:
> On Wed, Mar 8, 2023 at 6:45 PM James Morse <james.morse@arm.com> wrote:
>> On 06/03/2023 13:14, Peter Newman wrote:
>>> On Mon, Mar 6, 2023 at 12:34 PM James Morse <james.morse@arm.com> wrote:
>>
>>> Instead, when configuring a counter, could you use the firmware table
>>> value to compute the time when the counter will next be valid and return
>>> errors on read requests received before that?
>>
>> The monitor might get re-allocated, re-programmed and become valid for a different
>> PARTID+PMG in the mean time. I don't think these things should remain allocated over a
>> return to user-space. Without doing that I don't see how we can return-early and make
>> progress.
>>
>> How long should a CSU monitor remain allocated to a PARTID+PMG? Currently its only for the
>> duration of the read() syscall on the file.
>>
>>
>> The problem with MPAM is too much of it is optional. This particular behaviour is only
>> valid for CSU monitors, (llc_occupancy), and then, only if your hardware designers didn't
>> have a value to hand when the monitor is programmed, and need to do a scan of the cache to
>> come up with a result. The retry is only triggered if the hardware sets NRDY.
>> This is also only necessary if there aren't enough monitors for every RMID/(PARTID*PMG) to
>> have its own. If there were enough, the monitors could be allocated and programmed at
>> startup, and the whole thing becomes cheaper to access.
>>
>> If a hardware platform needs time to do this, it has to come from somewhere. I don't think
>> maintaining an epoch based list of which monitor secretly belongs to a PARTID+PMG in the
>> hope user-space reads the file again 'quickly enough' is going to be maintainable.
>>
>> If returning errors early is an important use-case, I can suggest ensuring the MPAM driver
>> allocates CSU monitors up-front if there are enough (today it only does this for MBWU
>> monitors). We then have to hope that folk who care about this also build hardware
>> platforms with enough monitors.
> 
> Thanks, this makes more sense now. Since CSU data isn't cumulative, I
> see how synchronously collecting a snapshot is useful in this situation.
> I was more concerned about understanding the need for the new behavior
> than getting errors back quickly.
> 
> However, I do want to be sure that MBWU counters will never be silently
> deallocated because we will never be able to trust the data unless we
> know that the counter has been watching the group's tasks for the
> entirety of the measurement window.

Absolutely.

The MPAM driver requires the number of monitors to match the value of
resctrl_arch_system_num_rmid_idx(), otherwise 'mbm_local' won't be offered via resctrl.
(see class_has_usable_mbwu() in [0])

If the files exist in resctrl, then a monitor was reserved for this PARTID+PMG, and won't
get allocated for anything else.


[0]
https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/commit/?h=mpam/snapshot/v6.2&id=f28d3fefdcf7022a49f62752acbecf180ea7d32f


> Unlike on AMD, MPAM allows software to control which PARTID+PMG the
> monitoring hardware is watching. Could we instead make the user
> explicitly request the mbm_{total,local}_bytes events be allocated to
> monitoring groups after creating them? Or even just allocating the
> events on monitoring group creation only when they're available could
> also be marginably usable if a single user agent is managing rdtgroups.

Hmmmm, what would that look like to user-space?

I'm against inventing anything new here until there is feature-parity where possible
upstream. It's a walk, then run kind of thing.

I worry that extra steps to setup the monitoring on MPAM:resctrl will be missing or broken
in many (all?) software projects if they're not also required on Intel:resctrl.

My plan for hardware with insufficient counters is to make the counters accessible via
perf, and do that in a way that works on Intel and AMD too.


Thanks,

James
  
Peter Newman March 10, 2023, 9:28 a.m. UTC | #9
Hi James,

On Thu, Mar 9, 2023 at 6:35 PM James Morse <james.morse@arm.com> wrote:
> On 09/03/2023 13:41, Peter Newman wrote:
> > However, I do want to be sure that MBWU counters will never be silently
> > deallocated because we will never be able to trust the data unless we
> > know that the counter has been watching the group's tasks for the
> > entirety of the measurement window.
>
> Absolutely.
>
> The MPAM driver requires the number of monitors to match the value of
> resctrl_arch_system_num_rmid_idx(), otherwise 'mbm_local' won't be offered via resctrl.
> (see class_has_usable_mbwu() in [0])
>
> If the files exist in resctrl, then a monitor was reserved for this PARTID+PMG, and won't
> get allocated for anything else.
>
>
> [0]
> https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/commit/?h=mpam/snapshot/v6.2&id=f28d3fefdcf7022a49f62752acbecf180ea7d32f
>
>
> > Unlike on AMD, MPAM allows software to control which PARTID+PMG the
> > monitoring hardware is watching. Could we instead make the user
> > explicitly request the mbm_{total,local}_bytes events be allocated to
> > monitoring groups after creating them? Or even just allocating the
> > events on monitoring group creation only when they're available could
> > also be marginably usable if a single user agent is managing rdtgroups.
>
> Hmmmm, what would that look like to user-space?
>
> I'm against inventing anything new here until there is feature-parity where possible
> upstream. It's a walk, then run kind of thing.
>
> I worry that extra steps to setup the monitoring on MPAM:resctrl will be missing or broken
> in many (all?) software projects if they're not also required on Intel:resctrl.
>
> My plan for hardware with insufficient counters is to make the counters accessible via
> perf, and do that in a way that works on Intel and AMD too.

In the interest of enabling MPAM functionality, I think the low-effort
approach is to only allocate an MBWU monitor to a newly-created MON or
CTRL_MON group if one is available. On Intel and AMD, the resources are
simply always available.

The downside on monitor-poor (or PARTID-rich) hardware is the user gets
maximually-featureful monitoring groups first, whether they want them or
not, but I think it's workable. Perhaps in a later change we can make an
interface to prevent monitors from being allocated to new groups or one
to release them when they're not needed after group creation.

At least in this approach there's still a way to use MBWU with resctrl
when systems have more PARTIDs than monitors.

This also seems like less work than making resctrl able to interface
with the perf subsystem.

-Peter
  
James Morse March 20, 2023, 5:12 p.m. UTC | #10
Hi Peter,

On 10/03/2023 09:28, Peter Newman wrote:
> On Thu, Mar 9, 2023 at 6:35 PM James Morse <james.morse@arm.com> wrote:
>> On 09/03/2023 13:41, Peter Newman wrote:
>>> However, I do want to be sure that MBWU counters will never be silently
>>> deallocated because we will never be able to trust the data unless we
>>> know that the counter has been watching the group's tasks for the
>>> entirety of the measurement window.
>>
>> Absolutely.
>>
>> The MPAM driver requires the number of monitors to match the value of
>> resctrl_arch_system_num_rmid_idx(), otherwise 'mbm_local' won't be offered via resctrl.
>> (see class_has_usable_mbwu() in [0])
>>
>> If the files exist in resctrl, then a monitor was reserved for this PARTID+PMG, and won't
>> get allocated for anything else.
>>
>>
>> [0]
>> https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/commit/?h=mpam/snapshot/v6.2&id=f28d3fefdcf7022a49f62752acbecf180ea7d32f
>>
>>
>>> Unlike on AMD, MPAM allows software to control which PARTID+PMG the
>>> monitoring hardware is watching. Could we instead make the user
>>> explicitly request the mbm_{total,local}_bytes events be allocated to
>>> monitoring groups after creating them? Or even just allocating the
>>> events on monitoring group creation only when they're available could
>>> also be marginably usable if a single user agent is managing rdtgroups.
>>
>> Hmmmm, what would that look like to user-space?
>>
>> I'm against inventing anything new here until there is feature-parity where possible
>> upstream. It's a walk, then run kind of thing.
>>
>> I worry that extra steps to setup the monitoring on MPAM:resctrl will be missing or broken
>> in many (all?) software projects if they're not also required on Intel:resctrl.
>>
>> My plan for hardware with insufficient counters is to make the counters accessible via
>> perf, and do that in a way that works on Intel and AMD too.

> In the interest of enabling MPAM functionality, I think the low-effort
> approach is to only allocate an MBWU monitor to a newly-created MON or
> CTRL_MON group if one is available. On Intel and AMD, the resources are
> simply always available.

I agree its low-effort, but I think the result is not worth having.

What does user-space get when it reads 'mbm_total_bytes'? Returning an error here sucks.
How is user-space supposed to identify the groups it wants to monitor, and those it
doesn't care about?

Taking "the only way to win is not to play" means the MPAM driver will only offer those
'mbm_total_bytes' files if they are going to work in the same way they do today. (as you
said, on Intel and AMD the resources are simply always available).

I agree those files have always been able to return errors - but I've never managed to
make the Intel system I have do it... so I bet user-space doesn't expect errors here.
(let alone persistent errors)


My fix for those systems that don't have enough monitors is to expose the counters via
perf, which lets user-space say which groups it wants to monitor (and when!). To make it
easier to use I've done it so the perf stuff works on Intel and AMD too.


> The downside on monitor-poor (or PARTID-rich) hardware is the user gets
> maximually-featureful monitoring groups first, whether they want them or
> not, but I think it's workable.

Really? Depending on the order groups are created in is a terrible user-space interface!
If someone needing to be monitored comes along later, you have do delete the existing
groups ... wait for limbo to do its thing ... then re-create them in some new order.


> Perhaps in a later change we can make an
> interface to prevent monitors from being allocated to new groups or one
> to release them when they're not needed after group creation.
> 
> At least in this approach there's still a way to use MBWU with resctrl
> when systems have more PARTIDs than monitors.
> 
> This also seems like less work than making resctrl able to interface
> with the perf subsystem.

Not correctly supporting resctrl (the mbm_total_files exist, but persistently return an
error) means some user-space users of this thing are broken on those systems.
Adding extra knobs to indicate when the underlying monitor hardware needs to be allocated
is in practice going to be missing if its only needed for MPAM, and is most likely to
bit-rot as codebases that do use it don't regularly test it.


This patch to allow resctrl_arch_rmid_read() to sleep is about MPAM's CSU NRDY and the
high likelyhood that folk build systems where MSCs are sliced up and private to something
smaller than the resctrl:domain. Without the perf support, this would still be necessary.

The changes needed for perf support are to make resctrl_arch_rmid_read() re-entrant, and
for the domain list to be protected by RCU. Neither of these are as onerous as changes to
the user-space interface, and the associated risk of breaking programs that work on other
platforms.


Thanks,

James
  
Peter Newman March 22, 2023, 1:21 p.m. UTC | #11
Hi James,

On Mon, Mar 20, 2023 at 6:12 PM James Morse <james.morse@arm.com> wrote:
> On 10/03/2023 09:28, Peter Newman wrote:
> > In the interest of enabling MPAM functionality, I think the low-effort
> > approach is to only allocate an MBWU monitor to a newly-created MON or
> > CTRL_MON group if one is available. On Intel and AMD, the resources are
> > simply always available.
>
> I agree its low-effort, but I think the result is not worth having.
>
> What does user-space get when it reads 'mbm_total_bytes'? Returning an error here sucks.
> How is user-space supposed to identify the groups it wants to monitor, and those it
> doesn't care about?
>
> Taking "the only way to win is not to play" means the MPAM driver will only offer those
> 'mbm_total_bytes' files if they are going to work in the same way they do today. (as you
> said, on Intel and AMD the resources are simply always available).

I told you that only Intel so far has resources for all RMIDs. AMD
implementations allocate MBW monitors on demand, even reallocating ones
that are actively in use.

> I agree those files have always been able to return errors - but I've never managed to
> make the Intel system I have do it... so I bet user-space doesn't expect errors here.
> (let alone persistent errors)

Find some AMD hardware. It's very easy to get persistent errors due to
no counters being allocated for an RMID:

(this is an 'AMD Ryzen Threadripper PRO 3995WX 64-Cores')

# cd /sys/fs/resctrl/mon_groups
# mkdir test
# cat test/mon_data/*/mbm_total_bytes
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
# cat test/mon_data/*/mbm_total_bytes
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable
Unavailable


> This patch to allow resctrl_arch_rmid_read() to sleep is about MPAM's CSU NRDY and the
> high likelyhood that folk build systems where MSCs are sliced up and private to something
> smaller than the resctrl:domain. Without the perf support, this would still be necessary.

I was worried about the blocking more when I thought you were doing it
for MBWU monitoring. Serializing access to limited CSU monitors makes
more sense.

> The changes needed for perf support are to make resctrl_arch_rmid_read() re-entrant, and
> for the domain list to be protected by RCU. Neither of these are as onerous as changes to
> the user-space interface, and the associated risk of breaking programs that work on other
> platforms.

I went ahead and tried to rebase my reliable-MBM-on-AMD changes onto
your series and they seemed to work with less difficulty than I was
expecting, so I'll try to stop worrying about the churn of this series
now.

-Peter
  

Patch

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index d309b830aeb2..d6ae4b713801 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -206,17 +206,19 @@  static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
 	return chunks >> shift;
 }
 
-int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
-			   u32 closid, u32 rmid, enum resctrl_event_id eventid,
-			   u64 *val)
+struct __rmid_read_arg
 {
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
-	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
-	struct arch_mbm_state *am;
-	u64 msr_val, chunks;
+	u32 rmid;
+	enum resctrl_event_id eventid;
 
-	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
-		return -EINVAL;
+	u64 msr_val;
+};
+
+static void __rmid_read(void *arg)
+{
+	enum resctrl_event_id eventid = ((struct __rmid_read_arg *)arg)->eventid;
+	u32 rmid = ((struct __rmid_read_arg *)arg)->rmid;
+	u64 msr_val;
 
 	/*
 	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
@@ -229,6 +231,28 @@  int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
 	rdmsrl(MSR_IA32_QM_CTR, msr_val);
 
+	((struct __rmid_read_arg *)arg)->msr_val = msr_val;
+}
+
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
+			   u32 closid, u32 rmid, enum resctrl_event_id eventid,
+			   u64 *val)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	struct __rmid_read_arg arg;
+	struct arch_mbm_state *am;
+	u64 msr_val, chunks;
+	int err;
+
+	arg.rmid = rmid;
+	arg.eventid = eventid;
+
+	err = smp_call_function_any(&d->cpu_mask, __rmid_read, &arg, true);
+	if (err)
+		return err;
+
+	msr_val = arg.msr_val;
 	if (msr_val & RMID_VAL_ERROR)
 		return -EIO;
 	if (msr_val & RMID_VAL_UNAVAIL)
@@ -383,23 +407,18 @@  static void add_rmid_to_limbo(struct rmid_entry *entry)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 	struct rdt_domain *d;
-	int cpu, err;
 	u64 val = 0;
 	u32 idx;
+	int err;
 
 	idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
 
 	entry->busy = 0;
-	cpu = get_cpu();
 	list_for_each_entry(d, &r->domains, list) {
-		if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
-			err = resctrl_arch_rmid_read(r, d, entry->closid,
-						     entry->rmid,
-						     QOS_L3_OCCUP_EVENT_ID,
-						     &val);
-			if (err || val <= resctrl_rmid_realloc_threshold)
-				continue;
-		}
+		err = resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid,
+					     QOS_L3_OCCUP_EVENT_ID, &val);
+		if (err || val <= resctrl_rmid_realloc_threshold)
+			continue;
 
 		/*
 		 * For the first limbo RMID in the domain,
@@ -410,7 +429,6 @@  static void add_rmid_to_limbo(struct rmid_entry *entry)
 		set_bit(idx, d->rmid_busy_llc);
 		entry->busy++;
 	}
-	put_cpu();
 
 	if (entry->busy)
 		rmid_limbo_count++;