[v8,08/24] x86/resctrl: Track the number of dirty RMID a CLOSID has

Message ID 20231215174343.13872-9-james.morse@arm.com
State New
Headers
Series x86/resctrl: monitored closid+rmid together, separate arch/fs locking |

Commit Message

James Morse Dec. 15, 2023, 5:43 p.m. UTC
  MPAM's PMG bits extend its PARTID space, meaning the same PMG value can be
used for different control groups.

This means once a CLOSID is allocated, all its monitoring ids may still be
dirty, and held in limbo.

Keep track of the number of RMID held in limbo each CLOSID has. This will
allow a future helper to find the 'cleanest' CLOSID when allocating.

The array is only needed when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is
defined. This will never be the case on x86.

Signed-off-by: James Morse <james.morse@arm.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Peter Newman <peternewman@google.com>
Tested-by: Babu Moger <babu.moger@amd.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
---
Changes since v4:
 * Moved closid_num_dirty_rmid[] update under entry->busy check
 * Take the mutex in dom_data_init() as the caller doesn't.

Changes since v5:
 * Added braces after an else.
 * Made closid_num_dirty_rmid an unsigned int.
 * Moved mutex_lock() in dom_data_init() to cover the whole function.

Changes since v6:
 * Made closid_num_dirty_rmid[] and associated tmp variables u32.

Changes since v7:
 * Clobber kfree()d variable with NULL.
 * Guard the use of closid_num_dirty_rmid with IS_ENABLED() so it can be
   optimised out on x86.
---
 arch/x86/kernel/cpu/resctrl/monitor.c | 69 +++++++++++++++++++++++----
 1 file changed, 59 insertions(+), 10 deletions(-)
  

Comments

Moger, Babu Jan. 3, 2024, 7:43 p.m. UTC | #1
On 12/15/23 11:43, James Morse wrote:
> MPAM's PMG bits extend its PARTID space, meaning the same PMG value can be
> used for different control groups.
> 
> This means once a CLOSID is allocated, all its monitoring ids may still be
> dirty, and held in limbo.
> 
> Keep track of the number of RMID held in limbo each CLOSID has. This will
> allow a future helper to find the 'cleanest' CLOSID when allocating.
> 
> The array is only needed when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is
> defined. This will never be the case on x86.
> 
> Signed-off-by: James Morse <james.morse@arm.com>
> Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
> Tested-by: Peter Newman <peternewman@google.com>
> Tested-by: Babu Moger <babu.moger@amd.com>
> Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
> Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>

Reviewed-by: Babu Moger <babu.moger@amd.com>

> ---
> Changes since v4:
>  * Moved closid_num_dirty_rmid[] update under entry->busy check
>  * Take the mutex in dom_data_init() as the caller doesn't.
> 
> Changes since v5:
>  * Added braces after an else.
>  * Made closid_num_dirty_rmid an unsigned int.
>  * Moved mutex_lock() in dom_data_init() to cover the whole function.
> 
> Changes since v6:
>  * Made closid_num_dirty_rmid[] and associated tmp variables u32.
> 
> Changes since v7:
>  * Clobber kfree()d variable with NULL.
>  * Guard the use of closid_num_dirty_rmid with IS_ENABLED() so it can be
>    optimised out on x86.
> ---
>  arch/x86/kernel/cpu/resctrl/monitor.c | 69 +++++++++++++++++++++++----
>  1 file changed, 59 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
> index 1f371b108a74..6dfc68c800c8 100644
> --- a/arch/x86/kernel/cpu/resctrl/monitor.c
> +++ b/arch/x86/kernel/cpu/resctrl/monitor.c
> @@ -50,6 +50,13 @@ struct rmid_entry {
>   */
>  static LIST_HEAD(rmid_free_lru);
>  
> +/**
> + * @closid_num_dirty_rmid    The number of dirty RMID each CLOSID has.
> + *     Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
> + *     Indexed by CLOSID. Protected by rdtgroup_mutex.
> + */
> +static u32 *closid_num_dirty_rmid;
> +
>  /*
>   * @rmid_limbo_count - count of currently unused but (potentially)
>   *     dirty RMIDs.
> @@ -292,6 +299,17 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
>  	return 0;
>  }
>  
> +static void limbo_release_entry(struct rmid_entry *entry)
> +{
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
> +	rmid_limbo_count--;
> +	list_add_tail(&entry->list, &rmid_free_lru);
> +
> +	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
> +		closid_num_dirty_rmid[entry->closid]--;
> +}
> +
>  /*
>   * Check the RMIDs that are marked as busy for this domain. If the
>   * reported LLC occupancy is below the threshold clear the busy bit and
> @@ -328,10 +346,8 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
>  
>  		if (force_free || !rmid_dirty) {
>  			clear_bit(idx, d->rmid_busy_llc);
> -			if (!--entry->busy) {
> -				rmid_limbo_count--;
> -				list_add_tail(&entry->list, &rmid_free_lru);
> -			}
> +			if (!--entry->busy)
> +				limbo_release_entry(entry);
>  		}
>  		cur_idx = idx + 1;
>  	}
> @@ -398,6 +414,8 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
>  	u64 val = 0;
>  	u32 idx;
>  
> +	lockdep_assert_held(&rdtgroup_mutex);
> +
>  	idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
>  
>  	entry->busy = 0;
> @@ -423,10 +441,13 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
>  	}
>  	put_cpu();
>  
> -	if (entry->busy)
> +	if (entry->busy) {
>  		rmid_limbo_count++;
> -	else
> +		if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
> +			closid_num_dirty_rmid[entry->closid]++;
> +	} else {
>  		list_add_tail(&entry->list, &rmid_free_lru);
> +	}
>  }
>  
>  void free_rmid(u32 closid, u32 rmid)
> @@ -792,13 +813,33 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
>  static int dom_data_init(struct rdt_resource *r)
>  {
>  	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> +	u32 num_closid = resctrl_arch_get_num_closid(r);
>  	struct rmid_entry *entry = NULL;
> +	int err = 0, i;
>  	u32 idx;
> -	int i;
> +
> +	mutex_lock(&rdtgroup_mutex);
> +	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
> +		u32 *tmp;
> +
> +		tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
> +		if (!tmp) {
> +			err = -ENOMEM;
> +			goto out_unlock;
> +		}
> +
> +		closid_num_dirty_rmid = tmp;
> +	}
>  
>  	rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
> -	if (!rmid_ptrs)
> -		return -ENOMEM;
> +	if (!rmid_ptrs) {
> +		if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
> +			kfree(closid_num_dirty_rmid);
> +			closid_num_dirty_rmid = NULL;
> +		}
> +		err = -ENOMEM;
> +		goto out_unlock;
> +	}
>  
>  	for (i = 0; i < idx_limit; i++) {
>  		entry = &rmid_ptrs[i];
> @@ -818,13 +859,21 @@ static int dom_data_init(struct rdt_resource *r)
>  	entry = __rmid_entry(idx);
>  	list_del(&entry->list);
>  
> -	return 0;
> +out_unlock:
> +	mutex_unlock(&rdtgroup_mutex);
> +
> +	return err;
>  }
>  
>  static void __exit dom_data_exit(struct rdt_resource *r)
>  {
>  	mutex_lock(&rdtgroup_mutex);
>  
> +	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
> +		kfree(closid_num_dirty_rmid);
> +		closid_num_dirty_rmid = NULL;
> +	}
> +
>  	kfree(rmid_ptrs);
>  	rmid_ptrs = NULL;
>
  
Peter Newman Jan. 4, 2024, 7:13 p.m. UTC | #2
Hi James,

On Fri, Dec 15, 2023 at 9:44 AM James Morse <james.morse@arm.com> wrote:
>  void free_rmid(u32 closid, u32 rmid)
> @@ -792,13 +813,33 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
>  static int dom_data_init(struct rdt_resource *r)
>  {
>         u32 idx_limit = resctrl_arch_system_num_rmid_idx();
> +       u32 num_closid = resctrl_arch_get_num_closid(r);

Which resource is this again? Surely the one with the smallest number
of CLOSIDs?

It's not much harm if the array is bigger than it needs to be, but
I've become curious about how The Monitoring Resource is used in the
code when there are later changes[1] which would cause this function
to be called on RDT_RESOURCE_L3, RDT_RESOURCE_MBA, or both.

Given that we have hardware with event counters residing at different
levels of the topology and possibly being associated with different
rdt_resources, more attention needs to be paid to how these parameters
are used in code related to monitoring.

-Peter

[1] https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/commit/drivers/platform/mpam/mpam_resctrl.c?h=mpam/snapshot/v6.7-rc2&id=a0ab3a6c26002
  
James Morse Jan. 22, 2024, 6:05 p.m. UTC | #3
Hi Peter,

On 04/01/2024 19:13, Peter Newman wrote:
> On Fri, Dec 15, 2023 at 9:44 AM James Morse <james.morse@arm.com> wrote:
>>  void free_rmid(u32 closid, u32 rmid)
>> @@ -792,13 +813,33 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
>>  static int dom_data_init(struct rdt_resource *r)
>>  {
>>         u32 idx_limit = resctrl_arch_system_num_rmid_idx();
>> +       u32 num_closid = resctrl_arch_get_num_closid(r);

> Which resource is this again? Surely the one with the smallest number
> of CLOSIDs?

Today it's implicitly L3 because that is the only one resctrl supports monitoring on


> It's not much harm if the array is bigger than it needs to be, but

Heh, this use of this variable is behind those IS_ENABLED(), which means it gets removed
unless you are on an MPAM system. MPAM always has to sanitise these fields as not all the
hardware is exposed to resctrl.
(e.g. L3 and MB might support 16 CLOSID, but if there is an invisible system-cache in
between them that only supports 8 CLOSID, the system-wide value has to be 8, regardless of
what the hardware supports.)

The MPAM driver finds the system wide value here:
https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/tree/drivers/platform/mpam/mpam_devices.c?h=mpam/snapshot/v6.7-rc2#n757

And regardless of which resource you select, returns that value here:
https://git.kernel.org/pub/scm/linux/kernel/git/morse/linux.git/tree/drivers/platform/mpam/mpam_resctrl.c?h=mpam/snapshot/v6.7-rc2#n128

On x86 the helper returns the hardware num CLOSID so that the resctrl sanitisation does
the right thing.

I'll add a comment that this may over-allocate if the architecture isn't pre-sanitising
this field:
|                /*
|                 * If the architecture hasn't provided a sanitised value here,
|                 * this may result in larger arrays than necessary. Resctrl will
|                 * use a smaller system wide value based on the resources in
|                 * use.
|                 */


> I've become curious about how The Monitoring Resource is used in the
> code when there are later changes[1] which would cause this function
> to be called on RDT_RESOURCE_L3, RDT_RESOURCE_MBA, or both.

I need to digest Tony's series. Today the event names all have L3 in them - the MPAM
driver is ignoring both this and the resources, and relying on heuristics to pick
something to back these counters with. Something is better than nothing,.
I agree it can be improved as resctrl allows more things to be exposed.


> Given that we have hardware with event counters residing at different
> levels of the topology and possibly being associated with different
> rdt_resources, more attention needs to be paid to how these parameters
> are used in code related to monitoring.

Certainly there are likely to be weirdness in what the MPAM driver picks here. Those
patches are marked untested for a reason! I have nothing I can test the bandwidth counters on.

My intention here is that 'things that look like a Xeon' should behave equivalently as far
as resctrl can see. That gets any existing software working. Beyond that we can talk about
extending what we have to better cover the hardware people have built.

I'm coming to the conclusion that results vary depending on {ingress,egress} of {L3, SLC,
Memory-Side-Cache, Memory-Controller} - even when only one is implemented, and that hiding
this in resctrl isn't helpful. Using perf's platform-specific json files to identify
counters may be a better approach.


Thanks,

James
  
James Morse Jan. 22, 2024, 6:05 p.m. UTC | #4
Hi Babu,

On 03/01/2024 19:43, Moger, Babu wrote:
> On 12/15/23 11:43, James Morse wrote:
>> MPAM's PMG bits extend its PARTID space, meaning the same PMG value can be
>> used for different control groups.
>>
>> This means once a CLOSID is allocated, all its monitoring ids may still be
>> dirty, and held in limbo.
>>
>> Keep track of the number of RMID held in limbo each CLOSID has. This will
>> allow a future helper to find the 'cleanest' CLOSID when allocating.
>>
>> The array is only needed when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is
>> defined. This will never be the case on x86.

> Reviewed-by: Babu Moger <babu.moger@amd.com>

Thanks!

James
  

Patch

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 1f371b108a74..6dfc68c800c8 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -50,6 +50,13 @@  struct rmid_entry {
  */
 static LIST_HEAD(rmid_free_lru);
 
+/**
+ * @closid_num_dirty_rmid    The number of dirty RMID each CLOSID has.
+ *     Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
+ *     Indexed by CLOSID. Protected by rdtgroup_mutex.
+ */
+static u32 *closid_num_dirty_rmid;
+
 /*
  * @rmid_limbo_count - count of currently unused but (potentially)
  *     dirty RMIDs.
@@ -292,6 +299,17 @@  int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 	return 0;
 }
 
+static void limbo_release_entry(struct rmid_entry *entry)
+{
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	rmid_limbo_count--;
+	list_add_tail(&entry->list, &rmid_free_lru);
+
+	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+		closid_num_dirty_rmid[entry->closid]--;
+}
+
 /*
  * Check the RMIDs that are marked as busy for this domain. If the
  * reported LLC occupancy is below the threshold clear the busy bit and
@@ -328,10 +346,8 @@  void __check_limbo(struct rdt_domain *d, bool force_free)
 
 		if (force_free || !rmid_dirty) {
 			clear_bit(idx, d->rmid_busy_llc);
-			if (!--entry->busy) {
-				rmid_limbo_count--;
-				list_add_tail(&entry->list, &rmid_free_lru);
-			}
+			if (!--entry->busy)
+				limbo_release_entry(entry);
 		}
 		cur_idx = idx + 1;
 	}
@@ -398,6 +414,8 @@  static void add_rmid_to_limbo(struct rmid_entry *entry)
 	u64 val = 0;
 	u32 idx;
 
+	lockdep_assert_held(&rdtgroup_mutex);
+
 	idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
 
 	entry->busy = 0;
@@ -423,10 +441,13 @@  static void add_rmid_to_limbo(struct rmid_entry *entry)
 	}
 	put_cpu();
 
-	if (entry->busy)
+	if (entry->busy) {
 		rmid_limbo_count++;
-	else
+		if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+			closid_num_dirty_rmid[entry->closid]++;
+	} else {
 		list_add_tail(&entry->list, &rmid_free_lru);
+	}
 }
 
 void free_rmid(u32 closid, u32 rmid)
@@ -792,13 +813,33 @@  void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
 static int dom_data_init(struct rdt_resource *r)
 {
 	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+	u32 num_closid = resctrl_arch_get_num_closid(r);
 	struct rmid_entry *entry = NULL;
+	int err = 0, i;
 	u32 idx;
-	int i;
+
+	mutex_lock(&rdtgroup_mutex);
+	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+		u32 *tmp;
+
+		tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
+		if (!tmp) {
+			err = -ENOMEM;
+			goto out_unlock;
+		}
+
+		closid_num_dirty_rmid = tmp;
+	}
 
 	rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
-	if (!rmid_ptrs)
-		return -ENOMEM;
+	if (!rmid_ptrs) {
+		if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+			kfree(closid_num_dirty_rmid);
+			closid_num_dirty_rmid = NULL;
+		}
+		err = -ENOMEM;
+		goto out_unlock;
+	}
 
 	for (i = 0; i < idx_limit; i++) {
 		entry = &rmid_ptrs[i];
@@ -818,13 +859,21 @@  static int dom_data_init(struct rdt_resource *r)
 	entry = __rmid_entry(idx);
 	list_del(&entry->list);
 
-	return 0;
+out_unlock:
+	mutex_unlock(&rdtgroup_mutex);
+
+	return err;
 }
 
 static void __exit dom_data_exit(struct rdt_resource *r)
 {
 	mutex_lock(&rdtgroup_mutex);
 
+	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+		kfree(closid_num_dirty_rmid);
+		closid_num_dirty_rmid = NULL;
+	}
+
 	kfree(rmid_ptrs);
 	rmid_ptrs = NULL;