[v5,2/3] x86/resctrl: Implement rename op for mon groups
Commit Message
To change the resources allocated to a large group of tasks, such as an
application container, a container manager must write all of the tasks'
IDs into the tasks file interface of the new control group.
If a container manager is additionally tracking containers' bandwidth
usage by placing tasks from each into their own monitoring group, it
must first move the tasks to the default monitoring group of the new
control group before it can move the tasks into their new monitoring
groups. This is undesirable because it makes bandwidth usage during the
move unattributable to the correct tasks and resets monitoring event
counters and cache usage information for the group.
Implement the rename operation only for resctrlfs monitor groups to
enable users to move a monitoring group from one control group to
another. This effects a change in resources allocated to all the tasks
in the monitoring group while otherwise leaving the monitoring data
intact.
Signed-off-by: Peter Newman <peternewman@google.com>
---
arch/x86/kernel/cpu/resctrl/rdtgroup.c | 128 +++++++++++++++++++++++++
1 file changed, 128 insertions(+)
Comments
Hi Peter,
On 3/30/2023 6:55 AM, Peter Newman wrote:
> To change the resources allocated to a large group of tasks, such as an
> application container, a container manager must write all of the tasks'
> IDs into the tasks file interface of the new control group.
>
> If a container manager is additionally tracking containers' bandwidth
> usage by placing tasks from each into their own monitoring group, it
The above sentence seems to be missing something after the "for each".
It seems to still parse if "for each" is removed.
> must first move the tasks to the default monitoring group of the new
> control group before it can move the tasks into their new monitoring
> groups. This is undesirable because it makes bandwidth usage during the
> move unattributable to the correct tasks and resets monitoring event
> counters and cache usage information for the group.
>
> Implement the rename operation only for resctrlfs monitor groups to
> enable users to move a monitoring group from one control group to
> another. This effects a change in resources allocated to all the tasks
> in the monitoring group while otherwise leaving the monitoring data
> intact.
>
> Signed-off-by: Peter Newman <peternewman@google.com>
> ---
> arch/x86/kernel/cpu/resctrl/rdtgroup.c | 128 +++++++++++++++++++++++++
> 1 file changed, 128 insertions(+)
>
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 51b869149e76..86de22d8e23a 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -3514,6 +3514,133 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
> return ret;
> }
>
> +/**
> + * mongrp_reparent() - replace parent CTRL_MON group of a MON group
> + * @rdtgrp: the MON group whose parent should be replaced
> + * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp
> + * @cpus: cpumask provided by the caller for use during this call
> + *
> + * Replaces the parent CTRL_MON group for a MON group, resulting in all member
> + * tasks' CLOSID immediately changing to that of the new parent group.
> + * Monitoring data for the group is unaffected by this operation.
> + */
> +static void mongrp_reparent(struct rdtgroup *rdtgrp,
> + struct rdtgroup *new_prdtgrp,
> + cpumask_var_t cpus)
> +{
> + struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
> +
> + WARN_ON(rdtgrp->type != RDTMON_GROUP);
> + WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
> +
> + /* Nothing to do when simply renaming a MON group. */
> + if (prdtgrp == new_prdtgrp)
> + return;
> +
> + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
> + list_move_tail(&rdtgrp->mon.crdtgrp_list,
> + &new_prdtgrp->mon.crdtgrp_list);
> +
> + rdtgrp->mon.parent = new_prdtgrp;
> + rdtgrp->closid = new_prdtgrp->closid;
> +
> + /* Propagate updated closid to all tasks in this group. */
> + rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
> +
> + update_closid_rmid(cpus, NULL);
> +}
> +
> +static int rdtgroup_rename(struct kernfs_node *kn,
> + struct kernfs_node *new_parent, const char *new_name)
> +{
> + struct rdtgroup *new_prdtgrp;
> + struct rdtgroup *rdtgrp;
> + cpumask_var_t tmpmask;
> + int ret;
> +
> + rdtgrp = kernfs_to_rdtgroup(kn);
> + new_prdtgrp = kernfs_to_rdtgroup(new_parent);
> + if (!rdtgrp || !new_prdtgrp)
> + return -ENOENT;
> +
> + /* Release both kernfs active_refs before obtaining rdtgroup mutex. */
> + rdtgroup_kn_get(rdtgrp, kn);
> + rdtgroup_kn_get(new_prdtgrp, new_parent);
> +
> + mutex_lock(&rdtgroup_mutex);
> +
> + rdt_last_cmd_clear();
> +
> + /*
> + * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
> + * either kernfs_node is a file.
> + */
> + if (kernfs_type(kn) != KERNFS_DIR ||
> + kernfs_type(new_parent) != KERNFS_DIR) {
> + rdt_last_cmd_puts("Source and destination must be group directories");
I do not think it is obvious what a "group directory" is. The source must be a
monitoring group and the destination must be the "mon_groups" directory. Maybe
the "group" term can just be dropped to read "Source and destination must be
directories" (which is exactly what is tested for).
> + ret = -EPERM;
> + goto out;
> + }
> +
> + if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
> + ret = -ENOENT;
> + goto out;
> + }
> +
> + if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
> + !is_mon_groups(kn->parent, kn->name)) {
> + rdt_last_cmd_puts("Source must be a MON group\n");
> + ret = -EPERM;
> + goto out;
> + }
> +
> + if (!is_mon_groups(new_parent, new_name)) {
> + rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
> + ret = -EPERM;
> + goto out;
> + }
> +
Thanks. I think using these terms ("MON" and "mon_groups") in the error messages
are useful since it gives the user something to search for in the documentation.
> + /*
> + * If the MON group is monitoring CPUs, the CPUs must be assigned to the
> + * current parent CTRL_MON group and therefore cannot be assigned to
> + * the new parent, making the move illegal.
> + */
> + if (!cpumask_empty(&rdtgrp->cpu_mask) &&
> + (rdtgrp->mon.parent != new_prdtgrp)) {
You can remove the extra parentheses so that this patch can get a clean slate
from "checkpatch.pl --strict" done as this work moves to the next level.
> + rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
> + ret = -EPERM;
> + goto out;
> + }
> +
> + /*
> + * Allocate the cpumask for use in mongrp_reparent() to avoid the
> + * possibility of failing to allocate it after kernfs_rename() has
> + * succeeded.
> + */
> + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + /*
> + * Perform all input validation and allocations needed to ensure
> + * mongrp_reparent() will succeed before calling kernfs_rename(),
> + * otherwise it would be necessary to revert this call if
> + * mongrp_reparent() failed.
> + */
> + ret = kernfs_rename(kn, new_parent, new_name);
> + if (!ret)
> + mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
> +
> + free_cpumask_var(tmpmask);
> +
> +out:
> + mutex_unlock(&rdtgroup_mutex);
> + rdtgroup_kn_put(rdtgrp, kn);
> + rdtgroup_kn_put(new_prdtgrp, new_parent);
> + return ret;
> +}
> +
> static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
> {
> if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
> @@ -3531,6 +3658,7 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
> static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
> .mkdir = rdtgroup_mkdir,
> .rmdir = rdtgroup_rmdir,
> + .rename = rdtgroup_rename,
> .show_options = rdtgroup_show_options,
> };
>
Thank you very much.
Just the few minor comments. With those addressed:
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Reinette
Hi Reinette,
On Tue, Apr 18, 2023 at 11:53 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
> On 3/30/2023 6:55 AM, Peter Newman wrote:
> > If a container manager is additionally tracking containers' bandwidth
> > usage by placing tasks from each into their own monitoring group, it
>
> The above sentence seems to be missing something after the "for each".
> It seems to still parse if "for each" is removed.
Did you mean "from each"? In any case, I'll further disambiguate to
this in my next update:
"If the container manager is using monitoring groups to separately
track the bandwidth of containers assigned to the same control group,
it must first move the container's tasks to the default monitoring
group of the new control group before it can move these tasks into the
container's replacement monitoring groups under the destination
control group."
> > + /*
> > + * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
> > + * either kernfs_node is a file.
> > + */
> > + if (kernfs_type(kn) != KERNFS_DIR ||
> > + kernfs_type(new_parent) != KERNFS_DIR) {
> > + rdt_last_cmd_puts("Source and destination must be group directories");
>
> I do not think it is obvious what a "group directory" is. The source must be a
> monitoring group and the destination must be the "mon_groups" directory. Maybe
> the "group" term can just be dropped to read "Source and destination must be
> directories" (which is exactly what is tested for).
Sounds good.
>
> > + ret = -EPERM;
> > + goto out;
> > + }
> > +
> > + if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
> > + ret = -ENOENT;
> > + goto out;
> > + }
> > +
> > + if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
> > + !is_mon_groups(kn->parent, kn->name)) {
> > + rdt_last_cmd_puts("Source must be a MON group\n");
> > + ret = -EPERM;
> > + goto out;
> > + }
> > +
> > + if (!is_mon_groups(new_parent, new_name)) {
> > + rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
> > + ret = -EPERM;
> > + goto out;
> > + }
> > +
>
> Thanks. I think using these terms ("MON" and "mon_groups") in the error messages
> are useful since it gives the user something to search for in the documentation.
>
> > + /*
> > + * If the MON group is monitoring CPUs, the CPUs must be assigned to the
> > + * current parent CTRL_MON group and therefore cannot be assigned to
> > + * the new parent, making the move illegal.
> > + */
> > + if (!cpumask_empty(&rdtgrp->cpu_mask) &&
> > + (rdtgrp->mon.parent != new_prdtgrp)) {
>
> You can remove the extra parentheses so that this patch can get a clean slate
> from "checkpatch.pl --strict" done as this work moves to the next level.
Ok
>
> Thank you very much.
>
> Just the few minor comments. With those addressed:
>
> Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Thanks again for your careful review. Also thank you for suggesting
this solution. It's a big improvement in maintainability over what
we've been using downstream.
-Peter
Hi Peter,
On 4/19/2023 2:38 AM, Peter Newman wrote:
> On Tue, Apr 18, 2023 at 11:53 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>> On 3/30/2023 6:55 AM, Peter Newman wrote:
>>> If a container manager is additionally tracking containers' bandwidth
>>> usage by placing tasks from each into their own monitoring group, it
>>
>> The above sentence seems to be missing something after the "for each".
>> It seems to still parse if "for each" is removed.
>
> Did you mean "from each"?
I did, yes. Thanks for converting it to what I intended to write.
> In any case, I'll further disambiguate to
> this in my next update:
>
> "If the container manager is using monitoring groups to separately
> track the bandwidth of containers assigned to the same control group,
> it must first move the container's tasks to the default monitoring
> group of the new control group before it can move these tasks into the
> container's replacement monitoring groups under the destination
> control group."
Looks good to me. Thank you.
Reinette
@@ -3514,6 +3514,133 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
return ret;
}
+/**
+ * mongrp_reparent() - replace parent CTRL_MON group of a MON group
+ * @rdtgrp: the MON group whose parent should be replaced
+ * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp
+ * @cpus: cpumask provided by the caller for use during this call
+ *
+ * Replaces the parent CTRL_MON group for a MON group, resulting in all member
+ * tasks' CLOSID immediately changing to that of the new parent group.
+ * Monitoring data for the group is unaffected by this operation.
+ */
+static void mongrp_reparent(struct rdtgroup *rdtgrp,
+ struct rdtgroup *new_prdtgrp,
+ cpumask_var_t cpus)
+{
+ struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+
+ WARN_ON(rdtgrp->type != RDTMON_GROUP);
+ WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
+
+ /* Nothing to do when simply renaming a MON group. */
+ if (prdtgrp == new_prdtgrp)
+ return;
+
+ WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+ list_move_tail(&rdtgrp->mon.crdtgrp_list,
+ &new_prdtgrp->mon.crdtgrp_list);
+
+ rdtgrp->mon.parent = new_prdtgrp;
+ rdtgrp->closid = new_prdtgrp->closid;
+
+ /* Propagate updated closid to all tasks in this group. */
+ rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
+
+ update_closid_rmid(cpus, NULL);
+}
+
+static int rdtgroup_rename(struct kernfs_node *kn,
+ struct kernfs_node *new_parent, const char *new_name)
+{
+ struct rdtgroup *new_prdtgrp;
+ struct rdtgroup *rdtgrp;
+ cpumask_var_t tmpmask;
+ int ret;
+
+ rdtgrp = kernfs_to_rdtgroup(kn);
+ new_prdtgrp = kernfs_to_rdtgroup(new_parent);
+ if (!rdtgrp || !new_prdtgrp)
+ return -ENOENT;
+
+ /* Release both kernfs active_refs before obtaining rdtgroup mutex. */
+ rdtgroup_kn_get(rdtgrp, kn);
+ rdtgroup_kn_get(new_prdtgrp, new_parent);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ /*
+ * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
+ * either kernfs_node is a file.
+ */
+ if (kernfs_type(kn) != KERNFS_DIR ||
+ kernfs_type(new_parent) != KERNFS_DIR) {
+ rdt_last_cmd_puts("Source and destination must be group directories");
+ ret = -EPERM;
+ goto out;
+ }
+
+ if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (rdtgrp->type != RDTMON_GROUP || !kn->parent ||
+ !is_mon_groups(kn->parent, kn->name)) {
+ rdt_last_cmd_puts("Source must be a MON group\n");
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (!is_mon_groups(new_parent, new_name)) {
+ rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
+ * If the MON group is monitoring CPUs, the CPUs must be assigned to the
+ * current parent CTRL_MON group and therefore cannot be assigned to
+ * the new parent, making the move illegal.
+ */
+ if (!cpumask_empty(&rdtgrp->cpu_mask) &&
+ (rdtgrp->mon.parent != new_prdtgrp)) {
+ rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
+ * Allocate the cpumask for use in mongrp_reparent() to avoid the
+ * possibility of failing to allocate it after kernfs_rename() has
+ * succeeded.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Perform all input validation and allocations needed to ensure
+ * mongrp_reparent() will succeed before calling kernfs_rename(),
+ * otherwise it would be necessary to revert this call if
+ * mongrp_reparent() failed.
+ */
+ ret = kernfs_rename(kn, new_parent, new_name);
+ if (!ret)
+ mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
+
+ free_cpumask_var(tmpmask);
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+ rdtgroup_kn_put(rdtgrp, kn);
+ rdtgroup_kn_put(new_prdtgrp, new_parent);
+ return ret;
+}
+
static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
{
if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
@@ -3531,6 +3658,7 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
.mkdir = rdtgroup_mkdir,
.rmdir = rdtgroup_rmdir,
+ .rename = rdtgroup_rename,
.show_options = rdtgroup_show_options,
};