diff mbox series

[v3,3/3] x86/resctrl: Implement rename op for mon groups

Message ID	20230125101334.1069060-4-peternewman@google.com
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20; Date: Wed, 25 Jan 2023 11:13:34 +0100 In-Reply-To: <20230125101334.1069060-1-peternewman@google.com> Mime-Version: 1.0 References: <20230125101334.1069060-1-peternewman@google.com> Message-ID: <20230125101334.1069060-4-peternewman@google.com> Subject: [PATCH v3 3/3] x86/resctrl: Implement rename op for mon groups From: Peter Newman <peternewman@google.com> To: reinette.chatre@intel.com, fenghua.yu@intel.com Cc: Babu.Moger@amd.com, bp@alien8.de, dave.hansen@linux.intel.com, eranian@google.com, gupasani@google.com, hpa@zytor.com, james.morse@arm.com, linux-kernel@vger.kernel.org, mingo@redhat.com, skodak@google.com, tony.luck@intel.com, tglx@linutronix.de, x86@kernel.org, Peter Newman <peternewman@google.com> Content-Type: text/plain; charset="UTF-8" Precedence: bulk
Series	x86/resctrl: Implement rename to help move containers' tasks \| [v3,0/3] x86/resctrl: Implement rename to help move containers' tasks [v3,1/3] x86/resctrl: Factor rdtgroup lock for multi-file ops [v3,2/3] x86/resctrl: Factor work to update task CLOSID/RMID [v3,3/3] x86/resctrl: Implement rename op for mon groups

Commit Message

Peter Newman Jan. 25, 2023, 10:13 a.m. UTC

  To change the class of service for a large group of tasks, such as an
application container, a container manager must write all of the tasks'
IDs into the tasks file interface of the new control group.

If a container manager is tracking containers' bandwidth usage by
placing tasks from each into their own monitoring group, it must first
move the tasks to the default monitoring group of the new control group
before it can move the tasks into their new monitoring groups. This is
undesirable because it makes bandwidth usage during the move
unattributable to the correct tasks and resets monitoring event counters
and cache usage information for the group.

To address this, implement the rename operation for resctrlfs mon groups
to effect a change in CLOSID for a MON group while otherwise leaving the
monitoring group intact.

Signed-off-by: Peter Newman <peternewman@google.com>
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 75 ++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

Comments

Reinette Chatre Feb. 10, 2023, 11:21 p.m. UTC | #1

Hi Peter,

On 1/25/2023 2:13 AM, Peter Newman wrote:
> To change the class of service for a large group of tasks, such as an
> application container, a container manager must write all of the tasks'
> IDs into the tasks file interface of the new control group.
> 
> If a container manager is tracking containers' bandwidth usage by
> placing tasks from each into their own monitoring group, it must first
> move the tasks to the default monitoring group of the new control group
> before it can move the tasks into their new monitoring groups. This is
> undesirable because it makes bandwidth usage during the move
> unattributable to the correct tasks and resets monitoring event counters
> and cache usage information for the group.
> 
> To address this, implement the rename operation for resctrlfs mon groups
> to effect a change in CLOSID for a MON group while otherwise leaving the
> monitoring group intact.
> 
> Signed-off-by: Peter Newman <peternewman@google.com>
> ---
>  arch/x86/kernel/cpu/resctrl/rdtgroup.c | 75 ++++++++++++++++++++++++++
>  1 file changed, 75 insertions(+)
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index b2081bc1bbfd..595f83a517c6 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -3238,6 +3238,80 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
>  	return ret;
>  }
>  
> +static void mongrp_move(struct rdtgroup *rdtgrp, struct rdtgroup *new_prdtgrp,
> +			cpumask_var_t cpus)

Could you please add some function comments on what is moved and how it is accomplished?

> +{
> +	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
> +	struct task_struct *p, *t;
> +
> +	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
> +	list_del(&rdtgrp->mon.crdtgrp_list);
> +
> +	list_add_tail(&rdtgrp->mon.crdtgrp_list,
> +		      &new_prdtgrp->mon.crdtgrp_list);
> +	rdtgrp->mon.parent = new_prdtgrp;
> +
> +	read_lock(&tasklist_lock);
> +	for_each_process_thread(p, t) {
> +		if (is_closid_match(t, prdtgrp) && is_rmid_match(t, rdtgrp))
> +			rdt_move_one_task(t, new_prdtgrp->closid, t->rmid,
> +					  cpus);
> +	}
> +	read_unlock(&tasklist_lock);

Can rdt_move_group_tasks() be used here?

> +
> +	update_closid_rmid(cpus, NULL);
> +}

I see the tasks in a monitor group handled. There is another way to create/manage
a monitor group. For example, by not writing tasks to the tasks file but instead
to write CPU ids to the CPU file. All tasks on a particular CPU will be monitored
by that group. One rule is that a MON group may only have CPUs that are owned by
the CTRL_MON group.
It is not clear to me how such a group is handled in this work. 


> +
> +static int rdtgroup_rename(struct kernfs_node *kn,
> +			   struct kernfs_node *new_parent, const char *new_name)
> +{
> +	struct rdtgroup *new_prdtgrp;
> +	struct rdtgroup *rdtgrp;
> +	cpumask_var_t tmpmask;
> +	int ret;
> +
> +	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	rdtgrp = kernfs_to_rdtgroup(kn);
> +	new_prdtgrp = kernfs_to_rdtgroup(new_parent);
> +	if (!rdtgrp || !new_prdtgrp) {
> +		free_cpumask_var(tmpmask);
> +		return -EPERM;
> +	}
> +

How robust is this against user space attempting to move files? 

> +	/* Release both kernfs active_refs before obtaining rdtgroup mutex. */
> +	rdtgroup_kn_get(rdtgrp, kn);
> +	rdtgroup_kn_get(new_prdtgrp, new_parent);
> +
> +	mutex_lock(&rdtgroup_mutex);
> +
> +	if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
> +		ret = -ESRCH;
> +		goto out;
> +	}
> +
> +	/* Only a mon group can be moved to a new mon_groups directory. */
> +	if (rdtgrp->type != RDTMON_GROUP ||
> +	    !is_mon_groups(new_parent, kn->name)) {
> +		ret = -EPERM;
> +		goto out;
> +	}
> +

Should in-place moves be allowed?

> +	ret = kernfs_rename(kn, new_parent, new_name);
> +	if (ret)
> +		goto out;
> +
> +	mongrp_move(rdtgrp, new_prdtgrp, tmpmask);
> +

Can tmpmask allocation/free be done in mongrp_move()?

> +out:
> +	mutex_unlock(&rdtgroup_mutex);
> +	rdtgroup_kn_put(rdtgrp, kn);
> +	rdtgroup_kn_put(new_prdtgrp, new_parent);
> +	free_cpumask_var(tmpmask);
> +	return ret;
> +}
> +
>  static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
>  {
>  	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
> @@ -3255,6 +3329,7 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
>  static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
>  	.mkdir		= rdtgroup_mkdir,
>  	.rmdir		= rdtgroup_rmdir,
> +	.rename		= rdtgroup_rename,
>  	.show_options	= rdtgroup_show_options,
>  };
>  

Reinette

Peter Newman March 2, 2023, 2:26 p.m. UTC | #2

Hi Reinette,

On Sat, Feb 11, 2023 at 12:21 AM Reinette Chatre
<reinette.chatre@intel.com> wrote:

> On 1/25/2023 2:13 AM, Peter Newman wrote:

> > --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> > @@ -3238,6 +3238,80 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
> >       return ret;
> >  }
> >
> > +static void mongrp_move(struct rdtgroup *rdtgrp, struct rdtgroup *new_prdtgrp,
> > +                     cpumask_var_t cpus)
>
> Could you please add some function comments on what is moved and how it is accomplished?

Sure, I think I should also make the name more descriptive. I think
"move" is too high-level here.

> > +     for_each_process_thread(p, t) {
> > +             if (is_closid_match(t, prdtgrp) && is_rmid_match(t, rdtgrp))
> > +                     rdt_move_one_task(t, new_prdtgrp->closid, t->rmid,
> > +                                       cpus);
> > +     }
> > +     read_unlock(&tasklist_lock);
>
> Can rdt_move_group_tasks() be used here?

As it is today, rdt_move_group_tasks() would move too many tasks.
mongrp_move() needs both the CLOSID and RMID to match, while
rdt_move_group_tasks() only needs 0-1 of the two to match.

I tried adding more parameters to rdt_move_group_tasks() to change the
move condition, but I couldn't make the resulting code not look gross
and after factoring the tricky stuff into rdt_move_one_task(),
rdt_move_group_tasks() didn't look interesting enough to reuse.

>
> > +
> > +     update_closid_rmid(cpus, NULL);
> > +}
>
> I see the tasks in a monitor group handled. There is another way to create/manage
> a monitor group. For example, by not writing tasks to the tasks file but instead
> to write CPU ids to the CPU file. All tasks on a particular CPU will be monitored
> by that group. One rule is that a MON group may only have CPUs that are owned by
> the CTRL_MON group.
> It is not clear to me how such a group is handled in this work.

Right, I overlooked this form of monitoring.

I don't think it makes sense to move such a monitoring group, because a
CPU can only be assigned to one CTRL_MON group. Therefore a move of a MON
group with an assigned CPU will always violate the parent CTRL_MON group
rule after the move.

Based on this, I think rename of a MON group should fail when
rdtgrp->cpu_mask is non-zero.

>
>
> > +
> > +static int rdtgroup_rename(struct kernfs_node *kn,
> > +                        struct kernfs_node *new_parent, const char *new_name)
> > +{
> > +     struct rdtgroup *new_prdtgrp;
> > +     struct rdtgroup *rdtgrp;
> > +     cpumask_var_t tmpmask;
> > +     int ret;
> > +
> > +     if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
> > +             return -ENOMEM;
> > +
> > +     rdtgrp = kernfs_to_rdtgroup(kn);
> > +     new_prdtgrp = kernfs_to_rdtgroup(new_parent);
> > +     if (!rdtgrp || !new_prdtgrp) {
> > +             free_cpumask_var(tmpmask);
> > +             return -EPERM;
> > +     }
> > +
>
> How robust is this against user space attempting to move files?

I'm not sure I understand the question. Can you be more specific?

>
> > +     /* Release both kernfs active_refs before obtaining rdtgroup mutex. */
> > +     rdtgroup_kn_get(rdtgrp, kn);
> > +     rdtgroup_kn_get(new_prdtgrp, new_parent);
> > +
> > +     mutex_lock(&rdtgroup_mutex);
> > +
> > +     if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
> > +             ret = -ESRCH;
> > +             goto out;
> > +     }
> > +
> > +     /* Only a mon group can be moved to a new mon_groups directory. */
> > +     if (rdtgrp->type != RDTMON_GROUP ||
> > +         !is_mon_groups(new_parent, kn->name)) {
> > +             ret = -EPERM;
> > +             goto out;
> > +     }
> > +
>
> Should in-place moves be allowed?

I don't think it's useful in the context of the intended use case.
Also, it looks like kernfs_rename() would fail with EEXIST if I tried.

If it were important to someone, I could make it succeed before calling
into kernfs_rename().

>
> > +     ret = kernfs_rename(kn, new_parent, new_name);
> > +     if (ret)
> > +             goto out;
> > +
> > +     mongrp_move(rdtgrp, new_prdtgrp, tmpmask);
> > +
>
> Can tmpmask allocation/free be done in mongrp_move()?

Yes, but it looked like most other functions in this file allocate the
cpumask up front before validating parameters. If you have a preference
for internalizing its allocation within mongrp_move(), I can try it.

Thank you for your review.

-Peter

Reinette Chatre March 2, 2023, 10:26 p.m. UTC | #3

Hi Peter,

On 3/2/2023 6:26 AM, Peter Newman wrote:
> On Sat, Feb 11, 2023 at 12:21 AM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
> 
>> On 1/25/2023 2:13 AM, Peter Newman wrote:
> 
>>> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>>> @@ -3238,6 +3238,80 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
>>>       return ret;
>>>  }
>>>
>>> +static void mongrp_move(struct rdtgroup *rdtgrp, struct rdtgroup *new_prdtgrp,
>>> +                     cpumask_var_t cpus)
>>
>> Could you please add some function comments on what is moved and how it is accomplished?
> 
> Sure, I think I should also make the name more descriptive. I think
> "move" is too high-level here.
> 
> 
>>> +     for_each_process_thread(p, t) {
>>> +             if (is_closid_match(t, prdtgrp) && is_rmid_match(t, rdtgrp))
>>> +                     rdt_move_one_task(t, new_prdtgrp->closid, t->rmid,
>>> +                                       cpus);
>>> +     }
>>> +     read_unlock(&tasklist_lock);
>>
>> Can rdt_move_group_tasks() be used here?
> 
> As it is today, rdt_move_group_tasks() would move too many tasks.
> mongrp_move() needs both the CLOSID and RMID to match, while
> rdt_move_group_tasks() only needs 0-1 of the two to match.
> 
> I tried adding more parameters to rdt_move_group_tasks() to change the
> move condition, but I couldn't make the resulting code not look gross
> and after factoring the tricky stuff into rdt_move_one_task(),
> rdt_move_group_tasks() didn't look interesting enough to reuse.

Could it be made readable by adding a compare function as parameter to
rdt_move_group_tasks() that is used to determine if a task should be moved?

>>> +
>>> +     update_closid_rmid(cpus, NULL);
>>> +}
>>
>> I see the tasks in a monitor group handled. There is another way to create/manage
>> a monitor group. For example, by not writing tasks to the tasks file but instead
>> to write CPU ids to the CPU file. All tasks on a particular CPU will be monitored
>> by that group. One rule is that a MON group may only have CPUs that are owned by
>> the CTRL_MON group.
>> It is not clear to me how such a group is handled in this work.
> 
> Right, I overlooked this form of monitoring.
> 
> I don't think it makes sense to move such a monitoring group, because a
> CPU can only be assigned to one CTRL_MON group. Therefore a move of a MON
> group with an assigned CPU will always violate the parent CTRL_MON group
> rule after the move.
> 
> Based on this, I think rename of a MON group should fail when
> rdtgrp->cpu_mask is non-zero.

I think this is fair. Also please write something useful in the last
command status buffer.

>>> +
>>> +static int rdtgroup_rename(struct kernfs_node *kn,
>>> +                        struct kernfs_node *new_parent, const char *new_name)
>>> +{
>>> +     struct rdtgroup *new_prdtgrp;
>>> +     struct rdtgroup *rdtgrp;
>>> +     cpumask_var_t tmpmask;
>>> +     int ret;
>>> +
>>> +     if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
>>> +             return -ENOMEM;
>>> +
>>> +     rdtgrp = kernfs_to_rdtgroup(kn);
>>> +     new_prdtgrp = kernfs_to_rdtgroup(new_parent);
>>> +     if (!rdtgrp || !new_prdtgrp) {
>>> +             free_cpumask_var(tmpmask);
>>> +             return -EPERM;
>>> +     }
>>> +
>>
>> How robust is this against user space attempting to move files?
> 
> I'm not sure I understand the question. Can you be more specific?

This commit adds support for rename to resctrl. I thus expect this
function to be called when user space attempts to move _any_ of
the files and/or directories within resctrl. This could be out of
curiosity, buggy, or maliciousness. I would like to understand how
robust this code would be against such attempts because I do not see
much checking before the preparation to do the move is started.

>>> +     /* Release both kernfs active_refs before obtaining rdtgroup mutex. */
>>> +     rdtgroup_kn_get(rdtgrp, kn);
>>> +     rdtgroup_kn_get(new_prdtgrp, new_parent);
>>> +
>>> +     mutex_lock(&rdtgroup_mutex);
>>> +
>>> +     if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
>>> +             ret = -ESRCH;
>>> +             goto out;
>>> +     }
>>> +
>>> +     /* Only a mon group can be moved to a new mon_groups directory. */
>>> +     if (rdtgrp->type != RDTMON_GROUP ||
>>> +         !is_mon_groups(new_parent, kn->name)) {
>>> +             ret = -EPERM;
>>> +             goto out;
>>> +     }
>>> +
>>
>> Should in-place moves be allowed?
> 
> I don't think it's useful in the context of the intended use case.
> Also, it looks like kernfs_rename() would fail with EEXIST if I tried.
> 

From what I can tell kernfs_rename() will return EEXIST if there
is an attempt to create file/directory with the same name at the same place.
What I am asking about is if user space requests to change the name
of a monitoring group without moving it to a new resource group. This seems
to be supported by this code but if it is supported it could likely be
done more efficiently since no tasks need to be moved because neither
closid nor rmid needs to change.

> If it were important to someone, I could make it succeed before calling
> into kernfs_rename().
> 
> 
>>
>>> +     ret = kernfs_rename(kn, new_parent, new_name);
>>> +     if (ret)
>>> +             goto out;
>>> +
>>> +     mongrp_move(rdtgrp, new_prdtgrp, tmpmask);
>>> +
>>
>> Can tmpmask allocation/free be done in mongrp_move()?
> 
> Yes, but it looked like most other functions in this file allocate the
> cpumask up front before validating parameters. If you have a preference
> for internalizing its allocation within mongrp_move(), I can try it.

Could you please elaborate what the concern is? From what I can tell
mongrp_move() is the only user of the cpumask so it is not clear to
me why it cannot take care of the allocation and free.

When referring to existing code I assume you mean rdtgroup_rmdir(). This
is the only code I could find in this file with this pattern. I looked
back at the history and indeed the cpumask was allocated where it was
used but the flow was changed to the current when support for monitoring
groups were added. See f9049547f7e7 ("x86/intel_rdt: Separate the ctrl bits from rmdir")
I do not see a requirement for doing the allocations in that way.

Reinette

Peter Newman March 3, 2023, 3:10 p.m. UTC | #4

Hi Reinette,

On Thu, Mar 2, 2023 at 11:27 PM Reinette Chatre
<reinette.chatre@intel.com> wrote:
> On 3/2/2023 6:26 AM, Peter Newman wrote:
> > On Sat, Feb 11, 2023 at 12:21 AM Reinette Chatre
> > <reinette.chatre@intel.com> wrote:
> >
> >> On 1/25/2023 2:13 AM, Peter Newman wrote:
> >>> +     for_each_process_thread(p, t) {
> >>> +             if (is_closid_match(t, prdtgrp) && is_rmid_match(t, rdtgrp))
> >>> +                     rdt_move_one_task(t, new_prdtgrp->closid, t->rmid,
> >>> +                                       cpus);
> >>> +     }
> >>> +     read_unlock(&tasklist_lock);
> >>
> >> Can rdt_move_group_tasks() be used here?
> >
> > As it is today, rdt_move_group_tasks() would move too many tasks.
> > mongrp_move() needs both the CLOSID and RMID to match, while
> > rdt_move_group_tasks() only needs 0-1 of the two to match.
> >
> > I tried adding more parameters to rdt_move_group_tasks() to change the
> > move condition, but I couldn't make the resulting code not look gross
> > and after factoring the tricky stuff into rdt_move_one_task(),
> > rdt_move_group_tasks() didn't look interesting enough to reuse.
>
> Could it be made readable by adding a compare function as parameter to
> rdt_move_group_tasks() that is used to determine if a task should be moved?

Yes, I think that would be ok in this case. That shouldn't have any
cost if these are all static functions.

As long as we have an rdt_move_group_tasks() function, it's a liability
to have a separate task-moving loop for someone to miss in the future.

Should I still bother with factoring out rdt_move_one_task() in the
parent patch? It was motivated by my creating a new task-moving loop
in this patch.


> >>> +
> >>> +     rdtgrp = kernfs_to_rdtgroup(kn);
> >>> +     new_prdtgrp = kernfs_to_rdtgroup(new_parent);
> >>> +     if (!rdtgrp || !new_prdtgrp) {
> >>> +             free_cpumask_var(tmpmask);
> >>> +             return -EPERM;
> >>> +     }
> >>> +
> >>
> >> How robust is this against user space attempting to move files?
> >
> > I'm not sure I understand the question. Can you be more specific?
>
> This commit adds support for rename to resctrl. I thus expect this
> function to be called when user space attempts to move _any_ of
> the files and/or directories within resctrl. This could be out of
> curiosity, buggy, or maliciousness. I would like to understand how
> robust this code would be against such attempts because I do not see
> much checking before the preparation to do the move is started.

Now I see, thanks.

kernfs_to_rdtgroup() will return the parent rdtgroup when
kn or new_parent is a file, which will lead to kernfs_rename() moving
a file out of a group or clobbering another file node. I'll need to
enforce that kn and new_parent are rdtgroup directories and not file
nodes.

Assuming that the paths of kn and new_parent exactly match their
rdtgroup directories, I believe the checks below are sufficient to
constrain the operation to only moving MON groups to existing
mon_groups directories.


> >> Should in-place moves be allowed?
> >
> > I don't think it's useful in the context of the intended use case.
> > Also, it looks like kernfs_rename() would fail with EEXIST if I tried.
> >
>
> From what I can tell kernfs_rename() will return EEXIST if there
> is an attempt to create file/directory with the same name at the same place.
> What I am asking about is if user space requests to change the name
> of a monitoring group without moving it to a new resource group. This seems
> to be supported by this code but if it is supported it could likely be
> done more efficiently since no tasks need to be moved because neither
> closid nor rmid needs to change.

Yes, I see now. I'll try skipping the mongrp_move() call below when
new_parent is already the parent of rdtgrp to optimize the simple
rename use case.


> >> Can tmpmask allocation/free be done in mongrp_move()?
> >
> > Yes, but it looked like most other functions in this file allocate the
> > cpumask up front before validating parameters. If you have a preference
> > for internalizing its allocation within mongrp_move(), I can try it.
>
> Could you please elaborate what the concern is? From what I can tell
> mongrp_move() is the only user of the cpumask so it is not clear to
> me why it cannot take care of the allocation and free.
>
> When referring to existing code I assume you mean rdtgroup_rmdir(). This
> is the only code I could find in this file with this pattern. I looked
> back at the history and indeed the cpumask was allocated where it was
> used but the flow was changed to the current when support for monitoring
> groups were added. See f9049547f7e7 ("x86/intel_rdt: Separate the ctrl bits from rmdir")
> I do not see a requirement for doing the allocations in that way.

I looked over this again in more detail...

I need to choose whether to call kernfs_rename() or mongrp_move() first.
If the second call fails, the first needs to be reverted. It's feasible
to ensure that a mongrp_move() call will be successful before calling
kernfs_rename(), but not the other way around.

If I allow mongrp_move() to fail, kernfs_rename() should be reversible
thanks to the prior checks validating this use case, but I would prefer
to eliminate the need for a revert on cleanup entirely.

-Peter

Reinette Chatre March 3, 2023, 7:05 p.m. UTC | #5

Hi Peter,

On 3/3/2023 7:10 AM, Peter Newman wrote:
> On Thu, Mar 2, 2023 at 11:27 PM Reinette Chatre
> <reinette.chatre@intel.com> wrote:
>> On 3/2/2023 6:26 AM, Peter Newman wrote:
>>> On Sat, Feb 11, 2023 at 12:21 AM Reinette Chatre
>>> <reinette.chatre@intel.com> wrote:
>>>
>>>> On 1/25/2023 2:13 AM, Peter Newman wrote:
>>>>> +     for_each_process_thread(p, t) {
>>>>> +             if (is_closid_match(t, prdtgrp) && is_rmid_match(t, rdtgrp))
>>>>> +                     rdt_move_one_task(t, new_prdtgrp->closid, t->rmid,
>>>>> +                                       cpus);
>>>>> +     }
>>>>> +     read_unlock(&tasklist_lock);
>>>>
>>>> Can rdt_move_group_tasks() be used here?
>>>
>>> As it is today, rdt_move_group_tasks() would move too many tasks.
>>> mongrp_move() needs both the CLOSID and RMID to match, while
>>> rdt_move_group_tasks() only needs 0-1 of the two to match.
>>>
>>> I tried adding more parameters to rdt_move_group_tasks() to change the
>>> move condition, but I couldn't make the resulting code not look gross
>>> and after factoring the tricky stuff into rdt_move_one_task(),
>>> rdt_move_group_tasks() didn't look interesting enough to reuse.
>>
>> Could it be made readable by adding a compare function as parameter to
>> rdt_move_group_tasks() that is used to determine if a task should be moved?
> 
> Yes, I think that would be ok in this case. That shouldn't have any
> cost if these are all static functions.
> 
> As long as we have an rdt_move_group_tasks() function, it's a liability
> to have a separate task-moving loop for someone to miss in the future.

Agreed.
 
> Should I still bother with factoring out rdt_move_one_task() in the
> parent patch? It was motivated by my creating a new task-moving loop
> in this patch.

I do not think that refactoring is necessary if you go this route.
 
>>>>> +
>>>>> +     rdtgrp = kernfs_to_rdtgroup(kn);
>>>>> +     new_prdtgrp = kernfs_to_rdtgroup(new_parent);
>>>>> +     if (!rdtgrp || !new_prdtgrp) {
>>>>> +             free_cpumask_var(tmpmask);
>>>>> +             return -EPERM;
>>>>> +     }
>>>>> +
>>>>
>>>> How robust is this against user space attempting to move files?
>>>
>>> I'm not sure I understand the question. Can you be more specific?
>>
>> This commit adds support for rename to resctrl. I thus expect this
>> function to be called when user space attempts to move _any_ of
>> the files and/or directories within resctrl. This could be out of
>> curiosity, buggy, or maliciousness. I would like to understand how
>> robust this code would be against such attempts because I do not see
>> much checking before the preparation to do the move is started.
> 
> Now I see, thanks.
> 
> kernfs_to_rdtgroup() will return the parent rdtgroup when
> kn or new_parent is a file, which will lead to kernfs_rename() moving
> a file out of a group or clobbering another file node. I'll need to
> enforce that kn and new_parent are rdtgroup directories and not file
> nodes.

Perhaps additionally that the source directories have mon_groups as parent,
similar to the destination check you already have.

Reinette

diff mbox series

Patch

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index b2081bc1bbfd..595f83a517c6 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -3238,6 +3238,80 @@  static int rdtgroup_rmdir(struct kernfs_node *kn)
 	return ret;
 }
 
+static void mongrp_move(struct rdtgroup *rdtgrp, struct rdtgroup *new_prdtgrp,
+			cpumask_var_t cpus)
+{
+	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+	struct task_struct *p, *t;
+
+	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+	list_del(&rdtgrp->mon.crdtgrp_list);
+
+	list_add_tail(&rdtgrp->mon.crdtgrp_list,
+		      &new_prdtgrp->mon.crdtgrp_list);
+	rdtgrp->mon.parent = new_prdtgrp;
+
+	read_lock(&tasklist_lock);
+	for_each_process_thread(p, t) {
+		if (is_closid_match(t, prdtgrp) && is_rmid_match(t, rdtgrp))
+			rdt_move_one_task(t, new_prdtgrp->closid, t->rmid,
+					  cpus);
+	}
+	read_unlock(&tasklist_lock);
+
+	update_closid_rmid(cpus, NULL);
+}
+
+static int rdtgroup_rename(struct kernfs_node *kn,
+			   struct kernfs_node *new_parent, const char *new_name)
+{
+	struct rdtgroup *new_prdtgrp;
+	struct rdtgroup *rdtgrp;
+	cpumask_var_t tmpmask;
+	int ret;
+
+	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		return -ENOMEM;
+
+	rdtgrp = kernfs_to_rdtgroup(kn);
+	new_prdtgrp = kernfs_to_rdtgroup(new_parent);
+	if (!rdtgrp || !new_prdtgrp) {
+		free_cpumask_var(tmpmask);
+		return -EPERM;
+	}
+
+	/* Release both kernfs active_refs before obtaining rdtgroup mutex. */
+	rdtgroup_kn_get(rdtgrp, kn);
+	rdtgroup_kn_get(new_prdtgrp, new_parent);
+
+	mutex_lock(&rdtgroup_mutex);
+
+	if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
+		ret = -ESRCH;
+		goto out;
+	}
+
+	/* Only a mon group can be moved to a new mon_groups directory. */
+	if (rdtgrp->type != RDTMON_GROUP ||
+	    !is_mon_groups(new_parent, kn->name)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	ret = kernfs_rename(kn, new_parent, new_name);
+	if (ret)
+		goto out;
+
+	mongrp_move(rdtgrp, new_prdtgrp, tmpmask);
+
+out:
+	mutex_unlock(&rdtgroup_mutex);
+	rdtgroup_kn_put(rdtgrp, kn);
+	rdtgroup_kn_put(new_prdtgrp, new_parent);
+	free_cpumask_var(tmpmask);
+	return ret;
+}
+
 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
 {
 	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
@@ -3255,6 +3329,7 @@  static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
 static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
 	.mkdir		= rdtgroup_mkdir,
 	.rmdir		= rdtgroup_rmdir,
+	.rename		= rdtgroup_rename,
 	.show_options	= rdtgroup_show_options,
 };