[5/6] cgroup/cpuset: Free DL BW in case can_attach() fails

Message ID 20230329125558.255239-6-juri.lelli@redhat.com
State New
Headers
Series sched/deadline: cpuset: Rework DEADLINE bandwidth restoration |

Commit Message

Juri Lelli March 29, 2023, 12:55 p.m. UTC
  From: Dietmar Eggemann <dietmar.eggemann@arm.com>

cpuset_can_attach() can fail. Postpone DL BW allocation until all tasks
have been checked. DL BW is not allocated per-task but as a sum over
all DL tasks migrating.

If multiple controllers are attached to the cgroup next to the cuset
controller a non-cpuset can_attach() can fail. In this case free DL BW
in cpuset_cancel_attach().

Finally, update cpuset DL task count (nr_deadline_tasks) only in
cpuset_attach().

Suggested-by: Waiman Long <longman@redhat.com>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
---
 include/linux/sched.h  |  2 +-
 kernel/cgroup/cpuset.c | 55 ++++++++++++++++++++++++++++++++++++++----
 kernel/sched/core.c    | 17 ++-----------
 3 files changed, 53 insertions(+), 21 deletions(-)
  

Comments

Waiman Long March 29, 2023, 2:25 p.m. UTC | #1
On 3/29/23 08:55, Juri Lelli wrote:
> From: Dietmar Eggemann <dietmar.eggemann@arm.com>
>
> cpuset_can_attach() can fail. Postpone DL BW allocation until all tasks
> have been checked. DL BW is not allocated per-task but as a sum over
> all DL tasks migrating.
>
> If multiple controllers are attached to the cgroup next to the cuset
Typo: : "cuset" => "cpuset"
> controller a non-cpuset can_attach() can fail. In this case free DL BW
> in cpuset_cancel_attach().
>
> Finally, update cpuset DL task count (nr_deadline_tasks) only in
> cpuset_attach().
>
> Suggested-by: Waiman Long <longman@redhat.com>
> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
> Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
> ---
>   include/linux/sched.h  |  2 +-
>   kernel/cgroup/cpuset.c | 55 ++++++++++++++++++++++++++++++++++++++----
>   kernel/sched/core.c    | 17 ++-----------
>   3 files changed, 53 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 6f3d84e0ed08..50cbbfefbe11 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1847,7 +1847,7 @@ current_restore_flags(unsigned long orig_flags, unsigned long flags)
>   }
>   
>   extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
> -extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus);
> +extern int task_can_attach(struct task_struct *p);
>   extern int dl_bw_alloc(int cpu, u64 dl_bw);
>   extern void dl_bw_free(int cpu, u64 dl_bw);
>   #ifdef CONFIG_SMP
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index eb0854ef9757..f8ebec66da51 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -198,6 +198,8 @@ struct cpuset {
>   	 * know when to rebuild associated root domain bandwidth information.
>   	 */
>   	int nr_deadline_tasks;
> +	int nr_migrate_dl_tasks;
> +	u64 sum_migrate_dl_bw;
>   
>   	/* Invalid partition error code, not lock protected */
>   	enum prs_errcode prs_err;
> @@ -2464,16 +2466,23 @@ static int fmeter_getrate(struct fmeter *fmp)
>   
>   static struct cpuset *cpuset_attach_old_cs;
>   
> +static void reset_migrate_dl_data(struct cpuset *cs)
> +{
> +	cs->nr_migrate_dl_tasks = 0;
> +	cs->sum_migrate_dl_bw = 0;
> +}
> +
>   /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
>   static int cpuset_can_attach(struct cgroup_taskset *tset)
>   {
>   	struct cgroup_subsys_state *css;
> -	struct cpuset *cs;
> +	struct cpuset *cs, *oldcs;
>   	struct task_struct *task;
>   	int ret;
>   
>   	/* used later by cpuset_attach() */
>   	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
> +	oldcs = cpuset_attach_old_cs;
>   	cs = css_cs(css);
>   
>   	mutex_lock(&cpuset_mutex);
> @@ -2491,7 +2500,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
>   		goto out_unlock;
>   
>   	cgroup_taskset_for_each(task, css, tset) {
> -		ret = task_can_attach(task, cs->effective_cpus);
> +		ret = task_can_attach(task);
>   		if (ret)
>   			goto out_unlock;
>   		ret = security_task_setscheduler(task);
> @@ -2499,11 +2508,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
>   			goto out_unlock;
>   
>   		if (dl_task(task)) {
> -			cs->nr_deadline_tasks++;
> -			cpuset_attach_old_cs->nr_deadline_tasks--;
> +			cs->nr_migrate_dl_tasks++;
> +			cs->sum_migrate_dl_bw += task->dl.dl_bw;
> +		}
> +	}
> +
> +	if (!cs->nr_migrate_dl_tasks)
> +		goto out_succes;
> +
> +	if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
> +		int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
> +
> +		if (unlikely(cpu >= nr_cpu_ids)) {
> +			reset_migrate_dl_data(cs);
> +			ret = -EINVAL;
> +			goto out_unlock;
> +		}
> +
> +		ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
> +		if (ret) {
> +			reset_migrate_dl_data(cs);
> +			goto out_unlock;
>   		}
>   	}
>   
> +out_succes:
>   	/*
>   	 * Mark attach is in progress.  This makes validate_change() fail
>   	 * changes which zero cpus/mems_allowed.
> @@ -2518,11 +2547,21 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
>   static void cpuset_cancel_attach(struct cgroup_taskset *tset)
>   {
>   	struct cgroup_subsys_state *css;
> +	struct cpuset *cs;
>   
>   	cgroup_taskset_first(tset, &css);
> +	cs = css_cs(css);
>   
>   	mutex_lock(&cpuset_mutex);
> -	css_cs(css)->attach_in_progress--;
> +	cs->attach_in_progress--;
> +
> +	if (cs->nr_migrate_dl_tasks) {
> +		int cpu = cpumask_any(cs->effective_cpus);
> +
> +		dl_bw_free(cpu, cs->sum_migrate_dl_bw);
> +		reset_migrate_dl_data(cs);
> +	}
> +
>   	mutex_unlock(&cpuset_mutex);
>   }
>   
> @@ -2617,6 +2656,12 @@ static void cpuset_attach(struct cgroup_taskset *tset)
>   out:
>   	cs->old_mems_allowed = cpuset_attach_nodemask_to;
>   
> +	if (cs->nr_migrate_dl_tasks) {
> +		cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
> +		oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
> +		reset_migrate_dl_data(cs);
> +	}
> +
>   	cs->attach_in_progress--;
>   	if (!cs->attach_in_progress)
>   		wake_up(&cpuset_attach_wq);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c83dae6b8586..10454980e830 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -9269,8 +9269,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
>   	return ret;
>   }
>   
> -int task_can_attach(struct task_struct *p,
> -		    const struct cpumask *cs_effective_cpus)
> +int task_can_attach(struct task_struct *p)
>   {
>   	int ret = 0;
>   
> @@ -9283,21 +9282,9 @@ int task_can_attach(struct task_struct *p,
>   	 * success of set_cpus_allowed_ptr() on all attached tasks
>   	 * before cpus_mask may be changed.
>   	 */
> -	if (p->flags & PF_NO_SETAFFINITY) {
> +	if (p->flags & PF_NO_SETAFFINITY)
>   		ret = -EINVAL;
> -		goto out;
> -	}
> -
> -	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
> -					      cs_effective_cpus)) {
> -		int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
>   
> -		if (unlikely(cpu >= nr_cpu_ids))
> -			return -EINVAL;
> -		ret = dl_bw_alloc(cpu, p->dl.dl_bw);
> -	}
> -
> -out:
>   	return ret;
>   }
>
  
Waiman Long March 29, 2023, 2:31 p.m. UTC | #2
On 3/29/23 10:25, Waiman Long wrote:
>
> On 3/29/23 08:55, Juri Lelli wrote:
>> From: Dietmar Eggemann <dietmar.eggemann@arm.com>
>>
>> cpuset_can_attach() can fail. Postpone DL BW allocation until all tasks
>> have been checked. DL BW is not allocated per-task but as a sum over
>> all DL tasks migrating.
>>
>> If multiple controllers are attached to the cgroup next to the cuset
> Typo: : "cuset" => "cpuset"
>> controller a non-cpuset can_attach() can fail. In this case free DL BW
>> in cpuset_cancel_attach().
>>
>> Finally, update cpuset DL task count (nr_deadline_tasks) only in
>> cpuset_attach().
>>
>> Suggested-by: Waiman Long <longman@redhat.com>
>> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
>> Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
>> ---
>>   include/linux/sched.h  |  2 +-
>>   kernel/cgroup/cpuset.c | 55 ++++++++++++++++++++++++++++++++++++++----
>>   kernel/sched/core.c    | 17 ++-----------
>>   3 files changed, 53 insertions(+), 21 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 6f3d84e0ed08..50cbbfefbe11 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1847,7 +1847,7 @@ current_restore_flags(unsigned long orig_flags, 
>> unsigned long flags)
>>   }
>>     extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, 
>> const struct cpumask *trial);
>> -extern int task_can_attach(struct task_struct *p, const struct 
>> cpumask *cs_effective_cpus);
>> +extern int task_can_attach(struct task_struct *p);
>>   extern int dl_bw_alloc(int cpu, u64 dl_bw);
>>   extern void dl_bw_free(int cpu, u64 dl_bw);
>>   #ifdef CONFIG_SMP
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index eb0854ef9757..f8ebec66da51 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -198,6 +198,8 @@ struct cpuset {
>>        * know when to rebuild associated root domain bandwidth 
>> information.
>>        */
>>       int nr_deadline_tasks;
>> +    int nr_migrate_dl_tasks;
>> +    u64 sum_migrate_dl_bw;
>>         /* Invalid partition error code, not lock protected */
>>       enum prs_errcode prs_err;
>> @@ -2464,16 +2466,23 @@ static int fmeter_getrate(struct fmeter *fmp)
>>     static struct cpuset *cpuset_attach_old_cs;
>>   +static void reset_migrate_dl_data(struct cpuset *cs)
>> +{
>> +    cs->nr_migrate_dl_tasks = 0;
>> +    cs->sum_migrate_dl_bw = 0;
>> +}
>> +
>>   /* Called by cgroups to determine if a cpuset is usable; 
>> cpuset_mutex held */
>>   static int cpuset_can_attach(struct cgroup_taskset *tset)
>>   {
>>       struct cgroup_subsys_state *css;
>> -    struct cpuset *cs;
>> +    struct cpuset *cs, *oldcs;
>>       struct task_struct *task;
>>       int ret;
>>         /* used later by cpuset_attach() */
>>       cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
>> +    oldcs = cpuset_attach_old_cs;
>>       cs = css_cs(css);
>>         mutex_lock(&cpuset_mutex);
>> @@ -2491,7 +2500,7 @@ static int cpuset_can_attach(struct 
>> cgroup_taskset *tset)
>>           goto out_unlock;
>>         cgroup_taskset_for_each(task, css, tset) {
>> -        ret = task_can_attach(task, cs->effective_cpus);
>> +        ret = task_can_attach(task);
>>           if (ret)
>>               goto out_unlock;
>>           ret = security_task_setscheduler(task);
>> @@ -2499,11 +2508,31 @@ static int cpuset_can_attach(struct 
>> cgroup_taskset *tset)
>>               goto out_unlock;
>>             if (dl_task(task)) {
>> -            cs->nr_deadline_tasks++;
>> -            cpuset_attach_old_cs->nr_deadline_tasks--;
>> +            cs->nr_migrate_dl_tasks++;
>> +            cs->sum_migrate_dl_bw += task->dl.dl_bw;
>> +        }
>> +    }
>> +
>> +    if (!cs->nr_migrate_dl_tasks)
>> +        goto out_succes;
>> +
>> +    if (!cpumask_intersects(oldcs->effective_cpus, 
>> cs->effective_cpus)) {
>> +        int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
>> +
>> +        if (unlikely(cpu >= nr_cpu_ids)) {
>> +            reset_migrate_dl_data(cs);
>> +            ret = -EINVAL;
>> +            goto out_unlock;
>> +        }
>> +
>> +        ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
>> +        if (ret) {
>> +            reset_migrate_dl_data(cs);
>> +            goto out_unlock;
>>           }
>>       }
>>   +out_succes:
>>       /*
>>        * Mark attach is in progress.  This makes validate_change() fail
>>        * changes which zero cpus/mems_allowed.
>> @@ -2518,11 +2547,21 @@ static int cpuset_can_attach(struct 
>> cgroup_taskset *tset)
>>   static void cpuset_cancel_attach(struct cgroup_taskset *tset)
>>   {
>>       struct cgroup_subsys_state *css;
>> +    struct cpuset *cs;
>>         cgroup_taskset_first(tset, &css);
>> +    cs = css_cs(css);
>>         mutex_lock(&cpuset_mutex);
>> -    css_cs(css)->attach_in_progress--;
>> +    cs->attach_in_progress--;
>> +
>> +    if (cs->nr_migrate_dl_tasks) {
>> +        int cpu = cpumask_any(cs->effective_cpus);
>> +
>> +        dl_bw_free(cpu, cs->sum_migrate_dl_bw);
>> +        reset_migrate_dl_data(cs);
>> +    }
>> +

Another nit that I have is that you may have to record also the cpu 
where the DL bandwidth is allocated in cpuset_can_attach() and free the 
bandwidth back into that cpu or there can be an underflow if another cpu 
is chosen.

Cheers,
Longman
  
Dietmar Eggemann March 29, 2023, 4:39 p.m. UTC | #3
On 29/03/2023 16:31, Waiman Long wrote:
> On 3/29/23 10:25, Waiman Long wrote:
>>
>> On 3/29/23 08:55, Juri Lelli wrote:
>>> From: Dietmar Eggemann <dietmar.eggemann@arm.com>

[...]

>>> @@ -2518,11 +2547,21 @@ static int cpuset_can_attach(struct
>>> cgroup_taskset *tset)
>>>   static void cpuset_cancel_attach(struct cgroup_taskset *tset)
>>>   {
>>>       struct cgroup_subsys_state *css;
>>> +    struct cpuset *cs;
>>>         cgroup_taskset_first(tset, &css);
>>> +    cs = css_cs(css);
>>>         mutex_lock(&cpuset_mutex);
>>> -    css_cs(css)->attach_in_progress--;
>>> +    cs->attach_in_progress--;
>>> +
>>> +    if (cs->nr_migrate_dl_tasks) {
>>> +        int cpu = cpumask_any(cs->effective_cpus);
>>> +
>>> +        dl_bw_free(cpu, cs->sum_migrate_dl_bw);
>>> +        reset_migrate_dl_data(cs);
>>> +    }
>>> +
> 
> Another nit that I have is that you may have to record also the cpu
> where the DL bandwidth is allocated in cpuset_can_attach() and free the
> bandwidth back into that cpu or there can be an underflow if another cpu
> is chosen.

Many thanks for the review!

But isn't the DL BW control `struct dl_bw` per `struct root_domain`
which is per exclusive cpuset. So as long cpu is from
`cs->effective_cpus` shouldn't this be fine?
  
Waiman Long March 29, 2023, 6:09 p.m. UTC | #4
On 3/29/23 12:39, Dietmar Eggemann wrote:
> On 29/03/2023 16:31, Waiman Long wrote:
>> On 3/29/23 10:25, Waiman Long wrote:
>>> On 3/29/23 08:55, Juri Lelli wrote:
>>>> From: Dietmar Eggemann <dietmar.eggemann@arm.com>
> [...]
>
>>>> @@ -2518,11 +2547,21 @@ static int cpuset_can_attach(struct
>>>> cgroup_taskset *tset)
>>>>    static void cpuset_cancel_attach(struct cgroup_taskset *tset)
>>>>    {
>>>>        struct cgroup_subsys_state *css;
>>>> +    struct cpuset *cs;
>>>>          cgroup_taskset_first(tset, &css);
>>>> +    cs = css_cs(css);
>>>>          mutex_lock(&cpuset_mutex);
>>>> -    css_cs(css)->attach_in_progress--;
>>>> +    cs->attach_in_progress--;
>>>> +
>>>> +    if (cs->nr_migrate_dl_tasks) {
>>>> +        int cpu = cpumask_any(cs->effective_cpus);
>>>> +
>>>> +        dl_bw_free(cpu, cs->sum_migrate_dl_bw);
>>>> +        reset_migrate_dl_data(cs);
>>>> +    }
>>>> +
>> Another nit that I have is that you may have to record also the cpu
>> where the DL bandwidth is allocated in cpuset_can_attach() and free the
>> bandwidth back into that cpu or there can be an underflow if another cpu
>> is chosen.
> Many thanks for the review!
>
> But isn't the DL BW control `struct dl_bw` per `struct root_domain`
> which is per exclusive cpuset. So as long cpu is from
> `cs->effective_cpus` shouldn't this be fine?

Sorry for my ignorance on how the deadline bandwidth operation work. I 
check the bandwidth code and find that we are storing the bandwidth 
information in the root domain, not on the cpu. That shouldn't be a 
concern then.

However, I still have some question on how that works when dealing with 
cpuset. First of all, not all the CPUs in a given root domains are in 
the cpuset. So there may be enough bandwidth on the root domain, but it 
doesn't mean there will be enough bandwidth in the set of CPUs in a 
particular cpuset. Secondly, how do you deal with isolated CPUs that do 
not have a corresponding root domain? It is now possible to create a 
cpuset with isolated CPUs.

Cheers,
Longman
  
Dietmar Eggemann March 30, 2023, 3:14 p.m. UTC | #5
On 29/03/2023 20:09, Waiman Long wrote:
> On 3/29/23 12:39, Dietmar Eggemann wrote:
>> On 29/03/2023 16:31, Waiman Long wrote:
>>> On 3/29/23 10:25, Waiman Long wrote:
>>>> On 3/29/23 08:55, Juri Lelli wrote:
>>>>> From: Dietmar Eggemann <dietmar.eggemann@arm.com>
>> [...]
>>
>>>>> @@ -2518,11 +2547,21 @@ static int cpuset_can_attach(struct
>>>>> cgroup_taskset *tset)
>>>>>    static void cpuset_cancel_attach(struct cgroup_taskset *tset)
>>>>>    {
>>>>>        struct cgroup_subsys_state *css;
>>>>> +    struct cpuset *cs;
>>>>>          cgroup_taskset_first(tset, &css);
>>>>> +    cs = css_cs(css);
>>>>>          mutex_lock(&cpuset_mutex);
>>>>> -    css_cs(css)->attach_in_progress--;
>>>>> +    cs->attach_in_progress--;
>>>>> +
>>>>> +    if (cs->nr_migrate_dl_tasks) {
>>>>> +        int cpu = cpumask_any(cs->effective_cpus);
>>>>> +
>>>>> +        dl_bw_free(cpu, cs->sum_migrate_dl_bw);
>>>>> +        reset_migrate_dl_data(cs);
>>>>> +    }
>>>>> +
>>> Another nit that I have is that you may have to record also the cpu
>>> where the DL bandwidth is allocated in cpuset_can_attach() and free the
>>> bandwidth back into that cpu or there can be an underflow if another cpu
>>> is chosen.
>> Many thanks for the review!
>>
>> But isn't the DL BW control `struct dl_bw` per `struct root_domain`
>> which is per exclusive cpuset. So as long cpu is from
>> `cs->effective_cpus` shouldn't this be fine?
> 
> Sorry for my ignorance on how the deadline bandwidth operation work. I
> check the bandwidth code and find that we are storing the bandwidth
> information in the root domain, not on the cpu. That shouldn't be a
> concern then.
> 
> However, I still have some question on how that works when dealing with
> cpuset. First of all, not all the CPUs in a given root domains are in
> the cpuset. So there may be enough bandwidth on the root domain, but it
> doesn't mean there will be enough bandwidth in the set of CPUs in a
> particular cpuset. Secondly, how do you deal with isolated CPUs that do
> not have a corresponding root domain? It is now possible to create a
> cpuset with isolated CPUs.

Sorry, I overlooked this email somehow.

IMHO, this is only done for exclusive cpusets:

  cpuset_can_attach()

    if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus))

So they should have their own root_domain congruent to their cpumask.
  
Waiman Long March 30, 2023, 4:13 p.m. UTC | #6
On 3/30/23 11:14, Dietmar Eggemann wrote:
> On 29/03/2023 20:09, Waiman Long wrote:
>> On 3/29/23 12:39, Dietmar Eggemann wrote:
>>> On 29/03/2023 16:31, Waiman Long wrote:
>>>> On 3/29/23 10:25, Waiman Long wrote:
>>>>> On 3/29/23 08:55, Juri Lelli wrote:
>>>>>> From: Dietmar Eggemann <dietmar.eggemann@arm.com>
>>> [...]
>>>
>>>>>> @@ -2518,11 +2547,21 @@ static int cpuset_can_attach(struct
>>>>>> cgroup_taskset *tset)
>>>>>>     static void cpuset_cancel_attach(struct cgroup_taskset *tset)
>>>>>>     {
>>>>>>         struct cgroup_subsys_state *css;
>>>>>> +    struct cpuset *cs;
>>>>>>           cgroup_taskset_first(tset, &css);
>>>>>> +    cs = css_cs(css);
>>>>>>           mutex_lock(&cpuset_mutex);
>>>>>> -    css_cs(css)->attach_in_progress--;
>>>>>> +    cs->attach_in_progress--;
>>>>>> +
>>>>>> +    if (cs->nr_migrate_dl_tasks) {
>>>>>> +        int cpu = cpumask_any(cs->effective_cpus);
>>>>>> +
>>>>>> +        dl_bw_free(cpu, cs->sum_migrate_dl_bw);
>>>>>> +        reset_migrate_dl_data(cs);
>>>>>> +    }
>>>>>> +
>>>> Another nit that I have is that you may have to record also the cpu
>>>> where the DL bandwidth is allocated in cpuset_can_attach() and free the
>>>> bandwidth back into that cpu or there can be an underflow if another cpu
>>>> is chosen.
>>> Many thanks for the review!
>>>
>>> But isn't the DL BW control `struct dl_bw` per `struct root_domain`
>>> which is per exclusive cpuset. So as long cpu is from
>>> `cs->effective_cpus` shouldn't this be fine?
>> Sorry for my ignorance on how the deadline bandwidth operation work. I
>> check the bandwidth code and find that we are storing the bandwidth
>> information in the root domain, not on the cpu. That shouldn't be a
>> concern then.
>>
>> However, I still have some question on how that works when dealing with
>> cpuset. First of all, not all the CPUs in a given root domains are in
>> the cpuset. So there may be enough bandwidth on the root domain, but it
>> doesn't mean there will be enough bandwidth in the set of CPUs in a
>> particular cpuset. Secondly, how do you deal with isolated CPUs that do
>> not have a corresponding root domain? It is now possible to create a
>> cpuset with isolated CPUs.
> Sorry, I overlooked this email somehow.
>
> IMHO, this is only done for exclusive cpusets:
>
>    cpuset_can_attach()
>
>      if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus))
>
> So they should have their own root_domain congruent to their cpumask.

I am sorry that I missed that check.

Parallel attach is actually an existing problem in cpuset as there is a 
shared cpuset_attach_old_cs variable being used by cpuset between 
cpuset_can_attach() and cpuset_attach(). So any parallel attach can lead 
to corruption of this common data causing incorrect result. So this 
problem is not specific to this patch series. So please ignore this 
patch for now. It has to be addressed separately.

Cheers,
Longman
  

Patch

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6f3d84e0ed08..50cbbfefbe11 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1847,7 +1847,7 @@  current_restore_flags(unsigned long orig_flags, unsigned long flags)
 }
 
 extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus);
+extern int task_can_attach(struct task_struct *p);
 extern int dl_bw_alloc(int cpu, u64 dl_bw);
 extern void dl_bw_free(int cpu, u64 dl_bw);
 #ifdef CONFIG_SMP
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index eb0854ef9757..f8ebec66da51 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -198,6 +198,8 @@  struct cpuset {
 	 * know when to rebuild associated root domain bandwidth information.
 	 */
 	int nr_deadline_tasks;
+	int nr_migrate_dl_tasks;
+	u64 sum_migrate_dl_bw;
 
 	/* Invalid partition error code, not lock protected */
 	enum prs_errcode prs_err;
@@ -2464,16 +2466,23 @@  static int fmeter_getrate(struct fmeter *fmp)
 
 static struct cpuset *cpuset_attach_old_cs;
 
+static void reset_migrate_dl_data(struct cpuset *cs)
+{
+	cs->nr_migrate_dl_tasks = 0;
+	cs->sum_migrate_dl_bw = 0;
+}
+
 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
 static int cpuset_can_attach(struct cgroup_taskset *tset)
 {
 	struct cgroup_subsys_state *css;
-	struct cpuset *cs;
+	struct cpuset *cs, *oldcs;
 	struct task_struct *task;
 	int ret;
 
 	/* used later by cpuset_attach() */
 	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
+	oldcs = cpuset_attach_old_cs;
 	cs = css_cs(css);
 
 	mutex_lock(&cpuset_mutex);
@@ -2491,7 +2500,7 @@  static int cpuset_can_attach(struct cgroup_taskset *tset)
 		goto out_unlock;
 
 	cgroup_taskset_for_each(task, css, tset) {
-		ret = task_can_attach(task, cs->effective_cpus);
+		ret = task_can_attach(task);
 		if (ret)
 			goto out_unlock;
 		ret = security_task_setscheduler(task);
@@ -2499,11 +2508,31 @@  static int cpuset_can_attach(struct cgroup_taskset *tset)
 			goto out_unlock;
 
 		if (dl_task(task)) {
-			cs->nr_deadline_tasks++;
-			cpuset_attach_old_cs->nr_deadline_tasks--;
+			cs->nr_migrate_dl_tasks++;
+			cs->sum_migrate_dl_bw += task->dl.dl_bw;
+		}
+	}
+
+	if (!cs->nr_migrate_dl_tasks)
+		goto out_succes;
+
+	if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
+		int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
+
+		if (unlikely(cpu >= nr_cpu_ids)) {
+			reset_migrate_dl_data(cs);
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+
+		ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+		if (ret) {
+			reset_migrate_dl_data(cs);
+			goto out_unlock;
 		}
 	}
 
+out_succes:
 	/*
 	 * Mark attach is in progress.  This makes validate_change() fail
 	 * changes which zero cpus/mems_allowed.
@@ -2518,11 +2547,21 @@  static int cpuset_can_attach(struct cgroup_taskset *tset)
 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
 {
 	struct cgroup_subsys_state *css;
+	struct cpuset *cs;
 
 	cgroup_taskset_first(tset, &css);
+	cs = css_cs(css);
 
 	mutex_lock(&cpuset_mutex);
-	css_cs(css)->attach_in_progress--;
+	cs->attach_in_progress--;
+
+	if (cs->nr_migrate_dl_tasks) {
+		int cpu = cpumask_any(cs->effective_cpus);
+
+		dl_bw_free(cpu, cs->sum_migrate_dl_bw);
+		reset_migrate_dl_data(cs);
+	}
+
 	mutex_unlock(&cpuset_mutex);
 }
 
@@ -2617,6 +2656,12 @@  static void cpuset_attach(struct cgroup_taskset *tset)
 out:
 	cs->old_mems_allowed = cpuset_attach_nodemask_to;
 
+	if (cs->nr_migrate_dl_tasks) {
+		cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
+		oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
+		reset_migrate_dl_data(cs);
+	}
+
 	cs->attach_in_progress--;
 	if (!cs->attach_in_progress)
 		wake_up(&cpuset_attach_wq);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c83dae6b8586..10454980e830 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9269,8 +9269,7 @@  int cpuset_cpumask_can_shrink(const struct cpumask *cur,
 	return ret;
 }
 
-int task_can_attach(struct task_struct *p,
-		    const struct cpumask *cs_effective_cpus)
+int task_can_attach(struct task_struct *p)
 {
 	int ret = 0;
 
@@ -9283,21 +9282,9 @@  int task_can_attach(struct task_struct *p,
 	 * success of set_cpus_allowed_ptr() on all attached tasks
 	 * before cpus_mask may be changed.
 	 */
-	if (p->flags & PF_NO_SETAFFINITY) {
+	if (p->flags & PF_NO_SETAFFINITY)
 		ret = -EINVAL;
-		goto out;
-	}
-
-	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
-					      cs_effective_cpus)) {
-		int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
 
-		if (unlikely(cpu >= nr_cpu_ids))
-			return -EINVAL;
-		ret = dl_bw_alloc(cpu, p->dl.dl_bw);
-	}
-
-out:
 	return ret;
 }