[v5,01/18] cgroup/misc: Add per resource callbacks for CSS events

Message ID 20230923030657.16148-2-haitao.huang@linux.intel.com
State New
Headers
Series Add Cgroup support for SGX EPC memory |

Commit Message

Haitao Huang Sept. 23, 2023, 3:06 a.m. UTC
  From: Kristen Carlson Accardi <kristen@linux.intel.com>

The misc cgroup controller (subsystem) currently does not perform
resource type specific action for Cgroups Subsystem State (CSS) events:
the 'css_alloc' event when a cgroup is created and the 'css_free' event
when a cgroup is destroyed, or in event of user writing the max value to
the misc.max file to set the usage limit of a specific resource
[admin-guide/cgroup-v2.rst, 5-9. Misc].

Define callbacks for those events and allow resource providers to
register the callbacks per resource type as needed. This will be
utilized later by the EPC misc cgroup support implemented in the SGX
driver:
- On css_alloc, allocate and initialize necessary structures for EPC
reclaiming, e.g., LRU list, work queue, etc.
- On css_free, cleanup and free those structures created in alloc.
- On max_write, trigger EPC reclaiming if the new limit is at or below
current usage.

Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
---
V5:
- Remove prefixes from the callback names (tj)
- Update commit message (Jarkko)

V4:
- Moved this to the front of the series.
- Applies on cgroup/for-6.6 with the overflow fix for misc.

V3:
- Removed the released() callback
---
 include/linux/misc_cgroup.h |  5 +++++
 kernel/cgroup/misc.c        | 32 +++++++++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 3 deletions(-)
  

Comments

Jarkko Sakkinen Sept. 25, 2023, 5:09 p.m. UTC | #1
On Sat Sep 23, 2023 at 6:06 AM EEST, Haitao Huang wrote:
> From: Kristen Carlson Accardi <kristen@linux.intel.com>
>
> The misc cgroup controller (subsystem) currently does not perform
> resource type specific action for Cgroups Subsystem State (CSS) events:
> the 'css_alloc' event when a cgroup is created and the 'css_free' event
> when a cgroup is destroyed, or in event of user writing the max value to
> the misc.max file to set the usage limit of a specific resource
> [admin-guide/cgroup-v2.rst, 5-9. Misc].
>
> Define callbacks for those events and allow resource providers to
> register the callbacks per resource type as needed. This will be
> utilized later by the EPC misc cgroup support implemented in the SGX
> driver:
> - On css_alloc, allocate and initialize necessary structures for EPC
> reclaiming, e.g., LRU list, work queue, etc.
> - On css_free, cleanup and free those structures created in alloc.
> - On max_write, trigger EPC reclaiming if the new limit is at or below
> current usage.
>
> Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
> Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
> ---
> V5:
> - Remove prefixes from the callback names (tj)
> - Update commit message (Jarkko)
>
> V4:
> - Moved this to the front of the series.
> - Applies on cgroup/for-6.6 with the overflow fix for misc.
>
> V3:
> - Removed the released() callback
> ---
>  include/linux/misc_cgroup.h |  5 +++++
>  kernel/cgroup/misc.c        | 32 +++++++++++++++++++++++++++++---
>  2 files changed, 34 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
> index e799b1f8d05b..96a88822815a 100644
> --- a/include/linux/misc_cgroup.h
> +++ b/include/linux/misc_cgroup.h
> @@ -37,6 +37,11 @@ struct misc_res {
>  	u64 max;
>  	atomic64_t usage;
>  	atomic64_t events;
> +
> +	/* per resource callback ops */
> +	int (*alloc)(struct misc_cg *cg);
> +	void (*free)(struct misc_cg *cg);
> +	void (*max_write)(struct misc_cg *cg);
>  };
>  
>  /**
> diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
> index 79a3717a5803..62c9198dee21 100644
> --- a/kernel/cgroup/misc.c
> +++ b/kernel/cgroup/misc.c
> @@ -276,10 +276,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
>  
>  	cg = css_misc(of_css(of));
>  
> -	if (READ_ONCE(misc_res_capacity[type]))
> +	if (READ_ONCE(misc_res_capacity[type])) {
>  		WRITE_ONCE(cg->res[type].max, max);
> -	else
> +		if (cg->res[type].max_write)
> +			cg->res[type].max_write(cg);
> +	} else {
>  		ret = -EINVAL;
> +	}
>  
>  	return ret ? ret : nbytes;
>  }
> @@ -383,23 +386,39 @@ static struct cftype misc_cg_files[] = {
>  static struct cgroup_subsys_state *
>  misc_cg_alloc(struct cgroup_subsys_state *parent_css)
>  {
> +	struct misc_cg *parent_cg;
>  	enum misc_res_type i;
>  	struct misc_cg *cg;
> +	int ret;
>  
>  	if (!parent_css) {
>  		cg = &root_cg;
> +		parent_cg = &root_cg;
>  	} else {
>  		cg = kzalloc(sizeof(*cg), GFP_KERNEL);
>  		if (!cg)
>  			return ERR_PTR(-ENOMEM);
> +		parent_cg = css_misc(parent_css);
>  	}
>  
>  	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
>  		WRITE_ONCE(cg->res[i].max, MAX_NUM);
>  		atomic64_set(&cg->res[i].usage, 0);
> +		if (parent_cg->res[i].alloc) {
> +			ret = parent_cg->res[i].alloc(cg);
> +			if (ret)
> +				goto alloc_err;
> +		}
>  	}
>  
>  	return &cg->css;
> +
> +alloc_err:
> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
> +		if (parent_cg->res[i].free)
> +			cg->res[i].free(cg);
> +	kfree(cg);
> +	return ERR_PTR(ret);
>  }
>  
>  /**
> @@ -410,7 +429,14 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css)
>   */
>  static void misc_cg_free(struct cgroup_subsys_state *css)
>  {
> -	kfree(css_misc(css));
> +	struct misc_cg *cg = css_misc(css);
> +	enum misc_res_type i;
> +
> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
> +		if (cg->res[i].free)
> +			cg->res[i].free(cg);
> +
> +	kfree(cg);
>  }
>  
>  /* Cgroup controller callbacks */
> -- 
> 2.25.1

Since the only existing client feature requires all callbacks, should
this not have that as an invariant?

I.e. it might be better to fail unless *all* ops are non-nil (e.g. to
catch issues in the kernel code).

BR, Jarkko
  
Haitao Huang Sept. 26, 2023, 3:04 a.m. UTC | #2
Hi Jarkko

On Mon, 25 Sep 2023 12:09:21 -0500, Jarkko Sakkinen <jarkko@kernel.org>  
wrote:

> On Sat Sep 23, 2023 at 6:06 AM EEST, Haitao Huang wrote:
>> From: Kristen Carlson Accardi <kristen@linux.intel.com>
>>
>> The misc cgroup controller (subsystem) currently does not perform
>> resource type specific action for Cgroups Subsystem State (CSS) events:
>> the 'css_alloc' event when a cgroup is created and the 'css_free' event
>> when a cgroup is destroyed, or in event of user writing the max value to
>> the misc.max file to set the usage limit of a specific resource
>> [admin-guide/cgroup-v2.rst, 5-9. Misc].
>>
>> Define callbacks for those events and allow resource providers to
>> register the callbacks per resource type as needed. This will be
>> utilized later by the EPC misc cgroup support implemented in the SGX
>> driver:
>> - On css_alloc, allocate and initialize necessary structures for EPC
>> reclaiming, e.g., LRU list, work queue, etc.
>> - On css_free, cleanup and free those structures created in alloc.
>> - On max_write, trigger EPC reclaiming if the new limit is at or below
>> current usage.
>>
>> Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
>> Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
>> ---
>> V5:
>> - Remove prefixes from the callback names (tj)
>> - Update commit message (Jarkko)
>>
>> V4:
>> - Moved this to the front of the series.
>> - Applies on cgroup/for-6.6 with the overflow fix for misc.
>>
>> V3:
>> - Removed the released() callback
>> ---
>>  include/linux/misc_cgroup.h |  5 +++++
>>  kernel/cgroup/misc.c        | 32 +++++++++++++++++++++++++++++---
>>  2 files changed, 34 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
>> index e799b1f8d05b..96a88822815a 100644
>> --- a/include/linux/misc_cgroup.h
>> +++ b/include/linux/misc_cgroup.h
>> @@ -37,6 +37,11 @@ struct misc_res {
>>  	u64 max;
>>  	atomic64_t usage;
>>  	atomic64_t events;
>> +
>> +	/* per resource callback ops */
>> +	int (*alloc)(struct misc_cg *cg);
>> +	void (*free)(struct misc_cg *cg);
>> +	void (*max_write)(struct misc_cg *cg);
>>  };
>>
>>  /**
>> diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
>> index 79a3717a5803..62c9198dee21 100644
>> --- a/kernel/cgroup/misc.c
>> +++ b/kernel/cgroup/misc.c
>> @@ -276,10 +276,13 @@ static ssize_t misc_cg_max_write(struct  
>> kernfs_open_file *of, char *buf,
>>
>>  	cg = css_misc(of_css(of));
>>
>> -	if (READ_ONCE(misc_res_capacity[type]))
>> +	if (READ_ONCE(misc_res_capacity[type])) {
>>  		WRITE_ONCE(cg->res[type].max, max);
>> -	else
>> +		if (cg->res[type].max_write)
>> +			cg->res[type].max_write(cg);
>> +	} else {
>>  		ret = -EINVAL;
>> +	}
>>
>>  	return ret ? ret : nbytes;
>>  }
>> @@ -383,23 +386,39 @@ static struct cftype misc_cg_files[] = {
>>  static struct cgroup_subsys_state *
>>  misc_cg_alloc(struct cgroup_subsys_state *parent_css)
>>  {
>> +	struct misc_cg *parent_cg;
>>  	enum misc_res_type i;
>>  	struct misc_cg *cg;
>> +	int ret;
>>
>>  	if (!parent_css) {
>>  		cg = &root_cg;
>> +		parent_cg = &root_cg;
>>  	} else {
>>  		cg = kzalloc(sizeof(*cg), GFP_KERNEL);
>>  		if (!cg)
>>  			return ERR_PTR(-ENOMEM);
>> +		parent_cg = css_misc(parent_css);
>>  	}
>>
>>  	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
>>  		WRITE_ONCE(cg->res[i].max, MAX_NUM);
>>  		atomic64_set(&cg->res[i].usage, 0);
>> +		if (parent_cg->res[i].alloc) {
>> +			ret = parent_cg->res[i].alloc(cg);
>> +			if (ret)
>> +				goto alloc_err;
>> +		}
>>  	}
>>
>>  	return &cg->css;
>> +
>> +alloc_err:
>> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
>> +		if (parent_cg->res[i].free)
>> +			cg->res[i].free(cg);
>> +	kfree(cg);
>> +	return ERR_PTR(ret);
>>  }
>>
>>  /**
>> @@ -410,7 +429,14 @@ misc_cg_alloc(struct cgroup_subsys_state  
>> *parent_css)
>>   */
>>  static void misc_cg_free(struct cgroup_subsys_state *css)
>>  {
>> -	kfree(css_misc(css));
>> +	struct misc_cg *cg = css_misc(css);
>> +	enum misc_res_type i;
>> +
>> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
>> +		if (cg->res[i].free)
>> +			cg->res[i].free(cg);
>> +
>> +	kfree(cg);
>>  }
>>
>>  /* Cgroup controller callbacks */
>> --
>> 2.25.1
>
> Since the only existing client feature requires all callbacks, should
> this not have that as an invariant?
>
> I.e. it might be better to fail unless *all* ops are non-nil (e.g. to
> catch issues in the kernel code).
>

These callbacks are chained from cgroup_subsys, and they are defined  
separately so it'd be better follow the same pattern.  Or change together  
with cgroup_subsys if we want to do that. Reasonable?

Thanks
Haitao
  
Jarkko Sakkinen Sept. 26, 2023, 1:10 p.m. UTC | #3
On Tue Sep 26, 2023 at 6:04 AM EEST, Haitao Huang wrote:
> Hi Jarkko
>
> On Mon, 25 Sep 2023 12:09:21 -0500, Jarkko Sakkinen <jarkko@kernel.org>  
> wrote:
>
> > On Sat Sep 23, 2023 at 6:06 AM EEST, Haitao Huang wrote:
> >> From: Kristen Carlson Accardi <kristen@linux.intel.com>
> >>
> >> The misc cgroup controller (subsystem) currently does not perform
> >> resource type specific action for Cgroups Subsystem State (CSS) events:
> >> the 'css_alloc' event when a cgroup is created and the 'css_free' event
> >> when a cgroup is destroyed, or in event of user writing the max value to
> >> the misc.max file to set the usage limit of a specific resource
> >> [admin-guide/cgroup-v2.rst, 5-9. Misc].
> >>
> >> Define callbacks for those events and allow resource providers to
> >> register the callbacks per resource type as needed. This will be
> >> utilized later by the EPC misc cgroup support implemented in the SGX
> >> driver:
> >> - On css_alloc, allocate and initialize necessary structures for EPC
> >> reclaiming, e.g., LRU list, work queue, etc.
> >> - On css_free, cleanup and free those structures created in alloc.
> >> - On max_write, trigger EPC reclaiming if the new limit is at or below
> >> current usage.
> >>
> >> Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
> >> Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
> >> ---
> >> V5:
> >> - Remove prefixes from the callback names (tj)
> >> - Update commit message (Jarkko)
> >>
> >> V4:
> >> - Moved this to the front of the series.
> >> - Applies on cgroup/for-6.6 with the overflow fix for misc.
> >>
> >> V3:
> >> - Removed the released() callback
> >> ---
> >>  include/linux/misc_cgroup.h |  5 +++++
> >>  kernel/cgroup/misc.c        | 32 +++++++++++++++++++++++++++++---
> >>  2 files changed, 34 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
> >> index e799b1f8d05b..96a88822815a 100644
> >> --- a/include/linux/misc_cgroup.h
> >> +++ b/include/linux/misc_cgroup.h
> >> @@ -37,6 +37,11 @@ struct misc_res {
> >>  	u64 max;
> >>  	atomic64_t usage;
> >>  	atomic64_t events;
> >> +
> >> +	/* per resource callback ops */
> >> +	int (*alloc)(struct misc_cg *cg);
> >> +	void (*free)(struct misc_cg *cg);
> >> +	void (*max_write)(struct misc_cg *cg);
> >>  };
> >>
> >>  /**
> >> diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
> >> index 79a3717a5803..62c9198dee21 100644
> >> --- a/kernel/cgroup/misc.c
> >> +++ b/kernel/cgroup/misc.c
> >> @@ -276,10 +276,13 @@ static ssize_t misc_cg_max_write(struct  
> >> kernfs_open_file *of, char *buf,
> >>
> >>  	cg = css_misc(of_css(of));
> >>
> >> -	if (READ_ONCE(misc_res_capacity[type]))
> >> +	if (READ_ONCE(misc_res_capacity[type])) {
> >>  		WRITE_ONCE(cg->res[type].max, max);
> >> -	else
> >> +		if (cg->res[type].max_write)
> >> +			cg->res[type].max_write(cg);
> >> +	} else {
> >>  		ret = -EINVAL;
> >> +	}
> >>
> >>  	return ret ? ret : nbytes;
> >>  }
> >> @@ -383,23 +386,39 @@ static struct cftype misc_cg_files[] = {
> >>  static struct cgroup_subsys_state *
> >>  misc_cg_alloc(struct cgroup_subsys_state *parent_css)
> >>  {
> >> +	struct misc_cg *parent_cg;
> >>  	enum misc_res_type i;
> >>  	struct misc_cg *cg;
> >> +	int ret;
> >>
> >>  	if (!parent_css) {
> >>  		cg = &root_cg;
> >> +		parent_cg = &root_cg;
> >>  	} else {
> >>  		cg = kzalloc(sizeof(*cg), GFP_KERNEL);
> >>  		if (!cg)
> >>  			return ERR_PTR(-ENOMEM);
> >> +		parent_cg = css_misc(parent_css);
> >>  	}
> >>
> >>  	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
> >>  		WRITE_ONCE(cg->res[i].max, MAX_NUM);
> >>  		atomic64_set(&cg->res[i].usage, 0);
> >> +		if (parent_cg->res[i].alloc) {
> >> +			ret = parent_cg->res[i].alloc(cg);
> >> +			if (ret)
> >> +				goto alloc_err;
> >> +		}
> >>  	}
> >>
> >>  	return &cg->css;
> >> +
> >> +alloc_err:
> >> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
> >> +		if (parent_cg->res[i].free)
> >> +			cg->res[i].free(cg);
> >> +	kfree(cg);
> >> +	return ERR_PTR(ret);
> >>  }
> >>
> >>  /**
> >> @@ -410,7 +429,14 @@ misc_cg_alloc(struct cgroup_subsys_state  
> >> *parent_css)
> >>   */
> >>  static void misc_cg_free(struct cgroup_subsys_state *css)
> >>  {
> >> -	kfree(css_misc(css));
> >> +	struct misc_cg *cg = css_misc(css);
> >> +	enum misc_res_type i;
> >> +
> >> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
> >> +		if (cg->res[i].free)
> >> +			cg->res[i].free(cg);
> >> +
> >> +	kfree(cg);
> >>  }
> >>
> >>  /* Cgroup controller callbacks */
> >> --
> >> 2.25.1
> >
> > Since the only existing client feature requires all callbacks, should
> > this not have that as an invariant?
> >
> > I.e. it might be better to fail unless *all* ops are non-nil (e.g. to
> > catch issues in the kernel code).
> >
>
> These callbacks are chained from cgroup_subsys, and they are defined  
> separately so it'd be better follow the same pattern.  Or change together  
> with cgroup_subsys if we want to do that. Reasonable?

I noticed this one later:

It would better to create a separate ops struct and declare the instance
as const at minimum.

Then there is no need for dynamic assigment of ops and all that is in
rodata. This is improves both security and also allows static analysis
bit better.

Now you have to dynamically trace the struct instance, e.g. in case of
a bug. If this one done, it would be already in the vmlinux.

BR, Jarkko
  
Jarkko Sakkinen Sept. 26, 2023, 1:13 p.m. UTC | #4
On Tue Sep 26, 2023 at 4:10 PM EEST, Jarkko Sakkinen wrote:
> On Tue Sep 26, 2023 at 6:04 AM EEST, Haitao Huang wrote:
> > Hi Jarkko
> >
> > On Mon, 25 Sep 2023 12:09:21 -0500, Jarkko Sakkinen <jarkko@kernel.org>  
> > wrote:
> >
> > > On Sat Sep 23, 2023 at 6:06 AM EEST, Haitao Huang wrote:
> > >> From: Kristen Carlson Accardi <kristen@linux.intel.com>
> > >>
> > >> The misc cgroup controller (subsystem) currently does not perform
> > >> resource type specific action for Cgroups Subsystem State (CSS) events:
> > >> the 'css_alloc' event when a cgroup is created and the 'css_free' event
> > >> when a cgroup is destroyed, or in event of user writing the max value to
> > >> the misc.max file to set the usage limit of a specific resource
> > >> [admin-guide/cgroup-v2.rst, 5-9. Misc].
> > >>
> > >> Define callbacks for those events and allow resource providers to
> > >> register the callbacks per resource type as needed. This will be
> > >> utilized later by the EPC misc cgroup support implemented in the SGX
> > >> driver:
> > >> - On css_alloc, allocate and initialize necessary structures for EPC
> > >> reclaiming, e.g., LRU list, work queue, etc.
> > >> - On css_free, cleanup and free those structures created in alloc.
> > >> - On max_write, trigger EPC reclaiming if the new limit is at or below
> > >> current usage.
> > >>
> > >> Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
> > >> Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
> > >> ---
> > >> V5:
> > >> - Remove prefixes from the callback names (tj)
> > >> - Update commit message (Jarkko)
> > >>
> > >> V4:
> > >> - Moved this to the front of the series.
> > >> - Applies on cgroup/for-6.6 with the overflow fix for misc.
> > >>
> > >> V3:
> > >> - Removed the released() callback
> > >> ---
> > >>  include/linux/misc_cgroup.h |  5 +++++
> > >>  kernel/cgroup/misc.c        | 32 +++++++++++++++++++++++++++++---
> > >>  2 files changed, 34 insertions(+), 3 deletions(-)
> > >>
> > >> diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
> > >> index e799b1f8d05b..96a88822815a 100644
> > >> --- a/include/linux/misc_cgroup.h
> > >> +++ b/include/linux/misc_cgroup.h
> > >> @@ -37,6 +37,11 @@ struct misc_res {
> > >>  	u64 max;
> > >>  	atomic64_t usage;
> > >>  	atomic64_t events;
> > >> +
> > >> +	/* per resource callback ops */
> > >> +	int (*alloc)(struct misc_cg *cg);
> > >> +	void (*free)(struct misc_cg *cg);
> > >> +	void (*max_write)(struct misc_cg *cg);
> > >>  };
> > >>
> > >>  /**
> > >> diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
> > >> index 79a3717a5803..62c9198dee21 100644
> > >> --- a/kernel/cgroup/misc.c
> > >> +++ b/kernel/cgroup/misc.c
> > >> @@ -276,10 +276,13 @@ static ssize_t misc_cg_max_write(struct  
> > >> kernfs_open_file *of, char *buf,
> > >>
> > >>  	cg = css_misc(of_css(of));
> > >>
> > >> -	if (READ_ONCE(misc_res_capacity[type]))
> > >> +	if (READ_ONCE(misc_res_capacity[type])) {
> > >>  		WRITE_ONCE(cg->res[type].max, max);
> > >> -	else
> > >> +		if (cg->res[type].max_write)
> > >> +			cg->res[type].max_write(cg);
> > >> +	} else {
> > >>  		ret = -EINVAL;
> > >> +	}
> > >>
> > >>  	return ret ? ret : nbytes;
> > >>  }
> > >> @@ -383,23 +386,39 @@ static struct cftype misc_cg_files[] = {
> > >>  static struct cgroup_subsys_state *
> > >>  misc_cg_alloc(struct cgroup_subsys_state *parent_css)
> > >>  {
> > >> +	struct misc_cg *parent_cg;
> > >>  	enum misc_res_type i;
> > >>  	struct misc_cg *cg;
> > >> +	int ret;
> > >>
> > >>  	if (!parent_css) {
> > >>  		cg = &root_cg;
> > >> +		parent_cg = &root_cg;
> > >>  	} else {
> > >>  		cg = kzalloc(sizeof(*cg), GFP_KERNEL);
> > >>  		if (!cg)
> > >>  			return ERR_PTR(-ENOMEM);
> > >> +		parent_cg = css_misc(parent_css);
> > >>  	}
> > >>
> > >>  	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
> > >>  		WRITE_ONCE(cg->res[i].max, MAX_NUM);
> > >>  		atomic64_set(&cg->res[i].usage, 0);
> > >> +		if (parent_cg->res[i].alloc) {
> > >> +			ret = parent_cg->res[i].alloc(cg);
> > >> +			if (ret)
> > >> +				goto alloc_err;
> > >> +		}
> > >>  	}
> > >>
> > >>  	return &cg->css;
> > >> +
> > >> +alloc_err:
> > >> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
> > >> +		if (parent_cg->res[i].free)
> > >> +			cg->res[i].free(cg);
> > >> +	kfree(cg);
> > >> +	return ERR_PTR(ret);
> > >>  }
> > >>
> > >>  /**
> > >> @@ -410,7 +429,14 @@ misc_cg_alloc(struct cgroup_subsys_state  
> > >> *parent_css)
> > >>   */
> > >>  static void misc_cg_free(struct cgroup_subsys_state *css)
> > >>  {
> > >> -	kfree(css_misc(css));
> > >> +	struct misc_cg *cg = css_misc(css);
> > >> +	enum misc_res_type i;
> > >> +
> > >> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
> > >> +		if (cg->res[i].free)
> > >> +			cg->res[i].free(cg);
> > >> +
> > >> +	kfree(cg);
> > >>  }
> > >>
> > >>  /* Cgroup controller callbacks */
> > >> --
> > >> 2.25.1
> > >
> > > Since the only existing client feature requires all callbacks, should
> > > this not have that as an invariant?
> > >
> > > I.e. it might be better to fail unless *all* ops are non-nil (e.g. to
> > > catch issues in the kernel code).
> > >
> >
> > These callbacks are chained from cgroup_subsys, and they are defined  
> > separately so it'd be better follow the same pattern.  Or change together  
> > with cgroup_subsys if we want to do that. Reasonable?
>
> I noticed this one later:
>
> It would better to create a separate ops struct and declare the instance
> as const at minimum.
>
> Then there is no need for dynamic assigment of ops and all that is in
> rodata. This is improves both security and also allows static analysis
> bit better.
>
> Now you have to dynamically trace the struct instance, e.g. in case of
> a bug. If this one done, it would be already in the vmlinux.

I.e. then in the driver you can have static const struct declaration
with *all* pointers pre-assigned.

Not sure if cgroups follows this or not but it is *objectively*
better. Previous work is not always best possible work...

BR, Jarkko
  
Haitao Huang Sept. 27, 2023, 1:56 a.m. UTC | #5
On Tue, 26 Sep 2023 08:13:18 -0500, Jarkko Sakkinen <jarkko@kernel.org>  
wrote:

...
>> > >>  /**
>> > >> @@ -410,7 +429,14 @@ misc_cg_alloc(struct cgroup_subsys_state
>> > >> *parent_css)
>> > >>   */
>> > >>  static void misc_cg_free(struct cgroup_subsys_state *css)
>> > >>  {
>> > >> -	kfree(css_misc(css));
>> > >> +	struct misc_cg *cg = css_misc(css);
>> > >> +	enum misc_res_type i;
>> > >> +
>> > >> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
>> > >> +		if (cg->res[i].free)
>> > >> +			cg->res[i].free(cg);
>> > >> +
>> > >> +	kfree(cg);
>> > >>  }
>> > >>
>> > >>  /* Cgroup controller callbacks */
>> > >> --
>> > >> 2.25.1
>> > >
>> > > Since the only existing client feature requires all callbacks,  
>> should
>> > > this not have that as an invariant?
>> > >
>> > > I.e. it might be better to fail unless *all* ops are non-nil (e.g.  
>> to
>> > > catch issues in the kernel code).
>> > >
>> >
>> > These callbacks are chained from cgroup_subsys, and they are defined
>> > separately so it'd be better follow the same pattern.  Or change  
>> together
>> > with cgroup_subsys if we want to do that. Reasonable?
>>
>> I noticed this one later:
>>
>> It would better to create a separate ops struct and declare the instance
>> as const at minimum.
>>
>> Then there is no need for dynamic assigment of ops and all that is in
>> rodata. This is improves both security and also allows static analysis
>> bit better.
>>
>> Now you have to dynamically trace the struct instance, e.g. in case of
>> a bug. If this one done, it would be already in the vmlinux.
>I.e. then in the driver you can have static const struct declaration
> with *all* pointers pre-assigned.
>
> Not sure if cgroups follows this or not but it is *objectively*
> better. Previous work is not always best possible work...
>

IIUC, like vm_ops field in vma structs. Although function pointers in  
vm_ops are assigned statically, but you still need dynamically assign  
vm_ops for each instance of vma.

So the code will look like this:

if (parent_cg->res[i].misc_ops && parent_cg->res[i].misc_ops->alloc)
{
...
}

I don't see this is the pattern used in cgroups and no strong opinion  
either way.

TJ, do you have preference on this?

Thanks
Haitao
  
Kai Huang Sept. 27, 2023, 9:20 a.m. UTC | #6
On Fri, 2023-09-22 at 20:06 -0700, Haitao Huang wrote:
> From: Kristen Carlson Accardi <kristen@linux.intel.com>
> 
> The misc cgroup controller (subsystem) currently does not perform
> resource type specific action for Cgroups Subsystem State (CSS) events:
> the 'css_alloc' event when a cgroup is created and the 'css_free' event
> when a cgroup is destroyed, or in event of user writing the max value to
> the misc.max file to set the usage limit of a specific resource
> [admin-guide/cgroup-v2.rst, 5-9. Misc].
> 
> Define callbacks for those events and allow resource providers to
> register the callbacks per resource type as needed. This will be
> utilized later by the EPC misc cgroup support implemented in the SGX
> driver:
> - On css_alloc, allocate and initialize necessary structures for EPC
> reclaiming, e.g., LRU list, work queue, etc.
> - On css_free, cleanup and free those structures created in alloc.
> - On max_write, trigger EPC reclaiming if the new limit is at or below
> current usage.

Nit:

Wondering why we should trigger EPC reclaiming if the new limit is *at* current
usage?

I actually don't quite care about why here, but writing these details in the
changelog may bring unnecessary confusion.  I guess you can just remove all the
details about what SGX driver needs to do on these callbacks.

> 
> Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
> Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
> ---
> V5:
> - Remove prefixes from the callback names (tj)
> - Update commit message (Jarkko)
> 
> V4:
> - Moved this to the front of the series.
> - Applies on cgroup/for-6.6 with the overflow fix for misc.
> 
> V3:
> - Removed the released() callback
> ---
>  include/linux/misc_cgroup.h |  5 +++++
>  kernel/cgroup/misc.c        | 32 +++++++++++++++++++++++++++++---
>  2 files changed, 34 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
> index e799b1f8d05b..96a88822815a 100644
> --- a/include/linux/misc_cgroup.h
> +++ b/include/linux/misc_cgroup.h
> @@ -37,6 +37,11 @@ struct misc_res {
>  	u64 max;
>  	atomic64_t usage;
>  	atomic64_t events;
> +
> +	/* per resource callback ops */

Nit:

This comment isn't quite useful IMHO.  And it seems you should just expand the
existing comment for the 'struct misc_res', which already covers the existing
members.

Or as Jarkko suggested, maybe you can introduce another structure 'misc_res_ops'
and comment more details for all these callbacks just like 'struct misc_res'.

Anyway it's cgroup maintainer's call.

> +	int (*alloc)(struct misc_cg *cg);
> +	void (*free)(struct misc_cg *cg);
> +	void (*max_write)(struct misc_cg *cg);
>  };
>  
>  /**
> diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
> index 79a3717a5803..62c9198dee21 100644
> --- a/kernel/cgroup/misc.c
> +++ b/kernel/cgroup/misc.c
> @@ -276,10 +276,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
>  
>  	cg = css_misc(of_css(of));
>  
> -	if (READ_ONCE(misc_res_capacity[type]))
> +	if (READ_ONCE(misc_res_capacity[type])) {
>  		WRITE_ONCE(cg->res[type].max, max);
> -	else
> +		if (cg->res[type].max_write)
> +			cg->res[type].max_write(cg);
> +	} else {
>  		ret = -EINVAL;
> +	}
>  
>  	return ret ? ret : nbytes;
>  }
> @@ -383,23 +386,39 @@ static struct cftype misc_cg_files[] = {
>  static struct cgroup_subsys_state *
>  misc_cg_alloc(struct cgroup_subsys_state *parent_css)
>  {
> +	struct misc_cg *parent_cg;

Nit: 

The below variable '*cg' can be moved here together with 'parent_cg'.
 
>  	enum misc_res_type i;
>  	struct misc_cg *cg;
> +	int ret;
>  
>  	if (!parent_css) {
>  		cg = &root_cg;
> +		parent_cg = &root_cg;

Nit:
		parent_cg = cg = &root_cg;
?

>  	} else {
>  		cg = kzalloc(sizeof(*cg), GFP_KERNEL);
>  		if (!cg)
>  			return ERR_PTR(-ENOMEM);
> +		parent_cg = css_misc(parent_css);
>  	}
>  
>  	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
>  		WRITE_ONCE(cg->res[i].max, MAX_NUM);
>  		atomic64_set(&cg->res[i].usage, 0);
> +		if (parent_cg->res[i].alloc) {
> +			ret = parent_cg->res[i].alloc(cg);
> +			if (ret)
> +				goto alloc_err;
> +		}
>  	}
>  
>  	return &cg->css;
> +
> +alloc_err:
> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
> +		if (parent_cg->res[i].free)
> +			cg->res[i].free(cg);
> +	kfree(cg);
> +	return ERR_PTR(ret);
>  }
>  
>  /**
> @@ -410,7 +429,14 @@ misc_cg_alloc(struct cgroup_subsys_state *parent_css)
>   */
>  static void misc_cg_free(struct cgroup_subsys_state *css)
>  {
> -	kfree(css_misc(css));
> +	struct misc_cg *cg = css_misc(css);
> +	enum misc_res_type i;
> +
> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
> +		if (cg->res[i].free)
> +			cg->res[i].free(cg);
> +
> +	kfree(cg);
>  }
>  
>  /* Cgroup controller callbacks */
  
Jarkko Sakkinen Oct. 2, 2023, 10:47 p.m. UTC | #7
On Wed Sep 27, 2023 at 4:56 AM EEST, Haitao Huang wrote:
> On Tue, 26 Sep 2023 08:13:18 -0500, Jarkko Sakkinen <jarkko@kernel.org>  
> wrote:
>
> ...
> >> > >>  /**
> >> > >> @@ -410,7 +429,14 @@ misc_cg_alloc(struct cgroup_subsys_state
> >> > >> *parent_css)
> >> > >>   */
> >> > >>  static void misc_cg_free(struct cgroup_subsys_state *css)
> >> > >>  {
> >> > >> -	kfree(css_misc(css));
> >> > >> +	struct misc_cg *cg = css_misc(css);
> >> > >> +	enum misc_res_type i;
> >> > >> +
> >> > >> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
> >> > >> +		if (cg->res[i].free)
> >> > >> +			cg->res[i].free(cg);
> >> > >> +
> >> > >> +	kfree(cg);
> >> > >>  }
> >> > >>
> >> > >>  /* Cgroup controller callbacks */
> >> > >> --
> >> > >> 2.25.1
> >> > >
> >> > > Since the only existing client feature requires all callbacks,  
> >> should
> >> > > this not have that as an invariant?
> >> > >
> >> > > I.e. it might be better to fail unless *all* ops are non-nil (e.g.  
> >> to
> >> > > catch issues in the kernel code).
> >> > >
> >> >
> >> > These callbacks are chained from cgroup_subsys, and they are defined
> >> > separately so it'd be better follow the same pattern.  Or change  
> >> together
> >> > with cgroup_subsys if we want to do that. Reasonable?
> >>
> >> I noticed this one later:
> >>
> >> It would better to create a separate ops struct and declare the instance
> >> as const at minimum.
> >>
> >> Then there is no need for dynamic assigment of ops and all that is in
> >> rodata. This is improves both security and also allows static analysis
> >> bit better.
> >>
> >> Now you have to dynamically trace the struct instance, e.g. in case of
> >> a bug. If this one done, it would be already in the vmlinux.
> >I.e. then in the driver you can have static const struct declaration
> > with *all* pointers pre-assigned.
> >
> > Not sure if cgroups follows this or not but it is *objectively*
> > better. Previous work is not always best possible work...
> >
>
> IIUC, like vm_ops field in vma structs. Although function pointers in  
> vm_ops are assigned statically, but you still need dynamically assign  
> vm_ops for each instance of vma.
>
> So the code will look like this:
>
> if (parent_cg->res[i].misc_ops && parent_cg->res[i].misc_ops->alloc)
> {
> ...
> }
>
> I don't see this is the pattern used in cgroups and no strong opinion  
> either way.
>
> TJ, do you have preference on this?

I do have strong opinion on this. In the client side we want as much
things declared statically as we can because it gives more tools for
statical analysis.

I don't want to see dynamic assignments in the SGX driver, when they
are not actually needed, no matter things are done in cgroups.

> Thanks
> Haitao

BR, Jarkko
  
Jarkko Sakkinen Oct. 2, 2023, 10:55 p.m. UTC | #8
On Tue Oct 3, 2023 at 1:47 AM EEST, Jarkko Sakkinen wrote:
> On Wed Sep 27, 2023 at 4:56 AM EEST, Haitao Huang wrote:
> > On Tue, 26 Sep 2023 08:13:18 -0500, Jarkko Sakkinen <jarkko@kernel.org>  
> > wrote:
> >
> > ...
> > >> > >>  /**
> > >> > >> @@ -410,7 +429,14 @@ misc_cg_alloc(struct cgroup_subsys_state
> > >> > >> *parent_css)
> > >> > >>   */
> > >> > >>  static void misc_cg_free(struct cgroup_subsys_state *css)
> > >> > >>  {
> > >> > >> -	kfree(css_misc(css));
> > >> > >> +	struct misc_cg *cg = css_misc(css);
> > >> > >> +	enum misc_res_type i;
> > >> > >> +
> > >> > >> +	for (i = 0; i < MISC_CG_RES_TYPES; i++)
> > >> > >> +		if (cg->res[i].free)
> > >> > >> +			cg->res[i].free(cg);
> > >> > >> +
> > >> > >> +	kfree(cg);
> > >> > >>  }
> > >> > >>
> > >> > >>  /* Cgroup controller callbacks */
> > >> > >> --
> > >> > >> 2.25.1
> > >> > >
> > >> > > Since the only existing client feature requires all callbacks,  
> > >> should
> > >> > > this not have that as an invariant?
> > >> > >
> > >> > > I.e. it might be better to fail unless *all* ops are non-nil (e.g.  
> > >> to
> > >> > > catch issues in the kernel code).
> > >> > >
> > >> >
> > >> > These callbacks are chained from cgroup_subsys, and they are defined
> > >> > separately so it'd be better follow the same pattern.  Or change  
> > >> together
> > >> > with cgroup_subsys if we want to do that. Reasonable?
> > >>
> > >> I noticed this one later:
> > >>
> > >> It would better to create a separate ops struct and declare the instance
> > >> as const at minimum.
> > >>
> > >> Then there is no need for dynamic assigment of ops and all that is in
> > >> rodata. This is improves both security and also allows static analysis
> > >> bit better.
> > >>
> > >> Now you have to dynamically trace the struct instance, e.g. in case of
> > >> a bug. If this one done, it would be already in the vmlinux.
> > >I.e. then in the driver you can have static const struct declaration
> > > with *all* pointers pre-assigned.
> > >
> > > Not sure if cgroups follows this or not but it is *objectively*
> > > better. Previous work is not always best possible work...
> > >
> >
> > IIUC, like vm_ops field in vma structs. Although function pointers in  
> > vm_ops are assigned statically, but you still need dynamically assign  
> > vm_ops for each instance of vma.
> >
> > So the code will look like this:
> >
> > if (parent_cg->res[i].misc_ops && parent_cg->res[i].misc_ops->alloc)
> > {
> > ...
> > }
> >
> > I don't see this is the pattern used in cgroups and no strong opinion  
> > either way.
> >
> > TJ, do you have preference on this?
>
> I do have strong opinion on this. In the client side we want as much
> things declared statically as we can because it gives more tools for
> statical analysis.
>
> I don't want to see dynamic assignments in the SGX driver, when they
> are not actually needed, no matter things are done in cgroups.

I.e. I don't really even care what crazy things cgroups subsystem
might do or not do. It's not my problem.

All I care is that we *do not* have any use for assigning those
pointers at run-time. So do whatever you want with cgroups side
as long as this is not the case.

BR, Jarkko
  
Haitao Huang Oct. 3, 2023, 2:29 p.m. UTC | #9
On Wed, 27 Sep 2023 04:20:55 -0500, Huang, Kai <kai.huang@intel.com> wrote:

> On Fri, 2023-09-22 at 20:06 -0700, Haitao Huang wrote:
>> From: Kristen Carlson Accardi <kristen@linux.intel.com>
>>
>> The misc cgroup controller (subsystem) currently does not perform
>> resource type specific action for Cgroups Subsystem State (CSS) events:
>> the 'css_alloc' event when a cgroup is created and the 'css_free' event
>> when a cgroup is destroyed, or in event of user writing the max value to
>> the misc.max file to set the usage limit of a specific resource
>> [admin-guide/cgroup-v2.rst, 5-9. Misc].
>>
>> Define callbacks for those events and allow resource providers to
>> register the callbacks per resource type as needed. This will be
>> utilized later by the EPC misc cgroup support implemented in the SGX
>> driver:
>> - On css_alloc, allocate and initialize necessary structures for EPC
>> reclaiming, e.g., LRU list, work queue, etc.
>> - On css_free, cleanup and free those structures created in alloc.
>> - On max_write, trigger EPC reclaiming if the new limit is at or below
>> current usage.
>
> Nit:
>
> Wondering why we should trigger EPC reclaiming if the new limit is *at*  
> current
> usage?
>
> I actually don't quite care about why here, but writing these details in  
> the
> changelog may bring unnecessary confusion.  I guess you can just remove  
> all the
> details about what SGX driver needs to do on these callbacks.
>
>
Okay, I'll remove the three bullets on the SGX drive implementation  
details.

Thanks
Haitao
  
Haitao Huang Oct. 4, 2023, 3:45 p.m. UTC | #10
Hi Jarkko

On Mon, 02 Oct 2023 17:55:14 -0500, Jarkko Sakkinen <jarkko@kernel.org>  
wrote:
...
>> > >> I noticed this one later:
>> > >>
>> > >> It would better to create a separate ops struct and declare the  
>> instance
>> > >> as const at minimum.
>> > >>
>> > >> Then there is no need for dynamic assigment of ops and all that is  
>> in
>> > >> rodata. This is improves both security and also allows static  
>> analysis
>> > >> bit better.
>> > >>
>> > >> Now you have to dynamically trace the struct instance, e.g. in  
>> case of
>> > >> a bug. If this one done, it would be already in the vmlinux.
>> > >I.e. then in the driver you can have static const struct declaration
>> > > with *all* pointers pre-assigned.
>> > >
>> > > Not sure if cgroups follows this or not but it is *objectively*
>> > > better. Previous work is not always best possible work...
>> > >
>> >
>> > IIUC, like vm_ops field in vma structs. Although function pointers in
>> > vm_ops are assigned statically, but you still need dynamically assign
>> > vm_ops for each instance of vma.
>> >
>> > So the code will look like this:
>> >
>> > if (parent_cg->res[i].misc_ops && parent_cg->res[i].misc_ops->alloc)
>> > {
>> > ...
>> > }
>> >
>> > I don't see this is the pattern used in cgroups and no strong opinion
>> > either way.
>> >
>> > TJ, do you have preference on this?
>>
>> I do have strong opinion on this. In the client side we want as much
>> things declared statically as we can because it gives more tools for
>> statical analysis.
>>
>> I don't want to see dynamic assignments in the SGX driver, when they
>> are not actually needed, no matter things are done in cgroups.
>
> I.e. I don't really even care what crazy things cgroups subsystem
> might do or not do. It's not my problem.
>
> All I care is that we *do not* have any use for assigning those
> pointers at run-time. So do whatever you want with cgroups side
> as long as this is not the case.
>


So I will update to something like following. Let me know if that's  
correct understanding.
@tj, I'd appreciate for your input on whether this is acceptable from  
cgroups side.

--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -31,22 +31,26 @@ struct misc_cg;

  #include <linux/cgroup.h>

+/* per resource callback ops */
+struct misc_operations_struct {
+       int (*alloc)(struct misc_cg *cg);
+       void (*free)(struct misc_cg *cg);
+       void (*max_write)(struct misc_cg *cg);
+};
  /**
   * struct misc_res: Per cgroup per misc type resource
   * @max: Maximum limit on the resource.
   * @usage: Current usage of the resource.
   * @events: Number of times, the resource limit exceeded.
+ * @priv: resource specific data.
+ * @misc_ops: resource specific operations.
   */
  struct misc_res {
         u64 max;
         atomic64_t usage;
         atomic64_t events;
         void *priv;
-
-       /* per resource callback ops */
-       int (*alloc)(struct misc_cg *cg);
-       void (*free)(struct misc_cg *cg);
-       void (*max_write)(struct misc_cg *cg);
+       const struct misc_operations_struct *misc_ops;
  };

...
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 4633b8629e63..500415087643 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -277,8 +277,8 @@ static ssize_t misc_cg_max_write(struct  
kernfs_open_file *of, char *buf,

         if (READ_ONCE(misc_res_capacity[type])) {
                 WRITE_ONCE(cg->res[type].max, max);
-               if (cg->res[type].max_write)
-                       cg->res[type].max_write(cg);
+               if (cg->res[type].misc_ops &&  
cg->res[type].misc_ops->max_write)
+                       cg->res[type].misc_ops->max_write(cg);

[skip other similar changes in misc.c]

And on SGX side, it'll be updated like this:

--- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c
+++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
@@ -376,6 +376,14 @@ static void sgx_epc_cgroup_max_write(struct misc_cg  
*cg)
         queue_work(sgx_epc_cg_wq, &rc.epc_cg->reclaim_work);
  }

+static int sgx_epc_cgroup_alloc(struct misc_cg *cg);
+
+const struct misc_operations_struct sgx_epc_cgroup_ops = {
+        .alloc = sgx_epc_cgroup_alloc,
+        .free = sgx_epc_cgroup_free,
+        .max_write = sgx_epc_cgroup_max_write,
+};
+
  static int sgx_epc_cgroup_alloc(struct misc_cg *cg)
  {
         struct sgx_epc_cgroup *epc_cg;
@@ -386,12 +394,7 @@ static int sgx_epc_cgroup_alloc(struct misc_cg *cg)

         sgx_lru_init(&epc_cg->lru);
         INIT_WORK(&epc_cg->reclaim_work, sgx_epc_cgroup_reclaim_work_func);
-       cg->res[MISC_CG_RES_SGX_EPC].alloc = sgx_epc_cgroup_alloc;
-       cg->res[MISC_CG_RES_SGX_EPC].free = sgx_epc_cgroup_free;
-       cg->res[MISC_CG_RES_SGX_EPC].max_write = sgx_epc_cgroup_max_write;
-       cg->res[MISC_CG_RES_SGX_EPC].priv = epc_cg;
-       epc_cg->cg = cg;
-
+       cg->res[MISC_CG_RES_SGX_EPC].misc_ops = &sgx_epc_cgroup_ops;
         return 0;
  }


Thanks again to all of you for feedback.

Haitao
  
Tejun Heo Oct. 4, 2023, 5:18 p.m. UTC | #11
Hello,

On Wed, Oct 04, 2023 at 10:45:08AM -0500, Haitao Huang wrote:
> So I will update to something like following. Let me know if that's correct
> understanding.
> @tj, I'd appreciate for your input on whether this is acceptable from
> cgroups side.

Yeah, that's fine by me and I can't tell what actual differences the two
would have in practice.

Thanks.
  
Michal Koutný Oct. 17, 2023, 6:55 p.m. UTC | #12
On Fri, Sep 22, 2023 at 08:06:40PM -0700, Haitao Huang <haitao.huang@linux.intel.com> wrote:
> @@ -276,10 +276,13 @@ static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
>  
>  	cg = css_misc(of_css(of));
>  
> -	if (READ_ONCE(misc_res_capacity[type]))
> +	if (READ_ONCE(misc_res_capacity[type])) {
>  		WRITE_ONCE(cg->res[type].max, max);
> -	else
> +		if (cg->res[type].max_write)
> +			cg->res[type].max_write(cg);
> +	} else {
>  		ret = -EINVAL;
>
> +	}

Is it time for a misc_cg_mutex? This given no synchronization guarantees
to implementors of max_write. (Alternatively, document it that the
callback must implement own synchronization.)


Thanks,
Michal
  

Patch

diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index e799b1f8d05b..96a88822815a 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -37,6 +37,11 @@  struct misc_res {
 	u64 max;
 	atomic64_t usage;
 	atomic64_t events;
+
+	/* per resource callback ops */
+	int (*alloc)(struct misc_cg *cg);
+	void (*free)(struct misc_cg *cg);
+	void (*max_write)(struct misc_cg *cg);
 };
 
 /**
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 79a3717a5803..62c9198dee21 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -276,10 +276,13 @@  static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
 
 	cg = css_misc(of_css(of));
 
-	if (READ_ONCE(misc_res_capacity[type]))
+	if (READ_ONCE(misc_res_capacity[type])) {
 		WRITE_ONCE(cg->res[type].max, max);
-	else
+		if (cg->res[type].max_write)
+			cg->res[type].max_write(cg);
+	} else {
 		ret = -EINVAL;
+	}
 
 	return ret ? ret : nbytes;
 }
@@ -383,23 +386,39 @@  static struct cftype misc_cg_files[] = {
 static struct cgroup_subsys_state *
 misc_cg_alloc(struct cgroup_subsys_state *parent_css)
 {
+	struct misc_cg *parent_cg;
 	enum misc_res_type i;
 	struct misc_cg *cg;
+	int ret;
 
 	if (!parent_css) {
 		cg = &root_cg;
+		parent_cg = &root_cg;
 	} else {
 		cg = kzalloc(sizeof(*cg), GFP_KERNEL);
 		if (!cg)
 			return ERR_PTR(-ENOMEM);
+		parent_cg = css_misc(parent_css);
 	}
 
 	for (i = 0; i < MISC_CG_RES_TYPES; i++) {
 		WRITE_ONCE(cg->res[i].max, MAX_NUM);
 		atomic64_set(&cg->res[i].usage, 0);
+		if (parent_cg->res[i].alloc) {
+			ret = parent_cg->res[i].alloc(cg);
+			if (ret)
+				goto alloc_err;
+		}
 	}
 
 	return &cg->css;
+
+alloc_err:
+	for (i = 0; i < MISC_CG_RES_TYPES; i++)
+		if (parent_cg->res[i].free)
+			cg->res[i].free(cg);
+	kfree(cg);
+	return ERR_PTR(ret);
 }
 
 /**
@@ -410,7 +429,14 @@  misc_cg_alloc(struct cgroup_subsys_state *parent_css)
  */
 static void misc_cg_free(struct cgroup_subsys_state *css)
 {
-	kfree(css_misc(css));
+	struct misc_cg *cg = css_misc(css);
+	enum misc_res_type i;
+
+	for (i = 0; i < MISC_CG_RES_TYPES; i++)
+		if (cg->res[i].free)
+			cg->res[i].free(cg);
+
+	kfree(cg);
 }
 
 /* Cgroup controller callbacks */