[v9,08/15] x86/sgx: Implement EPC reclamation flows for cgroup

Message ID 20240205210638.157741-9-haitao.huang@linux.intel.com
State New
Headers
Series Add Cgroup support for SGX EPC memory |

Commit Message

Haitao Huang Feb. 5, 2024, 9:06 p.m. UTC
  From: Kristen Carlson Accardi <kristen@linux.intel.com>

Implement the reclamation flow for cgroup, encapsulated in the top-level
function sgx_epc_cgroup_reclaim_pages(). It does a pre-order walk on its
subtree, and make calls to sgx_reclaim_pages() at each node passing in
the LRU of that node. It keeps track of total reclaimed pages, and pages
left to attempt.  It stops the walk if desired number of pages are
attempted.

In some contexts, e.g. page fault handling, only asynchronous
reclamation is allowed. Create a work-queue, corresponding work item and
function definitions to support the asynchronous reclamation. Both
synchronous and asynchronous flows invoke the same top level reclaim
function, and will be triggered later by sgx_epc_cgroup_try_charge()
when usage of the cgroup is at or near its limit.

Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
Co-developed-by: Haitao Huang <haitao.huang@linux.intel.com>
Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
---
V9:
- Add comments for static variables. (Jarkko)

V8:
- Remove alignment for substructure variables. (Jarkko)

V7:
- Split this out from the big patch, #10 in V6. (Dave, Kai)
---
 arch/x86/kernel/cpu/sgx/epc_cgroup.c | 181 ++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/sgx/epc_cgroup.h |   3 +
 2 files changed, 183 insertions(+), 1 deletion(-)
  

Comments

Jarkko Sakkinen Feb. 12, 2024, 7:35 p.m. UTC | #1
On Mon Feb 5, 2024 at 11:06 PM EET, Haitao Huang wrote:
> From: Kristen Carlson Accardi <kristen@linux.intel.com>
>
> Implement the reclamation flow for cgroup, encapsulated in the top-level
> function sgx_epc_cgroup_reclaim_pages(). It does a pre-order walk on its
> subtree, and make calls to sgx_reclaim_pages() at each node passing in
> the LRU of that node. It keeps track of total reclaimed pages, and pages
> left to attempt.  It stops the walk if desired number of pages are
> attempted.
>
> In some contexts, e.g. page fault handling, only asynchronous
> reclamation is allowed. Create a work-queue, corresponding work item and
> function definitions to support the asynchronous reclamation. Both
> synchronous and asynchronous flows invoke the same top level reclaim
> function, and will be triggered later by sgx_epc_cgroup_try_charge()
> when usage of the cgroup is at or near its limit.
>
> Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
> Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
> Co-developed-by: Haitao Huang <haitao.huang@linux.intel.com>
> Signed-off-by: Haitao Huang <haitao.huang@linux.intel.com>
> ---
> V9:
> - Add comments for static variables. (Jarkko)
>
> V8:
> - Remove alignment for substructure variables. (Jarkko)
>
> V7:
> - Split this out from the big patch, #10 in V6. (Dave, Kai)
> ---
>  arch/x86/kernel/cpu/sgx/epc_cgroup.c | 181 ++++++++++++++++++++++++++-
>  arch/x86/kernel/cpu/sgx/epc_cgroup.h |   3 +
>  2 files changed, 183 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.c b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> index f4a37ace67d7..16b6d9f909eb 100644
> --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
> @@ -8,9 +8,180 @@
>  /* The root EPC cgroup */
>  static struct sgx_epc_cgroup epc_cg_root;
>  
> +/*
> + * The work queue that reclaims EPC pages in the background for cgroups.
> + *
> + * A cgroup schedules a work item into this queue to reclaim pages within the
> + * same cgroup when its usage limit is reached and synchronous reclamation is not
> + * an option, e.g., in a fault handler.
> + */

Here I get a bit confused of the text because of "e.g., in a fault
handler".  Can we enumerate sites broadly where stimulus could
happen.

Does not have to be complete but at least the most common locations
would make this comment *actually* useful for later maintenance.

BR, Jarkko
  
Kai Huang Feb. 20, 2024, 9:52 a.m. UTC | #2
> +/*
> + * Get the lower bound of limits of a cgroup and its ancestors.  Used in
> + * sgx_epc_cgroup_reclaim_work_func() to determine if EPC usage of a cgroup is
> + * over its limit or its ancestors' hence reclamation is needed.
> + */
> +static inline u64 sgx_epc_cgroup_max_pages_to_root(struct sgx_epc_cgroup *epc_cg)
> +{
> +	struct misc_cg *i = epc_cg->cg;
> +	u64 m = U64_MAX;
> +
> +	while (i) {
> +		m = min(m, READ_ONCE(i->res[MISC_CG_RES_SGX_EPC].max));
> +		i = misc_cg_parent(i);
> +	}
> +
> +	return m / PAGE_SIZE;
> +}

I am not sure, but is it possible or legal for an ancestor to have less limit
than children?

> +
>  /**
> - * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page
> + * sgx_epc_cgroup_lru_empty() - check if a cgroup tree has no pages on its LRUs
> + * @root:	Root of the tree to check
>   *
> + * Return: %true if all cgroups under the specified root have empty LRU lists.
> + * Used to avoid livelocks due to a cgroup having a non-zero charge count but
> + * no pages on its LRUs, e.g. due to a dead enclave waiting to be released or
> + * because all pages in the cgroup are unreclaimable.
> + */
> +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root)
> +{
> +	struct cgroup_subsys_state *css_root;
> +	struct cgroup_subsys_state *pos;
> +	struct sgx_epc_cgroup *epc_cg;
> +	bool ret = true;
> +
> +	/*
> +	 * Caller ensure css_root ref acquired
> +	 */
> +	css_root = &root->css;
> +
> +	rcu_read_lock();
> +	css_for_each_descendant_pre(pos, css_root) {
> +		if (!css_tryget(pos))
> +			break;
> +
> +		rcu_read_unlock();
> +
> +		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> +
> +		spin_lock(&epc_cg->lru.lock);
> +		ret = list_empty(&epc_cg->lru.reclaimable);
> +		spin_unlock(&epc_cg->lru.lock);
> +
> +		rcu_read_lock();
> +		css_put(pos);
> +		if (!ret)
> +			break;
> +	}
> +
> +	rcu_read_unlock();
> +
> +	return ret;
> +}
> +
> +/**
> + * sgx_epc_cgroup_reclaim_pages() - walk a cgroup tree and scan LRUs to reclaim pages
> + * @root:	Root of the tree to start walking from.
> + * Return:	Number of pages reclaimed.

Just wondering, do you need to return @cnt given this function is called w/o
checking the return value?

> + */
> +unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
> +{
> +	/*
> +	 * Attempting to reclaim only a few pages will often fail and is
> +	 * inefficient, while reclaiming a huge number of pages can result in
> +	 * soft lockups due to holding various locks for an extended duration.
> +	 */

Not sure we need this comment, given it's already implied in
sgx_reclaim_pages().  You cannot pass a value > SGX_NR_TO_SCAN anyway.

> +	unsigned int nr_to_scan = SGX_NR_TO_SCAN;
> +	struct cgroup_subsys_state *css_root;
> +	struct cgroup_subsys_state *pos;
> +	struct sgx_epc_cgroup *epc_cg;
> +	unsigned int cnt;
> +
> +	 /* Caller ensure css_root ref acquired */
> +	css_root = &root->css;
> +
> +	cnt = 0;
> +	rcu_read_lock();
> +	css_for_each_descendant_pre(pos, css_root) {
> +		if (!css_tryget(pos))
> +			break;
> +		rcu_read_unlock();
> +
> +		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> +		cnt += sgx_reclaim_pages(&epc_cg->lru, &nr_to_scan);
> +
> +		rcu_read_lock();
> +		css_put(pos);
> +		if (!nr_to_scan)
> +			break;
> +	}
> +
> +	rcu_read_unlock();
> +	return cnt;
> +}

Here the @nr_to_scan is reduced by the number of pages that are isolated, but
not actually reclaimed (which is reflected by @cnt).

IIUC, looks you want to make this function do "each cycle" as what you mentioned
in the v8 [1]:

	I tested with that approach and found we can only target number of
pages  
	attempted to reclaim not pages actually reclaimed due to the
uncertainty  
	of how long it takes to reclaim pages. Besides targeting number of
	scanned pages for each cycle is also what the ksgxd does.

	If we target actual number of pages, sometimes it just takes too long.
I
	saw more timeouts with the default time limit when running parallel  
	selftests.

I am not sure what does "sometimes it just takes too long" mean, but what I am
thinking is you are trying to do some perfect but yet complicated code here.

For instance, I don't think selftest reflect the real workload, and I believe
adjusting the limit of a given EPC cgroup shouldn't be a frequent operation,
thus it is acceptable to use some easy-maintain code but less perfect code.

Here I still think having @nr_to_scan as a pointer is over-complicated.  For
example, we can still let sgx_reclaim_pages() to always scan SGX_NR_TO_SCAN
pages, but give up when there's enough pages reclaimed or when the EPC cgroup
and its descendants have been looped:

unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
{
	unsigned int cnt = 0;
	...

	css_for_each_descendant_pre(pos, css_root) {
		...
		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
		cnt += sgx_reclaim_pages(&epc_cg->lru);

		if (cnt >= SGX_NR_TO_SCAN)
			break;
	}

	...
	return cnt;
}

Yeah it may reclaim more than SGX_NR_TO_SCAN when the loop actually reaches any
descendants, but that should be rare and we don't care that much, do we?

But I'll leave to maintainers to judge.

[1]
https://lore.kernel.org/linux-kernel/CZ3CM9ZE39Q0.222HRSEUF8RFP@kernel.org/T/#md7b062b43d249218369f921682dfa7f975735dd1

> +
> +/*
> + * Scheduled by sgx_epc_cgroup_try_charge() to reclaim pages from the cgroup
> + * when the cgroup is at/near its maximum capacity
> + */

I don't see this being "scheduled by sgx_epc_cgroup_try_charge()" here.  Does it
make more sense to move that code change to this patch for better review?

> +static void sgx_epc_cgroup_reclaim_work_func(struct work_struct *work)
> +{
> +	struct sgx_epc_cgroup *epc_cg;
> +	u64 cur, max;
> +
> +	epc_cg = container_of(work, struct sgx_epc_cgroup, reclaim_work);
> +
> +	for (;;) {
> +		max = sgx_epc_cgroup_max_pages_to_root(epc_cg);
> +
> +		/*
> +		 * Adjust the limit down by one page, the goal is to free up
> +		 * pages for fault allocations, not to simply obey the limit.
> +		 * Conditionally decrementing max also means the cur vs. max
> +		 * check will correctly handle the case where both are zero.
> +		 */
> +		if (max)
> +			max--;

With the below max -= SGX_NR_TO_SCAN/2 staff, do you still need this one?

> +
> +		/*
> +		 * Unless the limit is extremely low, in which case forcing
> +		 * reclaim will likely cause thrashing, force the cgroup to
> +		 * reclaim at least once if it's operating *near* its maximum
> +		 * limit by adjusting @max down by half the min reclaim size.

OK.  But why choose "SGX_NO_TO_SCAN * 2" as "extremely low"? E.g, could we
choose SGX_NR_TO_SCAN instead? 

IMHO at least we should at least put a comment to mention this.

And maybe you can have a dedicated macro for that in which way I believe the
code would be easier to understand?

> +		 * This work func is scheduled by sgx_epc_cgroup_try_charge

This has been mentioned in the function comment already.

> +		 * when it cannot directly reclaim due to being in an atomic
> +		 * context, e.g. EPC allocation in a fault handler.  
> 

Why a fault handler is an "atomic context"?  Just say when it cannot directly
reclaim.

> Waiting
> +		 * to reclaim until the cgroup is actually at its limit is less
> +		 * performant as it means the faulting task is effectively
> +		 * blocked until a worker makes its way through the global work
> +		 * queue.
> +		 */
> +		if (max > SGX_NR_TO_SCAN * 2)
> +			max -= (SGX_NR_TO_SCAN / 2);
> +
> +		cur = sgx_epc_cgroup_page_counter_read(epc_cg);
> +
> +		if (cur <= max || sgx_epc_cgroup_lru_empty(epc_cg->cg))
> +			break;
> +
> +		/* Keep reclaiming until above condition is met. */
> +		sgx_epc_cgroup_reclaim_pages(epc_cg->cg);

Also, each loop here calls sgx_epc_cgroup_max_pages_to_root() and
sgx_epc_cgroup_lru_empty(), both loop the given EPC cgroup and descendants.  If
we still make sgx_reclaim_pages() always scan SGX_NR_TO_SCAN pages, seems we can
reduce the number of loops here?

> +	}
> +}
> +
> +/**
> + * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page
>   * @epc_cg:	The EPC cgroup to be charged for the page.
>   * Return:
>   * * %0 - If successfully charged.
> @@ -38,6 +209,7 @@ static void sgx_epc_cgroup_free(struct misc_cg *cg)
>  	if (!epc_cg)
>  		return;
>  
> +	cancel_work_sync(&epc_cg->reclaim_work);
>  	kfree(epc_cg);
>  }
>  
> @@ -50,6 +222,8 @@ const struct misc_res_ops sgx_epc_cgroup_ops = {
>  
>  static void sgx_epc_misc_init(struct misc_cg *cg, struct sgx_epc_cgroup *epc_cg)
>  {
> +	sgx_lru_init(&epc_cg->lru);
> +	INIT_WORK(&epc_cg->reclaim_work, sgx_epc_cgroup_reclaim_work_func);
>  	cg->res[MISC_CG_RES_SGX_EPC].priv = epc_cg;
>  	epc_cg->cg = cg;
>  }
> @@ -69,6 +243,11 @@ static int sgx_epc_cgroup_alloc(struct misc_cg *cg)
>  
>  void sgx_epc_cgroup_init(void)
>  {
> +	sgx_epc_cg_wq = alloc_workqueue("sgx_epc_cg_wq",
> +					WQ_UNBOUND | WQ_FREEZABLE,
> +					WQ_UNBOUND_MAX_ACTIVE);
> +	BUG_ON(!sgx_epc_cg_wq);

You cannot BUG_ON() simply due to unable to allocate a workqueue.  You can use
some way to mark EPC cgroup as disabled but keep going.  Static key is one way
although we cannot re-enable it at runtime.

> +
>  	misc_cg_set_ops(MISC_CG_RES_SGX_EPC, &sgx_epc_cgroup_ops);
>  	sgx_epc_misc_init(misc_cg_root(), &epc_cg_root);
>  }
> diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.h b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> index 6b664b4c321f..e3c6a08f0ee8 100644
> --- a/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> +++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
> @@ -34,6 +34,8 @@ static inline void sgx_epc_cgroup_init(void) { }
>  #else
>  struct sgx_epc_cgroup {
>  	struct misc_cg *cg;
> +	struct sgx_epc_lru_list lru;
> +	struct work_struct reclaim_work;
>  };

So you introduced the work/workqueue here but there's no place which actually
queues the work.  IMHO you can either:

1) move relevant code change here; or
2) focus on introducing core functions to reclaim certain pages from a given EPC
cgroup w/o workqueue and introduce the work/workqueue in later patch.

Makes sense?

>  
>  static inline struct sgx_epc_cgroup *sgx_epc_cgroup_from_misc_cg(struct misc_cg *cg)
> @@ -66,6 +68,7 @@ static inline void sgx_put_epc_cg(struct sgx_epc_cgroup *epc_cg)
>  
>  int sgx_epc_cgroup_try_charge(struct sgx_epc_cgroup *epc_cg);
>  void sgx_epc_cgroup_uncharge(struct sgx_epc_cgroup *epc_cg);
> +bool sgx_epc_cgroup_lru_empty(struct misc_cg *root);

Not sure why this needs to be exposed.  Perhaps you should make this change when
needed.

>  void sgx_epc_cgroup_init(void);
>  
>  #endif
  
Michal Koutný Feb. 20, 2024, 1:18 p.m. UTC | #3
On Tue, Feb 20, 2024 at 09:52:39AM +0000, "Huang, Kai" <kai.huang@intel.com> wrote:
> I am not sure, but is it possible or legal for an ancestor to have less limit
> than children?

Why not?
It is desired for proper hiearchical delegation and the tightest limit
of ancestors applies (cf memory.max).

Michal
  
Kai Huang Feb. 20, 2024, 8:09 p.m. UTC | #4
On Tue, 2024-02-20 at 14:18 +0100, Michal Koutný wrote:
> On Tue, Feb 20, 2024 at 09:52:39AM +0000, "Huang, Kai" <kai.huang@intel.com> wrote:
> > I am not sure, but is it possible or legal for an ancestor to have less limit
> > than children?
> 
> Why not?
> It is desired for proper hiearchical delegation and the tightest limit
> of ancestors applies (cf memory.max).
> 

OK.  Thanks for the info.
  
Haitao Huang Feb. 21, 2024, 6:23 a.m. UTC | #5
StartHi Kai
On Tue, 20 Feb 2024 03:52:39 -0600, Huang, Kai <kai.huang@intel.com> wrote:
[...]
>
> So you introduced the work/workqueue here but there's no place which  
> actually
> queues the work.  IMHO you can either:
>
> 1) move relevant code change here; or
> 2) focus on introducing core functions to reclaim certain pages from a  
> given EPC
> cgroup w/o workqueue and introduce the work/workqueue in later patch.
>
> Makes sense?
>

Starting in v7, I was trying to split the big patch, #10 in v6 as you and  
others suggested. My thought process was to put infrastructure needed for  
per-cgroup reclaim in the front, then turn on per-cgroup reclaim in [v9  
13/15] in the end.

Before that, all reclaimables are tracked in the global LRU so really  
there is no "reclaim certain pages from a  given EPC cgroup w/o workqueue"  
or reclaim through workqueue before that point, as suggested in #2. This  
patch puts down the implementation for both flows but neither used yet, as  
stated in the commit message.

#1 would force me go back and merge the patches again.

Sorry I feel kind of lost on this whole thing by now. It seems so random  
to me. Is there hard rules on this?

I was hoping these statements would help reviewers on the flow of the  
patches.

At the end of [v9 04/15]:

For now, the EPC cgroup simply blocks additional EPC allocation in
sgx_alloc_epc_page() when the limit is reached. Reclaimable pages are
still tracked in the global active list, only reclaimed by the global
reclaimer when the total free page count is lower than a threshold.

Later patches will reorganize the tracking and reclamation code in the
global reclaimer and implement per-cgroup tracking and reclaiming.

At the end of [v9 06/15]:

Next patches will first get the cgroup reclamation flow ready while
keeping pages tracked in the global LRU and reclaimed by ksgxd before we
make the switch in the end for sgx_lru_list() to return per-cgroup
LRU.

At the end of [v9 08/15]:

Both synchronous and asynchronous flows invoke the same top level reclaim
function, and will be triggered later by sgx_epc_cgroup_try_charge()
when usage of the cgroup is at or near its limit.

At the end of [v9 10/15]:
Note at this point, all reclaimable EPC pages are still tracked in the
global LRU and per-cgroup LRUs are empty. So no per-cgroup reclamation
is activated yet.


Thanks
Haitao
  
Haitao Huang Feb. 21, 2024, 6:44 a.m. UTC | #6
[...]
>
> Here the @nr_to_scan is reduced by the number of pages that are  
> isolated, but
> not actually reclaimed (which is reflected by @cnt).
>
> IIUC, looks you want to make this function do "each cycle" as what you  
> mentioned
> in the v8 [1]:
>
> 	I tested with that approach and found we can only target number of
> pages
> 	attempted to reclaim not pages actually reclaimed due to the
> uncertainty
> 	of how long it takes to reclaim pages. Besides targeting number of
> 	scanned pages for each cycle is also what the ksgxd does.
>
> 	If we target actual number of pages, sometimes it just takes too long.
> I
> 	saw more timeouts with the default time limit when running parallel
> 	selftests.
>
> I am not sure what does "sometimes it just takes too long" mean, but  
> what I am
> thinking is you are trying to do some perfect but yet complicated code  
> here.

I think what I observed was that the try_charge() would block too long  
before getting chance of schedule() to yield, causing more timeouts than  
necessary.
I'll do some re-test to be sure.

>
> For instance, I don't think selftest reflect the real workload, and I  
> believe
> adjusting the limit of a given EPC cgroup shouldn't be a frequent  
> operation,
> thus it is acceptable to use some easy-maintain code but less perfect  
> code.
>
> Here I still think having @nr_to_scan as a pointer is over-complicated.   
> For
> example, we can still let sgx_reclaim_pages() to always scan  
> SGX_NR_TO_SCAN
> pages, but give up when there's enough pages reclaimed or when the EPC  
> cgroup
> and its descendants have been looped:
>
> unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
> {
> 	unsigned int cnt = 0;
> 	...
>
> 	css_for_each_descendant_pre(pos, css_root) {
> 		...
> 		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> 		cnt += sgx_reclaim_pages(&epc_cg->lru);
>
> 		if (cnt >= SGX_NR_TO_SCAN)
> 			break;
> 	}
>
> 	...
> 	return cnt;
> }
>
> Yeah it may reclaim more than SGX_NR_TO_SCAN when the loop actually  
> reaches any
> descendants, but that should be rare and we don't care that much, do we?
>
I assume you meant @cnt here to be number of pages actually reclaimed.  
This could cause  sgx_epc_cgroup_reclaim_pages() block too long as @cnt  
may always be zero (all pages are too young) and you have to loop all  
descendants.

Thanks

Haitao
  
Kai Huang Feb. 21, 2024, 10:48 a.m. UTC | #7
On Wed, 2024-02-21 at 00:23 -0600, Haitao Huang wrote:
> StartHi Kai
> On Tue, 20 Feb 2024 03:52:39 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> [...]
> > 
> > So you introduced the work/workqueue here but there's no place which  
> > actually
> > queues the work.  IMHO you can either:
> > 
> > 1) move relevant code change here; or
> > 2) focus on introducing core functions to reclaim certain pages from a  
> > given EPC
> > cgroup w/o workqueue and introduce the work/workqueue in later patch.
> > 
> > Makes sense?
> > 
> 
> Starting in v7, I was trying to split the big patch, #10 in v6 as you and  
> others suggested. My thought process was to put infrastructure needed for  
> per-cgroup reclaim in the front, then turn on per-cgroup reclaim in [v9  
> 13/15] in the end.

That's reasonable for sure.

> 
> Before that, all reclaimables are tracked in the global LRU so really  
> there is no "reclaim certain pages from a  given EPC cgroup w/o workqueue"  
> or reclaim through workqueue before that point, as suggested in #2. This  
> patch puts down the implementation for both flows but neither used yet, as  
> stated in the commit message.

I know it's not used yet.  The point is how to split patches to make them more
self-contain and easy to review.

For #2, sorry for not being explicit -- I meant it seems it's more reasonable to
split in this way:

Patch 1)
  a). change to sgx_reclaim_pages();
  b). introduce sgx_epc_cgroup_reclaim_pages();
  c). introduce sgx_epc_cgroup_reclaim_work_func() (use a better name), 
     which just takes an EPC cgroup as input w/o involving any work/workqueue.

These functions are all related to how to implement reclaiming pages from a
given EPC cgroup, and they are logically related in terms of implementation thus
it's easier to be reviewed together.

Then you just need to justify the design/implementation in changelog/comments.

Patch 2) 
  - Introduce work/workqueue, and implement the logic to queue the work.

Now we all know there's a function to reclaim pages for a given EPC cgroup, then
we can focus on when that is called, either directly or indirectly.
	
> 
> #1 would force me go back and merge the patches again.

I don't think so.  I am not asking to put all things together, but only asking
to split in better way (that I see).

You mentioned some function is "Scheduled by sgx_epc_cgroup_try_charge() to
reclaim pages", but I am not seeing any code doing that in this patch.  This
needs fixing, either by moving relevant code here, or removing these not-done-
yet comments.

For instance (I am just giving an example), if after review we found the
queue_work() shouldn't be done in try_charge(), you will need to go back to this
patch and remove these comments.

That's not the best way.  Each patch needs to be self-contained.

> 
> Sorry I feel kind of lost on this whole thing by now. It seems so random  
> to me. Is there hard rules on this?

See above.  I am only offering my opinion on how to split patches in better way.

> 
> I was hoping these statements would help reviewers on the flow of the  
> patches.
> 
> At the end of [v9 04/15]:
> 
> For now, the EPC cgroup simply blocks additional EPC allocation in
> sgx_alloc_epc_page() when the limit is reached. Reclaimable pages are
> still tracked in the global active list, only reclaimed by the global
> reclaimer when the total free page count is lower than a threshold.
> 
> Later patches will reorganize the tracking and reclamation code in the
> global reclaimer and implement per-cgroup tracking and reclaiming.
> 
> At the end of [v9 06/15]:
> 
> Next patches will first get the cgroup reclamation flow ready while
> keeping pages tracked in the global LRU and reclaimed by ksgxd before we
> make the switch in the end for sgx_lru_list() to return per-cgroup
> LRU.
> 
> At the end of [v9 08/15]:
> 
> Both synchronous and asynchronous flows invoke the same top level reclaim
> function, and will be triggered later by sgx_epc_cgroup_try_charge()
> when usage of the cgroup is at or near its limit.
> 
> At the end of [v9 10/15]:
> Note at this point, all reclaimable EPC pages are still tracked in the
> global LRU and per-cgroup LRUs are empty. So no per-cgroup reclamation
> is activated yet.

They are useful in the changelog in each patch I suppose, but to me we are
discussing different things.

I found one pain in the review is I have to jump back and forth many times among
multiple patches to see whether one patch is reasonable.  That's why I am asking
whether there's better way to split patches so that each patch can be self-
contained logically in someway and easier to review.
  
Kai Huang Feb. 21, 2024, 11 a.m. UTC | #8
On Wed, 2024-02-21 at 00:44 -0600, Haitao Huang wrote:
> [...]
> > 
> > Here the @nr_to_scan is reduced by the number of pages that are  
> > isolated, but
> > not actually reclaimed (which is reflected by @cnt).
> > 
> > IIUC, looks you want to make this function do "each cycle" as what you  
> > mentioned
> > in the v8 [1]:
> > 
> > 	I tested with that approach and found we can only target number of
> > pages
> > 	attempted to reclaim not pages actually reclaimed due to the
> > uncertainty
> > 	of how long it takes to reclaim pages. Besides targeting number of
> > 	scanned pages for each cycle is also what the ksgxd does.
> > 
> > 	If we target actual number of pages, sometimes it just takes too long.
> > I
> > 	saw more timeouts with the default time limit when running parallel
> > 	selftests.
> > 
> > I am not sure what does "sometimes it just takes too long" mean, but  
> > what I am
> > thinking is you are trying to do some perfect but yet complicated code  
> > here.
> 
> I think what I observed was that the try_charge() would block too long  
> before getting chance of schedule() to yield, causing more timeouts than  
> necessary.
> I'll do some re-test to be sure.

Looks this is a valid information that can be used to justify whatever you are
implementing in the EPC cgroup reclaiming function(s).

> 
> > 
> > For instance, I don't think selftest reflect the real workload, and I  
> > believe
> > adjusting the limit of a given EPC cgroup shouldn't be a frequent  
> > operation,
> > thus it is acceptable to use some easy-maintain code but less perfect  
> > code.
> > 
> > Here I still think having @nr_to_scan as a pointer is over-complicated.   
> > For
> > example, we can still let sgx_reclaim_pages() to always scan  
> > SGX_NR_TO_SCAN
> > pages, but give up when there's enough pages reclaimed or when the EPC  
> > cgroup
> > and its descendants have been looped:
> > 
> > unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
> > {
> > 	unsigned int cnt = 0;
> > 	...
> > 
> > 	css_for_each_descendant_pre(pos, css_root) {
> > 		...
> > 		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> > 		cnt += sgx_reclaim_pages(&epc_cg->lru);
> > 
> > 		if (cnt >= SGX_NR_TO_SCAN)
> > 			break;
> > 	}
> > 
> > 	...
> > 	return cnt;
> > }
> > 
> > Yeah it may reclaim more than SGX_NR_TO_SCAN when the loop actually  
> > reaches any
> > descendants, but that should be rare and we don't care that much, do we?
> > 
> I assume you meant @cnt here to be number of pages actually reclaimed.  

Yes.

> This could cause  sgx_epc_cgroup_reclaim_pages() block too long as @cnt  
> may always be zero (all pages are too young) and you have to loop all  
> descendants.

I am not sure whether this is a valid point.

For example, your change in patch 10 "x86/sgx: Add EPC reclamation in cgroup
try_charge()" already loops all descendants in below code:

+		if (sgx_epc_cgroup_lru_empty(epc_cg->cg))
+			return -ENOMEM;

Anyway, I can see all these can be justification to your design/implementation.
My point is please put these justification in changelog/comments so that we can
actually understand.


Makes sense?
  
Haitao Huang Feb. 22, 2024, 5:20 p.m. UTC | #9
On Wed, 21 Feb 2024 05:00:27 -0600, Huang, Kai <kai.huang@intel.com> wrote:

> On Wed, 2024-02-21 at 00:44 -0600, Haitao Huang wrote:
>> [...]
>> >
>> > Here the @nr_to_scan is reduced by the number of pages that are
>> > isolated, but
>> > not actually reclaimed (which is reflected by @cnt).
>> >
>> > IIUC, looks you want to make this function do "each cycle" as what you
>> > mentioned
>> > in the v8 [1]:
>> >
>> > 	I tested with that approach and found we can only target number of
>> > pages
>> > 	attempted to reclaim not pages actually reclaimed due to the
>> > uncertainty
>> > 	of how long it takes to reclaim pages. Besides targeting number of
>> > 	scanned pages for each cycle is also what the ksgxd does.
>> >
>> > 	If we target actual number of pages, sometimes it just takes too  
>> long.
>> > I
>> > 	saw more timeouts with the default time limit when running parallel
>> > 	selftests.
>> >
>> > I am not sure what does "sometimes it just takes too long" mean, but
>> > what I am
>> > thinking is you are trying to do some perfect but yet complicated code
>> > here.
>>
>> I think what I observed was that the try_charge() would block too long
>> before getting chance of schedule() to yield, causing more timeouts than
>> necessary.
>> I'll do some re-test to be sure.
>
> Looks this is a valid information that can be used to justify whatever  
> you are
> implementing in the EPC cgroup reclaiming function(s).
>
I'll add some comments. Was assuming this is just following the old design  
as ksgxd.
There were some comments at the beginning of  
sgx_epc_cgrooup_reclaim_page().
         /*
          * Attempting to reclaim only a few pages will often fail and is
          * inefficient, while reclaiming a huge number of pages can result  
in
          * soft lockups due to holding various locks for an extended  
duration.
          */
         unsigned int nr_to_scan = SGX_NR_TO_SCAN;

I think it can be improved to emphasize we only "attempt" to finish  
scanning fixed number of pages for reclamation, not enforce number of  
pages successfully reclaimed.

>>
>> >
>> > For instance, I don't think selftest reflect the real workload, and I
>> > believe
>> > adjusting the limit of a given EPC cgroup shouldn't be a frequent
>> > operation,
>> > thus it is acceptable to use some easy-maintain code but less perfect
>> > code.
>> >
>> > Here I still think having @nr_to_scan as a pointer is  
>> over-complicated.
>> > For
>> > example, we can still let sgx_reclaim_pages() to always scan
>> > SGX_NR_TO_SCAN
>> > pages, but give up when there's enough pages reclaimed or when the EPC
>> > cgroup
>> > and its descendants have been looped:
>> >
>> > unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
>> > {
>> > 	unsigned int cnt = 0;
>> > 	...
>> >
>> > 	css_for_each_descendant_pre(pos, css_root) {
>> > 		...
>> > 		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
>> > 		cnt += sgx_reclaim_pages(&epc_cg->lru);
>> >
>> > 		if (cnt >= SGX_NR_TO_SCAN)
>> > 			break;
>> > 	}
>> >
>> > 	...
>> > 	return cnt;
>> > }
>> >
>> > Yeah it may reclaim more than SGX_NR_TO_SCAN when the loop actually
>> > reaches any
>> > descendants, but that should be rare and we don't care that much, do  
>> we?
>> >
>> I assume you meant @cnt here to be number of pages actually reclaimed.
>
> Yes.
>
>> This could cause  sgx_epc_cgroup_reclaim_pages() block too long as @cnt
>> may always be zero (all pages are too young) and you have to loop all
>> descendants.
>
> I am not sure whether this is a valid point.
>
> For example, your change in patch 10 "x86/sgx: Add EPC reclamation in  
> cgroup
> try_charge()" already loops all descendants in below code:
>
> +		if (sgx_epc_cgroup_lru_empty(epc_cg->cg))
> +			return -ENOMEM;
>

I meant looping all descendants for reclamation which is expensive and we  
want to avoid. Not just checking emptiness of LRUs.

> Anyway, I can see all these can be justification to your  
> design/implementation.
> My point is please put these justification in changelog/comments so that  
> we can
> actually understand.
>
Yes, will add clarifying comments.
Thanks
  
Haitao Huang Feb. 22, 2024, 6:09 p.m. UTC | #10
On Tue, 20 Feb 2024 03:52:39 -0600, Huang, Kai <kai.huang@intel.com> wrote:

>>> +/**
>> + * sgx_epc_cgroup_reclaim_pages() - walk a cgroup tree and scan LRUs  
>> to reclaim pages
>> + * @root:	Root of the tree to start walking from.
>> + * Return:	Number of pages reclaimed.
>
> Just wondering, do you need to return @cnt given this function is called  
> w/o
> checking the return value?
>
Yes. Will add explicit commenting that we need scan fixed number of pages  
for attempted reclamation.
>> + */
>> +unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
>> +{
>> +	/*
>> +	 * Attempting to reclaim only a few pages will often fail and is
>> +	 * inefficient, while reclaiming a huge number of pages can result in
>> +	 * soft lockups due to holding various locks for an extended duration.
>> +	 */
>
> Not sure we need this comment, given it's already implied in
> sgx_reclaim_pages().  You cannot pass a value > SGX_NR_TO_SCAN anyway.

Will rework on these comments to make them more meaningful.
>

[other comments/questions addressed in separate email threads]
[...]
>> +
>> +/*
>> + * Scheduled by sgx_epc_cgroup_try_charge() to reclaim pages from the  
>> cgroup
>> + * when the cgroup is at/near its maximum capacity
>> + */
>
> I don't see this being "scheduled by sgx_epc_cgroup_try_charge()" here.   
> Does it
> make more sense to move that code change to this patch for better review?
>

Right. This comment was left-over when I split the old patch.

>> +static void sgx_epc_cgroup_reclaim_work_func(struct work_struct *work)
>> +{
>> +	struct sgx_epc_cgroup *epc_cg;
>> +	u64 cur, max;
>> +
>> +	epc_cg = container_of(work, struct sgx_epc_cgroup, reclaim_work);
>> +
>> +	for (;;) {
>> +		max = sgx_epc_cgroup_max_pages_to_root(epc_cg);
>> +
>> +		/*
>> +		 * Adjust the limit down by one page, the goal is to free up
>> +		 * pages for fault allocations, not to simply obey the limit.
>> +		 * Conditionally decrementing max also means the cur vs. max
>> +		 * check will correctly handle the case where both are zero.
>> +		 */
>> +		if (max)
>> +			max--;
>
> With the below max -= SGX_NR_TO_SCAN/2 staff, do you still need this one?
>

Logically still needed for case max <= SGX_NR_TO_SCAN * 2

>> +
>> +		/*
>> +		 * Unless the limit is extremely low, in which case forcing
>> +		 * reclaim will likely cause thrashing, force the cgroup to
>> +		 * reclaim at least once if it's operating *near* its maximum
>> +		 * limit by adjusting @max down by half the min reclaim size.
>
> OK.  But why choose "SGX_NO_TO_SCAN * 2" as "extremely low"? E.g, could  
> we
> choose SGX_NR_TO_SCAN instead?
> IMHO at least we should at least put a comment to mention this.
>
> And maybe you can have a dedicated macro for that in which way I believe  
> the
> code would be easier to understand?

Good point. I think the value is kind of arbitrary. We consider  
enclaves/cgroups of 64K size are very small. If such a cgroup ever reaches  
the limit, then we don't aggressively reclaim to optimize #PF handling.  
User might as well just raise the limit if it is not performant.

>
>> +		 * This work func is scheduled by sgx_epc_cgroup_try_charge
>
> This has been mentioned in the function comment already.
>
>> +		 * when it cannot directly reclaim due to being in an atomic
>> +		 * context, e.g. EPC allocation in a fault handler.
>
> Why a fault handler is an "atomic context"?  Just say when it cannot  
> directly
> reclaim.
>

Sure.

>> Waiting
>> +		 * to reclaim until the cgroup is actually at its limit is less
>> +		 * performant as it means the faulting task is effectively
>> +		 * blocked until a worker makes its way through the global work
>> +		 * queue.
>> +		 */
>> +		if (max > SGX_NR_TO_SCAN * 2)
>> +			max -= (SGX_NR_TO_SCAN / 2);
>> +
>> +		cur = sgx_epc_cgroup_page_counter_read(epc_cg);
>> +
>> +		if (cur <= max || sgx_epc_cgroup_lru_empty(epc_cg->cg))
>> +			break;
>> +
>> +		/* Keep reclaiming until above condition is met. */
>> +		sgx_epc_cgroup_reclaim_pages(epc_cg->cg);
>
> Also, each loop here calls sgx_epc_cgroup_max_pages_to_root() and
> sgx_epc_cgroup_lru_empty(), both loop the given EPC cgroup and  
> descendants.  If
> we still make sgx_reclaim_pages() always scan SGX_NR_TO_SCAN pages,  
> seems we can
> reduce the number of loops here?
>

[We already scan SGX_NR_TO_SCAN pages for the cgroup at the level of  
sgx_epc_cgroup_reclaim_pages().]

I think you mean that we keep scanning and reclaiming until at least  
SGX_NR_TO_SCAN pages are reclaimed as your code suggested above. We  
probably can make that a version for this background thread for  
optimization. But sgx_epc_cgroup_max_pages_to_root() and  
sgx_epc_cgroup_lru_empty() are not that bad unless we had very deep and  
wide cgroup trees. So would you agree we defer this optimization for later?


>> +	}
>> +}
>> +
>> +/**
>> + * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC  
>> page
>>   * @epc_cg:	The EPC cgroup to be charged for the page.
>>   * Return:
>>   * * %0 - If successfully charged.
>> @@ -38,6 +209,7 @@ static void sgx_epc_cgroup_free(struct misc_cg *cg)
>>  	if (!epc_cg)
>>  		return;
>>
>> +	cancel_work_sync(&epc_cg->reclaim_work);
>>  	kfree(epc_cg);
>>  }
>>
>> @@ -50,6 +222,8 @@ const struct misc_res_ops sgx_epc_cgroup_ops = {
>>
>>  static void sgx_epc_misc_init(struct misc_cg *cg, struct  
>> sgx_epc_cgroup *epc_cg)
>>  {
>> +	sgx_lru_init(&epc_cg->lru);
>> +	INIT_WORK(&epc_cg->reclaim_work, sgx_epc_cgroup_reclaim_work_func);
>>  	cg->res[MISC_CG_RES_SGX_EPC].priv = epc_cg;
>>  	epc_cg->cg = cg;
>>  }
>> @@ -69,6 +243,11 @@ static int sgx_epc_cgroup_alloc(struct misc_cg *cg)
>>
>>  void sgx_epc_cgroup_init(void)
>>  {
>> +	sgx_epc_cg_wq = alloc_workqueue("sgx_epc_cg_wq",
>> +					WQ_UNBOUND | WQ_FREEZABLE,
>> +					WQ_UNBOUND_MAX_ACTIVE);
>> +	BUG_ON(!sgx_epc_cg_wq);
>
> You cannot BUG_ON() simply due to unable to allocate a workqueue.  You  
> can use
> some way to mark EPC cgroup as disabled but keep going.  Static key is  
> one way
> although we cannot re-enable it at runtime.
>
>
Okay, I'll disable and print a log.

[...]
[workqueue related discussion in separate email]

Thanks
Haitao
  
Haitao Huang Feb. 22, 2024, 8:12 p.m. UTC | #11
On Wed, 21 Feb 2024 04:48:58 -0600, Huang, Kai <kai.huang@intel.com> wrote:

> On Wed, 2024-02-21 at 00:23 -0600, Haitao Huang wrote:
>> StartHi Kai
>> On Tue, 20 Feb 2024 03:52:39 -0600, Huang, Kai <kai.huang@intel.com>  
>> wrote:
>> [...]
>> >
>> > So you introduced the work/workqueue here but there's no place which
>> > actually
>> > queues the work.  IMHO you can either:
>> >
>> > 1) move relevant code change here; or
>> > 2) focus on introducing core functions to reclaim certain pages from a
>> > given EPC
>> > cgroup w/o workqueue and introduce the work/workqueue in later patch.
>> >
>> > Makes sense?
>> >
>>
>> Starting in v7, I was trying to split the big patch, #10 in v6 as you  
>> and
>> others suggested. My thought process was to put infrastructure needed  
>> for
>> per-cgroup reclaim in the front, then turn on per-cgroup reclaim in [v9
>> 13/15] in the end.
>
> That's reasonable for sure.
>

Thanks for the confirmation :-)

>>
>> Before that, all reclaimables are tracked in the global LRU so really
>> there is no "reclaim certain pages from a  given EPC cgroup w/o  
>> workqueue"
>> or reclaim through workqueue before that point, as suggested in #2. This
>> patch puts down the implementation for both flows but neither used yet,  
>> as
>> stated in the commit message.
>
> I know it's not used yet.  The point is how to split patches to make  
> them more
> self-contain and easy to review.

I would think this patch already self-contained in that all are  
implementation of cgroup reclamation building blocks utilized later. But  
I'll try to follow your suggestions below to split further (would prefer  
not to merge in general unless there is strong reasons).

>
> For #2, sorry for not being explicit -- I meant it seems it's more  
> reasonable to
> split in this way:
>
> Patch 1)
>   a). change to sgx_reclaim_pages();

I'll still prefer this to be a separate patch. It is self-contained IMHO.
We were splitting the original patch because it was too big. I don't want  
to merge back unless there is a strong reason.

>   b). introduce sgx_epc_cgroup_reclaim_pages();

Ok.

>   c). introduce sgx_epc_cgroup_reclaim_work_func() (use a better name), 
>      which just takes an EPC cgroup as input w/o involving any  
> work/workqueue.

This is for the workqueue use only. So I think it'd be better be with  
patch #2 below?

>
> These functions are all related to how to implement reclaiming pages  
> from a
> given EPC cgroup, and they are logically related in terms of  
> implementation thus
> it's easier to be reviewed together.
>

This is pretty much the current patch + sgx_reclaim_pages() - workqueue.

> Then you just need to justify the design/implementation in  
> changelog/comments.
>

How about just do b) in patch #1, and state the new function is the  
building block and will be used for both direct and indirect reclamation?

> Patch 2)
>   - Introduce work/workqueue, and implement the logic to queue the work.
>
> Now we all know there's a function to reclaim pages for a given EPC  
> cgroup, then
> we can focus on when that is called, either directly or indirectly.
> 	

The try_charge() function will do both actually.
For indirect, it queue the work to the wq. For direct it just calls  
sgx_epc_cgroup_reclaim_pages().
That part is already in separate (I think self-contained) patch [v9,  
10/15].

So for this patch, I'll add  sgx_epc_cgroup_reclaim_work_func() and  
introduce work/workqueue so later work can be queued?

>>
>> #1 would force me go back and merge the patches again.
>
> I don't think so.  I am not asking to put all things together, but only  
> asking
> to split in better way (that I see).
>

Okay.

> You mentioned some function is "Scheduled by sgx_epc_cgroup_try_charge()  
> to
> reclaim pages", but I am not seeing any code doing that in this patch.   
> This
> needs fixing, either by moving relevant code here, or removing these  
> not-done-
> yet comments.
>

Yes. The comments will be fixed.

> For instance (I am just giving an example), if after review we found the
> queue_work() shouldn't be done in try_charge(), you will need to go back  
> to this
> patch and remove these comments.
>
> That's not the best way.  Each patch needs to be self-contained.
>
>>
>> Sorry I feel kind of lost on this whole thing by now. It seems so random
>> to me. Is there hard rules on this?
>
> See above.  I am only offering my opinion on how to split patches in  
> better way.
>

To be honest, the part I'm feeling most confusing is this  
self-contained-ness. It seems depend on how you look at things.

>>
>> I was hoping these statements would help reviewers on the flow of the
>> patches.
>>
>> At the end of [v9 04/15]:
>>
>> For now, the EPC cgroup simply blocks additional EPC allocation in
>> sgx_alloc_epc_page() when the limit is reached. Reclaimable pages are
>> still tracked in the global active list, only reclaimed by the global
>> reclaimer when the total free page count is lower than a threshold.
>>
>> Later patches will reorganize the tracking and reclamation code in the
>> global reclaimer and implement per-cgroup tracking and reclaiming.
>>
>> At the end of [v9 06/15]:
>>
>> Next patches will first get the cgroup reclamation flow ready while
>> keeping pages tracked in the global LRU and reclaimed by ksgxd before we
>> make the switch in the end for sgx_lru_list() to return per-cgroup
>> LRU.
>>
>> At the end of [v9 08/15]:
>>
>> Both synchronous and asynchronous flows invoke the same top level  
>> reclaim
>> function, and will be triggered later by sgx_epc_cgroup_try_charge()
>> when usage of the cgroup is at or near its limit.
>>
>> At the end of [v9 10/15]:
>> Note at this point, all reclaimable EPC pages are still tracked in the
>> global LRU and per-cgroup LRUs are empty. So no per-cgroup reclamation
>> is activated yet.
>
> They are useful in the changelog in each patch I suppose, but to me we  
> are
> discussing different things.
>
> I found one pain in the review is I have to jump back and forth many  
> times among
> multiple patches to see whether one patch is reasonable.  That's why I  
> am asking
> whether there's better way to split patches so that each patch can be  
> self-
> contained logically in someway and easier to review.
>

I appreciate very much your time and effort on providing detailed review.  
You have been very helpful.
If you think it makes sense, I'll split this patch into 2 with stated  
modifications above.

Thanks
Haitao
  
Kai Huang Feb. 22, 2024, 10:24 p.m. UTC | #12
On 23/02/2024 9:12 am, Haitao Huang wrote:
> On Wed, 21 Feb 2024 04:48:58 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> 
>> On Wed, 2024-02-21 at 00:23 -0600, Haitao Huang wrote:
>>> StartHi Kai
>>> On Tue, 20 Feb 2024 03:52:39 -0600, Huang, Kai <kai.huang@intel.com> 
>>> wrote:
>>> [...]
>>> >
>>> > So you introduced the work/workqueue here but there's no place which
>>> > actually
>>> > queues the work.  IMHO you can either:
>>> >
>>> > 1) move relevant code change here; or
>>> > 2) focus on introducing core functions to reclaim certain pages from a
>>> > given EPC
>>> > cgroup w/o workqueue and introduce the work/workqueue in later patch.
>>> >
>>> > Makes sense?
>>> >
>>>
>>> Starting in v7, I was trying to split the big patch, #10 in v6 as you 
>>> and
>>> others suggested. My thought process was to put infrastructure needed 
>>> for
>>> per-cgroup reclaim in the front, then turn on per-cgroup reclaim in [v9
>>> 13/15] in the end.
>>
>> That's reasonable for sure.
>>
> 
> Thanks for the confirmation :-)
> 
>>>
>>> Before that, all reclaimables are tracked in the global LRU so really
>>> there is no "reclaim certain pages from a  given EPC cgroup w/o 
>>> workqueue"
>>> or reclaim through workqueue before that point, as suggested in #2. This
>>> patch puts down the implementation for both flows but neither used 
>>> yet, as
>>> stated in the commit message.
>>
>> I know it's not used yet.  The point is how to split patches to make 
>> them more
>> self-contain and easy to review.
> 
> I would think this patch already self-contained in that all are 
> implementation of cgroup reclamation building blocks utilized later. But 
> I'll try to follow your suggestions below to split further (would prefer 
> not to merge in general unless there is strong reasons).
> 
>>
>> For #2, sorry for not being explicit -- I meant it seems it's more 
>> reasonable to
>> split in this way:
>>
>> Patch 1)
>>   a). change to sgx_reclaim_pages();
> 
> I'll still prefer this to be a separate patch. It is self-contained IMHO.
> We were splitting the original patch because it was too big. I don't 
> want to merge back unless there is a strong reason.
> 
>>   b). introduce sgx_epc_cgroup_reclaim_pages();
> 
> Ok.

If I got you right, I believe you want to have a cgroup variant function 
following the same behaviour of the one for global reclaim, i.e., the 
_current_ sgx_reclaim_pages(), which always tries to scan and reclaim 
SGX_NR_TO_SCAN pages each time.

And this cgroup variant function, sgx_epc_cgroup_reclaim_pages(), tries 
to scan and reclaim SGX_NR_TO_SCAN pages each time "_across_ the cgroup 
and all the descendants".

And you want to implement sgx_epc_cgroup_reclaim_pages() in this way due 
to WHATEVER reasons.

In that case, the change to sgx_reclaim_pages() and the introduce of 
sgx_epc_cgroup_reclaim_pages() should really be together because they 
are completely tied together in terms of implementation.

In this way you can just explain clearly in _ONE_ patch why you choose 
this implementation, and for reviewer it's also easier to review because 
we can just discuss in one patch.

Makes sense?

> 
>>   c). introduce sgx_epc_cgroup_reclaim_work_func() (use a better 
>> name),     which just takes an EPC cgroup as input w/o involving any 
>> work/workqueue.
> 
> This is for the workqueue use only. So I think it'd be better be with 
> patch #2 below?

There are multiple levels of logic here IMHO:

   1. a) and b) above focus on "each reclaim" a given EPC cgroup
   2. c) is about a loop of above to bring given cgroup's usage to limit
   3. workqueue is one (probably best) way to do c) in async way
   4. the logic where 1) (direct reclaim) and 3) (indirect) are triggered

To me, it's clear 1) should be in one patch as stated above.

Also, to me 3) and 4) are better to be together since they give you a 
clear view on how the direct/indirect reclaim are triggered.

2) could be flexible depending on how you see it.  If you prefer viewing 
it from low-level implementation of reclaiming pages from cgroup, then 
it's also OK to be together with 1).  If you want to treat it as a part 
of _async_ way of bring down usage to limit, then _MAYBE_ it's also OK 
to be with 3) and 4).

But to me 2) can be together with 1) or even a separate patch because 
it's still kinda of low-level reclaiming details.  3) and 4) shouldn't 
contain such detail but should focus on how direct/indirect reclaim is done.

[...]

> 
> To be honest, the part I'm feeling most confusing is this 
> self-contained-ness. It seems depend on how you look at things.

Completely understand.  But I think our discussion should be helpful to 
both of us and others.
  
Kai Huang Feb. 22, 2024, 10:31 p.m. UTC | #13
On 23/02/2024 6:20 am, Haitao Huang wrote:
> On Wed, 21 Feb 2024 05:00:27 -0600, Huang, Kai <kai.huang@intel.com> wrote:
> 
>> On Wed, 2024-02-21 at 00:44 -0600, Haitao Huang wrote:
>>> [...]
>>> >
>>> > Here the @nr_to_scan is reduced by the number of pages that are
>>> > isolated, but
>>> > not actually reclaimed (which is reflected by @cnt).
>>> >
>>> > IIUC, looks you want to make this function do "each cycle" as what you
>>> > mentioned
>>> > in the v8 [1]:
>>> >
>>> >     I tested with that approach and found we can only target number of
>>> > pages
>>> >     attempted to reclaim not pages actually reclaimed due to the
>>> > uncertainty
>>> >     of how long it takes to reclaim pages. Besides targeting number of
>>> >     scanned pages for each cycle is also what the ksgxd does.
>>> >
>>> >     If we target actual number of pages, sometimes it just takes 
>>> too long.
>>> > I
>>> >     saw more timeouts with the default time limit when running 
>>> parallel
>>> >     selftests.
>>> >
>>> > I am not sure what does "sometimes it just takes too long" mean, but
>>> > what I am
>>> > thinking is you are trying to do some perfect but yet complicated code
>>> > here.
>>>
>>> I think what I observed was that the try_charge() would block too long
>>> before getting chance of schedule() to yield, causing more timeouts than
>>> necessary.
>>> I'll do some re-test to be sure.
>>
>> Looks this is a valid information that can be used to justify whatever 
>> you are
>> implementing in the EPC cgroup reclaiming function(s).
>>
> I'll add some comments. Was assuming this is just following the old 
> design as ksgxd.
> There were some comments at the beginning of 
> sgx_epc_cgrooup_reclaim_page().
>          /*
>           * Attempting to reclaim only a few pages will often fail and is
>           * inefficient, while reclaiming a huge number of pages can 
> result in
>           * soft lockups due to holding various locks for an extended 
> duration.
>           */
>          unsigned int nr_to_scan = SGX_NR_TO_SCAN;
> 
> I think it can be improved to emphasize we only "attempt" to finish 
> scanning fixed number of pages for reclamation, not enforce number of 
> pages successfully reclaimed.

Not sure need to be this comment, but at somewhere just state you are 
trying to follow the ksgxd() (the current sgx_reclaim_pages()), but 
trying to do it "_across_ given cgroup and all the descendants".

That's the reason you made @nr_to_scan as a pointer.

And also some text to explain why to follow ksgxd() -- not wanting to 
block longer due to loop over descendants etc -- so we can focus on 
discussing whether such justification is reasonable.
  

Patch

diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.c b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
index f4a37ace67d7..16b6d9f909eb 100644
--- a/arch/x86/kernel/cpu/sgx/epc_cgroup.c
+++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.c
@@ -8,9 +8,180 @@ 
 /* The root EPC cgroup */
 static struct sgx_epc_cgroup epc_cg_root;
 
+/*
+ * The work queue that reclaims EPC pages in the background for cgroups.
+ *
+ * A cgroup schedules a work item into this queue to reclaim pages within the
+ * same cgroup when its usage limit is reached and synchronous reclamation is not
+ * an option, e.g., in a fault handler.
+ */
+static struct workqueue_struct *sgx_epc_cg_wq;
+
+static inline u64 sgx_epc_cgroup_page_counter_read(struct sgx_epc_cgroup *epc_cg)
+{
+	return atomic64_read(&epc_cg->cg->res[MISC_CG_RES_SGX_EPC].usage) / PAGE_SIZE;
+}
+
+static inline u64 sgx_epc_cgroup_max_pages(struct sgx_epc_cgroup *epc_cg)
+{
+	return READ_ONCE(epc_cg->cg->res[MISC_CG_RES_SGX_EPC].max) / PAGE_SIZE;
+}
+
+/*
+ * Get the lower bound of limits of a cgroup and its ancestors.  Used in
+ * sgx_epc_cgroup_reclaim_work_func() to determine if EPC usage of a cgroup is
+ * over its limit or its ancestors' hence reclamation is needed.
+ */
+static inline u64 sgx_epc_cgroup_max_pages_to_root(struct sgx_epc_cgroup *epc_cg)
+{
+	struct misc_cg *i = epc_cg->cg;
+	u64 m = U64_MAX;
+
+	while (i) {
+		m = min(m, READ_ONCE(i->res[MISC_CG_RES_SGX_EPC].max));
+		i = misc_cg_parent(i);
+	}
+
+	return m / PAGE_SIZE;
+}
+
 /**
- * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page
+ * sgx_epc_cgroup_lru_empty() - check if a cgroup tree has no pages on its LRUs
+ * @root:	Root of the tree to check
  *
+ * Return: %true if all cgroups under the specified root have empty LRU lists.
+ * Used to avoid livelocks due to a cgroup having a non-zero charge count but
+ * no pages on its LRUs, e.g. due to a dead enclave waiting to be released or
+ * because all pages in the cgroup are unreclaimable.
+ */
+bool sgx_epc_cgroup_lru_empty(struct misc_cg *root)
+{
+	struct cgroup_subsys_state *css_root;
+	struct cgroup_subsys_state *pos;
+	struct sgx_epc_cgroup *epc_cg;
+	bool ret = true;
+
+	/*
+	 * Caller ensure css_root ref acquired
+	 */
+	css_root = &root->css;
+
+	rcu_read_lock();
+	css_for_each_descendant_pre(pos, css_root) {
+		if (!css_tryget(pos))
+			break;
+
+		rcu_read_unlock();
+
+		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
+
+		spin_lock(&epc_cg->lru.lock);
+		ret = list_empty(&epc_cg->lru.reclaimable);
+		spin_unlock(&epc_cg->lru.lock);
+
+		rcu_read_lock();
+		css_put(pos);
+		if (!ret)
+			break;
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/**
+ * sgx_epc_cgroup_reclaim_pages() - walk a cgroup tree and scan LRUs to reclaim pages
+ * @root:	Root of the tree to start walking from.
+ * Return:	Number of pages reclaimed.
+ */
+unsigned int sgx_epc_cgroup_reclaim_pages(struct misc_cg *root)
+{
+	/*
+	 * Attempting to reclaim only a few pages will often fail and is
+	 * inefficient, while reclaiming a huge number of pages can result in
+	 * soft lockups due to holding various locks for an extended duration.
+	 */
+	unsigned int nr_to_scan = SGX_NR_TO_SCAN;
+	struct cgroup_subsys_state *css_root;
+	struct cgroup_subsys_state *pos;
+	struct sgx_epc_cgroup *epc_cg;
+	unsigned int cnt;
+
+	 /* Caller ensure css_root ref acquired */
+	css_root = &root->css;
+
+	cnt = 0;
+	rcu_read_lock();
+	css_for_each_descendant_pre(pos, css_root) {
+		if (!css_tryget(pos))
+			break;
+		rcu_read_unlock();
+
+		epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
+		cnt += sgx_reclaim_pages(&epc_cg->lru, &nr_to_scan);
+
+		rcu_read_lock();
+		css_put(pos);
+		if (!nr_to_scan)
+			break;
+	}
+
+	rcu_read_unlock();
+	return cnt;
+}
+
+/*
+ * Scheduled by sgx_epc_cgroup_try_charge() to reclaim pages from the cgroup
+ * when the cgroup is at/near its maximum capacity
+ */
+static void sgx_epc_cgroup_reclaim_work_func(struct work_struct *work)
+{
+	struct sgx_epc_cgroup *epc_cg;
+	u64 cur, max;
+
+	epc_cg = container_of(work, struct sgx_epc_cgroup, reclaim_work);
+
+	for (;;) {
+		max = sgx_epc_cgroup_max_pages_to_root(epc_cg);
+
+		/*
+		 * Adjust the limit down by one page, the goal is to free up
+		 * pages for fault allocations, not to simply obey the limit.
+		 * Conditionally decrementing max also means the cur vs. max
+		 * check will correctly handle the case where both are zero.
+		 */
+		if (max)
+			max--;
+
+		/*
+		 * Unless the limit is extremely low, in which case forcing
+		 * reclaim will likely cause thrashing, force the cgroup to
+		 * reclaim at least once if it's operating *near* its maximum
+		 * limit by adjusting @max down by half the min reclaim size.
+		 * This work func is scheduled by sgx_epc_cgroup_try_charge
+		 * when it cannot directly reclaim due to being in an atomic
+		 * context, e.g. EPC allocation in a fault handler.  Waiting
+		 * to reclaim until the cgroup is actually at its limit is less
+		 * performant as it means the faulting task is effectively
+		 * blocked until a worker makes its way through the global work
+		 * queue.
+		 */
+		if (max > SGX_NR_TO_SCAN * 2)
+			max -= (SGX_NR_TO_SCAN / 2);
+
+		cur = sgx_epc_cgroup_page_counter_read(epc_cg);
+
+		if (cur <= max || sgx_epc_cgroup_lru_empty(epc_cg->cg))
+			break;
+
+		/* Keep reclaiming until above condition is met. */
+		sgx_epc_cgroup_reclaim_pages(epc_cg->cg);
+	}
+}
+
+/**
+ * sgx_epc_cgroup_try_charge() - try to charge cgroup for a single EPC page
  * @epc_cg:	The EPC cgroup to be charged for the page.
  * Return:
  * * %0 - If successfully charged.
@@ -38,6 +209,7 @@  static void sgx_epc_cgroup_free(struct misc_cg *cg)
 	if (!epc_cg)
 		return;
 
+	cancel_work_sync(&epc_cg->reclaim_work);
 	kfree(epc_cg);
 }
 
@@ -50,6 +222,8 @@  const struct misc_res_ops sgx_epc_cgroup_ops = {
 
 static void sgx_epc_misc_init(struct misc_cg *cg, struct sgx_epc_cgroup *epc_cg)
 {
+	sgx_lru_init(&epc_cg->lru);
+	INIT_WORK(&epc_cg->reclaim_work, sgx_epc_cgroup_reclaim_work_func);
 	cg->res[MISC_CG_RES_SGX_EPC].priv = epc_cg;
 	epc_cg->cg = cg;
 }
@@ -69,6 +243,11 @@  static int sgx_epc_cgroup_alloc(struct misc_cg *cg)
 
 void sgx_epc_cgroup_init(void)
 {
+	sgx_epc_cg_wq = alloc_workqueue("sgx_epc_cg_wq",
+					WQ_UNBOUND | WQ_FREEZABLE,
+					WQ_UNBOUND_MAX_ACTIVE);
+	BUG_ON(!sgx_epc_cg_wq);
+
 	misc_cg_set_ops(MISC_CG_RES_SGX_EPC, &sgx_epc_cgroup_ops);
 	sgx_epc_misc_init(misc_cg_root(), &epc_cg_root);
 }
diff --git a/arch/x86/kernel/cpu/sgx/epc_cgroup.h b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
index 6b664b4c321f..e3c6a08f0ee8 100644
--- a/arch/x86/kernel/cpu/sgx/epc_cgroup.h
+++ b/arch/x86/kernel/cpu/sgx/epc_cgroup.h
@@ -34,6 +34,8 @@  static inline void sgx_epc_cgroup_init(void) { }
 #else
 struct sgx_epc_cgroup {
 	struct misc_cg *cg;
+	struct sgx_epc_lru_list lru;
+	struct work_struct reclaim_work;
 };
 
 static inline struct sgx_epc_cgroup *sgx_epc_cgroup_from_misc_cg(struct misc_cg *cg)
@@ -66,6 +68,7 @@  static inline void sgx_put_epc_cg(struct sgx_epc_cgroup *epc_cg)
 
 int sgx_epc_cgroup_try_charge(struct sgx_epc_cgroup *epc_cg);
 void sgx_epc_cgroup_uncharge(struct sgx_epc_cgroup *epc_cg);
+bool sgx_epc_cgroup_lru_empty(struct misc_cg *root);
 void sgx_epc_cgroup_init(void);
 
 #endif