[v3,2/5] zswap: make shrinking memcg-aware

Message ID 20231017232152.2605440-3-nphamcs@gmail.com
State New
Headers
Series workload-specific and memory pressure-driven zswap writeback |

Commit Message

Nhat Pham Oct. 17, 2023, 11:21 p.m. UTC
  From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>

Currently, we only have a single global LRU for zswap. This makes it
impossible to perform worload-specific shrinking - an memcg cannot
determine which pages in the pool it owns, and often ends up writing
pages from other memcgs. This issue has been previously observed in
practice and mitigated by simply disabling memcg-initiated shrinking:

https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u

This patch fully resolves the issue by replacing the global zswap LRU
with memcg- and NUMA-specific LRUs, and modify the reclaim logic:

a) When a store attempt hits an memcg limit, it now triggers a
   synchronous reclaim attempt that, if successful, allows the new
   hotter page to be accepted by zswap.
b) If the store attempt instead hits the global zswap limit, it will
   trigger an asynchronous reclaim attempt, in which an memcg is
   selected for reclaim in a round-robin-like fashion.

Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Co-developed-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
 include/linux/memcontrol.h |   5 ++
 mm/swap.h                  |   3 +-
 mm/swap_state.c            |  17 +++-
 mm/zswap.c                 | 179 ++++++++++++++++++++++++++-----------
 4 files changed, 147 insertions(+), 57 deletions(-)
  

Comments

Yosry Ahmed Oct. 18, 2023, 11:20 p.m. UTC | #1
On Tue, Oct 17, 2023 at 4:21 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
>
> Currently, we only have a single global LRU for zswap. This makes it
> impossible to perform worload-specific shrinking - an memcg cannot
> determine which pages in the pool it owns, and often ends up writing
> pages from other memcgs. This issue has been previously observed in
> practice and mitigated by simply disabling memcg-initiated shrinking:
>
> https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u
>
> This patch fully resolves the issue by replacing the global zswap LRU
> with memcg- and NUMA-specific LRUs, and modify the reclaim logic:
>
> a) When a store attempt hits an memcg limit, it now triggers a
>    synchronous reclaim attempt that, if successful, allows the new
>    hotter page to be accepted by zswap.
> b) If the store attempt instead hits the global zswap limit, it will
>    trigger an asynchronous reclaim attempt, in which an memcg is
>    selected for reclaim in a round-robin-like fashion.

Could you explain the rationale behind the difference in behavior here
between the global limit and the memcg limit?

>
> Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> Co-developed-by: Nhat Pham <nphamcs@gmail.com>
> Signed-off-by: Nhat Pham <nphamcs@gmail.com>
> ---
>  include/linux/memcontrol.h |   5 ++
>  mm/swap.h                  |   3 +-
>  mm/swap_state.c            |  17 +++-
>  mm/zswap.c                 | 179 ++++++++++++++++++++++++++-----------
>  4 files changed, 147 insertions(+), 57 deletions(-)

This is a dense patch, I haven't absorbed all of it yet, but the first
round of comments below.

>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 031102ac9311..3de10fabea0f 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1179,6 +1179,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
>         return NULL;
>  }
>
> +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
> +{
> +       return NULL;
> +}
> +
>  static inline bool folio_memcg_kmem(struct folio *folio)
>  {
>         return false;
> diff --git a/mm/swap.h b/mm/swap.h
> index 8a3c7a0ace4f..bbd6ce661a20 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -50,7 +50,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>                                      struct vm_area_struct *vma,
>                                      unsigned long addr,
> -                                    bool *new_page_allocated);
> +                                    bool *new_page_allocated,
> +                                    bool fail_if_exists);
>  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
>                                     struct vm_fault *vmf);
>  struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index b3b14bd0dd64..0356df52b06a 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -411,7 +411,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
>
>  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>                         struct vm_area_struct *vma, unsigned long addr,
> -                       bool *new_page_allocated)
> +                       bool *new_page_allocated, bool fail_if_exists)

nit: I don't feel like "fail" is the correct word here. Perhaps "skip"?

>  {
>         struct swap_info_struct *si;
>         struct folio *folio;
> @@ -468,6 +468,15 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>                 if (err != -EEXIST)
>                         goto fail_put_swap;
>
> +               /*
> +                * This check guards against a state that happens if a call
> +                * to __read_swap_cache_async triggers a reclaim, if the
> +                * reclaimer (zswap's writeback as of now) then decides to
> +                * reclaim that same entry, then the subsequent call to
> +                * __read_swap_cache_async would get stuck in this loop.

I think this comment needs to first state that it is protecting
against a recursive call in general, not necessarily in reclaim, as
__read_swap_cache_async() is not usually called in the context of
reclaim so this can be confusing. Then it can give the exact example
we have today. Perhaps something like:

Protect against a recursive call to __read_swap_cache_async() on the
same entry waiting forever here because SWAP_HAS_CACHE is set but the
folio is not the swap cache yet. This can happen today if
mem_cgroup_swapin_charge_folio() below triggers reclaim through zswap,
which may call __read_swap_cache_async() in the writeback path.

> +                */
> +               if (fail_if_exists && err == -EEXIST)

We already made sure  in the preceding condition that err is -EEXIST.

> +                       goto fail_put_swap;
>                 /*
>                  * We might race against __delete_from_swap_cache(), and
>                  * stumble across a swap_map entry whose SWAP_HAS_CACHE
> @@ -530,7 +539,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>  {
>         bool page_was_allocated;
>         struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
> -                       vma, addr, &page_was_allocated);
> +                       vma, addr, &page_was_allocated, false);
>
>         if (page_was_allocated)
>                 swap_readpage(retpage, false, plug);
> @@ -649,7 +658,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
>                 /* Ok, do the async read-ahead now */
>                 page = __read_swap_cache_async(
>                         swp_entry(swp_type(entry), offset),
> -                       gfp_mask, vma, addr, &page_allocated);
> +                       gfp_mask, vma, addr, &page_allocated, false);
>                 if (!page)
>                         continue;
>                 if (page_allocated) {
> @@ -815,7 +824,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
>                 pte_unmap(pte);
>                 pte = NULL;
>                 page = __read_swap_cache_async(entry, gfp_mask, vma,
> -                                              addr, &page_allocated);
> +                                              addr, &page_allocated, false);
>                 if (!page)
>                         continue;
>                 if (page_allocated) {
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 083c693602b8..d2989ad11814 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -34,6 +34,7 @@
>  #include <linux/writeback.h>
>  #include <linux/pagemap.h>
>  #include <linux/workqueue.h>
> +#include <linux/list_lru.h>
>
>  #include "swap.h"
>  #include "internal.h"
> @@ -171,8 +172,8 @@ struct zswap_pool {
>         struct work_struct shrink_work;
>         struct hlist_node node;
>         char tfm_name[CRYPTO_MAX_ALG_NAME];
> -       struct list_head lru;
> -       spinlock_t lru_lock;
> +       struct list_lru list_lru;
> +       struct mem_cgroup *next_shrink;
>  };
>
>  /*
> @@ -288,15 +289,25 @@ static void zswap_update_total_size(void)
>         zswap_pool_total_size = total;
>  }
>
> +static inline struct mem_cgroup *get_mem_cgroup_from_entry(struct zswap_entry *entry)
> +{
> +       return entry->objcg ? get_mem_cgroup_from_objcg(entry->objcg) : NULL;
> +}
> +
> +static inline int entry_to_nid(struct zswap_entry *entry)
> +{
> +       return page_to_nid(virt_to_page(entry));
> +}
> +
>  /*********************************
>  * zswap entry functions
>  **********************************/
>  static struct kmem_cache *zswap_entry_cache;
>
> -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
> +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
>  {
>         struct zswap_entry *entry;
> -       entry = kmem_cache_alloc(zswap_entry_cache, gfp);
> +       entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
>         if (!entry)
>                 return NULL;
>         entry->refcount = 1;
> @@ -309,6 +320,27 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
>         kmem_cache_free(zswap_entry_cache, entry);
>  }
>
> +/*********************************
> +* lru functions
> +**********************************/
> +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> +{
> +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);

Could we avoid the need for get/put with an rcu_read_lock() instead?

> +       bool added = __list_lru_add(list_lru, &entry->lru, entry_to_nid(entry), memcg);
> +
> +       mem_cgroup_put(memcg);
> +       return added;
> +}
> +
> +static bool zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
> +{
> +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
> +       bool removed = __list_lru_del(list_lru, &entry->lru, entry_to_nid(entry), memcg);
> +
> +       mem_cgroup_put(memcg);
> +       return removed;
> +}
> +
>  /*********************************
>  * rbtree functions
>  **********************************/
> @@ -393,9 +425,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
>         if (!entry->length)
>                 atomic_dec(&zswap_same_filled_pages);
>         else {
> -               spin_lock(&entry->pool->lru_lock);
> -               list_del(&entry->lru);
> -               spin_unlock(&entry->pool->lru_lock);
> +               zswap_lru_del(&entry->pool->list_lru, entry);
>                 zpool_free(zswap_find_zpool(entry), entry->handle);
>                 zswap_pool_put(entry->pool);
>         }
> @@ -629,21 +659,16 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
>                 zswap_entry_put(tree, entry);
>  }
>
> -static int zswap_reclaim_entry(struct zswap_pool *pool)
> +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
> +                                      spinlock_t *lock, void *arg)
>  {
> -       struct zswap_entry *entry;
> +       struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
> +       struct mem_cgroup *memcg;
>         struct zswap_tree *tree;
>         pgoff_t swpoffset;
> -       int ret;
> +       enum lru_status ret = LRU_REMOVED_RETRY;
> +       int writeback_result;
>
> -       /* Get an entry off the LRU */
> -       spin_lock(&pool->lru_lock);
> -       if (list_empty(&pool->lru)) {
> -               spin_unlock(&pool->lru_lock);
> -               return -EINVAL;
> -       }
> -       entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> -       list_del_init(&entry->lru);
>         /*
>          * Once the lru lock is dropped, the entry might get freed. The
>          * swpoffset is copied to the stack, and entry isn't deref'd again
> @@ -651,28 +676,33 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
>          */
>         swpoffset = swp_offset(entry->swpentry);
>         tree = zswap_trees[swp_type(entry->swpentry)];
> -       spin_unlock(&pool->lru_lock);
> +       list_lru_isolate(l, item);
> +       spin_unlock(lock);

Perhaps a comment somewhere stating that we only return either
LRU_REMOVED_RETRY or LRU_RETRY, so it's fine to drop and reacquire the
lock.

>
>         /* Check for invalidate() race */
>         spin_lock(&tree->lock);
>         if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
> -               ret = -EAGAIN;
>                 goto unlock;
>         }

nit: braces no longer needed?

>         /* Hold a reference to prevent a free during writeback */
>         zswap_entry_get(entry);
>         spin_unlock(&tree->lock);
>
> -       ret = zswap_writeback_entry(entry, tree);
> +       writeback_result = zswap_writeback_entry(entry, tree);
>
>         spin_lock(&tree->lock);
> -       if (ret) {
> -               /* Writeback failed, put entry back on LRU */
> -               spin_lock(&pool->lru_lock);
> -               list_move(&entry->lru, &pool->lru);
> -               spin_unlock(&pool->lru_lock);
> +       if (writeback_result) {
> +               zswap_reject_reclaim_fail++;
> +               memcg = get_mem_cgroup_from_entry(entry);
> +               spin_lock(lock);
> +               /* we cannot use zswap_lru_add here, because it increments node's lru count */
> +               list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg);
> +               spin_unlock(lock);
> +               mem_cgroup_put(memcg);
> +               ret = LRU_RETRY;
>                 goto put_unlock;
>         }
> +       zswap_written_back_pages++;

Why is this moved here from zswap_writeback_entry()? Also why is
zswap_reject_reclaim_fail incremented here instead of inside
zswap_writeback_entry()?

>
>         /*
>          * Writeback started successfully, the page now belongs to the
> @@ -686,7 +716,36 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
>         zswap_entry_put(tree, entry);
>  unlock:
>         spin_unlock(&tree->lock);
> -       return ret ? -EAGAIN : 0;
> +       spin_lock(lock);
> +       return ret;
> +}
> +
> +static int shrink_memcg(struct mem_cgroup *memcg)
> +{
> +       struct zswap_pool *pool;
> +       int nid, shrunk = 0;
> +
> +       pool = zswap_pool_current_get();
> +       if (!pool)
> +               return -EINVAL;
> +
> +       /*
> +        * Skip zombies because their LRUs are reparented and we would be
> +        * reclaiming from the parent instead of the dead memcgroup.

nit: s/memcgroup/memcg.

> +        */
> +       if (memcg && !mem_cgroup_online(memcg))
> +               goto out;

If we move this above zswap_pool_current_get(), we can return directly
and remove the label. I noticed we will return -EAGAIN if memcg is
offline. IIUC -EAGAIN for the caller will move on to the next memcg,
but I am wondering if a different errno would be clearer here.

> +
> +       for_each_node_state(nid, N_NORMAL_MEMORY) {
> +               unsigned long nr_to_walk = 1;
> +
> +               if (list_lru_walk_one(&pool->list_lru, nid, memcg, &shrink_memcg_cb,
> +                                     NULL, &nr_to_walk))
> +                       shrunk++;

nit:
shrunk += list_lru_walk_one(..);

> +       }
> +out:
> +       zswap_pool_put(pool);
> +       return shrunk ? 0 : -EAGAIN;
>  }
>
>  static void shrink_worker(struct work_struct *w)
> @@ -695,10 +754,13 @@ static void shrink_worker(struct work_struct *w)
>                                                 shrink_work);
>         int ret, failures = 0;
>
> +       /* global reclaim will select cgroup in a round-robin fashion. */
>         do {
> -               ret = zswap_reclaim_entry(pool);
> +               pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);

Perhaps next_shrink_memcg is a better name here?

> +
> +               ret = shrink_memcg(pool->next_shrink);
> +
>                 if (ret) {
> -                       zswap_reject_reclaim_fail++;
>                         if (ret != -EAGAIN)
>                                 break;
>                         if (++failures == MAX_RECLAIM_RETRIES)
> @@ -764,8 +826,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>          */
>         kref_init(&pool->kref);
>         INIT_LIST_HEAD(&pool->list);
> -       INIT_LIST_HEAD(&pool->lru);
> -       spin_lock_init(&pool->lru_lock);
> +       list_lru_init_memcg(&pool->list_lru, NULL);
>         INIT_WORK(&pool->shrink_work, shrink_worker);
>
>         zswap_pool_debug("created", pool);
> @@ -831,6 +892,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
>
>         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
>         free_percpu(pool->acomp_ctx);
> +       list_lru_destroy(&pool->list_lru);
> +       if (pool->next_shrink)
> +               mem_cgroup_put(pool->next_shrink);
>         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
>                 zpool_destroy_pool(pool->zpools[i]);
>         kfree(pool);
> @@ -1076,7 +1140,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>
>         /* try to allocate swap cache page */
>         page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
> -                                      &page_was_allocated);
> +                                      &page_was_allocated, true);
>         if (!page) {
>                 ret = -ENOMEM;
>                 goto fail;
> @@ -1142,7 +1206,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>         /* start writeback */
>         __swap_writepage(page, &wbc);
>         put_page(page);
> -       zswap_written_back_pages++;
>
>         return ret;
>
> @@ -1199,8 +1262,10 @@ bool zswap_store(struct folio *folio)
>         struct scatterlist input, output;
>         struct crypto_acomp_ctx *acomp_ctx;
>         struct obj_cgroup *objcg = NULL;
> +       struct mem_cgroup *memcg = NULL;
>         struct zswap_pool *pool;
>         struct zpool *zpool;
> +       int lru_alloc_ret;
>         unsigned int dlen = PAGE_SIZE;
>         unsigned long handle, value;
>         char *buf;
> @@ -1230,15 +1295,15 @@ bool zswap_store(struct folio *folio)
>                 zswap_invalidate_entry(tree, dupentry);
>         }
>         spin_unlock(&tree->lock);
> -
> -       /*
> -        * XXX: zswap reclaim does not work with cgroups yet. Without a
> -        * cgroup-aware entry LRU, we will push out entries system-wide based on
> -        * local cgroup limits.
> -        */
>         objcg = get_obj_cgroup_from_folio(folio);
> -       if (objcg && !obj_cgroup_may_zswap(objcg))
> -               goto reject;
> +       if (objcg && !obj_cgroup_may_zswap(objcg)) {
> +               memcg = get_mem_cgroup_from_objcg(objcg);
> +               if (shrink_memcg(memcg)) {
> +                       mem_cgroup_put(memcg);
> +                       goto reject;
> +               }
> +               mem_cgroup_put(memcg);
> +       }
>
>         /* reclaim space if needed */
>         if (zswap_is_full()) {
> @@ -1254,10 +1319,15 @@ bool zswap_store(struct folio *folio)
>                         zswap_pool_reached_full = false;
>         }
>
> +       pool = zswap_pool_current_get();
> +       if (!pool)
> +               goto reject;
> +

Why do we need to move zswap_pool_current_get() up here?

>         /* allocate entry */
> -       entry = zswap_entry_cache_alloc(GFP_KERNEL);
> +       entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
>         if (!entry) {
>                 zswap_reject_kmemcache_fail++;
> +               zswap_pool_put(pool);
>                 goto reject;
>         }
>
> @@ -1269,6 +1339,7 @@ bool zswap_store(struct folio *folio)
>                         entry->length = 0;
>                         entry->value = value;
>                         atomic_inc(&zswap_same_filled_pages);
> +                       zswap_pool_put(pool);
>                         goto insert_entry;
>                 }
>                 kunmap_atomic(src);
> @@ -1278,9 +1349,15 @@ bool zswap_store(struct folio *folio)
>                 goto freepage;
>
>         /* if entry is successfully added, it keeps the reference */
> -       entry->pool = zswap_pool_current_get();
> -       if (!entry->pool)
> -               goto freepage;
> +       entry->pool = pool;
> +       if (objcg) {
> +               memcg = get_mem_cgroup_from_objcg(objcg);
> +               lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);
> +               mem_cgroup_put(memcg);
> +
> +               if (lru_alloc_ret)
> +                       goto freepage;
> +       }
>
>         /* compress */
>         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> @@ -1358,9 +1435,8 @@ bool zswap_store(struct folio *folio)
>                 zswap_invalidate_entry(tree, dupentry);
>         }
>         if (entry->length) {
> -               spin_lock(&entry->pool->lru_lock);
> -               list_add(&entry->lru, &entry->pool->lru);
> -               spin_unlock(&entry->pool->lru_lock);
> +               INIT_LIST_HEAD(&entry->lru);
> +               zswap_lru_add(&pool->list_lru, entry);
>         }
>         spin_unlock(&tree->lock);
>
> @@ -1373,8 +1449,8 @@ bool zswap_store(struct folio *folio)
>
>  put_dstmem:
>         mutex_unlock(acomp_ctx->mutex);
> -       zswap_pool_put(entry->pool);
>  freepage:
> +       zswap_pool_put(entry->pool);
>         zswap_entry_cache_free(entry);
>  reject:
>         if (objcg)
> @@ -1467,9 +1543,8 @@ bool zswap_load(struct folio *folio)
>                 zswap_invalidate_entry(tree, entry);
>                 folio_mark_dirty(folio);
>         } else if (entry->length) {
> -               spin_lock(&entry->pool->lru_lock);
> -               list_move(&entry->lru, &entry->pool->lru);
> -               spin_unlock(&entry->pool->lru_lock);
> +               zswap_lru_del(&entry->pool->list_lru, entry);
> +               zswap_lru_add(&entry->pool->list_lru, entry);
>         }
>         zswap_entry_put(tree, entry);
>         spin_unlock(&tree->lock);
> --
> 2.34.1
  
Nhat Pham Oct. 18, 2023, 11:46 p.m. UTC | #2
On Wed, Oct 18, 2023 at 4:20 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Oct 17, 2023 at 4:21 PM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> >
> > Currently, we only have a single global LRU for zswap. This makes it
> > impossible to perform worload-specific shrinking - an memcg cannot
> > determine which pages in the pool it owns, and often ends up writing
> > pages from other memcgs. This issue has been previously observed in
> > practice and mitigated by simply disabling memcg-initiated shrinking:
> >
> > https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u
> >
> > This patch fully resolves the issue by replacing the global zswap LRU
> > with memcg- and NUMA-specific LRUs, and modify the reclaim logic:
> >
> > a) When a store attempt hits an memcg limit, it now triggers a
> >    synchronous reclaim attempt that, if successful, allows the new
> >    hotter page to be accepted by zswap.
> > b) If the store attempt instead hits the global zswap limit, it will
> >    trigger an asynchronous reclaim attempt, in which an memcg is
> >    selected for reclaim in a round-robin-like fashion.
>
> Could you explain the rationale behind the difference in behavior here
> between the global limit and the memcg limit?

The global limit hit reclaim behavior was previously asynchronous too.
We just added the round-robin part because now the zswap LRU is
cgroup-aware :)

For the cgroup limit hit, however, we cannot make it asynchronous,
as it is a bit hairy to add a per-cgroup shrink_work. So, we just
perform the reclaim synchronously.

The question is whether it makes sense to make the global limit
reclaim synchronous too. That is a task of its own IMO.

(FWIW, this somewhat mirrors the direct reclaimer v.s kswapd
story to me, but don't quote me too hard on this).

>
> >
> > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > Co-developed-by: Nhat Pham <nphamcs@gmail.com>
> > Signed-off-by: Nhat Pham <nphamcs@gmail.com>
> > ---
> >  include/linux/memcontrol.h |   5 ++
> >  mm/swap.h                  |   3 +-
> >  mm/swap_state.c            |  17 +++-
> >  mm/zswap.c                 | 179 ++++++++++++++++++++++++++-----------
> >  4 files changed, 147 insertions(+), 57 deletions(-)
>
> This is a dense patch, I haven't absorbed all of it yet, but the first
> round of comments below.

Regardless, thanks for the feedback, Yosry! Domenico definitely
knows more than me about this, but I'll respond with what I know,
and he can expand and/or fact-check me :)

>
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 031102ac9311..3de10fabea0f 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -1179,6 +1179,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
> >         return NULL;
> >  }
> >
> > +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
> > +{
> > +       return NULL;
> > +}
> > +
> >  static inline bool folio_memcg_kmem(struct folio *folio)
> >  {
> >         return false;
> > diff --git a/mm/swap.h b/mm/swap.h
> > index 8a3c7a0ace4f..bbd6ce661a20 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -50,7 +50,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >                                      struct vm_area_struct *vma,
> >                                      unsigned long addr,
> > -                                    bool *new_page_allocated);
> > +                                    bool *new_page_allocated,
> > +                                    bool fail_if_exists);
> >  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
> >                                     struct vm_fault *vmf);
> >  struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index b3b14bd0dd64..0356df52b06a 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -411,7 +411,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
> >
> >  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >                         struct vm_area_struct *vma, unsigned long addr,
> > -                       bool *new_page_allocated)
> > +                       bool *new_page_allocated, bool fail_if_exists)
>
> nit: I don't feel like "fail" is the correct word here. Perhaps "skip"?
>
> >  {
> >         struct swap_info_struct *si;
> >         struct folio *folio;
> > @@ -468,6 +468,15 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >                 if (err != -EEXIST)
> >                         goto fail_put_swap;
> >
> > +               /*
> > +                * This check guards against a state that happens if a call
> > +                * to __read_swap_cache_async triggers a reclaim, if the
> > +                * reclaimer (zswap's writeback as of now) then decides to
> > +                * reclaim that same entry, then the subsequent call to
> > +                * __read_swap_cache_async would get stuck in this loop.
>
> I think this comment needs to first state that it is protecting
> against a recursive call in general, not necessarily in reclaim, as
> __read_swap_cache_async() is not usually called in the context of
> reclaim so this can be confusing. Then it can give the exact example
> we have today. Perhaps something like:
>
> Protect against a recursive call to __read_swap_cache_async() on the
> same entry waiting forever here because SWAP_HAS_CACHE is set but the
> folio is not the swap cache yet. This can happen today if
> mem_cgroup_swapin_charge_folio() below triggers reclaim through zswap,
> which may call __read_swap_cache_async() in the writeback path.
>
> > +                */
> > +               if (fail_if_exists && err == -EEXIST)
>
> We already made sure  in the preceding condition that err is -EEXIST.
>
> > +                       goto fail_put_swap;
> >                 /*
> >                  * We might race against __delete_from_swap_cache(), and
> >                  * stumble across a swap_map entry whose SWAP_HAS_CACHE
> > @@ -530,7 +539,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >  {
> >         bool page_was_allocated;
> >         struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
> > -                       vma, addr, &page_was_allocated);
> > +                       vma, addr, &page_was_allocated, false);
> >
> >         if (page_was_allocated)
> >                 swap_readpage(retpage, false, plug);
> > @@ -649,7 +658,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> >                 /* Ok, do the async read-ahead now */
> >                 page = __read_swap_cache_async(
> >                         swp_entry(swp_type(entry), offset),
> > -                       gfp_mask, vma, addr, &page_allocated);
> > +                       gfp_mask, vma, addr, &page_allocated, false);
> >                 if (!page)
> >                         continue;
> >                 if (page_allocated) {
> > @@ -815,7 +824,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
> >                 pte_unmap(pte);
> >                 pte = NULL;
> >                 page = __read_swap_cache_async(entry, gfp_mask, vma,
> > -                                              addr, &page_allocated);
> > +                                              addr, &page_allocated, false);
> >                 if (!page)
> >                         continue;
> >                 if (page_allocated) {
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 083c693602b8..d2989ad11814 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -34,6 +34,7 @@
> >  #include <linux/writeback.h>
> >  #include <linux/pagemap.h>
> >  #include <linux/workqueue.h>
> > +#include <linux/list_lru.h>
> >
> >  #include "swap.h"
> >  #include "internal.h"
> > @@ -171,8 +172,8 @@ struct zswap_pool {
> >         struct work_struct shrink_work;
> >         struct hlist_node node;
> >         char tfm_name[CRYPTO_MAX_ALG_NAME];
> > -       struct list_head lru;
> > -       spinlock_t lru_lock;
> > +       struct list_lru list_lru;
> > +       struct mem_cgroup *next_shrink;
> >  };
> >
> >  /*
> > @@ -288,15 +289,25 @@ static void zswap_update_total_size(void)
> >         zswap_pool_total_size = total;
> >  }
> >
> > +static inline struct mem_cgroup *get_mem_cgroup_from_entry(struct zswap_entry *entry)
> > +{
> > +       return entry->objcg ? get_mem_cgroup_from_objcg(entry->objcg) : NULL;
> > +}
> > +
> > +static inline int entry_to_nid(struct zswap_entry *entry)
> > +{
> > +       return page_to_nid(virt_to_page(entry));
> > +}
> > +
> >  /*********************************
> >  * zswap entry functions
> >  **********************************/
> >  static struct kmem_cache *zswap_entry_cache;
> >
> > -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
> > +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
> >  {
> >         struct zswap_entry *entry;
> > -       entry = kmem_cache_alloc(zswap_entry_cache, gfp);
> > +       entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
> >         if (!entry)
> >                 return NULL;
> >         entry->refcount = 1;
> > @@ -309,6 +320,27 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
> >         kmem_cache_free(zswap_entry_cache, entry);
> >  }
> >
> > +/*********************************
> > +* lru functions
> > +**********************************/
> > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> > +{
> > +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
>
> Could we avoid the need for get/put with an rcu_read_lock() instead?
>
> > +       bool added = __list_lru_add(list_lru, &entry->lru, entry_to_nid(entry), memcg);
> > +
> > +       mem_cgroup_put(memcg);
> > +       return added;
> > +}
> > +
> > +static bool zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
> > +{
> > +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
> > +       bool removed = __list_lru_del(list_lru, &entry->lru, entry_to_nid(entry), memcg);
> > +
> > +       mem_cgroup_put(memcg);
> > +       return removed;
> > +}
> > +
> >  /*********************************
> >  * rbtree functions
> >  **********************************/
> > @@ -393,9 +425,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
> >         if (!entry->length)
> >                 atomic_dec(&zswap_same_filled_pages);
> >         else {
> > -               spin_lock(&entry->pool->lru_lock);
> > -               list_del(&entry->lru);
> > -               spin_unlock(&entry->pool->lru_lock);
> > +               zswap_lru_del(&entry->pool->list_lru, entry);
> >                 zpool_free(zswap_find_zpool(entry), entry->handle);
> >                 zswap_pool_put(entry->pool);
> >         }
> > @@ -629,21 +659,16 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
> >                 zswap_entry_put(tree, entry);
> >  }
> >
> > -static int zswap_reclaim_entry(struct zswap_pool *pool)
> > +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
> > +                                      spinlock_t *lock, void *arg)
> >  {
> > -       struct zswap_entry *entry;
> > +       struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
> > +       struct mem_cgroup *memcg;
> >         struct zswap_tree *tree;
> >         pgoff_t swpoffset;
> > -       int ret;
> > +       enum lru_status ret = LRU_REMOVED_RETRY;
> > +       int writeback_result;
> >
> > -       /* Get an entry off the LRU */
> > -       spin_lock(&pool->lru_lock);
> > -       if (list_empty(&pool->lru)) {
> > -               spin_unlock(&pool->lru_lock);
> > -               return -EINVAL;
> > -       }
> > -       entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> > -       list_del_init(&entry->lru);
> >         /*
> >          * Once the lru lock is dropped, the entry might get freed. The
> >          * swpoffset is copied to the stack, and entry isn't deref'd again
> > @@ -651,28 +676,33 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> >          */
> >         swpoffset = swp_offset(entry->swpentry);
> >         tree = zswap_trees[swp_type(entry->swpentry)];
> > -       spin_unlock(&pool->lru_lock);
> > +       list_lru_isolate(l, item);
> > +       spin_unlock(lock);
>
> Perhaps a comment somewhere stating that we only return either
> LRU_REMOVED_RETRY or LRU_RETRY, so it's fine to drop and reacquire the
> lock.
>
> >
> >         /* Check for invalidate() race */
> >         spin_lock(&tree->lock);
> >         if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
> > -               ret = -EAGAIN;
> >                 goto unlock;
> >         }
>
> nit: braces no longer needed?

Ah, for some reason checkpatch did not pick up on this.
Weird.

>
> >         /* Hold a reference to prevent a free during writeback */
> >         zswap_entry_get(entry);
> >         spin_unlock(&tree->lock);
> >
> > -       ret = zswap_writeback_entry(entry, tree);
> > +       writeback_result = zswap_writeback_entry(entry, tree);
> >
> >         spin_lock(&tree->lock);
> > -       if (ret) {
> > -               /* Writeback failed, put entry back on LRU */
> > -               spin_lock(&pool->lru_lock);
> > -               list_move(&entry->lru, &pool->lru);
> > -               spin_unlock(&pool->lru_lock);
> > +       if (writeback_result) {
> > +               zswap_reject_reclaim_fail++;
> > +               memcg = get_mem_cgroup_from_entry(entry);
> > +               spin_lock(lock);
> > +               /* we cannot use zswap_lru_add here, because it increments node's lru count */
> > +               list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg);
> > +               spin_unlock(lock);
> > +               mem_cgroup_put(memcg);
> > +               ret = LRU_RETRY;
> >                 goto put_unlock;
> >         }
> > +       zswap_written_back_pages++;
>
> Why is this moved here from zswap_writeback_entry()? Also why is
> zswap_reject_reclaim_fail incremented here instead of inside
> zswap_writeback_entry()?

Domenico should know this better than me, but my understanding
is that moving it here protects concurrent modifications of
zswap_written_back_pages with the tree lock.

Is writeback single-threaded in the past? This counter is non-atomic,
and doesn't seem to be protected by any locks...

There definitely can be concurrent stores now though - with
a synchronous reclaim from cgroup-limit hit and another
from the old shrink worker.

(and with the new zswap shrinker, concurrent reclaim is
the expectation!)

zswap_reject_reclaim_fail was previously incremented in
shrink_worker I think. We need it to be incremented
for the shrinker as well, so might as well move it here.

>
> >
> >         /*
> >          * Writeback started successfully, the page now belongs to the
> > @@ -686,7 +716,36 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> >         zswap_entry_put(tree, entry);
> >  unlock:
> >         spin_unlock(&tree->lock);
> > -       return ret ? -EAGAIN : 0;
> > +       spin_lock(lock);
> > +       return ret;
> > +}
> > +
> > +static int shrink_memcg(struct mem_cgroup *memcg)
> > +{
> > +       struct zswap_pool *pool;
> > +       int nid, shrunk = 0;
> > +
> > +       pool = zswap_pool_current_get();
> > +       if (!pool)
> > +               return -EINVAL;
> > +
> > +       /*
> > +        * Skip zombies because their LRUs are reparented and we would be
> > +        * reclaiming from the parent instead of the dead memcgroup.
>
> nit: s/memcgroup/memcg.
>
> > +        */
> > +       if (memcg && !mem_cgroup_online(memcg))
> > +               goto out;
>
> If we move this above zswap_pool_current_get(), we can return directly
> and remove the label. I noticed we will return -EAGAIN if memcg is
> offline. IIUC -EAGAIN for the caller will move on to the next memcg,
> but I am wondering if a different errno would be clearer here.
>
> > +
> > +       for_each_node_state(nid, N_NORMAL_MEMORY) {
> > +               unsigned long nr_to_walk = 1;
> > +
> > +               if (list_lru_walk_one(&pool->list_lru, nid, memcg, &shrink_memcg_cb,
> > +                                     NULL, &nr_to_walk))
> > +                       shrunk++;
>
> nit:
> shrunk += list_lru_walk_one(..);

yeah might be a tad cleaner.

>
> > +       }
> > +out:
> > +       zswap_pool_put(pool);
> > +       return shrunk ? 0 : -EAGAIN;
> >  }
> >
> >  static void shrink_worker(struct work_struct *w)
> > @@ -695,10 +754,13 @@ static void shrink_worker(struct work_struct *w)
> >                                                 shrink_work);
> >         int ret, failures = 0;
> >
> > +       /* global reclaim will select cgroup in a round-robin fashion. */
> >         do {
> > -               ret = zswap_reclaim_entry(pool);
> > +               pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
>
> Perhaps next_shrink_memcg is a better name here?
>
> > +
> > +               ret = shrink_memcg(pool->next_shrink);
> > +
> >                 if (ret) {
> > -                       zswap_reject_reclaim_fail++;
> >                         if (ret != -EAGAIN)
> >                                 break;
> >                         if (++failures == MAX_RECLAIM_RETRIES)
> > @@ -764,8 +826,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >          */
> >         kref_init(&pool->kref);
> >         INIT_LIST_HEAD(&pool->list);
> > -       INIT_LIST_HEAD(&pool->lru);
> > -       spin_lock_init(&pool->lru_lock);
> > +       list_lru_init_memcg(&pool->list_lru, NULL);
> >         INIT_WORK(&pool->shrink_work, shrink_worker);
> >
> >         zswap_pool_debug("created", pool);
> > @@ -831,6 +892,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> >
> >         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> >         free_percpu(pool->acomp_ctx);
> > +       list_lru_destroy(&pool->list_lru);
> > +       if (pool->next_shrink)
> > +               mem_cgroup_put(pool->next_shrink);
> >         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> >                 zpool_destroy_pool(pool->zpools[i]);
> >         kfree(pool);
> > @@ -1076,7 +1140,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >
> >         /* try to allocate swap cache page */
> >         page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
> > -                                      &page_was_allocated);
> > +                                      &page_was_allocated, true);
> >         if (!page) {
> >                 ret = -ENOMEM;
> >                 goto fail;
> > @@ -1142,7 +1206,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >         /* start writeback */
> >         __swap_writepage(page, &wbc);
> >         put_page(page);
> > -       zswap_written_back_pages++;
> >
> >         return ret;
> >
> > @@ -1199,8 +1262,10 @@ bool zswap_store(struct folio *folio)
> >         struct scatterlist input, output;
> >         struct crypto_acomp_ctx *acomp_ctx;
> >         struct obj_cgroup *objcg = NULL;
> > +       struct mem_cgroup *memcg = NULL;
> >         struct zswap_pool *pool;
> >         struct zpool *zpool;
> > +       int lru_alloc_ret;
> >         unsigned int dlen = PAGE_SIZE;
> >         unsigned long handle, value;
> >         char *buf;
> > @@ -1230,15 +1295,15 @@ bool zswap_store(struct folio *folio)
> >                 zswap_invalidate_entry(tree, dupentry);
> >         }
> >         spin_unlock(&tree->lock);
> > -
> > -       /*
> > -        * XXX: zswap reclaim does not work with cgroups yet. Without a
> > -        * cgroup-aware entry LRU, we will push out entries system-wide based on
> > -        * local cgroup limits.
> > -        */
> >         objcg = get_obj_cgroup_from_folio(folio);
> > -       if (objcg && !obj_cgroup_may_zswap(objcg))
> > -               goto reject;
> > +       if (objcg && !obj_cgroup_may_zswap(objcg)) {
> > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > +               if (shrink_memcg(memcg)) {
> > +                       mem_cgroup_put(memcg);
> > +                       goto reject;
> > +               }
> > +               mem_cgroup_put(memcg);
> > +       }
> >
> >         /* reclaim space if needed */
> >         if (zswap_is_full()) {
> > @@ -1254,10 +1319,15 @@ bool zswap_store(struct folio *folio)
> >                         zswap_pool_reached_full = false;
> >         }
> >
> > +       pool = zswap_pool_current_get();
> > +       if (!pool)
> > +               goto reject;
> > +
>
> Why do we need to move zswap_pool_current_get() up here?
>
> >         /* allocate entry */
> > -       entry = zswap_entry_cache_alloc(GFP_KERNEL);
> > +       entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> >         if (!entry) {
> >                 zswap_reject_kmemcache_fail++;
> > +               zswap_pool_put(pool);
> >                 goto reject;
> >         }
> >
> > @@ -1269,6 +1339,7 @@ bool zswap_store(struct folio *folio)
> >                         entry->length = 0;
> >                         entry->value = value;
> >                         atomic_inc(&zswap_same_filled_pages);
> > +                       zswap_pool_put(pool);
> >                         goto insert_entry;
> >                 }
> >                 kunmap_atomic(src);
> > @@ -1278,9 +1349,15 @@ bool zswap_store(struct folio *folio)
> >                 goto freepage;
> >
> >         /* if entry is successfully added, it keeps the reference */
> > -       entry->pool = zswap_pool_current_get();
> > -       if (!entry->pool)
> > -               goto freepage;
> > +       entry->pool = pool;
> > +       if (objcg) {
> > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > +               lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);
> > +               mem_cgroup_put(memcg);
> > +
> > +               if (lru_alloc_ret)
> > +                       goto freepage;
> > +       }
> >
> >         /* compress */
> >         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > @@ -1358,9 +1435,8 @@ bool zswap_store(struct folio *folio)
> >                 zswap_invalidate_entry(tree, dupentry);
> >         }
> >         if (entry->length) {
> > -               spin_lock(&entry->pool->lru_lock);
> > -               list_add(&entry->lru, &entry->pool->lru);
> > -               spin_unlock(&entry->pool->lru_lock);
> > +               INIT_LIST_HEAD(&entry->lru);
> > +               zswap_lru_add(&pool->list_lru, entry);
> >         }
> >         spin_unlock(&tree->lock);
> >
> > @@ -1373,8 +1449,8 @@ bool zswap_store(struct folio *folio)
> >
> >  put_dstmem:
> >         mutex_unlock(acomp_ctx->mutex);
> > -       zswap_pool_put(entry->pool);
> >  freepage:
> > +       zswap_pool_put(entry->pool);
> >         zswap_entry_cache_free(entry);
> >  reject:
> >         if (objcg)
> > @@ -1467,9 +1543,8 @@ bool zswap_load(struct folio *folio)
> >                 zswap_invalidate_entry(tree, entry);
> >                 folio_mark_dirty(folio);
> >         } else if (entry->length) {
> > -               spin_lock(&entry->pool->lru_lock);
> > -               list_move(&entry->lru, &entry->pool->lru);
> > -               spin_unlock(&entry->pool->lru_lock);
> > +               zswap_lru_del(&entry->pool->list_lru, entry);
> > +               zswap_lru_add(&entry->pool->list_lru, entry);
> >         }
> >         zswap_entry_put(tree, entry);
> >         spin_unlock(&tree->lock);
> > --
> > 2.34.1

I don't have (strong) opinions or (educated) guesses
on the rest.
  
Nhat Pham Oct. 18, 2023, 11:48 p.m. UTC | #3
On Wed, Oct 18, 2023 at 4:46 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Wed, Oct 18, 2023 at 4:20 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Oct 17, 2023 at 4:21 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > >
> > > Currently, we only have a single global LRU for zswap. This makes it
> > > impossible to perform worload-specific shrinking - an memcg cannot
> > > determine which pages in the pool it owns, and often ends up writing
> > > pages from other memcgs. This issue has been previously observed in
> > > practice and mitigated by simply disabling memcg-initiated shrinking:
> > >
> > > https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u
> > >
> > > This patch fully resolves the issue by replacing the global zswap LRU
> > > with memcg- and NUMA-specific LRUs, and modify the reclaim logic:
> > >
> > > a) When a store attempt hits an memcg limit, it now triggers a
> > >    synchronous reclaim attempt that, if successful, allows the new
> > >    hotter page to be accepted by zswap.
> > > b) If the store attempt instead hits the global zswap limit, it will
> > >    trigger an asynchronous reclaim attempt, in which an memcg is
> > >    selected for reclaim in a round-robin-like fashion.
> >
> > Could you explain the rationale behind the difference in behavior here
> > between the global limit and the memcg limit?
>
> The global limit hit reclaim behavior was previously asynchronous too.
> We just added the round-robin part because now the zswap LRU is
> cgroup-aware :)
>
> For the cgroup limit hit, however, we cannot make it asynchronous,
> as it is a bit hairy to add a per-cgroup shrink_work. So, we just
> perform the reclaim synchronously.
>
> The question is whether it makes sense to make the global limit
> reclaim synchronous too. That is a task of its own IMO.
>
> (FWIW, this somewhat mirrors the direct reclaimer v.s kswapd
> story to me, but don't quote me too hard on this).
>
> >
> > >
> > > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > > Co-developed-by: Nhat Pham <nphamcs@gmail.com>
> > > Signed-off-by: Nhat Pham <nphamcs@gmail.com>
> > > ---
> > >  include/linux/memcontrol.h |   5 ++
> > >  mm/swap.h                  |   3 +-
> > >  mm/swap_state.c            |  17 +++-
> > >  mm/zswap.c                 | 179 ++++++++++++++++++++++++++-----------
> > >  4 files changed, 147 insertions(+), 57 deletions(-)
> >
> > This is a dense patch, I haven't absorbed all of it yet, but the first
> > round of comments below.
>
> Regardless, thanks for the feedback, Yosry! Domenico definitely
> knows more than me about this, but I'll respond with what I know,
> and he can expand and/or fact-check me :)
>
> >
> > >
> > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > > index 031102ac9311..3de10fabea0f 100644
> > > --- a/include/linux/memcontrol.h
> > > +++ b/include/linux/memcontrol.h
> > > @@ -1179,6 +1179,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
> > >         return NULL;
> > >  }
> > >
> > > +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
> > > +{
> > > +       return NULL;
> > > +}
> > > +
> > >  static inline bool folio_memcg_kmem(struct folio *folio)
> > >  {
> > >         return false;
> > > diff --git a/mm/swap.h b/mm/swap.h
> > > index 8a3c7a0ace4f..bbd6ce661a20 100644
> > > --- a/mm/swap.h
> > > +++ b/mm/swap.h
> > > @@ -50,7 +50,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > >  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > >                                      struct vm_area_struct *vma,
> > >                                      unsigned long addr,
> > > -                                    bool *new_page_allocated);
> > > +                                    bool *new_page_allocated,
> > > +                                    bool fail_if_exists);
> > >  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
> > >                                     struct vm_fault *vmf);
> > >  struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
> > > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > > index b3b14bd0dd64..0356df52b06a 100644
> > > --- a/mm/swap_state.c
> > > +++ b/mm/swap_state.c
> > > @@ -411,7 +411,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
> > >
> > >  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > >                         struct vm_area_struct *vma, unsigned long addr,
> > > -                       bool *new_page_allocated)
> > > +                       bool *new_page_allocated, bool fail_if_exists)
> >
> > nit: I don't feel like "fail" is the correct word here. Perhaps "skip"?
> >
> > >  {
> > >         struct swap_info_struct *si;
> > >         struct folio *folio;
> > > @@ -468,6 +468,15 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > >                 if (err != -EEXIST)
> > >                         goto fail_put_swap;
> > >
> > > +               /*
> > > +                * This check guards against a state that happens if a call
> > > +                * to __read_swap_cache_async triggers a reclaim, if the
> > > +                * reclaimer (zswap's writeback as of now) then decides to
> > > +                * reclaim that same entry, then the subsequent call to
> > > +                * __read_swap_cache_async would get stuck in this loop.
> >
> > I think this comment needs to first state that it is protecting
> > against a recursive call in general, not necessarily in reclaim, as
> > __read_swap_cache_async() is not usually called in the context of
> > reclaim so this can be confusing. Then it can give the exact example
> > we have today. Perhaps something like:
> >
> > Protect against a recursive call to __read_swap_cache_async() on the
> > same entry waiting forever here because SWAP_HAS_CACHE is set but the
> > folio is not the swap cache yet. This can happen today if
> > mem_cgroup_swapin_charge_folio() below triggers reclaim through zswap,
> > which may call __read_swap_cache_async() in the writeback path.
> >
> > > +                */
> > > +               if (fail_if_exists && err == -EEXIST)
> >
> > We already made sure  in the preceding condition that err is -EEXIST.
> >
> > > +                       goto fail_put_swap;
> > >                 /*
> > >                  * We might race against __delete_from_swap_cache(), and
> > >                  * stumble across a swap_map entry whose SWAP_HAS_CACHE
> > > @@ -530,7 +539,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > >  {
> > >         bool page_was_allocated;
> > >         struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
> > > -                       vma, addr, &page_was_allocated);
> > > +                       vma, addr, &page_was_allocated, false);
> > >
> > >         if (page_was_allocated)
> > >                 swap_readpage(retpage, false, plug);
> > > @@ -649,7 +658,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> > >                 /* Ok, do the async read-ahead now */
> > >                 page = __read_swap_cache_async(
> > >                         swp_entry(swp_type(entry), offset),
> > > -                       gfp_mask, vma, addr, &page_allocated);
> > > +                       gfp_mask, vma, addr, &page_allocated, false);
> > >                 if (!page)
> > >                         continue;
> > >                 if (page_allocated) {
> > > @@ -815,7 +824,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
> > >                 pte_unmap(pte);
> > >                 pte = NULL;
> > >                 page = __read_swap_cache_async(entry, gfp_mask, vma,
> > > -                                              addr, &page_allocated);
> > > +                                              addr, &page_allocated, false);
> > >                 if (!page)
> > >                         continue;
> > >                 if (page_allocated) {
> > > diff --git a/mm/zswap.c b/mm/zswap.c
> > > index 083c693602b8..d2989ad11814 100644
> > > --- a/mm/zswap.c
> > > +++ b/mm/zswap.c
> > > @@ -34,6 +34,7 @@
> > >  #include <linux/writeback.h>
> > >  #include <linux/pagemap.h>
> > >  #include <linux/workqueue.h>
> > > +#include <linux/list_lru.h>
> > >
> > >  #include "swap.h"
> > >  #include "internal.h"
> > > @@ -171,8 +172,8 @@ struct zswap_pool {
> > >         struct work_struct shrink_work;
> > >         struct hlist_node node;
> > >         char tfm_name[CRYPTO_MAX_ALG_NAME];
> > > -       struct list_head lru;
> > > -       spinlock_t lru_lock;
> > > +       struct list_lru list_lru;
> > > +       struct mem_cgroup *next_shrink;
> > >  };
> > >
> > >  /*
> > > @@ -288,15 +289,25 @@ static void zswap_update_total_size(void)
> > >         zswap_pool_total_size = total;
> > >  }
> > >
> > > +static inline struct mem_cgroup *get_mem_cgroup_from_entry(struct zswap_entry *entry)
> > > +{
> > > +       return entry->objcg ? get_mem_cgroup_from_objcg(entry->objcg) : NULL;
> > > +}
> > > +
> > > +static inline int entry_to_nid(struct zswap_entry *entry)
> > > +{
> > > +       return page_to_nid(virt_to_page(entry));
> > > +}
> > > +
> > >  /*********************************
> > >  * zswap entry functions
> > >  **********************************/
> > >  static struct kmem_cache *zswap_entry_cache;
> > >
> > > -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
> > > +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
> > >  {
> > >         struct zswap_entry *entry;
> > > -       entry = kmem_cache_alloc(zswap_entry_cache, gfp);
> > > +       entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
> > >         if (!entry)
> > >                 return NULL;
> > >         entry->refcount = 1;
> > > @@ -309,6 +320,27 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
> > >         kmem_cache_free(zswap_entry_cache, entry);
> > >  }
> > >
> > > +/*********************************
> > > +* lru functions
> > > +**********************************/
> > > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> > > +{
> > > +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
> >
> > Could we avoid the need for get/put with an rcu_read_lock() instead?
> >
> > > +       bool added = __list_lru_add(list_lru, &entry->lru, entry_to_nid(entry), memcg);
> > > +
> > > +       mem_cgroup_put(memcg);
> > > +       return added;
> > > +}
> > > +
> > > +static bool zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
> > > +{
> > > +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
> > > +       bool removed = __list_lru_del(list_lru, &entry->lru, entry_to_nid(entry), memcg);
> > > +
> > > +       mem_cgroup_put(memcg);
> > > +       return removed;
> > > +}
> > > +
> > >  /*********************************
> > >  * rbtree functions
> > >  **********************************/
> > > @@ -393,9 +425,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
> > >         if (!entry->length)
> > >                 atomic_dec(&zswap_same_filled_pages);
> > >         else {
> > > -               spin_lock(&entry->pool->lru_lock);
> > > -               list_del(&entry->lru);
> > > -               spin_unlock(&entry->pool->lru_lock);
> > > +               zswap_lru_del(&entry->pool->list_lru, entry);
> > >                 zpool_free(zswap_find_zpool(entry), entry->handle);
> > >                 zswap_pool_put(entry->pool);
> > >         }
> > > @@ -629,21 +659,16 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
> > >                 zswap_entry_put(tree, entry);
> > >  }
> > >
> > > -static int zswap_reclaim_entry(struct zswap_pool *pool)
> > > +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
> > > +                                      spinlock_t *lock, void *arg)
> > >  {
> > > -       struct zswap_entry *entry;
> > > +       struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
> > > +       struct mem_cgroup *memcg;
> > >         struct zswap_tree *tree;
> > >         pgoff_t swpoffset;
> > > -       int ret;
> > > +       enum lru_status ret = LRU_REMOVED_RETRY;
> > > +       int writeback_result;
> > >
> > > -       /* Get an entry off the LRU */
> > > -       spin_lock(&pool->lru_lock);
> > > -       if (list_empty(&pool->lru)) {
> > > -               spin_unlock(&pool->lru_lock);
> > > -               return -EINVAL;
> > > -       }
> > > -       entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> > > -       list_del_init(&entry->lru);
> > >         /*
> > >          * Once the lru lock is dropped, the entry might get freed. The
> > >          * swpoffset is copied to the stack, and entry isn't deref'd again
> > > @@ -651,28 +676,33 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> > >          */
> > >         swpoffset = swp_offset(entry->swpentry);
> > >         tree = zswap_trees[swp_type(entry->swpentry)];
> > > -       spin_unlock(&pool->lru_lock);
> > > +       list_lru_isolate(l, item);
> > > +       spin_unlock(lock);
> >
> > Perhaps a comment somewhere stating that we only return either
> > LRU_REMOVED_RETRY or LRU_RETRY, so it's fine to drop and reacquire the
> > lock.
> >
> > >
> > >         /* Check for invalidate() race */
> > >         spin_lock(&tree->lock);
> > >         if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
> > > -               ret = -EAGAIN;
> > >                 goto unlock;
> > >         }
> >
> > nit: braces no longer needed?
>
> Ah, for some reason checkpatch did not pick up on this.
> Weird.
>
> >
> > >         /* Hold a reference to prevent a free during writeback */
> > >         zswap_entry_get(entry);
> > >         spin_unlock(&tree->lock);
> > >
> > > -       ret = zswap_writeback_entry(entry, tree);
> > > +       writeback_result = zswap_writeback_entry(entry, tree);
> > >
> > >         spin_lock(&tree->lock);
> > > -       if (ret) {
> > > -               /* Writeback failed, put entry back on LRU */
> > > -               spin_lock(&pool->lru_lock);
> > > -               list_move(&entry->lru, &pool->lru);
> > > -               spin_unlock(&pool->lru_lock);
> > > +       if (writeback_result) {
> > > +               zswap_reject_reclaim_fail++;
> > > +               memcg = get_mem_cgroup_from_entry(entry);
> > > +               spin_lock(lock);
> > > +               /* we cannot use zswap_lru_add here, because it increments node's lru count */
> > > +               list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg);
> > > +               spin_unlock(lock);
> > > +               mem_cgroup_put(memcg);
> > > +               ret = LRU_RETRY;
> > >                 goto put_unlock;
> > >         }
> > > +       zswap_written_back_pages++;
> >
> > Why is this moved here from zswap_writeback_entry()? Also why is
> > zswap_reject_reclaim_fail incremented here instead of inside
> > zswap_writeback_entry()?
>
> Domenico should know this better than me, but my understanding
> is that moving it here protects concurrent modifications of
> zswap_written_back_pages with the tree lock.
>
> Is writeback single-threaded in the past? This counter is non-atomic,
> and doesn't seem to be protected by any locks...
>
> There definitely can be concurrent stores now though - with
/s/stores/writebacks
> a synchronous reclaim from cgroup-limit hit and another
> from the old shrink worker.
>
> (and with the new zswap shrinker, concurrent reclaim is
> the expectation!)
>
> zswap_reject_reclaim_fail was previously incremented in
> shrink_worker I think. We need it to be incremented
> for the shrinker as well, so might as well move it here.
>
> >
> > >
> > >         /*
> > >          * Writeback started successfully, the page now belongs to the
> > > @@ -686,7 +716,36 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> > >         zswap_entry_put(tree, entry);
> > >  unlock:
> > >         spin_unlock(&tree->lock);
> > > -       return ret ? -EAGAIN : 0;
> > > +       spin_lock(lock);
> > > +       return ret;
> > > +}
> > > +
> > > +static int shrink_memcg(struct mem_cgroup *memcg)
> > > +{
> > > +       struct zswap_pool *pool;
> > > +       int nid, shrunk = 0;
> > > +
> > > +       pool = zswap_pool_current_get();
> > > +       if (!pool)
> > > +               return -EINVAL;
> > > +
> > > +       /*
> > > +        * Skip zombies because their LRUs are reparented and we would be
> > > +        * reclaiming from the parent instead of the dead memcgroup.
> >
> > nit: s/memcgroup/memcg.
> >
> > > +        */
> > > +       if (memcg && !mem_cgroup_online(memcg))
> > > +               goto out;
> >
> > If we move this above zswap_pool_current_get(), we can return directly
> > and remove the label. I noticed we will return -EAGAIN if memcg is
> > offline. IIUC -EAGAIN for the caller will move on to the next memcg,
> > but I am wondering if a different errno would be clearer here.
> >
> > > +
> > > +       for_each_node_state(nid, N_NORMAL_MEMORY) {
> > > +               unsigned long nr_to_walk = 1;
> > > +
> > > +               if (list_lru_walk_one(&pool->list_lru, nid, memcg, &shrink_memcg_cb,
> > > +                                     NULL, &nr_to_walk))
> > > +                       shrunk++;
> >
> > nit:
> > shrunk += list_lru_walk_one(..);
>
> yeah might be a tad cleaner.
>
> >
> > > +       }
> > > +out:
> > > +       zswap_pool_put(pool);
> > > +       return shrunk ? 0 : -EAGAIN;
> > >  }
> > >
> > >  static void shrink_worker(struct work_struct *w)
> > > @@ -695,10 +754,13 @@ static void shrink_worker(struct work_struct *w)
> > >                                                 shrink_work);
> > >         int ret, failures = 0;
> > >
> > > +       /* global reclaim will select cgroup in a round-robin fashion. */
> > >         do {
> > > -               ret = zswap_reclaim_entry(pool);
> > > +               pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
> >
> > Perhaps next_shrink_memcg is a better name here?
> >
> > > +
> > > +               ret = shrink_memcg(pool->next_shrink);
> > > +
> > >                 if (ret) {
> > > -                       zswap_reject_reclaim_fail++;
> > >                         if (ret != -EAGAIN)
> > >                                 break;
> > >                         if (++failures == MAX_RECLAIM_RETRIES)
> > > @@ -764,8 +826,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> > >          */
> > >         kref_init(&pool->kref);
> > >         INIT_LIST_HEAD(&pool->list);
> > > -       INIT_LIST_HEAD(&pool->lru);
> > > -       spin_lock_init(&pool->lru_lock);
> > > +       list_lru_init_memcg(&pool->list_lru, NULL);
> > >         INIT_WORK(&pool->shrink_work, shrink_worker);
> > >
> > >         zswap_pool_debug("created", pool);
> > > @@ -831,6 +892,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> > >
> > >         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> > >         free_percpu(pool->acomp_ctx);
> > > +       list_lru_destroy(&pool->list_lru);
> > > +       if (pool->next_shrink)
> > > +               mem_cgroup_put(pool->next_shrink);
> > >         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> > >                 zpool_destroy_pool(pool->zpools[i]);
> > >         kfree(pool);
> > > @@ -1076,7 +1140,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > >
> > >         /* try to allocate swap cache page */
> > >         page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
> > > -                                      &page_was_allocated);
> > > +                                      &page_was_allocated, true);
> > >         if (!page) {
> > >                 ret = -ENOMEM;
> > >                 goto fail;
> > > @@ -1142,7 +1206,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > >         /* start writeback */
> > >         __swap_writepage(page, &wbc);
> > >         put_page(page);
> > > -       zswap_written_back_pages++;
> > >
> > >         return ret;
> > >
> > > @@ -1199,8 +1262,10 @@ bool zswap_store(struct folio *folio)
> > >         struct scatterlist input, output;
> > >         struct crypto_acomp_ctx *acomp_ctx;
> > >         struct obj_cgroup *objcg = NULL;
> > > +       struct mem_cgroup *memcg = NULL;
> > >         struct zswap_pool *pool;
> > >         struct zpool *zpool;
> > > +       int lru_alloc_ret;
> > >         unsigned int dlen = PAGE_SIZE;
> > >         unsigned long handle, value;
> > >         char *buf;
> > > @@ -1230,15 +1295,15 @@ bool zswap_store(struct folio *folio)
> > >                 zswap_invalidate_entry(tree, dupentry);
> > >         }
> > >         spin_unlock(&tree->lock);
> > > -
> > > -       /*
> > > -        * XXX: zswap reclaim does not work with cgroups yet. Without a
> > > -        * cgroup-aware entry LRU, we will push out entries system-wide based on
> > > -        * local cgroup limits.
> > > -        */
> > >         objcg = get_obj_cgroup_from_folio(folio);
> > > -       if (objcg && !obj_cgroup_may_zswap(objcg))
> > > -               goto reject;
> > > +       if (objcg && !obj_cgroup_may_zswap(objcg)) {
> > > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > > +               if (shrink_memcg(memcg)) {
> > > +                       mem_cgroup_put(memcg);
> > > +                       goto reject;
> > > +               }
> > > +               mem_cgroup_put(memcg);
> > > +       }
> > >
> > >         /* reclaim space if needed */
> > >         if (zswap_is_full()) {
> > > @@ -1254,10 +1319,15 @@ bool zswap_store(struct folio *folio)
> > >                         zswap_pool_reached_full = false;
> > >         }
> > >
> > > +       pool = zswap_pool_current_get();
> > > +       if (!pool)
> > > +               goto reject;
> > > +
> >
> > Why do we need to move zswap_pool_current_get() up here?
> >
> > >         /* allocate entry */
> > > -       entry = zswap_entry_cache_alloc(GFP_KERNEL);
> > > +       entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> > >         if (!entry) {
> > >                 zswap_reject_kmemcache_fail++;
> > > +               zswap_pool_put(pool);
> > >                 goto reject;
> > >         }
> > >
> > > @@ -1269,6 +1339,7 @@ bool zswap_store(struct folio *folio)
> > >                         entry->length = 0;
> > >                         entry->value = value;
> > >                         atomic_inc(&zswap_same_filled_pages);
> > > +                       zswap_pool_put(pool);
> > >                         goto insert_entry;
> > >                 }
> > >                 kunmap_atomic(src);
> > > @@ -1278,9 +1349,15 @@ bool zswap_store(struct folio *folio)
> > >                 goto freepage;
> > >
> > >         /* if entry is successfully added, it keeps the reference */
> > > -       entry->pool = zswap_pool_current_get();
> > > -       if (!entry->pool)
> > > -               goto freepage;
> > > +       entry->pool = pool;
> > > +       if (objcg) {
> > > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > > +               lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);
> > > +               mem_cgroup_put(memcg);
> > > +
> > > +               if (lru_alloc_ret)
> > > +                       goto freepage;
> > > +       }
> > >
> > >         /* compress */
> > >         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > > @@ -1358,9 +1435,8 @@ bool zswap_store(struct folio *folio)
> > >                 zswap_invalidate_entry(tree, dupentry);
> > >         }
> > >         if (entry->length) {
> > > -               spin_lock(&entry->pool->lru_lock);
> > > -               list_add(&entry->lru, &entry->pool->lru);
> > > -               spin_unlock(&entry->pool->lru_lock);
> > > +               INIT_LIST_HEAD(&entry->lru);
> > > +               zswap_lru_add(&pool->list_lru, entry);
> > >         }
> > >         spin_unlock(&tree->lock);
> > >
> > > @@ -1373,8 +1449,8 @@ bool zswap_store(struct folio *folio)
> > >
> > >  put_dstmem:
> > >         mutex_unlock(acomp_ctx->mutex);
> > > -       zswap_pool_put(entry->pool);
> > >  freepage:
> > > +       zswap_pool_put(entry->pool);
> > >         zswap_entry_cache_free(entry);
> > >  reject:
> > >         if (objcg)
> > > @@ -1467,9 +1543,8 @@ bool zswap_load(struct folio *folio)
> > >                 zswap_invalidate_entry(tree, entry);
> > >                 folio_mark_dirty(folio);
> > >         } else if (entry->length) {
> > > -               spin_lock(&entry->pool->lru_lock);
> > > -               list_move(&entry->lru, &entry->pool->lru);
> > > -               spin_unlock(&entry->pool->lru_lock);
> > > +               zswap_lru_del(&entry->pool->list_lru, entry);
> > > +               zswap_lru_add(&entry->pool->list_lru, entry);
> > >         }
> > >         zswap_entry_put(tree, entry);
> > >         spin_unlock(&tree->lock);
> > > --
> > > 2.34.1
>
> I don't have (strong) opinions or (educated) guesses
> on the rest.
  
Yosry Ahmed Oct. 19, 2023, 1:11 a.m. UTC | #4
On Wed, Oct 18, 2023 at 4:47 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Wed, Oct 18, 2023 at 4:20 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Oct 17, 2023 at 4:21 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > >
> > > Currently, we only have a single global LRU for zswap. This makes it
> > > impossible to perform worload-specific shrinking - an memcg cannot
> > > determine which pages in the pool it owns, and often ends up writing
> > > pages from other memcgs. This issue has been previously observed in
> > > practice and mitigated by simply disabling memcg-initiated shrinking:
> > >
> > > https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u
> > >
> > > This patch fully resolves the issue by replacing the global zswap LRU
> > > with memcg- and NUMA-specific LRUs, and modify the reclaim logic:
> > >
> > > a) When a store attempt hits an memcg limit, it now triggers a
> > >    synchronous reclaim attempt that, if successful, allows the new
> > >    hotter page to be accepted by zswap.
> > > b) If the store attempt instead hits the global zswap limit, it will
> > >    trigger an asynchronous reclaim attempt, in which an memcg is
> > >    selected for reclaim in a round-robin-like fashion.
> >
> > Could you explain the rationale behind the difference in behavior here
> > between the global limit and the memcg limit?
>
> The global limit hit reclaim behavior was previously asynchronous too.
> We just added the round-robin part because now the zswap LRU is
> cgroup-aware :)
>
> For the cgroup limit hit, however, we cannot make it asynchronous,
> as it is a bit hairy to add a per-cgroup shrink_work. So, we just
> perform the reclaim synchronously.
>
> The question is whether it makes sense to make the global limit
> reclaim synchronous too. That is a task of its own IMO.

Let's add such context to the commit log, and perhaps an XXX comment
in the code asking whether we should consider doing the reclaim
synchronously for the global limit too.

>
> (FWIW, this somewhat mirrors the direct reclaimer v.s kswapd
> story to me, but don't quote me too hard on this).
>
[..]
>
> >
> > >         /* Hold a reference to prevent a free during writeback */
> > >         zswap_entry_get(entry);
> > >         spin_unlock(&tree->lock);
> > >
> > > -       ret = zswap_writeback_entry(entry, tree);
> > > +       writeback_result = zswap_writeback_entry(entry, tree);
> > >
> > >         spin_lock(&tree->lock);
> > > -       if (ret) {
> > > -               /* Writeback failed, put entry back on LRU */
> > > -               spin_lock(&pool->lru_lock);
> > > -               list_move(&entry->lru, &pool->lru);
> > > -               spin_unlock(&pool->lru_lock);
> > > +       if (writeback_result) {
> > > +               zswap_reject_reclaim_fail++;
> > > +               memcg = get_mem_cgroup_from_entry(entry);
> > > +               spin_lock(lock);
> > > +               /* we cannot use zswap_lru_add here, because it increments node's lru count */
> > > +               list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg);
> > > +               spin_unlock(lock);
> > > +               mem_cgroup_put(memcg);
> > > +               ret = LRU_RETRY;
> > >                 goto put_unlock;
> > >         }
> > > +       zswap_written_back_pages++;
> >
> > Why is this moved here from zswap_writeback_entry()? Also why is
> > zswap_reject_reclaim_fail incremented here instead of inside
> > zswap_writeback_entry()?
>
> Domenico should know this better than me, but my understanding
> is that moving it here protects concurrent modifications of
> zswap_written_back_pages with the tree lock.
>
> Is writeback single-threaded in the past? This counter is non-atomic,
> and doesn't seem to be protected by any locks...
>
> There definitely can be concurrent stores now though - with
> a synchronous reclaim from cgroup-limit hit and another
> from the old shrink worker.
>
> (and with the new zswap shrinker, concurrent reclaim is
> the expectation!)

The comment above the stats definition stats that they are left
unprotected purposefully. If we want to fix that let's do it
separately. If this patch makes it significantly worse such that it
would cause a regression, let's at least do it in a separate patch.
The diff here is too large already.

>
> zswap_reject_reclaim_fail was previously incremented in
> shrink_worker I think. We need it to be incremented
> for the shrinker as well, so might as well move it here.

Wouldn't moving it inside zswap_writeback_entry() near incrementing
zswap_written_back_pages make it easier to follow?
  
Domenico Cerasuolo Oct. 19, 2023, 12:29 p.m. UTC | #5
On Thu, Oct 19, 2023 at 1:20 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Oct 17, 2023 at 4:21 PM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> >
> > Currently, we only have a single global LRU for zswap. This makes it
> > impossible to perform worload-specific shrinking - an memcg cannot
> > determine which pages in the pool it owns, and often ends up writing
> > pages from other memcgs. This issue has been previously observed in
> > practice and mitigated by simply disabling memcg-initiated shrinking:
> >
> > https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u
> >
> > This patch fully resolves the issue by replacing the global zswap LRU
> > with memcg- and NUMA-specific LRUs, and modify the reclaim logic:
> >
> > a) When a store attempt hits an memcg limit, it now triggers a
> >    synchronous reclaim attempt that, if successful, allows the new
> >    hotter page to be accepted by zswap.
> > b) If the store attempt instead hits the global zswap limit, it will
> >    trigger an asynchronous reclaim attempt, in which an memcg is
> >    selected for reclaim in a round-robin-like fashion.
>
> Could you explain the rationale behind the difference in behavior here
> between the global limit and the memcg limit?
>
> >
> > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > Co-developed-by: Nhat Pham <nphamcs@gmail.com>
> > Signed-off-by: Nhat Pham <nphamcs@gmail.com>
> > ---
> >  include/linux/memcontrol.h |   5 ++
> >  mm/swap.h                  |   3 +-
> >  mm/swap_state.c            |  17 +++-
> >  mm/zswap.c                 | 179 ++++++++++++++++++++++++++-----------
> >  4 files changed, 147 insertions(+), 57 deletions(-)
>
> This is a dense patch, I haven't absorbed all of it yet, but the first
> round of comments below.
>
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 031102ac9311..3de10fabea0f 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -1179,6 +1179,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
> >         return NULL;
> >  }
> >
> > +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
> > +{
> > +       return NULL;
> > +}
> > +
> >  static inline bool folio_memcg_kmem(struct folio *folio)
> >  {
> >         return false;
> > diff --git a/mm/swap.h b/mm/swap.h
> > index 8a3c7a0ace4f..bbd6ce661a20 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -50,7 +50,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >                                      struct vm_area_struct *vma,
> >                                      unsigned long addr,
> > -                                    bool *new_page_allocated);
> > +                                    bool *new_page_allocated,
> > +                                    bool fail_if_exists);
> >  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
> >                                     struct vm_fault *vmf);
> >  struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index b3b14bd0dd64..0356df52b06a 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -411,7 +411,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
> >
> >  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >                         struct vm_area_struct *vma, unsigned long addr,
> > -                       bool *new_page_allocated)
> > +                       bool *new_page_allocated, bool fail_if_exists)
>
> nit: I don't feel like "fail" is the correct word here. Perhaps "skip"?

I really don't have a preference, we can go with "skip".

>
> >  {
> >         struct swap_info_struct *si;
> >         struct folio *folio;
> > @@ -468,6 +468,15 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >                 if (err != -EEXIST)
> >                         goto fail_put_swap;
> >
> > +               /*
> > +                * This check guards against a state that happens if a call
> > +                * to __read_swap_cache_async triggers a reclaim, if the
> > +                * reclaimer (zswap's writeback as of now) then decides to
> > +                * reclaim that same entry, then the subsequent call to
> > +                * __read_swap_cache_async would get stuck in this loop.
>
> I think this comment needs to first state that it is protecting
> against a recursive call in general, not necessarily in reclaim, as
> __read_swap_cache_async() is not usually called in the context of
> reclaim so this can be confusing. Then it can give the exact example
> we have today. Perhaps something like:
>
> Protect against a recursive call to __read_swap_cache_async() on the
> same entry waiting forever here because SWAP_HAS_CACHE is set but the
> folio is not the swap cache yet. This can happen today if
> mem_cgroup_swapin_charge_folio() below triggers reclaim through zswap,
> which may call __read_swap_cache_async() in the writeback path.
>

Sold.

> > +                */
> > +               if (fail_if_exists && err == -EEXIST)
>
> We already made sure  in the preceding condition that err is -EEXIST.
>
> > +                       goto fail_put_swap;
> >                 /*
> >                  * We might race against __delete_from_swap_cache(), and
> >                  * stumble across a swap_map entry whose SWAP_HAS_CACHE
> > @@ -530,7 +539,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >  {
> >         bool page_was_allocated;
> >         struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
> > -                       vma, addr, &page_was_allocated);
> > +                       vma, addr, &page_was_allocated, false);
> >
> >         if (page_was_allocated)
> >                 swap_readpage(retpage, false, plug);
> > @@ -649,7 +658,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
> >                 /* Ok, do the async read-ahead now */
> >                 page = __read_swap_cache_async(
> >                         swp_entry(swp_type(entry), offset),
> > -                       gfp_mask, vma, addr, &page_allocated);
> > +                       gfp_mask, vma, addr, &page_allocated, false);
> >                 if (!page)
> >                         continue;
> >                 if (page_allocated) {
> > @@ -815,7 +824,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
> >                 pte_unmap(pte);
> >                 pte = NULL;
> >                 page = __read_swap_cache_async(entry, gfp_mask, vma,
> > -                                              addr, &page_allocated);
> > +                                              addr, &page_allocated, false);
> >                 if (!page)
> >                         continue;
> >                 if (page_allocated) {
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 083c693602b8..d2989ad11814 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -34,6 +34,7 @@
> >  #include <linux/writeback.h>
> >  #include <linux/pagemap.h>
> >  #include <linux/workqueue.h>
> > +#include <linux/list_lru.h>
> >
> >  #include "swap.h"
> >  #include "internal.h"
> > @@ -171,8 +172,8 @@ struct zswap_pool {
> >         struct work_struct shrink_work;
> >         struct hlist_node node;
> >         char tfm_name[CRYPTO_MAX_ALG_NAME];
> > -       struct list_head lru;
> > -       spinlock_t lru_lock;
> > +       struct list_lru list_lru;
> > +       struct mem_cgroup *next_shrink;
> >  };
> >
> >  /*
> > @@ -288,15 +289,25 @@ static void zswap_update_total_size(void)
> >         zswap_pool_total_size = total;
> >  }
> >
> > +static inline struct mem_cgroup *get_mem_cgroup_from_entry(struct zswap_entry *entry)
> > +{
> > +       return entry->objcg ? get_mem_cgroup_from_objcg(entry->objcg) : NULL;
> > +}
> > +
> > +static inline int entry_to_nid(struct zswap_entry *entry)
> > +{
> > +       return page_to_nid(virt_to_page(entry));
> > +}
> > +
> >  /*********************************
> >  * zswap entry functions
> >  **********************************/
> >  static struct kmem_cache *zswap_entry_cache;
> >
> > -static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
> > +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
> >  {
> >         struct zswap_entry *entry;
> > -       entry = kmem_cache_alloc(zswap_entry_cache, gfp);
> > +       entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
> >         if (!entry)
> >                 return NULL;
> >         entry->refcount = 1;
> > @@ -309,6 +320,27 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
> >         kmem_cache_free(zswap_entry_cache, entry);
> >  }
> >
> > +/*********************************
> > +* lru functions
> > +**********************************/
> > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> > +{
> > +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
>
> Could we avoid the need for get/put with an rcu_read_lock() instead?

I think we can, I'm not entirely sure of the consequences though. By the
look of it I'd say it's safe but I wouldn't trust my judgement on this.

>
> > +       bool added = __list_lru_add(list_lru, &entry->lru, entry_to_nid(entry), memcg);
> > +
> > +       mem_cgroup_put(memcg);
> > +       return added;
> > +}
> > +
> > +static bool zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
> > +{
> > +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
> > +       bool removed = __list_lru_del(list_lru, &entry->lru, entry_to_nid(entry), memcg);
> > +
> > +       mem_cgroup_put(memcg);
> > +       return removed;
> > +}
> > +
> >  /*********************************
> >  * rbtree functions
> >  **********************************/
> > @@ -393,9 +425,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
> >         if (!entry->length)
> >                 atomic_dec(&zswap_same_filled_pages);
> >         else {
> > -               spin_lock(&entry->pool->lru_lock);
> > -               list_del(&entry->lru);
> > -               spin_unlock(&entry->pool->lru_lock);
> > +               zswap_lru_del(&entry->pool->list_lru, entry);
> >                 zpool_free(zswap_find_zpool(entry), entry->handle);
> >                 zswap_pool_put(entry->pool);
> >         }
> > @@ -629,21 +659,16 @@ static void zswap_invalidate_entry(struct zswap_tree *tree,
> >                 zswap_entry_put(tree, entry);
> >  }
> >
> > -static int zswap_reclaim_entry(struct zswap_pool *pool)
> > +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
> > +                                      spinlock_t *lock, void *arg)
> >  {
> > -       struct zswap_entry *entry;
> > +       struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
> > +       struct mem_cgroup *memcg;
> >         struct zswap_tree *tree;
> >         pgoff_t swpoffset;
> > -       int ret;
> > +       enum lru_status ret = LRU_REMOVED_RETRY;
> > +       int writeback_result;
> >
> > -       /* Get an entry off the LRU */
> > -       spin_lock(&pool->lru_lock);
> > -       if (list_empty(&pool->lru)) {
> > -               spin_unlock(&pool->lru_lock);
> > -               return -EINVAL;
> > -       }
> > -       entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> > -       list_del_init(&entry->lru);
> >         /*
> >          * Once the lru lock is dropped, the entry might get freed. The
> >          * swpoffset is copied to the stack, and entry isn't deref'd again
> > @@ -651,28 +676,33 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> >          */
> >         swpoffset = swp_offset(entry->swpentry);
> >         tree = zswap_trees[swp_type(entry->swpentry)];
> > -       spin_unlock(&pool->lru_lock);
> > +       list_lru_isolate(l, item);
> > +       spin_unlock(lock);
>
> Perhaps a comment somewhere stating that we only return either
> LRU_REMOVED_RETRY or LRU_RETRY, so it's fine to drop and reacquire the
> lock.
>
> >
> >         /* Check for invalidate() race */
> >         spin_lock(&tree->lock);
> >         if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
> > -               ret = -EAGAIN;
> >                 goto unlock;
> >         }
>
> nit: braces no longer needed?
>
> >         /* Hold a reference to prevent a free during writeback */
> >         zswap_entry_get(entry);
> >         spin_unlock(&tree->lock);
> >
> > -       ret = zswap_writeback_entry(entry, tree);
> > +       writeback_result = zswap_writeback_entry(entry, tree);
> >
> >         spin_lock(&tree->lock);
> > -       if (ret) {
> > -               /* Writeback failed, put entry back on LRU */
> > -               spin_lock(&pool->lru_lock);
> > -               list_move(&entry->lru, &pool->lru);
> > -               spin_unlock(&pool->lru_lock);
> > +       if (writeback_result) {
> > +               zswap_reject_reclaim_fail++;
> > +               memcg = get_mem_cgroup_from_entry(entry);
> > +               spin_lock(lock);
> > +               /* we cannot use zswap_lru_add here, because it increments node's lru count */
> > +               list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg);
> > +               spin_unlock(lock);
> > +               mem_cgroup_put(memcg);
> > +               ret = LRU_RETRY;
> >                 goto put_unlock;
> >         }
> > +       zswap_written_back_pages++;
>
> Why is this moved here from zswap_writeback_entry()? Also why is
> zswap_reject_reclaim_fail incremented here instead of inside
> zswap_writeback_entry()?
>
> >
> >         /*
> >          * Writeback started successfully, the page now belongs to the
> > @@ -686,7 +716,36 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> >         zswap_entry_put(tree, entry);
> >  unlock:
> >         spin_unlock(&tree->lock);
> > -       return ret ? -EAGAIN : 0;
> > +       spin_lock(lock);
> > +       return ret;
> > +}
> > +
> > +static int shrink_memcg(struct mem_cgroup *memcg)
> > +{
> > +       struct zswap_pool *pool;
> > +       int nid, shrunk = 0;
> > +
> > +       pool = zswap_pool_current_get();
> > +       if (!pool)
> > +               return -EINVAL;
> > +
> > +       /*
> > +        * Skip zombies because their LRUs are reparented and we would be
> > +        * reclaiming from the parent instead of the dead memcgroup.
>
> nit: s/memcgroup/memcg.
>
> > +        */
> > +       if (memcg && !mem_cgroup_online(memcg))
> > +               goto out;
>
> If we move this above zswap_pool_current_get(), we can return directly
> and remove the label. I noticed we will return -EAGAIN if memcg is
> offline. IIUC -EAGAIN for the caller will move on to the next memcg,
> but I am wondering if a different errno would be clearer here.

True, I remember spending some time staring at error codes but couldn't find a
better one. What if we use -EINVAL for retryable errors, and use something else
for the one where there is no pool? -ENODEV?

>
> > +
> > +       for_each_node_state(nid, N_NORMAL_MEMORY) {
> > +               unsigned long nr_to_walk = 1;
> > +
> > +               if (list_lru_walk_one(&pool->list_lru, nid, memcg, &shrink_memcg_cb,
> > +                                     NULL, &nr_to_walk))
> > +                       shrunk++;
>
> nit:
> shrunk += list_lru_walk_one(..);
>
> > +       }
> > +out:
> > +       zswap_pool_put(pool);
> > +       return shrunk ? 0 : -EAGAIN;
> >  }
> >
> >  static void shrink_worker(struct work_struct *w)
> > @@ -695,10 +754,13 @@ static void shrink_worker(struct work_struct *w)
> >                                                 shrink_work);
> >         int ret, failures = 0;
> >
> > +       /* global reclaim will select cgroup in a round-robin fashion. */
> >         do {
> > -               ret = zswap_reclaim_entry(pool);
> > +               pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
>
> Perhaps next_shrink_memcg is a better name here?

Will change if you have a strong preference, I'd keep it shorter because it's
always used in conjunction with a memcg type or function.

>
> > +
> > +               ret = shrink_memcg(pool->next_shrink);
> > +
> >                 if (ret) {
> > -                       zswap_reject_reclaim_fail++;
> >                         if (ret != -EAGAIN)
> >                                 break;
> >                         if (++failures == MAX_RECLAIM_RETRIES)
> > @@ -764,8 +826,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >          */
> >         kref_init(&pool->kref);
> >         INIT_LIST_HEAD(&pool->list);
> > -       INIT_LIST_HEAD(&pool->lru);
> > -       spin_lock_init(&pool->lru_lock);
> > +       list_lru_init_memcg(&pool->list_lru, NULL);
> >         INIT_WORK(&pool->shrink_work, shrink_worker);
> >
> >         zswap_pool_debug("created", pool);
> > @@ -831,6 +892,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> >
> >         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> >         free_percpu(pool->acomp_ctx);
> > +       list_lru_destroy(&pool->list_lru);
> > +       if (pool->next_shrink)
> > +               mem_cgroup_put(pool->next_shrink);
> >         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> >                 zpool_destroy_pool(pool->zpools[i]);
> >         kfree(pool);
> > @@ -1076,7 +1140,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >
> >         /* try to allocate swap cache page */
> >         page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
> > -                                      &page_was_allocated);
> > +                                      &page_was_allocated, true);
> >         if (!page) {
> >                 ret = -ENOMEM;
> >                 goto fail;
> > @@ -1142,7 +1206,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >         /* start writeback */
> >         __swap_writepage(page, &wbc);
> >         put_page(page);
> > -       zswap_written_back_pages++;
> >
> >         return ret;
> >
> > @@ -1199,8 +1262,10 @@ bool zswap_store(struct folio *folio)
> >         struct scatterlist input, output;
> >         struct crypto_acomp_ctx *acomp_ctx;
> >         struct obj_cgroup *objcg = NULL;
> > +       struct mem_cgroup *memcg = NULL;
> >         struct zswap_pool *pool;
> >         struct zpool *zpool;
> > +       int lru_alloc_ret;
> >         unsigned int dlen = PAGE_SIZE;
> >         unsigned long handle, value;
> >         char *buf;
> > @@ -1230,15 +1295,15 @@ bool zswap_store(struct folio *folio)
> >                 zswap_invalidate_entry(tree, dupentry);
> >         }
> >         spin_unlock(&tree->lock);
> > -
> > -       /*
> > -        * XXX: zswap reclaim does not work with cgroups yet. Without a
> > -        * cgroup-aware entry LRU, we will push out entries system-wide based on
> > -        * local cgroup limits.
> > -        */
> >         objcg = get_obj_cgroup_from_folio(folio);
> > -       if (objcg && !obj_cgroup_may_zswap(objcg))
> > -               goto reject;
> > +       if (objcg && !obj_cgroup_may_zswap(objcg)) {
> > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > +               if (shrink_memcg(memcg)) {
> > +                       mem_cgroup_put(memcg);
> > +                       goto reject;
> > +               }
> > +               mem_cgroup_put(memcg);
> > +       }
> >
> >         /* reclaim space if needed */
> >         if (zswap_is_full()) {
> > @@ -1254,10 +1319,15 @@ bool zswap_store(struct folio *folio)
> >                         zswap_pool_reached_full = false;
> >         }
> >
> > +       pool = zswap_pool_current_get();
> > +       if (!pool)
> > +               goto reject;
> > +
>
> Why do we need to move zswap_pool_current_get() up here?

Ah, thanks. This is a leftover from a previous version where the pool was needed
to allocate the entry.


>
> >         /* allocate entry */
> > -       entry = zswap_entry_cache_alloc(GFP_KERNEL);
> > +       entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> >         if (!entry) {
> >                 zswap_reject_kmemcache_fail++;
> > +               zswap_pool_put(pool);
> >                 goto reject;
> >         }
> >
> > @@ -1269,6 +1339,7 @@ bool zswap_store(struct folio *folio)
> >                         entry->length = 0;
> >                         entry->value = value;
> >                         atomic_inc(&zswap_same_filled_pages);
> > +                       zswap_pool_put(pool);
> >                         goto insert_entry;
> >                 }
> >                 kunmap_atomic(src);
> > @@ -1278,9 +1349,15 @@ bool zswap_store(struct folio *folio)
> >                 goto freepage;
> >
> >         /* if entry is successfully added, it keeps the reference */
> > -       entry->pool = zswap_pool_current_get();
> > -       if (!entry->pool)
> > -               goto freepage;
> > +       entry->pool = pool;
> > +       if (objcg) {
> > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > +               lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);
> > +               mem_cgroup_put(memcg);
> > +
> > +               if (lru_alloc_ret)
> > +                       goto freepage;
> > +       }
> >
> >         /* compress */
> >         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > @@ -1358,9 +1435,8 @@ bool zswap_store(struct folio *folio)
> >                 zswap_invalidate_entry(tree, dupentry);
> >         }
> >         if (entry->length) {
> > -               spin_lock(&entry->pool->lru_lock);
> > -               list_add(&entry->lru, &entry->pool->lru);
> > -               spin_unlock(&entry->pool->lru_lock);
> > +               INIT_LIST_HEAD(&entry->lru);
> > +               zswap_lru_add(&pool->list_lru, entry);
> >         }
> >         spin_unlock(&tree->lock);
> >
> > @@ -1373,8 +1449,8 @@ bool zswap_store(struct folio *folio)
> >
> >  put_dstmem:
> >         mutex_unlock(acomp_ctx->mutex);
> > -       zswap_pool_put(entry->pool);
> >  freepage:
> > +       zswap_pool_put(entry->pool);
> >         zswap_entry_cache_free(entry);
> >  reject:
> >         if (objcg)
> > @@ -1467,9 +1543,8 @@ bool zswap_load(struct folio *folio)
> >                 zswap_invalidate_entry(tree, entry);
> >                 folio_mark_dirty(folio);
> >         } else if (entry->length) {
> > -               spin_lock(&entry->pool->lru_lock);
> > -               list_move(&entry->lru, &entry->pool->lru);
> > -               spin_unlock(&entry->pool->lru_lock);
> > +               zswap_lru_del(&entry->pool->list_lru, entry);
> > +               zswap_lru_add(&entry->pool->list_lru, entry);
> >         }
> >         zswap_entry_put(tree, entry);
> >         spin_unlock(&tree->lock);
> > --
> > 2.34.1
  
Domenico Cerasuolo Oct. 19, 2023, 12:47 p.m. UTC | #6
On Thu, Oct 19, 2023 at 3:12 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Wed, Oct 18, 2023 at 4:47 PM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Wed, Oct 18, 2023 at 4:20 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > >
> > > On Tue, Oct 17, 2023 at 4:21 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > > >
> > > > From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > > >
> > > > Currently, we only have a single global LRU for zswap. This makes it
> > > > impossible to perform worload-specific shrinking - an memcg cannot
> > > > determine which pages in the pool it owns, and often ends up writing
> > > > pages from other memcgs. This issue has been previously observed in
> > > > practice and mitigated by simply disabling memcg-initiated shrinking:
> > > >
> > > > https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u
> > > >
> > > > This patch fully resolves the issue by replacing the global zswap LRU
> > > > with memcg- and NUMA-specific LRUs, and modify the reclaim logic:
> > > >
> > > > a) When a store attempt hits an memcg limit, it now triggers a
> > > >    synchronous reclaim attempt that, if successful, allows the new
> > > >    hotter page to be accepted by zswap.
> > > > b) If the store attempt instead hits the global zswap limit, it will
> > > >    trigger an asynchronous reclaim attempt, in which an memcg is
> > > >    selected for reclaim in a round-robin-like fashion.
> > >
> > > Could you explain the rationale behind the difference in behavior here
> > > between the global limit and the memcg limit?
> >
> > The global limit hit reclaim behavior was previously asynchronous too.
> > We just added the round-robin part because now the zswap LRU is
> > cgroup-aware :)
> >
> > For the cgroup limit hit, however, we cannot make it asynchronous,
> > as it is a bit hairy to add a per-cgroup shrink_work. So, we just
> > perform the reclaim synchronously.
> >
> > The question is whether it makes sense to make the global limit
> > reclaim synchronous too. That is a task of its own IMO.
>
> Let's add such context to the commit log, and perhaps an XXX comment
> in the code asking whether we should consider doing the reclaim
> synchronously for the global limit too.

Makes sense, I wonder if the original reason for switching from a synchronous
to asynchronous reclaim will still be valid with the shrinker in place.

>
> >
> > (FWIW, this somewhat mirrors the direct reclaimer v.s kswapd
> > story to me, but don't quote me too hard on this).
> >
> [..]
> >
> > >
> > > >         /* Hold a reference to prevent a free during writeback */
> > > >         zswap_entry_get(entry);
> > > >         spin_unlock(&tree->lock);
> > > >
> > > > -       ret = zswap_writeback_entry(entry, tree);
> > > > +       writeback_result = zswap_writeback_entry(entry, tree);
> > > >
> > > >         spin_lock(&tree->lock);
> > > > -       if (ret) {
> > > > -               /* Writeback failed, put entry back on LRU */
> > > > -               spin_lock(&pool->lru_lock);
> > > > -               list_move(&entry->lru, &pool->lru);
> > > > -               spin_unlock(&pool->lru_lock);
> > > > +       if (writeback_result) {
> > > > +               zswap_reject_reclaim_fail++;
> > > > +               memcg = get_mem_cgroup_from_entry(entry);
> > > > +               spin_lock(lock);
> > > > +               /* we cannot use zswap_lru_add here, because it increments node's lru count */
> > > > +               list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg);
> > > > +               spin_unlock(lock);
> > > > +               mem_cgroup_put(memcg);
> > > > +               ret = LRU_RETRY;
> > > >                 goto put_unlock;
> > > >         }
> > > > +       zswap_written_back_pages++;
> > >
> > > Why is this moved here from zswap_writeback_entry()? Also why is
> > > zswap_reject_reclaim_fail incremented here instead of inside
> > > zswap_writeback_entry()?
> >
> > Domenico should know this better than me, but my understanding
> > is that moving it here protects concurrent modifications of
> > zswap_written_back_pages with the tree lock.
> >
> > Is writeback single-threaded in the past? This counter is non-atomic,
> > and doesn't seem to be protected by any locks...
> >
> > There definitely can be concurrent stores now though - with
> > a synchronous reclaim from cgroup-limit hit and another
> > from the old shrink worker.
> >
> > (and with the new zswap shrinker, concurrent reclaim is
> > the expectation!)
>
> The comment above the stats definition stats that they are left
> unprotected purposefully. If we want to fix that let's do it
> separately. If this patch makes it significantly worse such that it
> would cause a regression, let's at least do it in a separate patch.
> The diff here is too large already.
>
> >
> > zswap_reject_reclaim_fail was previously incremented in
> > shrink_worker I think. We need it to be incremented
> > for the shrinker as well, so might as well move it here.
>
> Wouldn't moving it inside zswap_writeback_entry() near incrementing
> zswap_written_back_pages make it easier to follow?

As Nhat said, zswap_reject_reclaim_fail++ had to be moved, I naturally moved it
here because it's where we act upon the result of the writeback. I then noticed
that zswap_written_back_pages++ was elsewhere and decided to move that as well
so that they're in the same place and at least they're under the tree lock.

It's not meant to fix the unprotected counters, it's just a mitigation since we
are forced to move at least one of them.
  
Yosry Ahmed Oct. 19, 2023, 4:14 p.m. UTC | #7
[..]
> > >
> > > +/*********************************
> > > +* lru functions
> > > +**********************************/
> > > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> > > +{
> > > +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
> >
> > Could we avoid the need for get/put with an rcu_read_lock() instead?
>
> I think we can, I'm not entirely sure of the consequences though. By the
> look of it I'd say it's safe but I wouldn't trust my judgement on this.

It just seems like we have a pattern of short-lived get/put. If RCU
gives enough protection it should be simpler. IIUC taking a reference
does not protect against offlining or reparenting, so I am not sure if
taking a reference here would provide any more protection than

>
> >
[..]
> > > @@ -686,7 +716,36 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> > >         zswap_entry_put(tree, entry);
> > >  unlock:
> > >         spin_unlock(&tree->lock);
> > > -       return ret ? -EAGAIN : 0;
> > > +       spin_lock(lock);
> > > +       return ret;
> > > +}
> > > +
> > > +static int shrink_memcg(struct mem_cgroup *memcg)
> > > +{
> > > +       struct zswap_pool *pool;
> > > +       int nid, shrunk = 0;
> > > +
> > > +       pool = zswap_pool_current_get();
> > > +       if (!pool)
> > > +               return -EINVAL;
> > > +
> > > +       /*
> > > +        * Skip zombies because their LRUs are reparented and we would be
> > > +        * reclaiming from the parent instead of the dead memcgroup.
> >
> > nit: s/memcgroup/memcg.
> >
> > > +        */
> > > +       if (memcg && !mem_cgroup_online(memcg))
> > > +               goto out;
> >
> > If we move this above zswap_pool_current_get(), we can return directly
> > and remove the label. I noticed we will return -EAGAIN if memcg is
> > offline. IIUC -EAGAIN for the caller will move on to the next memcg,
> > but I am wondering if a different errno would be clearer here.
>
> True, I remember spending some time staring at error codes but couldn't find a
> better one. What if we use -EINVAL for retryable errors, and use something else
> for the one where there is no pool? -ENODEV?

Do you mean -EINVAL for non-retryable errors? Perhaps -ENOENT is more
appropriate as a return for offline memcgs?

>
> >
[..]
> > >  static void shrink_worker(struct work_struct *w)
> > > @@ -695,10 +754,13 @@ static void shrink_worker(struct work_struct *w)
> > >                                                 shrink_work);
> > >         int ret, failures = 0;
> > >
> > > +       /* global reclaim will select cgroup in a round-robin fashion. */
> > >         do {
> > > -               ret = zswap_reclaim_entry(pool);
> > > +               pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
> >
> > Perhaps next_shrink_memcg is a better name here?
>
> Will change if you have a strong preference, I'd keep it shorter because it's
> always used in conjunction with a memcg type or function.

I'd rather have the more explicit name unless it causes some annoying
line breaks or so.

>
> >
> > > +
> > > +               ret = shrink_memcg(pool->next_shrink);
> > > +
> > >                 if (ret) {
> > > -                       zswap_reject_reclaim_fail++;
> > >                         if (ret != -EAGAIN)
> > >                                 break;
> > >                         if (++failures == MAX_RECLAIM_RETRIES)
> > > @@ -764,8 +826,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> > >          */
> > >         kref_init(&pool->kref);
> > >         INIT_LIST_HEAD(&pool->list);
> > > -       INIT_LIST_HEAD(&pool->lru);
> > > -       spin_lock_init(&pool->lru_lock);
> > > +       list_lru_init_memcg(&pool->list_lru, NULL);
> > >         INIT_WORK(&pool->shrink_work, shrink_worker);
> > >
> > >         zswap_pool_debug("created", pool);
> > > @@ -831,6 +892,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> > >
> > >         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> > >         free_percpu(pool->acomp_ctx);
> > > +       list_lru_destroy(&pool->list_lru);
> > > +       if (pool->next_shrink)
> > > +               mem_cgroup_put(pool->next_shrink);
> > >         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> > >                 zpool_destroy_pool(pool->zpools[i]);
> > >         kfree(pool);
> > > @@ -1076,7 +1140,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > >
> > >         /* try to allocate swap cache page */
> > >         page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
> > > -                                      &page_was_allocated);
> > > +                                      &page_was_allocated, true);
> > >         if (!page) {
> > >                 ret = -ENOMEM;
> > >                 goto fail;
> > > @@ -1142,7 +1206,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > >         /* start writeback */
> > >         __swap_writepage(page, &wbc);
> > >         put_page(page);
> > > -       zswap_written_back_pages++;
> > >
> > >         return ret;
> > >
> > > @@ -1199,8 +1262,10 @@ bool zswap_store(struct folio *folio)
> > >         struct scatterlist input, output;
> > >         struct crypto_acomp_ctx *acomp_ctx;
> > >         struct obj_cgroup *objcg = NULL;
> > > +       struct mem_cgroup *memcg = NULL;
> > >         struct zswap_pool *pool;
> > >         struct zpool *zpool;
> > > +       int lru_alloc_ret;
> > >         unsigned int dlen = PAGE_SIZE;
> > >         unsigned long handle, value;
> > >         char *buf;
> > > @@ -1230,15 +1295,15 @@ bool zswap_store(struct folio *folio)
> > >                 zswap_invalidate_entry(tree, dupentry);
> > >         }
> > >         spin_unlock(&tree->lock);
> > > -
> > > -       /*
> > > -        * XXX: zswap reclaim does not work with cgroups yet. Without a
> > > -        * cgroup-aware entry LRU, we will push out entries system-wide based on
> > > -        * local cgroup limits.
> > > -        */
> > >         objcg = get_obj_cgroup_from_folio(folio);
> > > -       if (objcg && !obj_cgroup_may_zswap(objcg))
> > > -               goto reject;
> > > +       if (objcg && !obj_cgroup_may_zswap(objcg)) {
> > > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > > +               if (shrink_memcg(memcg)) {
> > > +                       mem_cgroup_put(memcg);
> > > +                       goto reject;
> > > +               }
> > > +               mem_cgroup_put(memcg);
> > > +       }
> > >
> > >         /* reclaim space if needed */
> > >         if (zswap_is_full()) {
> > > @@ -1254,10 +1319,15 @@ bool zswap_store(struct folio *folio)
> > >                         zswap_pool_reached_full = false;
> > >         }
> > >
> > > +       pool = zswap_pool_current_get();
> > > +       if (!pool)
> > > +               goto reject;
> > > +
> >
> > Why do we need to move zswap_pool_current_get() up here?
>
> Ah, thanks. This is a leftover from a previous version where the pool was needed
> to allocate the entry.
>
>
> >
> > >         /* allocate entry */
> > > -       entry = zswap_entry_cache_alloc(GFP_KERNEL);
> > > +       entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> > >         if (!entry) {
> > >                 zswap_reject_kmemcache_fail++;
> > > +               zswap_pool_put(pool);
> > >                 goto reject;
> > >         }
> > >
> > > @@ -1269,6 +1339,7 @@ bool zswap_store(struct folio *folio)
> > >                         entry->length = 0;
> > >                         entry->value = value;
> > >                         atomic_inc(&zswap_same_filled_pages);
> > > +                       zswap_pool_put(pool);
> > >                         goto insert_entry;
> > >                 }
> > >                 kunmap_atomic(src);
> > > @@ -1278,9 +1349,15 @@ bool zswap_store(struct folio *folio)
> > >                 goto freepage;
> > >
> > >         /* if entry is successfully added, it keeps the reference */
> > > -       entry->pool = zswap_pool_current_get();
> > > -       if (!entry->pool)
> > > -               goto freepage;
> > > +       entry->pool = pool;
> > > +       if (objcg) {
> > > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > > +               lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);
> > > +               mem_cgroup_put(memcg);
> > > +
> > > +               if (lru_alloc_ret)
> > > +                       goto freepage;
> > > +       }
> > >
> > >         /* compress */
> > >         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > > @@ -1358,9 +1435,8 @@ bool zswap_store(struct folio *folio)
> > >                 zswap_invalidate_entry(tree, dupentry);
> > >         }
> > >         if (entry->length) {
> > > -               spin_lock(&entry->pool->lru_lock);
> > > -               list_add(&entry->lru, &entry->pool->lru);
> > > -               spin_unlock(&entry->pool->lru_lock);
> > > +               INIT_LIST_HEAD(&entry->lru);
> > > +               zswap_lru_add(&pool->list_lru, entry);
> > >         }
> > >         spin_unlock(&tree->lock);
> > >
> > > @@ -1373,8 +1449,8 @@ bool zswap_store(struct folio *folio)
> > >
> > >  put_dstmem:
> > >         mutex_unlock(acomp_ctx->mutex);
> > > -       zswap_pool_put(entry->pool);
> > >  freepage:
> > > +       zswap_pool_put(entry->pool);
> > >         zswap_entry_cache_free(entry);
> > >  reject:
> > >         if (objcg)
> > > @@ -1467,9 +1543,8 @@ bool zswap_load(struct folio *folio)
> > >                 zswap_invalidate_entry(tree, entry);
> > >                 folio_mark_dirty(folio);
> > >         } else if (entry->length) {
> > > -               spin_lock(&entry->pool->lru_lock);
> > > -               list_move(&entry->lru, &entry->pool->lru);
> > > -               spin_unlock(&entry->pool->lru_lock);
> > > +               zswap_lru_del(&entry->pool->list_lru, entry);
> > > +               zswap_lru_add(&entry->pool->list_lru, entry);
> > >         }
> > >         zswap_entry_put(tree, entry);
> > >         spin_unlock(&tree->lock);
> > > --
> > > 2.34.1
  
Yosry Ahmed Oct. 19, 2023, 4:28 p.m. UTC | #8
On Thu, Oct 19, 2023 at 5:47 AM Domenico Cerasuolo
<cerasuolodomenico@gmail.com> wrote:
>
> On Thu, Oct 19, 2023 at 3:12 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Wed, Oct 18, 2023 at 4:47 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > On Wed, Oct 18, 2023 at 4:20 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> > > >
> > > > On Tue, Oct 17, 2023 at 4:21 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > > > >
> > > > > From: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > > > >
> > > > > Currently, we only have a single global LRU for zswap. This makes it
> > > > > impossible to perform worload-specific shrinking - an memcg cannot
> > > > > determine which pages in the pool it owns, and often ends up writing
> > > > > pages from other memcgs. This issue has been previously observed in
> > > > > practice and mitigated by simply disabling memcg-initiated shrinking:
> > > > >
> > > > > https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u
> > > > >
> > > > > This patch fully resolves the issue by replacing the global zswap LRU
> > > > > with memcg- and NUMA-specific LRUs, and modify the reclaim logic:
> > > > >
> > > > > a) When a store attempt hits an memcg limit, it now triggers a
> > > > >    synchronous reclaim attempt that, if successful, allows the new
> > > > >    hotter page to be accepted by zswap.
> > > > > b) If the store attempt instead hits the global zswap limit, it will
> > > > >    trigger an asynchronous reclaim attempt, in which an memcg is
> > > > >    selected for reclaim in a round-robin-like fashion.
> > > >
> > > > Could you explain the rationale behind the difference in behavior here
> > > > between the global limit and the memcg limit?
> > >
> > > The global limit hit reclaim behavior was previously asynchronous too.
> > > We just added the round-robin part because now the zswap LRU is
> > > cgroup-aware :)
> > >
> > > For the cgroup limit hit, however, we cannot make it asynchronous,
> > > as it is a bit hairy to add a per-cgroup shrink_work. So, we just
> > > perform the reclaim synchronously.
> > >
> > > The question is whether it makes sense to make the global limit
> > > reclaim synchronous too. That is a task of its own IMO.
> >
> > Let's add such context to the commit log, and perhaps an XXX comment
> > in the code asking whether we should consider doing the reclaim
> > synchronously for the global limit too.
>
> Makes sense, I wonder if the original reason for switching from a synchronous
> to asynchronous reclaim will still be valid with the shrinker in place.
>

Seems like it was done as part of the hysteresis that stops accepting
pages into zswap once full until it reaches a certain threshold,
commit 45190f01dd40 ("mm/zswap.c: add allocation hysteresis if pool
limit is hit"). I guess the point is that zswap will stop accepting
pages when the limit is hit anyway, so no need to synchronously shrink
it since we can't store soon anyway. More useful context for the
commit log.

> >
> > >
> > > (FWIW, this somewhat mirrors the direct reclaimer v.s kswapd
> > > story to me, but don't quote me too hard on this).
> > >
> > [..]
> > >
> > > >
> > > > >         /* Hold a reference to prevent a free during writeback */
> > > > >         zswap_entry_get(entry);
> > > > >         spin_unlock(&tree->lock);
> > > > >
> > > > > -       ret = zswap_writeback_entry(entry, tree);
> > > > > +       writeback_result = zswap_writeback_entry(entry, tree);
> > > > >
> > > > >         spin_lock(&tree->lock);
> > > > > -       if (ret) {
> > > > > -               /* Writeback failed, put entry back on LRU */
> > > > > -               spin_lock(&pool->lru_lock);
> > > > > -               list_move(&entry->lru, &pool->lru);
> > > > > -               spin_unlock(&pool->lru_lock);
> > > > > +       if (writeback_result) {
> > > > > +               zswap_reject_reclaim_fail++;
> > > > > +               memcg = get_mem_cgroup_from_entry(entry);
> > > > > +               spin_lock(lock);
> > > > > +               /* we cannot use zswap_lru_add here, because it increments node's lru count */
> > > > > +               list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg);
> > > > > +               spin_unlock(lock);
> > > > > +               mem_cgroup_put(memcg);
> > > > > +               ret = LRU_RETRY;
> > > > >                 goto put_unlock;
> > > > >         }
> > > > > +       zswap_written_back_pages++;
> > > >
> > > > Why is this moved here from zswap_writeback_entry()? Also why is
> > > > zswap_reject_reclaim_fail incremented here instead of inside
> > > > zswap_writeback_entry()?
> > >
> > > Domenico should know this better than me, but my understanding
> > > is that moving it here protects concurrent modifications of
> > > zswap_written_back_pages with the tree lock.
> > >
> > > Is writeback single-threaded in the past? This counter is non-atomic,
> > > and doesn't seem to be protected by any locks...
> > >
> > > There definitely can be concurrent stores now though - with
> > > a synchronous reclaim from cgroup-limit hit and another
> > > from the old shrink worker.
> > >
> > > (and with the new zswap shrinker, concurrent reclaim is
> > > the expectation!)
> >
> > The comment above the stats definition stats that they are left
> > unprotected purposefully. If we want to fix that let's do it
> > separately. If this patch makes it significantly worse such that it
> > would cause a regression, let's at least do it in a separate patch.
> > The diff here is too large already.
> >
> > >
> > > zswap_reject_reclaim_fail was previously incremented in
> > > shrink_worker I think. We need it to be incremented
> > > for the shrinker as well, so might as well move it here.
> >
> > Wouldn't moving it inside zswap_writeback_entry() near incrementing
> > zswap_written_back_pages make it easier to follow?
>
> As Nhat said, zswap_reject_reclaim_fail++ had to be moved, I naturally moved it
> here because it's where we act upon the result of the writeback. I then noticed
> that zswap_written_back_pages++ was elsewhere and decided to move that as well
> so that they're in the same place and at least they're under the tree lock.
>
> It's not meant to fix the unprotected counters, it's just a mitigation since we
> are forced to move at least one of them.

I see. Just for my own understanding, it would be correct to update
them in zswap_writeback_entry(), but we choose to do it here as we
happen to hold the lock so we get the free synchronization? I think it
could be beneficial as we may see increased concurrent writeback with
this series. Probably something to call out in the commit log as well.
  
Nhat Pham Oct. 20, 2023, 7:58 p.m. UTC | #9
On Thu, Oct 19, 2023 at 9:15 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> [..]
> > > >
> > > > +/*********************************
> > > > +* lru functions
> > > > +**********************************/
> > > > +static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> > > > +{
> > > > +       struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
> > >
> > > Could we avoid the need for get/put with an rcu_read_lock() instead?
> >
> > I think we can, I'm not entirely sure of the consequences though. By the
> > look of it I'd say it's safe but I wouldn't trust my judgement on this.
>
> It just seems like we have a pattern of short-lived get/put. If RCU
> gives enough protection it should be simpler. IIUC taking a reference
> does not protect against offlining or reparenting, so I am not sure if
> taking a reference here would provide any more protection than
>

I'd keep it for now. Sounds like an optimization to me, which could
always be done as a follow-up :)

Unless of course, if somebody has a really strong opinion regarding
this.

> >
> > >
> [..]
> > > > @@ -686,7 +716,36 @@ static int zswap_reclaim_entry(struct zswap_pool *pool)
> > > >         zswap_entry_put(tree, entry);
> > > >  unlock:
> > > >         spin_unlock(&tree->lock);
> > > > -       return ret ? -EAGAIN : 0;
> > > > +       spin_lock(lock);
> > > > +       return ret;
> > > > +}
> > > > +
> > > > +static int shrink_memcg(struct mem_cgroup *memcg)
> > > > +{
> > > > +       struct zswap_pool *pool;
> > > > +       int nid, shrunk = 0;
> > > > +
> > > > +       pool = zswap_pool_current_get();
> > > > +       if (!pool)
> > > > +               return -EINVAL;
> > > > +
> > > > +       /*
> > > > +        * Skip zombies because their LRUs are reparented and we would be
> > > > +        * reclaiming from the parent instead of the dead memcgroup.
> > >
> > > nit: s/memcgroup/memcg.
> > >
> > > > +        */
> > > > +       if (memcg && !mem_cgroup_online(memcg))
> > > > +               goto out;
> > >
> > > If we move this above zswap_pool_current_get(), we can return directly
> > > and remove the label. I noticed we will return -EAGAIN if memcg is
> > > offline. IIUC -EAGAIN for the caller will move on to the next memcg,
> > > but I am wondering if a different errno would be clearer here.
> >
> > True, I remember spending some time staring at error codes but couldn't find a
> > better one. What if we use -EINVAL for retryable errors, and use something else
> > for the one where there is no pool? -ENODEV?
>
> Do you mean -EINVAL for non-retryable errors? Perhaps -ENOENT is more
> appropriate as a return for offline memcgs?
>
> >
> > >
> [..]
> > > >  static void shrink_worker(struct work_struct *w)
> > > > @@ -695,10 +754,13 @@ static void shrink_worker(struct work_struct *w)
> > > >                                                 shrink_work);
> > > >         int ret, failures = 0;
> > > >
> > > > +       /* global reclaim will select cgroup in a round-robin fashion. */
> > > >         do {
> > > > -               ret = zswap_reclaim_entry(pool);
> > > > +               pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
> > >
> > > Perhaps next_shrink_memcg is a better name here?
> >
> > Will change if you have a strong preference, I'd keep it shorter because it's
> > always used in conjunction with a memcg type or function.
>
> I'd rather have the more explicit name unless it causes some annoying
> line breaks or so.
>
> >
> > >
> > > > +
> > > > +               ret = shrink_memcg(pool->next_shrink);
> > > > +
> > > >                 if (ret) {
> > > > -                       zswap_reject_reclaim_fail++;
> > > >                         if (ret != -EAGAIN)
> > > >                                 break;
> > > >                         if (++failures == MAX_RECLAIM_RETRIES)
> > > > @@ -764,8 +826,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> > > >          */
> > > >         kref_init(&pool->kref);
> > > >         INIT_LIST_HEAD(&pool->list);
> > > > -       INIT_LIST_HEAD(&pool->lru);
> > > > -       spin_lock_init(&pool->lru_lock);
> > > > +       list_lru_init_memcg(&pool->list_lru, NULL);
> > > >         INIT_WORK(&pool->shrink_work, shrink_worker);
> > > >
> > > >         zswap_pool_debug("created", pool);
> > > > @@ -831,6 +892,9 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> > > >
> > > >         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> > > >         free_percpu(pool->acomp_ctx);
> > > > +       list_lru_destroy(&pool->list_lru);
> > > > +       if (pool->next_shrink)
> > > > +               mem_cgroup_put(pool->next_shrink);
> > > >         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> > > >                 zpool_destroy_pool(pool->zpools[i]);
> > > >         kfree(pool);
> > > > @@ -1076,7 +1140,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > > >
> > > >         /* try to allocate swap cache page */
> > > >         page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
> > > > -                                      &page_was_allocated);
> > > > +                                      &page_was_allocated, true);
> > > >         if (!page) {
> > > >                 ret = -ENOMEM;
> > > >                 goto fail;
> > > > @@ -1142,7 +1206,6 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> > > >         /* start writeback */
> > > >         __swap_writepage(page, &wbc);
> > > >         put_page(page);
> > > > -       zswap_written_back_pages++;
> > > >
> > > >         return ret;
> > > >
> > > > @@ -1199,8 +1262,10 @@ bool zswap_store(struct folio *folio)
> > > >         struct scatterlist input, output;
> > > >         struct crypto_acomp_ctx *acomp_ctx;
> > > >         struct obj_cgroup *objcg = NULL;
> > > > +       struct mem_cgroup *memcg = NULL;
> > > >         struct zswap_pool *pool;
> > > >         struct zpool *zpool;
> > > > +       int lru_alloc_ret;
> > > >         unsigned int dlen = PAGE_SIZE;
> > > >         unsigned long handle, value;
> > > >         char *buf;
> > > > @@ -1230,15 +1295,15 @@ bool zswap_store(struct folio *folio)
> > > >                 zswap_invalidate_entry(tree, dupentry);
> > > >         }
> > > >         spin_unlock(&tree->lock);
> > > > -
> > > > -       /*
> > > > -        * XXX: zswap reclaim does not work with cgroups yet. Without a
> > > > -        * cgroup-aware entry LRU, we will push out entries system-wide based on
> > > > -        * local cgroup limits.
> > > > -        */
> > > >         objcg = get_obj_cgroup_from_folio(folio);
> > > > -       if (objcg && !obj_cgroup_may_zswap(objcg))
> > > > -               goto reject;
> > > > +       if (objcg && !obj_cgroup_may_zswap(objcg)) {
> > > > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > > > +               if (shrink_memcg(memcg)) {
> > > > +                       mem_cgroup_put(memcg);
> > > > +                       goto reject;
> > > > +               }
> > > > +               mem_cgroup_put(memcg);
> > > > +       }
> > > >
> > > >         /* reclaim space if needed */
> > > >         if (zswap_is_full()) {
> > > > @@ -1254,10 +1319,15 @@ bool zswap_store(struct folio *folio)
> > > >                         zswap_pool_reached_full = false;
> > > >         }
> > > >
> > > > +       pool = zswap_pool_current_get();
> > > > +       if (!pool)
> > > > +               goto reject;
> > > > +
> > >
> > > Why do we need to move zswap_pool_current_get() up here?
> >
> > Ah, thanks. This is a leftover from a previous version where the pool was needed
> > to allocate the entry.
> >
> >
> > >
> > > >         /* allocate entry */
> > > > -       entry = zswap_entry_cache_alloc(GFP_KERNEL);
> > > > +       entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> > > >         if (!entry) {
> > > >                 zswap_reject_kmemcache_fail++;
> > > > +               zswap_pool_put(pool);
> > > >                 goto reject;
> > > >         }
> > > >
> > > > @@ -1269,6 +1339,7 @@ bool zswap_store(struct folio *folio)
> > > >                         entry->length = 0;
> > > >                         entry->value = value;
> > > >                         atomic_inc(&zswap_same_filled_pages);
> > > > +                       zswap_pool_put(pool);
> > > >                         goto insert_entry;
> > > >                 }
> > > >                 kunmap_atomic(src);
> > > > @@ -1278,9 +1349,15 @@ bool zswap_store(struct folio *folio)
> > > >                 goto freepage;
> > > >
> > > >         /* if entry is successfully added, it keeps the reference */
> > > > -       entry->pool = zswap_pool_current_get();
> > > > -       if (!entry->pool)
> > > > -               goto freepage;
> > > > +       entry->pool = pool;
> > > > +       if (objcg) {
> > > > +               memcg = get_mem_cgroup_from_objcg(objcg);
> > > > +               lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);
> > > > +               mem_cgroup_put(memcg);
> > > > +
> > > > +               if (lru_alloc_ret)
> > > > +                       goto freepage;
> > > > +       }
> > > >
> > > >         /* compress */
> > > >         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > > > @@ -1358,9 +1435,8 @@ bool zswap_store(struct folio *folio)
> > > >                 zswap_invalidate_entry(tree, dupentry);
> > > >         }
> > > >         if (entry->length) {
> > > > -               spin_lock(&entry->pool->lru_lock);
> > > > -               list_add(&entry->lru, &entry->pool->lru);
> > > > -               spin_unlock(&entry->pool->lru_lock);
> > > > +               INIT_LIST_HEAD(&entry->lru);
> > > > +               zswap_lru_add(&pool->list_lru, entry);
> > > >         }
> > > >         spin_unlock(&tree->lock);
> > > >
> > > > @@ -1373,8 +1449,8 @@ bool zswap_store(struct folio *folio)
> > > >
> > > >  put_dstmem:
> > > >         mutex_unlock(acomp_ctx->mutex);
> > > > -       zswap_pool_put(entry->pool);
> > > >  freepage:
> > > > +       zswap_pool_put(entry->pool);
> > > >         zswap_entry_cache_free(entry);
> > > >  reject:
> > > >         if (objcg)
> > > > @@ -1467,9 +1543,8 @@ bool zswap_load(struct folio *folio)
> > > >                 zswap_invalidate_entry(tree, entry);
> > > >                 folio_mark_dirty(folio);
> > > >         } else if (entry->length) {
> > > > -               spin_lock(&entry->pool->lru_lock);
> > > > -               list_move(&entry->lru, &entry->pool->lru);
> > > > -               spin_unlock(&entry->pool->lru_lock);
> > > > +               zswap_lru_del(&entry->pool->list_lru, entry);
> > > > +               zswap_lru_add(&entry->pool->list_lru, entry);
> > > >         }
> > > >         zswap_entry_put(tree, entry);
> > > >         spin_unlock(&tree->lock);
> > > > --
> > > > 2.34.1
  

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 031102ac9311..3de10fabea0f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1179,6 +1179,11 @@  static inline struct mem_cgroup *page_memcg_check(struct page *page)
 	return NULL;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
+{
+	return NULL;
+}
+
 static inline bool folio_memcg_kmem(struct folio *folio)
 {
 	return false;
diff --git a/mm/swap.h b/mm/swap.h
index 8a3c7a0ace4f..bbd6ce661a20 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -50,7 +50,8 @@  struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 				     struct vm_area_struct *vma,
 				     unsigned long addr,
-				     bool *new_page_allocated);
+				     bool *new_page_allocated,
+				     bool fail_if_exists);
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 				    struct vm_fault *vmf);
 struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b3b14bd0dd64..0356df52b06a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -411,7 +411,7 @@  struct folio *filemap_get_incore_folio(struct address_space *mapping,
 
 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr,
-			bool *new_page_allocated)
+			bool *new_page_allocated, bool fail_if_exists)
 {
 	struct swap_info_struct *si;
 	struct folio *folio;
@@ -468,6 +468,15 @@  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		if (err != -EEXIST)
 			goto fail_put_swap;
 
+		/*
+		 * This check guards against a state that happens if a call
+		 * to __read_swap_cache_async triggers a reclaim, if the
+		 * reclaimer (zswap's writeback as of now) then decides to
+		 * reclaim that same entry, then the subsequent call to
+		 * __read_swap_cache_async would get stuck in this loop.
+		 */
+		if (fail_if_exists && err == -EEXIST)
+			goto fail_put_swap;
 		/*
 		 * We might race against __delete_from_swap_cache(), and
 		 * stumble across a swap_map entry whose SWAP_HAS_CACHE
@@ -530,7 +539,7 @@  struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 {
 	bool page_was_allocated;
 	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
-			vma, addr, &page_was_allocated);
+			vma, addr, &page_was_allocated, false);
 
 	if (page_was_allocated)
 		swap_readpage(retpage, false, plug);
@@ -649,7 +658,7 @@  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		/* Ok, do the async read-ahead now */
 		page = __read_swap_cache_async(
 			swp_entry(swp_type(entry), offset),
-			gfp_mask, vma, addr, &page_allocated);
+			gfp_mask, vma, addr, &page_allocated, false);
 		if (!page)
 			continue;
 		if (page_allocated) {
@@ -815,7 +824,7 @@  static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 		pte_unmap(pte);
 		pte = NULL;
 		page = __read_swap_cache_async(entry, gfp_mask, vma,
-					       addr, &page_allocated);
+					       addr, &page_allocated, false);
 		if (!page)
 			continue;
 		if (page_allocated) {
diff --git a/mm/zswap.c b/mm/zswap.c
index 083c693602b8..d2989ad11814 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -34,6 +34,7 @@ 
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
 #include <linux/workqueue.h>
+#include <linux/list_lru.h>
 
 #include "swap.h"
 #include "internal.h"
@@ -171,8 +172,8 @@  struct zswap_pool {
 	struct work_struct shrink_work;
 	struct hlist_node node;
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
-	struct list_head lru;
-	spinlock_t lru_lock;
+	struct list_lru list_lru;
+	struct mem_cgroup *next_shrink;
 };
 
 /*
@@ -288,15 +289,25 @@  static void zswap_update_total_size(void)
 	zswap_pool_total_size = total;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_entry(struct zswap_entry *entry)
+{
+	return entry->objcg ? get_mem_cgroup_from_objcg(entry->objcg) : NULL;
+}
+
+static inline int entry_to_nid(struct zswap_entry *entry)
+{
+	return page_to_nid(virt_to_page(entry));
+}
+
 /*********************************
 * zswap entry functions
 **********************************/
 static struct kmem_cache *zswap_entry_cache;
 
-static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
 {
 	struct zswap_entry *entry;
-	entry = kmem_cache_alloc(zswap_entry_cache, gfp);
+	entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
 	if (!entry)
 		return NULL;
 	entry->refcount = 1;
@@ -309,6 +320,27 @@  static void zswap_entry_cache_free(struct zswap_entry *entry)
 	kmem_cache_free(zswap_entry_cache, entry);
 }
 
+/*********************************
+* lru functions
+**********************************/
+static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+	struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
+	bool added = __list_lru_add(list_lru, &entry->lru, entry_to_nid(entry), memcg);
+
+	mem_cgroup_put(memcg);
+	return added;
+}
+
+static bool zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
+{
+	struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry);
+	bool removed = __list_lru_del(list_lru, &entry->lru, entry_to_nid(entry), memcg);
+
+	mem_cgroup_put(memcg);
+	return removed;
+}
+
 /*********************************
 * rbtree functions
 **********************************/
@@ -393,9 +425,7 @@  static void zswap_free_entry(struct zswap_entry *entry)
 	if (!entry->length)
 		atomic_dec(&zswap_same_filled_pages);
 	else {
-		spin_lock(&entry->pool->lru_lock);
-		list_del(&entry->lru);
-		spin_unlock(&entry->pool->lru_lock);
+		zswap_lru_del(&entry->pool->list_lru, entry);
 		zpool_free(zswap_find_zpool(entry), entry->handle);
 		zswap_pool_put(entry->pool);
 	}
@@ -629,21 +659,16 @@  static void zswap_invalidate_entry(struct zswap_tree *tree,
 		zswap_entry_put(tree, entry);
 }
 
-static int zswap_reclaim_entry(struct zswap_pool *pool)
+static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
+				       spinlock_t *lock, void *arg)
 {
-	struct zswap_entry *entry;
+	struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
+	struct mem_cgroup *memcg;
 	struct zswap_tree *tree;
 	pgoff_t swpoffset;
-	int ret;
+	enum lru_status ret = LRU_REMOVED_RETRY;
+	int writeback_result;
 
-	/* Get an entry off the LRU */
-	spin_lock(&pool->lru_lock);
-	if (list_empty(&pool->lru)) {
-		spin_unlock(&pool->lru_lock);
-		return -EINVAL;
-	}
-	entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
-	list_del_init(&entry->lru);
 	/*
 	 * Once the lru lock is dropped, the entry might get freed. The
 	 * swpoffset is copied to the stack, and entry isn't deref'd again
@@ -651,28 +676,33 @@  static int zswap_reclaim_entry(struct zswap_pool *pool)
 	 */
 	swpoffset = swp_offset(entry->swpentry);
 	tree = zswap_trees[swp_type(entry->swpentry)];
-	spin_unlock(&pool->lru_lock);
+	list_lru_isolate(l, item);
+	spin_unlock(lock);
 
 	/* Check for invalidate() race */
 	spin_lock(&tree->lock);
 	if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
-		ret = -EAGAIN;
 		goto unlock;
 	}
 	/* Hold a reference to prevent a free during writeback */
 	zswap_entry_get(entry);
 	spin_unlock(&tree->lock);
 
-	ret = zswap_writeback_entry(entry, tree);
+	writeback_result = zswap_writeback_entry(entry, tree);
 
 	spin_lock(&tree->lock);
-	if (ret) {
-		/* Writeback failed, put entry back on LRU */
-		spin_lock(&pool->lru_lock);
-		list_move(&entry->lru, &pool->lru);
-		spin_unlock(&pool->lru_lock);
+	if (writeback_result) {
+		zswap_reject_reclaim_fail++;
+		memcg = get_mem_cgroup_from_entry(entry);
+		spin_lock(lock);
+		/* we cannot use zswap_lru_add here, because it increments node's lru count */
+		list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg);
+		spin_unlock(lock);
+		mem_cgroup_put(memcg);
+		ret = LRU_RETRY;
 		goto put_unlock;
 	}
+	zswap_written_back_pages++;
 
 	/*
 	 * Writeback started successfully, the page now belongs to the
@@ -686,7 +716,36 @@  static int zswap_reclaim_entry(struct zswap_pool *pool)
 	zswap_entry_put(tree, entry);
 unlock:
 	spin_unlock(&tree->lock);
-	return ret ? -EAGAIN : 0;
+	spin_lock(lock);
+	return ret;
+}
+
+static int shrink_memcg(struct mem_cgroup *memcg)
+{
+	struct zswap_pool *pool;
+	int nid, shrunk = 0;
+
+	pool = zswap_pool_current_get();
+	if (!pool)
+		return -EINVAL;
+
+	/*
+	 * Skip zombies because their LRUs are reparented and we would be
+	 * reclaiming from the parent instead of the dead memcgroup.
+	 */
+	if (memcg && !mem_cgroup_online(memcg))
+		goto out;
+
+	for_each_node_state(nid, N_NORMAL_MEMORY) {
+		unsigned long nr_to_walk = 1;
+
+		if (list_lru_walk_one(&pool->list_lru, nid, memcg, &shrink_memcg_cb,
+				      NULL, &nr_to_walk))
+			shrunk++;
+	}
+out:
+	zswap_pool_put(pool);
+	return shrunk ? 0 : -EAGAIN;
 }
 
 static void shrink_worker(struct work_struct *w)
@@ -695,10 +754,13 @@  static void shrink_worker(struct work_struct *w)
 						shrink_work);
 	int ret, failures = 0;
 
+	/* global reclaim will select cgroup in a round-robin fashion. */
 	do {
-		ret = zswap_reclaim_entry(pool);
+		pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
+
+		ret = shrink_memcg(pool->next_shrink);
+
 		if (ret) {
-			zswap_reject_reclaim_fail++;
 			if (ret != -EAGAIN)
 				break;
 			if (++failures == MAX_RECLAIM_RETRIES)
@@ -764,8 +826,7 @@  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	 */
 	kref_init(&pool->kref);
 	INIT_LIST_HEAD(&pool->list);
-	INIT_LIST_HEAD(&pool->lru);
-	spin_lock_init(&pool->lru_lock);
+	list_lru_init_memcg(&pool->list_lru, NULL);
 	INIT_WORK(&pool->shrink_work, shrink_worker);
 
 	zswap_pool_debug("created", pool);
@@ -831,6 +892,9 @@  static void zswap_pool_destroy(struct zswap_pool *pool)
 
 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 	free_percpu(pool->acomp_ctx);
+	list_lru_destroy(&pool->list_lru);
+	if (pool->next_shrink)
+		mem_cgroup_put(pool->next_shrink);
 	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
 		zpool_destroy_pool(pool->zpools[i]);
 	kfree(pool);
@@ -1076,7 +1140,7 @@  static int zswap_writeback_entry(struct zswap_entry *entry,
 
 	/* try to allocate swap cache page */
 	page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0,
-				       &page_was_allocated);
+				       &page_was_allocated, true);
 	if (!page) {
 		ret = -ENOMEM;
 		goto fail;
@@ -1142,7 +1206,6 @@  static int zswap_writeback_entry(struct zswap_entry *entry,
 	/* start writeback */
 	__swap_writepage(page, &wbc);
 	put_page(page);
-	zswap_written_back_pages++;
 
 	return ret;
 
@@ -1199,8 +1262,10 @@  bool zswap_store(struct folio *folio)
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
 	struct obj_cgroup *objcg = NULL;
+	struct mem_cgroup *memcg = NULL;
 	struct zswap_pool *pool;
 	struct zpool *zpool;
+	int lru_alloc_ret;
 	unsigned int dlen = PAGE_SIZE;
 	unsigned long handle, value;
 	char *buf;
@@ -1230,15 +1295,15 @@  bool zswap_store(struct folio *folio)
 		zswap_invalidate_entry(tree, dupentry);
 	}
 	spin_unlock(&tree->lock);
-
-	/*
-	 * XXX: zswap reclaim does not work with cgroups yet. Without a
-	 * cgroup-aware entry LRU, we will push out entries system-wide based on
-	 * local cgroup limits.
-	 */
 	objcg = get_obj_cgroup_from_folio(folio);
-	if (objcg && !obj_cgroup_may_zswap(objcg))
-		goto reject;
+	if (objcg && !obj_cgroup_may_zswap(objcg)) {
+		memcg = get_mem_cgroup_from_objcg(objcg);
+		if (shrink_memcg(memcg)) {
+			mem_cgroup_put(memcg);
+			goto reject;
+		}
+		mem_cgroup_put(memcg);
+	}
 
 	/* reclaim space if needed */
 	if (zswap_is_full()) {
@@ -1254,10 +1319,15 @@  bool zswap_store(struct folio *folio)
 			zswap_pool_reached_full = false;
 	}
 
+	pool = zswap_pool_current_get();
+	if (!pool)
+		goto reject;
+
 	/* allocate entry */
-	entry = zswap_entry_cache_alloc(GFP_KERNEL);
+	entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
 	if (!entry) {
 		zswap_reject_kmemcache_fail++;
+		zswap_pool_put(pool);
 		goto reject;
 	}
 
@@ -1269,6 +1339,7 @@  bool zswap_store(struct folio *folio)
 			entry->length = 0;
 			entry->value = value;
 			atomic_inc(&zswap_same_filled_pages);
+			zswap_pool_put(pool);
 			goto insert_entry;
 		}
 		kunmap_atomic(src);
@@ -1278,9 +1349,15 @@  bool zswap_store(struct folio *folio)
 		goto freepage;
 
 	/* if entry is successfully added, it keeps the reference */
-	entry->pool = zswap_pool_current_get();
-	if (!entry->pool)
-		goto freepage;
+	entry->pool = pool;
+	if (objcg) {
+		memcg = get_mem_cgroup_from_objcg(objcg);
+		lru_alloc_ret = memcg_list_lru_alloc(memcg, &pool->list_lru, GFP_KERNEL);
+		mem_cgroup_put(memcg);
+
+		if (lru_alloc_ret)
+			goto freepage;
+	}
 
 	/* compress */
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -1358,9 +1435,8 @@  bool zswap_store(struct folio *folio)
 		zswap_invalidate_entry(tree, dupentry);
 	}
 	if (entry->length) {
-		spin_lock(&entry->pool->lru_lock);
-		list_add(&entry->lru, &entry->pool->lru);
-		spin_unlock(&entry->pool->lru_lock);
+		INIT_LIST_HEAD(&entry->lru);
+		zswap_lru_add(&pool->list_lru, entry);
 	}
 	spin_unlock(&tree->lock);
 
@@ -1373,8 +1449,8 @@  bool zswap_store(struct folio *folio)
 
 put_dstmem:
 	mutex_unlock(acomp_ctx->mutex);
-	zswap_pool_put(entry->pool);
 freepage:
+	zswap_pool_put(entry->pool);
 	zswap_entry_cache_free(entry);
 reject:
 	if (objcg)
@@ -1467,9 +1543,8 @@  bool zswap_load(struct folio *folio)
 		zswap_invalidate_entry(tree, entry);
 		folio_mark_dirty(folio);
 	} else if (entry->length) {
-		spin_lock(&entry->pool->lru_lock);
-		list_move(&entry->lru, &entry->pool->lru);
-		spin_unlock(&entry->pool->lru_lock);
+		zswap_lru_del(&entry->pool->list_lru, entry);
+		zswap_lru_add(&entry->pool->list_lru, entry);
 	}
 	zswap_entry_put(tree, entry);
 	spin_unlock(&tree->lock);