[RFC,v2,1/7] mm: zswap: add pool shrinking mechanism

Message ID 20230606145611.704392-2-cerasuolodomenico@gmail.com
State New
Headers
Series mm: zswap: move writeback LRU from zpool to zswap |

Commit Message

Domenico Cerasuolo June 6, 2023, 2:56 p.m. UTC
  Each zpool driver (zbud, z3fold and zsmalloc) implements its own shrink
function, which is called from zpool_shrink. However, with this commit,
a unified shrink function is added to zswap. The ultimate goal is to
eliminate the need for zpool_shrink once all zpool implementations have
dropped their shrink code.

To ensure the functionality of each commit, this change focuses solely
on adding the mechanism itself. No modifications are made to
the backends, meaning that functionally, there are no immediate changes.
The zswap mechanism will only come into effect once the backends have
removed their shrink code. The subsequent commits will address the
modifications needed in the backends.

Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
---
 mm/zswap.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 91 insertions(+), 5 deletions(-)
  

Comments

Yosry Ahmed June 7, 2023, 8:14 a.m. UTC | #1
On Tue, Jun 6, 2023 at 7:56 AM Domenico Cerasuolo
<cerasuolodomenico@gmail.com> wrote:
>
> Each zpool driver (zbud, z3fold and zsmalloc) implements its own shrink
> function, which is called from zpool_shrink. However, with this commit,
> a unified shrink function is added to zswap. The ultimate goal is to
> eliminate the need for zpool_shrink once all zpool implementations have
> dropped their shrink code.
>
> To ensure the functionality of each commit, this change focuses solely
> on adding the mechanism itself. No modifications are made to
> the backends, meaning that functionally, there are no immediate changes.
> The zswap mechanism will only come into effect once the backends have
> removed their shrink code. The subsequent commits will address the
> modifications needed in the backends.
>
> Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> ---
>  mm/zswap.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 91 insertions(+), 5 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index bcb82e09eb64..c99bafcefecf 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -150,6 +150,12 @@ struct crypto_acomp_ctx {
>         struct mutex *mutex;
>  };
>
> +/*
> + * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
> + * The only case where lru_lock is not acquired while holding tree.lock is
> + * when a zswap_entry is taken off the lru for writeback, in that case it
> + * needs to be verified that it's still valid in the tree.
> + */
>  struct zswap_pool {
>         struct zpool *zpool;
>         struct crypto_acomp_ctx __percpu *acomp_ctx;
> @@ -159,6 +165,8 @@ struct zswap_pool {
>         struct work_struct shrink_work;
>         struct hlist_node node;
>         char tfm_name[CRYPTO_MAX_ALG_NAME];
> +       struct list_head lru;
> +       spinlock_t lru_lock;
>  };
>
>  /*
> @@ -176,10 +184,12 @@ struct zswap_pool {
>   *            be held while changing the refcount.  Since the lock must
>   *            be held, there is no reason to also make refcount atomic.
>   * length - the length in bytes of the compressed page data.  Needed during
> - *          decompression. For a same value filled page length is 0.
> + *          decompression. For a same value filled page length is 0, and both
> + *          pool and lru are invalid and must be ignored.
>   * pool - the zswap_pool the entry's data is in
>   * handle - zpool allocation handle that stores the compressed page data
>   * value - value of the same-value filled pages which have same content
> + * lru - handle to the pool's lru used to evict pages.
>   */
>  struct zswap_entry {
>         struct rb_node rbnode;
> @@ -192,6 +202,7 @@ struct zswap_entry {
>                 unsigned long value;
>         };
>         struct obj_cgroup *objcg;
> +       struct list_head lru;
>  };
>
>  struct zswap_header {
> @@ -364,6 +375,12 @@ static void zswap_free_entry(struct zswap_entry *entry)
>         if (!entry->length)
>                 atomic_dec(&zswap_same_filled_pages);
>         else {
> +       /* zpool_evictable will be removed once all 3 backends have migrated */
> +               if (!zpool_evictable(entry->pool->zpool)) {
> +                       spin_lock(&entry->pool->lru_lock);
> +                       list_del(&entry->lru);
> +                       spin_unlock(&entry->pool->lru_lock);
> +               }
>                 zpool_free(entry->pool->zpool, entry->handle);
>                 zswap_pool_put(entry->pool);
>         }
> @@ -584,14 +601,70 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>         return NULL;
>  }
>
> +static int zswap_shrink(struct zswap_pool *pool)

Nit: rename to zswap_shrink_one() so that it's clear we always
writeback one entry per call?

> +{
> +       struct zswap_entry *lru_entry, *tree_entry = NULL;
> +       struct zswap_header *zhdr;
> +       struct zswap_tree *tree;
> +       int swpoffset;
> +       int ret;
> +
> +       /* get a reclaimable entry from LRU */
> +       spin_lock(&pool->lru_lock);
> +       if (list_empty(&pool->lru)) {
> +               spin_unlock(&pool->lru_lock);
> +               return -EINVAL;
> +       }
> +       lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> +       list_del_init(&lru_entry->lru);
> +       zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> +       tree = zswap_trees[swp_type(zhdr->swpentry)];
> +       zpool_unmap_handle(pool->zpool, lru_entry->handle);
> +       /*
> +        * Once the pool lock is dropped, the lru_entry might get freed. The

Nit: lru lock*

> +        * swpoffset is copied to the stack, and lru_entry isn't deref'd again
> +        * until the entry is verified to still be alive in the tree.
> +        */
> +       swpoffset = swp_offset(zhdr->swpentry);
> +       spin_unlock(&pool->lru_lock);
> +
> +       /* hold a reference from tree so it won't be freed during writeback */
> +       spin_lock(&tree->lock);
> +       tree_entry = zswap_entry_find_get(&tree->rbroot, swpoffset);
> +       if (tree_entry != lru_entry) {
> +               if (tree_entry)
> +                       zswap_entry_put(tree, tree_entry);
> +               spin_unlock(&tree->lock);
> +               return -EAGAIN;
> +       }
> +       spin_unlock(&tree->lock);
> +
> +       ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> +
> +       spin_lock(&tree->lock);
> +       if (ret) {
> +               spin_lock(&pool->lru_lock);
> +               list_move(&lru_entry->lru, &pool->lru);
> +               spin_unlock(&pool->lru_lock);
> +       }
> +       zswap_entry_put(tree, tree_entry);
> +       spin_unlock(&tree->lock);
> +
> +       return ret ? -EAGAIN : 0;
> +}
> +
>  static void shrink_worker(struct work_struct *w)
>  {
>         struct zswap_pool *pool = container_of(w, typeof(*pool),
>                                                 shrink_work);
>         int ret, failures = 0;
>
> +       /* zpool_evictable will be removed once all 3 backends have migrated */
>         do {
> -               ret = zpool_shrink(pool->zpool, 1, NULL);
> +               if (zpool_evictable(pool->zpool))
> +                       ret = zpool_shrink(pool->zpool, 1, NULL);
> +               else
> +                       ret = zswap_shrink(pool);
>                 if (ret) {
>                         zswap_reject_reclaim_fail++;
>                         if (ret != -EAGAIN)
> @@ -655,6 +728,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>          */
>         kref_init(&pool->kref);
>         INIT_LIST_HEAD(&pool->list);
> +       INIT_LIST_HEAD(&pool->lru);
> +       spin_lock_init(&pool->lru_lock);
>         INIT_WORK(&pool->shrink_work, shrink_worker);
>
>         zswap_pool_debug("created", pool);
> @@ -1270,7 +1345,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         }
>
>         /* store */
> -       hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> +       hlen = sizeof(zhdr);
>         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
>         if (zpool_malloc_support_movable(entry->pool->zpool))
>                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> @@ -1313,6 +1388,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>                         zswap_entry_put(tree, dupentry);
>                 }
>         } while (ret == -EEXIST);
> +       /* zpool_evictable will be removed once all 3 backends have migrated */
> +       if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> +               spin_lock(&entry->pool->lru_lock);
> +               list_add(&entry->lru, &entry->pool->lru);
> +               spin_unlock(&entry->pool->lru_lock);
> +       }
>         spin_unlock(&tree->lock);
>
>         /* update stats */
> @@ -1384,8 +1465,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>         /* decompress */
>         dlen = PAGE_SIZE;
>         src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> -       if (zpool_evictable(entry->pool->zpool))
> -               src += sizeof(struct zswap_header);
> +       src += sizeof(struct zswap_header);
>
>         if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
>                 memcpy(tmp, src, entry->length);
> @@ -1415,6 +1495,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>  freeentry:
>         spin_lock(&tree->lock);
>         zswap_entry_put(tree, entry);
> +       /* zpool_evictable will be removed once all 3 backends have migrated */
> +       if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> +               spin_lock(&entry->pool->lru_lock);
> +               list_move(&entry->lru, &entry->pool->lru);
> +               spin_unlock(&entry->pool->lru_lock);
> +       }

It's not really this patch's fault, but when merged with commit
fe1d1f7d0fb5 ("mm: zswap: support exclusive loads") from mm-unstable
[1], and with CONFIG_ZSWAP_EXCLUSIVE_LOADS=y, this causes a crash.

This happens because fe1d1f7d0fb5 makes the loads exclusive, so
zswap_entry_put(tree, entry) above the added code causes the entry to
be freed, then we go ahead and deference multiple fields within it in
the added chunk. Moving the chunk above zswap_entry_put() (and
consequently also above zswap_invalidate_entry() from fe1d1f7d0fb5)
makes this work correctly.

Perhaps it would be useful to rebase on top of fe1d1f7d0fb5 for your
next version(s), if any.

Maybe the outcome would be something like:

zswap_entry_put(tree, entry);
if (!ret && IS_ENABLED(CONFIG_ZSWAP_EXCLUSIVE_LOADS)) {
        zswap_invalidate_entry(tree, entry);
} else if (entry->length && !zpool_evictable(entry->pool->zpool)) {
        spin_lock(&entry->pool->lru_lock);
        list_move(&entry->lru, &entry->pool->lru);
        spin_unlock(&entry->pool->lru_lock);
}

I am assuming if we are going to invalidate the entry anyway there is
no need to move it to the front of the lru -- but I didn't really
think it through.

[1]https://lore.kernel.org/lkml/20230530210251.493194-1-yosryahmed@google.com/

>         spin_unlock(&tree->lock);
>
>         return ret;
> --
> 2.34.1
>
  
Domenico Cerasuolo June 7, 2023, 9:22 a.m. UTC | #2
On Wed, Jun 7, 2023 at 10:14 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Jun 6, 2023 at 7:56 AM Domenico Cerasuolo
> <cerasuolodomenico@gmail.com> wrote:
> >
> > Each zpool driver (zbud, z3fold and zsmalloc) implements its own shrink
> > function, which is called from zpool_shrink. However, with this commit,
> > a unified shrink function is added to zswap. The ultimate goal is to
> > eliminate the need for zpool_shrink once all zpool implementations have
> > dropped their shrink code.
> >
> > To ensure the functionality of each commit, this change focuses solely
> > on adding the mechanism itself. No modifications are made to
> > the backends, meaning that functionally, there are no immediate changes.
> > The zswap mechanism will only come into effect once the backends have
> > removed their shrink code. The subsequent commits will address the
> > modifications needed in the backends.
> >
> > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > ---
> >  mm/zswap.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 91 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index bcb82e09eb64..c99bafcefecf 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -150,6 +150,12 @@ struct crypto_acomp_ctx {
> >         struct mutex *mutex;
> >  };
> >
> > +/*
> > + * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
> > + * The only case where lru_lock is not acquired while holding tree.lock is
> > + * when a zswap_entry is taken off the lru for writeback, in that case it
> > + * needs to be verified that it's still valid in the tree.
> > + */
> >  struct zswap_pool {
> >         struct zpool *zpool;
> >         struct crypto_acomp_ctx __percpu *acomp_ctx;
> > @@ -159,6 +165,8 @@ struct zswap_pool {
> >         struct work_struct shrink_work;
> >         struct hlist_node node;
> >         char tfm_name[CRYPTO_MAX_ALG_NAME];
> > +       struct list_head lru;
> > +       spinlock_t lru_lock;
> >  };
> >
> >  /*
> > @@ -176,10 +184,12 @@ struct zswap_pool {
> >   *            be held while changing the refcount.  Since the lock must
> >   *            be held, there is no reason to also make refcount atomic.
> >   * length - the length in bytes of the compressed page data.  Needed during
> > - *          decompression. For a same value filled page length is 0.
> > + *          decompression. For a same value filled page length is 0, and both
> > + *          pool and lru are invalid and must be ignored.
> >   * pool - the zswap_pool the entry's data is in
> >   * handle - zpool allocation handle that stores the compressed page data
> >   * value - value of the same-value filled pages which have same content
> > + * lru - handle to the pool's lru used to evict pages.
> >   */
> >  struct zswap_entry {
> >         struct rb_node rbnode;
> > @@ -192,6 +202,7 @@ struct zswap_entry {
> >                 unsigned long value;
> >         };
> >         struct obj_cgroup *objcg;
> > +       struct list_head lru;
> >  };
> >
> >  struct zswap_header {
> > @@ -364,6 +375,12 @@ static void zswap_free_entry(struct zswap_entry *entry)
> >         if (!entry->length)
> >                 atomic_dec(&zswap_same_filled_pages);
> >         else {
> > +       /* zpool_evictable will be removed once all 3 backends have migrated */
> > +               if (!zpool_evictable(entry->pool->zpool)) {
> > +                       spin_lock(&entry->pool->lru_lock);
> > +                       list_del(&entry->lru);
> > +                       spin_unlock(&entry->pool->lru_lock);
> > +               }
> >                 zpool_free(entry->pool->zpool, entry->handle);
> >                 zswap_pool_put(entry->pool);
> >         }
> > @@ -584,14 +601,70 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> >         return NULL;
> >  }
> >
> > +static int zswap_shrink(struct zswap_pool *pool)
>
> Nit: rename to zswap_shrink_one() so that it's clear we always
> writeback one entry per call?

I named it like that to mirror zpool_shrink but I think that you've got a point
in that it might not be very clear that it is shrinking by one page only.
What about zswap_reclaim_entry? I'm not a native speaker, but with
zswap_shrink_one I wouldn't obviously intend that the "one" refers to an
entry.

>
> > +{
> > +       struct zswap_entry *lru_entry, *tree_entry = NULL;
> > +       struct zswap_header *zhdr;
> > +       struct zswap_tree *tree;
> > +       int swpoffset;
> > +       int ret;
> > +
> > +       /* get a reclaimable entry from LRU */
> > +       spin_lock(&pool->lru_lock);
> > +       if (list_empty(&pool->lru)) {
> > +               spin_unlock(&pool->lru_lock);
> > +               return -EINVAL;
> > +       }
> > +       lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> > +       list_del_init(&lru_entry->lru);
> > +       zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> > +       tree = zswap_trees[swp_type(zhdr->swpentry)];
> > +       zpool_unmap_handle(pool->zpool, lru_entry->handle);
> > +       /*
> > +        * Once the pool lock is dropped, the lru_entry might get freed. The
>
> Nit: lru lock*
>
> > +        * swpoffset is copied to the stack, and lru_entry isn't deref'd again
> > +        * until the entry is verified to still be alive in the tree.
> > +        */
> > +       swpoffset = swp_offset(zhdr->swpentry);
> > +       spin_unlock(&pool->lru_lock);
> > +
> > +       /* hold a reference from tree so it won't be freed during writeback */
> > +       spin_lock(&tree->lock);
> > +       tree_entry = zswap_entry_find_get(&tree->rbroot, swpoffset);
> > +       if (tree_entry != lru_entry) {
> > +               if (tree_entry)
> > +                       zswap_entry_put(tree, tree_entry);
> > +               spin_unlock(&tree->lock);
> > +               return -EAGAIN;
> > +       }
> > +       spin_unlock(&tree->lock);
> > +
> > +       ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> > +
> > +       spin_lock(&tree->lock);
> > +       if (ret) {
> > +               spin_lock(&pool->lru_lock);
> > +               list_move(&lru_entry->lru, &pool->lru);
> > +               spin_unlock(&pool->lru_lock);
> > +       }
> > +       zswap_entry_put(tree, tree_entry);
> > +       spin_unlock(&tree->lock);
> > +
> > +       return ret ? -EAGAIN : 0;
> > +}
> > +
> >  static void shrink_worker(struct work_struct *w)
> >  {
> >         struct zswap_pool *pool = container_of(w, typeof(*pool),
> >                                                 shrink_work);
> >         int ret, failures = 0;
> >
> > +       /* zpool_evictable will be removed once all 3 backends have migrated */
> >         do {
> > -               ret = zpool_shrink(pool->zpool, 1, NULL);
> > +               if (zpool_evictable(pool->zpool))
> > +                       ret = zpool_shrink(pool->zpool, 1, NULL);
> > +               else
> > +                       ret = zswap_shrink(pool);
> >                 if (ret) {
> >                         zswap_reject_reclaim_fail++;
> >                         if (ret != -EAGAIN)
> > @@ -655,6 +728,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >          */
> >         kref_init(&pool->kref);
> >         INIT_LIST_HEAD(&pool->list);
> > +       INIT_LIST_HEAD(&pool->lru);
> > +       spin_lock_init(&pool->lru_lock);
> >         INIT_WORK(&pool->shrink_work, shrink_worker);
> >
> >         zswap_pool_debug("created", pool);
> > @@ -1270,7 +1345,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> >         }
> >
> >         /* store */
> > -       hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> > +       hlen = sizeof(zhdr);
> >         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> >         if (zpool_malloc_support_movable(entry->pool->zpool))
> >                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> > @@ -1313,6 +1388,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> >                         zswap_entry_put(tree, dupentry);
> >                 }
> >         } while (ret == -EEXIST);
> > +       /* zpool_evictable will be removed once all 3 backends have migrated */
> > +       if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> > +               spin_lock(&entry->pool->lru_lock);
> > +               list_add(&entry->lru, &entry->pool->lru);
> > +               spin_unlock(&entry->pool->lru_lock);
> > +       }
> >         spin_unlock(&tree->lock);
> >
> >         /* update stats */
> > @@ -1384,8 +1465,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> >         /* decompress */
> >         dlen = PAGE_SIZE;
> >         src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> > -       if (zpool_evictable(entry->pool->zpool))
> > -               src += sizeof(struct zswap_header);
> > +       src += sizeof(struct zswap_header);
> >
> >         if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> >                 memcpy(tmp, src, entry->length);
> > @@ -1415,6 +1495,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> >  freeentry:
> >         spin_lock(&tree->lock);
> >         zswap_entry_put(tree, entry);
> > +       /* zpool_evictable will be removed once all 3 backends have migrated */
> > +       if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> > +               spin_lock(&entry->pool->lru_lock);
> > +               list_move(&entry->lru, &entry->pool->lru);
> > +               spin_unlock(&entry->pool->lru_lock);
> > +       }
>
> It's not really this patch's fault, but when merged with commit
> fe1d1f7d0fb5 ("mm: zswap: support exclusive loads") from mm-unstable
> [1], and with CONFIG_ZSWAP_EXCLUSIVE_LOADS=y, this causes a crash.
>
> This happens because fe1d1f7d0fb5 makes the loads exclusive, so
> zswap_entry_put(tree, entry) above the added code causes the entry to
> be freed, then we go ahead and deference multiple fields within it in
> the added chunk. Moving the chunk above zswap_entry_put() (and
> consequently also above zswap_invalidate_entry() from fe1d1f7d0fb5)
> makes this work correctly.
>
> Perhaps it would be useful to rebase on top of fe1d1f7d0fb5 for your
> next version(s), if any.

Will definitely rebase, I just now saw that you tested the suggested resolution
below, thanks, it does make sense.

>
> Maybe the outcome would be something like:
>
> zswap_entry_put(tree, entry);
> if (!ret && IS_ENABLED(CONFIG_ZSWAP_EXCLUSIVE_LOADS)) {
>         zswap_invalidate_entry(tree, entry);
> } else if (entry->length && !zpool_evictable(entry->pool->zpool)) {
>         spin_lock(&entry->pool->lru_lock);
>         list_move(&entry->lru, &entry->pool->lru);
>         spin_unlock(&entry->pool->lru_lock);
> }
>
> I am assuming if we are going to invalidate the entry anyway there is
> no need to move it to the front of the lru -- but I didn't really
> think it through.
>
> [1]https://lore.kernel.org/lkml/20230530210251.493194-1-yosryahmed@google.com/
>
> >         spin_unlock(&tree->lock);
> >
> >         return ret;
> > --
> > 2.34.1
> >
  
Yosry Ahmed June 7, 2023, 9:31 a.m. UTC | #3
On Wed, Jun 7, 2023 at 2:22 AM Domenico Cerasuolo
<cerasuolodomenico@gmail.com> wrote:
>
> On Wed, Jun 7, 2023 at 10:14 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Jun 6, 2023 at 7:56 AM Domenico Cerasuolo
> > <cerasuolodomenico@gmail.com> wrote:
> > >
> > > Each zpool driver (zbud, z3fold and zsmalloc) implements its own shrink
> > > function, which is called from zpool_shrink. However, with this commit,
> > > a unified shrink function is added to zswap. The ultimate goal is to
> > > eliminate the need for zpool_shrink once all zpool implementations have
> > > dropped their shrink code.
> > >
> > > To ensure the functionality of each commit, this change focuses solely
> > > on adding the mechanism itself. No modifications are made to
> > > the backends, meaning that functionally, there are no immediate changes.
> > > The zswap mechanism will only come into effect once the backends have
> > > removed their shrink code. The subsequent commits will address the
> > > modifications needed in the backends.
> > >
> > > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > > ---
> > >  mm/zswap.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 91 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/mm/zswap.c b/mm/zswap.c
> > > index bcb82e09eb64..c99bafcefecf 100644
> > > --- a/mm/zswap.c
> > > +++ b/mm/zswap.c
> > > @@ -150,6 +150,12 @@ struct crypto_acomp_ctx {
> > >         struct mutex *mutex;
> > >  };
> > >
> > > +/*
> > > + * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
> > > + * The only case where lru_lock is not acquired while holding tree.lock is
> > > + * when a zswap_entry is taken off the lru for writeback, in that case it
> > > + * needs to be verified that it's still valid in the tree.
> > > + */
> > >  struct zswap_pool {
> > >         struct zpool *zpool;
> > >         struct crypto_acomp_ctx __percpu *acomp_ctx;
> > > @@ -159,6 +165,8 @@ struct zswap_pool {
> > >         struct work_struct shrink_work;
> > >         struct hlist_node node;
> > >         char tfm_name[CRYPTO_MAX_ALG_NAME];
> > > +       struct list_head lru;
> > > +       spinlock_t lru_lock;
> > >  };
> > >
> > >  /*
> > > @@ -176,10 +184,12 @@ struct zswap_pool {
> > >   *            be held while changing the refcount.  Since the lock must
> > >   *            be held, there is no reason to also make refcount atomic.
> > >   * length - the length in bytes of the compressed page data.  Needed during
> > > - *          decompression. For a same value filled page length is 0.
> > > + *          decompression. For a same value filled page length is 0, and both
> > > + *          pool and lru are invalid and must be ignored.
> > >   * pool - the zswap_pool the entry's data is in
> > >   * handle - zpool allocation handle that stores the compressed page data
> > >   * value - value of the same-value filled pages which have same content
> > > + * lru - handle to the pool's lru used to evict pages.
> > >   */
> > >  struct zswap_entry {
> > >         struct rb_node rbnode;
> > > @@ -192,6 +202,7 @@ struct zswap_entry {
> > >                 unsigned long value;
> > >         };
> > >         struct obj_cgroup *objcg;
> > > +       struct list_head lru;
> > >  };
> > >
> > >  struct zswap_header {
> > > @@ -364,6 +375,12 @@ static void zswap_free_entry(struct zswap_entry *entry)
> > >         if (!entry->length)
> > >                 atomic_dec(&zswap_same_filled_pages);
> > >         else {
> > > +       /* zpool_evictable will be removed once all 3 backends have migrated */
> > > +               if (!zpool_evictable(entry->pool->zpool)) {
> > > +                       spin_lock(&entry->pool->lru_lock);
> > > +                       list_del(&entry->lru);
> > > +                       spin_unlock(&entry->pool->lru_lock);
> > > +               }
> > >                 zpool_free(entry->pool->zpool, entry->handle);
> > >                 zswap_pool_put(entry->pool);
> > >         }
> > > @@ -584,14 +601,70 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> > >         return NULL;
> > >  }
> > >
> > > +static int zswap_shrink(struct zswap_pool *pool)
> >
> > Nit: rename to zswap_shrink_one() so that it's clear we always
> > writeback one entry per call?
>
> I named it like that to mirror zpool_shrink but I think that you've got a point
> in that it might not be very clear that it is shrinking by one page only.
> What about zswap_reclaim_entry? I'm not a native speaker, but with
> zswap_shrink_one I wouldn't obviously intend that the "one" refers to an
> entry.

I am not a native speaker either, but zswap_reclaim_entry() sounds
much better :)

>
> >
> > > +{
> > > +       struct zswap_entry *lru_entry, *tree_entry = NULL;
> > > +       struct zswap_header *zhdr;
> > > +       struct zswap_tree *tree;
> > > +       int swpoffset;
> > > +       int ret;
> > > +
> > > +       /* get a reclaimable entry from LRU */
> > > +       spin_lock(&pool->lru_lock);
> > > +       if (list_empty(&pool->lru)) {
> > > +               spin_unlock(&pool->lru_lock);
> > > +               return -EINVAL;
> > > +       }
> > > +       lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> > > +       list_del_init(&lru_entry->lru);
> > > +       zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> > > +       tree = zswap_trees[swp_type(zhdr->swpentry)];
> > > +       zpool_unmap_handle(pool->zpool, lru_entry->handle);
> > > +       /*
> > > +        * Once the pool lock is dropped, the lru_entry might get freed. The
> >
> > Nit: lru lock*
> >
> > > +        * swpoffset is copied to the stack, and lru_entry isn't deref'd again
> > > +        * until the entry is verified to still be alive in the tree.
> > > +        */
> > > +       swpoffset = swp_offset(zhdr->swpentry);
> > > +       spin_unlock(&pool->lru_lock);
> > > +
> > > +       /* hold a reference from tree so it won't be freed during writeback */
> > > +       spin_lock(&tree->lock);
> > > +       tree_entry = zswap_entry_find_get(&tree->rbroot, swpoffset);
> > > +       if (tree_entry != lru_entry) {
> > > +               if (tree_entry)
> > > +                       zswap_entry_put(tree, tree_entry);
> > > +               spin_unlock(&tree->lock);
> > > +               return -EAGAIN;
> > > +       }
> > > +       spin_unlock(&tree->lock);
> > > +
> > > +       ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> > > +
> > > +       spin_lock(&tree->lock);
> > > +       if (ret) {
> > > +               spin_lock(&pool->lru_lock);
> > > +               list_move(&lru_entry->lru, &pool->lru);
> > > +               spin_unlock(&pool->lru_lock);
> > > +       }
> > > +       zswap_entry_put(tree, tree_entry);
> > > +       spin_unlock(&tree->lock);
> > > +
> > > +       return ret ? -EAGAIN : 0;
> > > +}
> > > +
> > >  static void shrink_worker(struct work_struct *w)
> > >  {
> > >         struct zswap_pool *pool = container_of(w, typeof(*pool),
> > >                                                 shrink_work);
> > >         int ret, failures = 0;
> > >
> > > +       /* zpool_evictable will be removed once all 3 backends have migrated */
> > >         do {
> > > -               ret = zpool_shrink(pool->zpool, 1, NULL);
> > > +               if (zpool_evictable(pool->zpool))
> > > +                       ret = zpool_shrink(pool->zpool, 1, NULL);
> > > +               else
> > > +                       ret = zswap_shrink(pool);
> > >                 if (ret) {
> > >                         zswap_reject_reclaim_fail++;
> > >                         if (ret != -EAGAIN)
> > > @@ -655,6 +728,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> > >          */
> > >         kref_init(&pool->kref);
> > >         INIT_LIST_HEAD(&pool->list);
> > > +       INIT_LIST_HEAD(&pool->lru);
> > > +       spin_lock_init(&pool->lru_lock);
> > >         INIT_WORK(&pool->shrink_work, shrink_worker);
> > >
> > >         zswap_pool_debug("created", pool);
> > > @@ -1270,7 +1345,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> > >         }
> > >
> > >         /* store */
> > > -       hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> > > +       hlen = sizeof(zhdr);
> > >         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> > >         if (zpool_malloc_support_movable(entry->pool->zpool))
> > >                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> > > @@ -1313,6 +1388,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> > >                         zswap_entry_put(tree, dupentry);
> > >                 }
> > >         } while (ret == -EEXIST);
> > > +       /* zpool_evictable will be removed once all 3 backends have migrated */
> > > +       if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> > > +               spin_lock(&entry->pool->lru_lock);
> > > +               list_add(&entry->lru, &entry->pool->lru);
> > > +               spin_unlock(&entry->pool->lru_lock);
> > > +       }
> > >         spin_unlock(&tree->lock);
> > >
> > >         /* update stats */
> > > @@ -1384,8 +1465,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> > >         /* decompress */
> > >         dlen = PAGE_SIZE;
> > >         src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> > > -       if (zpool_evictable(entry->pool->zpool))
> > > -               src += sizeof(struct zswap_header);
> > > +       src += sizeof(struct zswap_header);
> > >
> > >         if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> > >                 memcpy(tmp, src, entry->length);
> > > @@ -1415,6 +1495,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> > >  freeentry:
> > >         spin_lock(&tree->lock);
> > >         zswap_entry_put(tree, entry);
> > > +       /* zpool_evictable will be removed once all 3 backends have migrated */
> > > +       if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> > > +               spin_lock(&entry->pool->lru_lock);
> > > +               list_move(&entry->lru, &entry->pool->lru);
> > > +               spin_unlock(&entry->pool->lru_lock);
> > > +       }
> >
> > It's not really this patch's fault, but when merged with commit
> > fe1d1f7d0fb5 ("mm: zswap: support exclusive loads") from mm-unstable
> > [1], and with CONFIG_ZSWAP_EXCLUSIVE_LOADS=y, this causes a crash.
> >
> > This happens because fe1d1f7d0fb5 makes the loads exclusive, so
> > zswap_entry_put(tree, entry) above the added code causes the entry to
> > be freed, then we go ahead and deference multiple fields within it in
> > the added chunk. Moving the chunk above zswap_entry_put() (and
> > consequently also above zswap_invalidate_entry() from fe1d1f7d0fb5)
> > makes this work correctly.
> >
> > Perhaps it would be useful to rebase on top of fe1d1f7d0fb5 for your
> > next version(s), if any.
>
> Will definitely rebase, I just now saw that you tested the suggested resolution
> below, thanks, it does make sense.
>
> >
> > Maybe the outcome would be something like:
> >
> > zswap_entry_put(tree, entry);
> > if (!ret && IS_ENABLED(CONFIG_ZSWAP_EXCLUSIVE_LOADS)) {
> >         zswap_invalidate_entry(tree, entry);
> > } else if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> >         spin_lock(&entry->pool->lru_lock);
> >         list_move(&entry->lru, &entry->pool->lru);
> >         spin_unlock(&entry->pool->lru_lock);
> > }
> >
> > I am assuming if we are going to invalidate the entry anyway there is
> > no need to move it to the front of the lru -- but I didn't really
> > think it through.
> >
> > [1]https://lore.kernel.org/lkml/20230530210251.493194-1-yosryahmed@google.com/
> >
> > >         spin_unlock(&tree->lock);
> > >
> > >         return ret;
> > > --
> > > 2.34.1
> > >
  
Nhat Pham June 7, 2023, 9:39 p.m. UTC | #4
On Tue, Jun 6, 2023 at 7:56 AM Domenico Cerasuolo
<cerasuolodomenico@gmail.com> wrote:
>
> Each zpool driver (zbud, z3fold and zsmalloc) implements its own shrink
> function, which is called from zpool_shrink. However, with this commit,
> a unified shrink function is added to zswap. The ultimate goal is to
> eliminate the need for zpool_shrink once all zpool implementations have
> dropped their shrink code.
>
> To ensure the functionality of each commit, this change focuses solely
> on adding the mechanism itself. No modifications are made to
> the backends, meaning that functionally, there are no immediate changes.
> The zswap mechanism will only come into effect once the backends have
> removed their shrink code. The subsequent commits will address the
> modifications needed in the backends.
>
> Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> ---
>  mm/zswap.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 91 insertions(+), 5 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index bcb82e09eb64..c99bafcefecf 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -150,6 +150,12 @@ struct crypto_acomp_ctx {
>         struct mutex *mutex;
>  };
>
> +/*
> + * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
> + * The only case where lru_lock is not acquired while holding tree.lock is
> + * when a zswap_entry is taken off the lru for writeback, in that case it
> + * needs to be verified that it's still valid in the tree.
> + */
>  struct zswap_pool {
>         struct zpool *zpool;
>         struct crypto_acomp_ctx __percpu *acomp_ctx;
> @@ -159,6 +165,8 @@ struct zswap_pool {
>         struct work_struct shrink_work;
>         struct hlist_node node;
>         char tfm_name[CRYPTO_MAX_ALG_NAME];
> +       struct list_head lru;
> +       spinlock_t lru_lock;
>  };
>
>  /*
> @@ -176,10 +184,12 @@ struct zswap_pool {
>   *            be held while changing the refcount.  Since the lock must
>   *            be held, there is no reason to also make refcount atomic.
>   * length - the length in bytes of the compressed page data.  Needed during
> - *          decompression. For a same value filled page length is 0.
> + *          decompression. For a same value filled page length is 0, and both
> + *          pool and lru are invalid and must be ignored.
>   * pool - the zswap_pool the entry's data is in
>   * handle - zpool allocation handle that stores the compressed page data
>   * value - value of the same-value filled pages which have same content
> + * lru - handle to the pool's lru used to evict pages.
>   */
>  struct zswap_entry {
>         struct rb_node rbnode;
> @@ -192,6 +202,7 @@ struct zswap_entry {
>                 unsigned long value;
>         };
>         struct obj_cgroup *objcg;
> +       struct list_head lru;
>  };
>
>  struct zswap_header {
> @@ -364,6 +375,12 @@ static void zswap_free_entry(struct zswap_entry *entry)
>         if (!entry->length)
>                 atomic_dec(&zswap_same_filled_pages);
>         else {
> +       /* zpool_evictable will be removed once all 3 backends have migrated */
> +               if (!zpool_evictable(entry->pool->zpool)) {
> +                       spin_lock(&entry->pool->lru_lock);
> +                       list_del(&entry->lru);
> +                       spin_unlock(&entry->pool->lru_lock);
> +               }
>                 zpool_free(entry->pool->zpool, entry->handle);
>                 zswap_pool_put(entry->pool);
>         }
> @@ -584,14 +601,70 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>         return NULL;
>  }
>
> +static int zswap_shrink(struct zswap_pool *pool)
> +{
> +       struct zswap_entry *lru_entry, *tree_entry = NULL;
> +       struct zswap_header *zhdr;
> +       struct zswap_tree *tree;
> +       int swpoffset;
> +       int ret;
> +
> +       /* get a reclaimable entry from LRU */
> +       spin_lock(&pool->lru_lock);
> +       if (list_empty(&pool->lru)) {
> +               spin_unlock(&pool->lru_lock);
> +               return -EINVAL;
> +       }
> +       lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> +       list_del_init(&lru_entry->lru);
> +       zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> +       tree = zswap_trees[swp_type(zhdr->swpentry)];
> +       zpool_unmap_handle(pool->zpool, lru_entry->handle);
> +       /*
> +        * Once the pool lock is dropped, the lru_entry might get freed. The
> +        * swpoffset is copied to the stack, and lru_entry isn't deref'd again
> +        * until the entry is verified to still be alive in the tree.
> +        */
> +       swpoffset = swp_offset(zhdr->swpentry);
> +       spin_unlock(&pool->lru_lock);
> +
> +       /* hold a reference from tree so it won't be freed during writeback */
> +       spin_lock(&tree->lock);
> +       tree_entry = zswap_entry_find_get(&tree->rbroot, swpoffset);
> +       if (tree_entry != lru_entry) {
> +               if (tree_entry)
> +                       zswap_entry_put(tree, tree_entry);
> +               spin_unlock(&tree->lock);
> +               return -EAGAIN;
> +       }
> +       spin_unlock(&tree->lock);
> +
> +       ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> +
> +       spin_lock(&tree->lock);
> +       if (ret) {
> +               spin_lock(&pool->lru_lock);
> +               list_move(&lru_entry->lru, &pool->lru);
> +               spin_unlock(&pool->lru_lock);
> +       }
> +       zswap_entry_put(tree, tree_entry);
> +       spin_unlock(&tree->lock);
> +
> +       return ret ? -EAGAIN : 0;
> +}
> +
>  static void shrink_worker(struct work_struct *w)
>  {
>         struct zswap_pool *pool = container_of(w, typeof(*pool),
>                                                 shrink_work);
>         int ret, failures = 0;
>
> +       /* zpool_evictable will be removed once all 3 backends have migrated */
>         do {
> -               ret = zpool_shrink(pool->zpool, 1, NULL);
> +               if (zpool_evictable(pool->zpool))
> +                       ret = zpool_shrink(pool->zpool, 1, NULL);
> +               else
> +                       ret = zswap_shrink(pool);
>                 if (ret) {
>                         zswap_reject_reclaim_fail++;
>                         if (ret != -EAGAIN)
> @@ -655,6 +728,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>          */
>         kref_init(&pool->kref);
>         INIT_LIST_HEAD(&pool->list);
> +       INIT_LIST_HEAD(&pool->lru);
> +       spin_lock_init(&pool->lru_lock);
>         INIT_WORK(&pool->shrink_work, shrink_worker);
>
>         zswap_pool_debug("created", pool);
> @@ -1270,7 +1345,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         }
>
>         /* store */
> -       hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
> +       hlen = sizeof(zhdr);
>         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
>         if (zpool_malloc_support_movable(entry->pool->zpool))
>                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> @@ -1313,6 +1388,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>                         zswap_entry_put(tree, dupentry);
>                 }
>         } while (ret == -EEXIST);
> +       /* zpool_evictable will be removed once all 3 backends have migrated */
> +       if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> +               spin_lock(&entry->pool->lru_lock);
> +               list_add(&entry->lru, &entry->pool->lru);
> +               spin_unlock(&entry->pool->lru_lock);
> +       }
>         spin_unlock(&tree->lock);
>
>         /* update stats */
> @@ -1384,8 +1465,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>         /* decompress */
>         dlen = PAGE_SIZE;
>         src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> -       if (zpool_evictable(entry->pool->zpool))
> -               src += sizeof(struct zswap_header);
> +       src += sizeof(struct zswap_header);
>
>         if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
>                 memcpy(tmp, src, entry->length);
> @@ -1415,6 +1495,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>  freeentry:
>         spin_lock(&tree->lock);
>         zswap_entry_put(tree, entry);
> +       /* zpool_evictable will be removed once all 3 backends have migrated */
> +       if (entry->length && !zpool_evictable(entry->pool->zpool)) {
> +               spin_lock(&entry->pool->lru_lock);
> +               list_move(&entry->lru, &entry->pool->lru);
> +               spin_unlock(&entry->pool->lru_lock);
> +       }
>         spin_unlock(&tree->lock);
>
>         return ret;
> --
> 2.34.1
>

Looks real solid to me! Thanks, Domenico.
Acked-by: Nhat Pham <nphamcs@gmail.com>
  
Johannes Weiner June 8, 2023, 3:58 p.m. UTC | #5
Hi Domenico,

Thanks for incorporating the feedback. Just two more nits:

On Tue, Jun 06, 2023 at 04:56:05PM +0200, Domenico Cerasuolo wrote:
> Each zpool driver (zbud, z3fold and zsmalloc) implements its own shrink
> function, which is called from zpool_shrink. However, with this commit,
> a unified shrink function is added to zswap. The ultimate goal is to
> eliminate the need for zpool_shrink once all zpool implementations have
> dropped their shrink code.
> 
> To ensure the functionality of each commit, this change focuses solely
> on adding the mechanism itself. No modifications are made to
> the backends, meaning that functionally, there are no immediate changes.
> The zswap mechanism will only come into effect once the backends have
> removed their shrink code. The subsequent commits will address the
> modifications needed in the backends.
> 
> Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> @@ -364,6 +375,12 @@ static void zswap_free_entry(struct zswap_entry *entry)
>  	if (!entry->length)
>  		atomic_dec(&zswap_same_filled_pages);
>  	else {
> +	/* zpool_evictable will be removed once all 3 backends have migrated */
> +		if (!zpool_evictable(entry->pool->zpool)) {

Comment indentation is off.

> +			spin_lock(&entry->pool->lru_lock);
> +			list_del(&entry->lru);
> +			spin_unlock(&entry->pool->lru_lock);
> +		}
>  		zpool_free(entry->pool->zpool, entry->handle);
>  		zswap_pool_put(entry->pool);
>  	}
> @@ -584,14 +601,70 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>  	return NULL;
>  }
>  
> +static int zswap_shrink(struct zswap_pool *pool)
> +{
> +	struct zswap_entry *lru_entry, *tree_entry = NULL;
> +	struct zswap_header *zhdr;
> +	struct zswap_tree *tree;
> +	int swpoffset;

pgoff_t

With that and what Yosry pointed out fixed, please feel free to add

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

to the next version.
  
Johannes Weiner June 8, 2023, 4:52 p.m. UTC | #6
On Tue, Jun 06, 2023 at 04:56:05PM +0200, Domenico Cerasuolo wrote:
> @@ -584,14 +601,70 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>  	return NULL;
>  }
>  
> +static int zswap_shrink(struct zswap_pool *pool)
> +{
> +	struct zswap_entry *lru_entry, *tree_entry = NULL;
> +	struct zswap_header *zhdr;
> +	struct zswap_tree *tree;
> +	int swpoffset;
> +	int ret;
> +
> +	/* get a reclaimable entry from LRU */
> +	spin_lock(&pool->lru_lock);
> +	if (list_empty(&pool->lru)) {
> +		spin_unlock(&pool->lru_lock);
> +		return -EINVAL;
> +	}
> +	lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> +	list_del_init(&lru_entry->lru);
> +	zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> +	tree = zswap_trees[swp_type(zhdr->swpentry)];
> +	zpool_unmap_handle(pool->zpool, lru_entry->handle);
> +	/*
> +	 * Once the pool lock is dropped, the lru_entry might get freed. The
> +	 * swpoffset is copied to the stack, and lru_entry isn't deref'd again
> +	 * until the entry is verified to still be alive in the tree.
> +	 */
> +	swpoffset = swp_offset(zhdr->swpentry);
> +	spin_unlock(&pool->lru_lock);
> +
> +	/* hold a reference from tree so it won't be freed during writeback */
> +	spin_lock(&tree->lock);
> +	tree_entry = zswap_entry_find_get(&tree->rbroot, swpoffset);
> +	if (tree_entry != lru_entry) {
> +		if (tree_entry)
> +			zswap_entry_put(tree, tree_entry);
> +		spin_unlock(&tree->lock);
> +		return -EAGAIN;
> +	}
> +	spin_unlock(&tree->lock);
> +
> +	ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> +
> +	spin_lock(&tree->lock);
> +	if (ret) {
> +		spin_lock(&pool->lru_lock);
> +		list_move(&lru_entry->lru, &pool->lru);
> +		spin_unlock(&pool->lru_lock);
> +	}
> +	zswap_entry_put(tree, tree_entry);

On re-reading this, I find the lru_entry vs tree_entry distinction
unnecessarily complicated. Once it's known that the thing coming off
the LRU is the same thing as in the tree, there is only "the entry".

How about 'entry' and 'tree_entry', and after validation use 'entry'
throughout the rest of the function?
  
Johannes Weiner June 8, 2023, 5:04 p.m. UTC | #7
On Thu, Jun 08, 2023 at 12:52:51PM -0400, Johannes Weiner wrote:
> On Tue, Jun 06, 2023 at 04:56:05PM +0200, Domenico Cerasuolo wrote:
> > @@ -584,14 +601,70 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> >  	return NULL;
> >  }
> >  
> > +static int zswap_shrink(struct zswap_pool *pool)
> > +{
> > +	struct zswap_entry *lru_entry, *tree_entry = NULL;
> > +	struct zswap_header *zhdr;
> > +	struct zswap_tree *tree;
> > +	int swpoffset;
> > +	int ret;
> > +
> > +	/* get a reclaimable entry from LRU */
> > +	spin_lock(&pool->lru_lock);
> > +	if (list_empty(&pool->lru)) {
> > +		spin_unlock(&pool->lru_lock);
> > +		return -EINVAL;
> > +	}
> > +	lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> > +	list_del_init(&lru_entry->lru);
> > +	zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> > +	tree = zswap_trees[swp_type(zhdr->swpentry)];
> > +	zpool_unmap_handle(pool->zpool, lru_entry->handle);
> > +	/*
> > +	 * Once the pool lock is dropped, the lru_entry might get freed. The
> > +	 * swpoffset is copied to the stack, and lru_entry isn't deref'd again
> > +	 * until the entry is verified to still be alive in the tree.
> > +	 */
> > +	swpoffset = swp_offset(zhdr->swpentry);
> > +	spin_unlock(&pool->lru_lock);
> > +
> > +	/* hold a reference from tree so it won't be freed during writeback */
> > +	spin_lock(&tree->lock);
> > +	tree_entry = zswap_entry_find_get(&tree->rbroot, swpoffset);
> > +	if (tree_entry != lru_entry) {
> > +		if (tree_entry)
> > +			zswap_entry_put(tree, tree_entry);
> > +		spin_unlock(&tree->lock);
> > +		return -EAGAIN;
> > +	}
> > +	spin_unlock(&tree->lock);
> > +
> > +	ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> > +
> > +	spin_lock(&tree->lock);
> > +	if (ret) {
> > +		spin_lock(&pool->lru_lock);
> > +		list_move(&lru_entry->lru, &pool->lru);
> > +		spin_unlock(&pool->lru_lock);
> > +	}
> > +	zswap_entry_put(tree, tree_entry);
> 
> On re-reading this, I find the lru_entry vs tree_entry distinction
> unnecessarily complicated. Once it's known that the thing coming off
> the LRU is the same thing as in the tree, there is only "the entry".
> 
> How about 'entry' and 'tree_entry', and after validation use 'entry'
> throughout the rest of the function?

Even better, safe the tree_entry entirely by getting the reference
from the LRU already, and then just search the tree for a match:

	/* Get an entry off the LRU */
	spin_lock(&pool->lru_lock);
	entry = list_last_entry();
	list_del(&entry->lru);
	zswap_entry_get(entry);
	spin_unlock(&pool->lru_lock);

	/* Check for invalidate() race */
	spin_lock(&tree->lock);
	if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
		ret = -EAGAIN;
		goto put_unlock;
	}
	spin_unlock(&tree->lock);

	ret = zswap_writeback_entry();

	spin_lock(&tree->lock);
	if (ret) {
		put_back_on_lru();
		goto put_unlock;
	}

	/* Check for invalidate() race */
	if (entry != zswap_rb_search(&tree->rbroot, swpoffset))
		goto put_unlock;

	/* Drop base reference */
	zswap_entry_put(tree, entry);

put_unlock:
	/* Drop local reference */
	zswap_entry_put(tree, entry);
	spin_unlock(&tree->lock);

	return ret ? -EAGAIN : 0;
  
Johannes Weiner June 8, 2023, 6:45 p.m. UTC | #8
On Thu, Jun 08, 2023 at 01:05:00PM -0400, Johannes Weiner wrote:
> On Thu, Jun 08, 2023 at 12:52:51PM -0400, Johannes Weiner wrote:
> > On Tue, Jun 06, 2023 at 04:56:05PM +0200, Domenico Cerasuolo wrote:
> > > @@ -584,14 +601,70 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> > >  	return NULL;
> > >  }
> > >  
> > > +static int zswap_shrink(struct zswap_pool *pool)
> > > +{
> > > +	struct zswap_entry *lru_entry, *tree_entry = NULL;
> > > +	struct zswap_header *zhdr;
> > > +	struct zswap_tree *tree;
> > > +	int swpoffset;
> > > +	int ret;
> > > +
> > > +	/* get a reclaimable entry from LRU */
> > > +	spin_lock(&pool->lru_lock);
> > > +	if (list_empty(&pool->lru)) {
> > > +		spin_unlock(&pool->lru_lock);
> > > +		return -EINVAL;
> > > +	}
> > > +	lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> > > +	list_del_init(&lru_entry->lru);
> > > +	zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> > > +	tree = zswap_trees[swp_type(zhdr->swpentry)];
> > > +	zpool_unmap_handle(pool->zpool, lru_entry->handle);
> > > +	/*
> > > +	 * Once the pool lock is dropped, the lru_entry might get freed. The
> > > +	 * swpoffset is copied to the stack, and lru_entry isn't deref'd again
> > > +	 * until the entry is verified to still be alive in the tree.
> > > +	 */
> > > +	swpoffset = swp_offset(zhdr->swpentry);
> > > +	spin_unlock(&pool->lru_lock);
> > > +
> > > +	/* hold a reference from tree so it won't be freed during writeback */
> > > +	spin_lock(&tree->lock);
> > > +	tree_entry = zswap_entry_find_get(&tree->rbroot, swpoffset);
> > > +	if (tree_entry != lru_entry) {
> > > +		if (tree_entry)
> > > +			zswap_entry_put(tree, tree_entry);
> > > +		spin_unlock(&tree->lock);
> > > +		return -EAGAIN;
> > > +	}
> > > +	spin_unlock(&tree->lock);
> > > +
> > > +	ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> > > +
> > > +	spin_lock(&tree->lock);
> > > +	if (ret) {
> > > +		spin_lock(&pool->lru_lock);
> > > +		list_move(&lru_entry->lru, &pool->lru);
> > > +		spin_unlock(&pool->lru_lock);
> > > +	}
> > > +	zswap_entry_put(tree, tree_entry);
> > 
> > On re-reading this, I find the lru_entry vs tree_entry distinction
> > unnecessarily complicated. Once it's known that the thing coming off
> > the LRU is the same thing as in the tree, there is only "the entry".
> > 
> > How about 'entry' and 'tree_entry', and after validation use 'entry'
> > throughout the rest of the function?
> 
> Even better, safe the tree_entry entirely by getting the reference
> from the LRU already, and then just search the tree for a match:
> 
> 	/* Get an entry off the LRU */
> 	spin_lock(&pool->lru_lock);
> 	entry = list_last_entry();
> 	list_del(&entry->lru);
> 	zswap_entry_get(entry);
> 	spin_unlock(&pool->lru_lock);
> 
> 	/* Check for invalidate() race */
> 	spin_lock(&tree->lock);
> 	if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
> 		ret = -EAGAIN;
> 		goto put_unlock;
> 	}
> 	spin_unlock(&tree->lock);

Eh, brainfart. It needs the tree lock to bump the ref, of course.

But this should work, right?

 	/* Check for invalidate() race */
 	spin_lock(&tree->lock);
 	if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
 		ret = -EAGAIN;
 		goto unlock;
 	}
	zswap_entry_get(entry);
 	spin_unlock(&tree->lock);
  
Domenico Cerasuolo June 9, 2023, 8:39 a.m. UTC | #9
On Thu, Jun 8, 2023 at 8:45 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Thu, Jun 08, 2023 at 01:05:00PM -0400, Johannes Weiner wrote:
> > On Thu, Jun 08, 2023 at 12:52:51PM -0400, Johannes Weiner wrote:
> > > On Tue, Jun 06, 2023 at 04:56:05PM +0200, Domenico Cerasuolo wrote:
> > > > @@ -584,14 +601,70 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> > > >   return NULL;
> > > >  }
> > > >
> > > > +static int zswap_shrink(struct zswap_pool *pool)
> > > > +{
> > > > + struct zswap_entry *lru_entry, *tree_entry = NULL;
> > > > + struct zswap_header *zhdr;
> > > > + struct zswap_tree *tree;
> > > > + int swpoffset;
> > > > + int ret;
> > > > +
> > > > + /* get a reclaimable entry from LRU */
> > > > + spin_lock(&pool->lru_lock);
> > > > + if (list_empty(&pool->lru)) {
> > > > +         spin_unlock(&pool->lru_lock);
> > > > +         return -EINVAL;
> > > > + }
> > > > + lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
> > > > + list_del_init(&lru_entry->lru);
> > > > + zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
> > > > + tree = zswap_trees[swp_type(zhdr->swpentry)];
> > > > + zpool_unmap_handle(pool->zpool, lru_entry->handle);
> > > > + /*
> > > > +  * Once the pool lock is dropped, the lru_entry might get freed. The
> > > > +  * swpoffset is copied to the stack, and lru_entry isn't deref'd again
> > > > +  * until the entry is verified to still be alive in the tree.
> > > > +  */
> > > > + swpoffset = swp_offset(zhdr->swpentry);
> > > > + spin_unlock(&pool->lru_lock);
> > > > +
> > > > + /* hold a reference from tree so it won't be freed during writeback */
> > > > + spin_lock(&tree->lock);
> > > > + tree_entry = zswap_entry_find_get(&tree->rbroot, swpoffset);
> > > > + if (tree_entry != lru_entry) {
> > > > +         if (tree_entry)
> > > > +                 zswap_entry_put(tree, tree_entry);
> > > > +         spin_unlock(&tree->lock);
> > > > +         return -EAGAIN;
> > > > + }
> > > > + spin_unlock(&tree->lock);
> > > > +
> > > > + ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> > > > +
> > > > + spin_lock(&tree->lock);
> > > > + if (ret) {
> > > > +         spin_lock(&pool->lru_lock);
> > > > +         list_move(&lru_entry->lru, &pool->lru);
> > > > +         spin_unlock(&pool->lru_lock);
> > > > + }
> > > > + zswap_entry_put(tree, tree_entry);
> > >
> > > On re-reading this, I find the lru_entry vs tree_entry distinction
> > > unnecessarily complicated. Once it's known that the thing coming off
> > > the LRU is the same thing as in the tree, there is only "the entry".
> > >
> > > How about 'entry' and 'tree_entry', and after validation use 'entry'
> > > throughout the rest of the function?
> >
> > Even better, safe the tree_entry entirely by getting the reference
> > from the LRU already, and then just search the tree for a match:
> >
> >       /* Get an entry off the LRU */
> >       spin_lock(&pool->lru_lock);
> >       entry = list_last_entry();
> >       list_del(&entry->lru);
> >       zswap_entry_get(entry);
> >       spin_unlock(&pool->lru_lock);
> >
> >       /* Check for invalidate() race */
> >       spin_lock(&tree->lock);
> >       if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
> >               ret = -EAGAIN;
> >               goto put_unlock;
> >       }
> >       spin_unlock(&tree->lock);
>
> Eh, brainfart. It needs the tree lock to bump the ref, of course.
>
> But this should work, right?
>
>         /* Check for invalidate() race */
>         spin_lock(&tree->lock);
>         if (entry != zswap_rb_search(&tree->rbroot, swpoffset)) {
>                 ret = -EAGAIN;
>                 goto unlock;
>         }
>         zswap_entry_get(entry);
>         spin_unlock(&tree->lock);

This should work indeed, it's much cleaner with just one local
zswap_entry, will update!
  

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index bcb82e09eb64..c99bafcefecf 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -150,6 +150,12 @@  struct crypto_acomp_ctx {
 	struct mutex *mutex;
 };
 
+/*
+ * The lock ordering is zswap_tree.lock -> zswap_pool.lru_lock.
+ * The only case where lru_lock is not acquired while holding tree.lock is
+ * when a zswap_entry is taken off the lru for writeback, in that case it
+ * needs to be verified that it's still valid in the tree.
+ */
 struct zswap_pool {
 	struct zpool *zpool;
 	struct crypto_acomp_ctx __percpu *acomp_ctx;
@@ -159,6 +165,8 @@  struct zswap_pool {
 	struct work_struct shrink_work;
 	struct hlist_node node;
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
+	struct list_head lru;
+	spinlock_t lru_lock;
 };
 
 /*
@@ -176,10 +184,12 @@  struct zswap_pool {
  *            be held while changing the refcount.  Since the lock must
  *            be held, there is no reason to also make refcount atomic.
  * length - the length in bytes of the compressed page data.  Needed during
- *          decompression. For a same value filled page length is 0.
+ *          decompression. For a same value filled page length is 0, and both
+ *          pool and lru are invalid and must be ignored.
  * pool - the zswap_pool the entry's data is in
  * handle - zpool allocation handle that stores the compressed page data
  * value - value of the same-value filled pages which have same content
+ * lru - handle to the pool's lru used to evict pages.
  */
 struct zswap_entry {
 	struct rb_node rbnode;
@@ -192,6 +202,7 @@  struct zswap_entry {
 		unsigned long value;
 	};
 	struct obj_cgroup *objcg;
+	struct list_head lru;
 };
 
 struct zswap_header {
@@ -364,6 +375,12 @@  static void zswap_free_entry(struct zswap_entry *entry)
 	if (!entry->length)
 		atomic_dec(&zswap_same_filled_pages);
 	else {
+	/* zpool_evictable will be removed once all 3 backends have migrated */
+		if (!zpool_evictable(entry->pool->zpool)) {
+			spin_lock(&entry->pool->lru_lock);
+			list_del(&entry->lru);
+			spin_unlock(&entry->pool->lru_lock);
+		}
 		zpool_free(entry->pool->zpool, entry->handle);
 		zswap_pool_put(entry->pool);
 	}
@@ -584,14 +601,70 @@  static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 	return NULL;
 }
 
+static int zswap_shrink(struct zswap_pool *pool)
+{
+	struct zswap_entry *lru_entry, *tree_entry = NULL;
+	struct zswap_header *zhdr;
+	struct zswap_tree *tree;
+	int swpoffset;
+	int ret;
+
+	/* get a reclaimable entry from LRU */
+	spin_lock(&pool->lru_lock);
+	if (list_empty(&pool->lru)) {
+		spin_unlock(&pool->lru_lock);
+		return -EINVAL;
+	}
+	lru_entry = list_last_entry(&pool->lru, struct zswap_entry, lru);
+	list_del_init(&lru_entry->lru);
+	zhdr = zpool_map_handle(pool->zpool, lru_entry->handle, ZPOOL_MM_RO);
+	tree = zswap_trees[swp_type(zhdr->swpentry)];
+	zpool_unmap_handle(pool->zpool, lru_entry->handle);
+	/*
+	 * Once the pool lock is dropped, the lru_entry might get freed. The
+	 * swpoffset is copied to the stack, and lru_entry isn't deref'd again
+	 * until the entry is verified to still be alive in the tree.
+	 */
+	swpoffset = swp_offset(zhdr->swpentry);
+	spin_unlock(&pool->lru_lock);
+
+	/* hold a reference from tree so it won't be freed during writeback */
+	spin_lock(&tree->lock);
+	tree_entry = zswap_entry_find_get(&tree->rbroot, swpoffset);
+	if (tree_entry != lru_entry) {
+		if (tree_entry)
+			zswap_entry_put(tree, tree_entry);
+		spin_unlock(&tree->lock);
+		return -EAGAIN;
+	}
+	spin_unlock(&tree->lock);
+
+	ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
+
+	spin_lock(&tree->lock);
+	if (ret) {
+		spin_lock(&pool->lru_lock);
+		list_move(&lru_entry->lru, &pool->lru);
+		spin_unlock(&pool->lru_lock);
+	}
+	zswap_entry_put(tree, tree_entry);
+	spin_unlock(&tree->lock);
+
+	return ret ? -EAGAIN : 0;
+}
+
 static void shrink_worker(struct work_struct *w)
 {
 	struct zswap_pool *pool = container_of(w, typeof(*pool),
 						shrink_work);
 	int ret, failures = 0;
 
+	/* zpool_evictable will be removed once all 3 backends have migrated */
 	do {
-		ret = zpool_shrink(pool->zpool, 1, NULL);
+		if (zpool_evictable(pool->zpool))
+			ret = zpool_shrink(pool->zpool, 1, NULL);
+		else
+			ret = zswap_shrink(pool);
 		if (ret) {
 			zswap_reject_reclaim_fail++;
 			if (ret != -EAGAIN)
@@ -655,6 +728,8 @@  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	 */
 	kref_init(&pool->kref);
 	INIT_LIST_HEAD(&pool->list);
+	INIT_LIST_HEAD(&pool->lru);
+	spin_lock_init(&pool->lru_lock);
 	INIT_WORK(&pool->shrink_work, shrink_worker);
 
 	zswap_pool_debug("created", pool);
@@ -1270,7 +1345,7 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	}
 
 	/* store */
-	hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
+	hlen = sizeof(zhdr);
 	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
 	if (zpool_malloc_support_movable(entry->pool->zpool))
 		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
@@ -1313,6 +1388,12 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 			zswap_entry_put(tree, dupentry);
 		}
 	} while (ret == -EEXIST);
+	/* zpool_evictable will be removed once all 3 backends have migrated */
+	if (entry->length && !zpool_evictable(entry->pool->zpool)) {
+		spin_lock(&entry->pool->lru_lock);
+		list_add(&entry->lru, &entry->pool->lru);
+		spin_unlock(&entry->pool->lru_lock);
+	}
 	spin_unlock(&tree->lock);
 
 	/* update stats */
@@ -1384,8 +1465,7 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	/* decompress */
 	dlen = PAGE_SIZE;
 	src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
-	if (zpool_evictable(entry->pool->zpool))
-		src += sizeof(struct zswap_header);
+	src += sizeof(struct zswap_header);
 
 	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
 		memcpy(tmp, src, entry->length);
@@ -1415,6 +1495,12 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 freeentry:
 	spin_lock(&tree->lock);
 	zswap_entry_put(tree, entry);
+	/* zpool_evictable will be removed once all 3 backends have migrated */
+	if (entry->length && !zpool_evictable(entry->pool->zpool)) {
+		spin_lock(&entry->pool->lru_lock);
+		list_move(&entry->lru, &entry->pool->lru);
+		spin_unlock(&entry->pool->lru_lock);
+	}
 	spin_unlock(&tree->lock);
 
 	return ret;