[RFC,v2,6/7] mm: zswap: simplify writeback function

Message ID 20230606145611.704392-7-cerasuolodomenico@gmail.com
State New
Headers
Series mm: zswap: move writeback LRU from zpool to zswap |

Commit Message

Domenico Cerasuolo June 6, 2023, 2:56 p.m. UTC
  Previously, in zswap, the writeback function was passed to zpool drivers
for their usage in calling the writeback operation. However, since the
drivers did not possess references to entries and the function was
specifically designed to work with handles, the writeback process has
been modified to occur directly within zswap. Consequently, this change
allows for some simplification of the writeback function, taking into
account the updated workflow.

Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
---
 mm/zswap.c | 69 ++++++++++++++----------------------------------------
 1 file changed, 17 insertions(+), 52 deletions(-)
  

Comments

Yosry Ahmed June 7, 2023, 9:26 a.m. UTC | #1
On Tue, Jun 6, 2023 at 7:56 AM Domenico Cerasuolo
<cerasuolodomenico@gmail.com> wrote:
>
> Previously, in zswap, the writeback function was passed to zpool drivers
> for their usage in calling the writeback operation. However, since the
> drivers did not possess references to entries and the function was
> specifically designed to work with handles, the writeback process has
> been modified to occur directly within zswap. Consequently, this change
> allows for some simplification of the writeback function, taking into
> account the updated workflow.
>
> Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> ---
>  mm/zswap.c | 69 ++++++++++++++----------------------------------------
>  1 file changed, 17 insertions(+), 52 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 2831bf56b168..ef8604812352 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -250,7 +250,8 @@ static bool zswap_has_pool;
>         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
>                  zpool_get_type((p)->zpool))
>
> -static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
> +static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_header *zhdr,
> +                                struct zswap_tree *tree);
>  static int zswap_pool_get(struct zswap_pool *pool);
>  static void zswap_pool_put(struct zswap_pool *pool);
>
> @@ -632,15 +633,21 @@ static int zswap_shrink(struct zswap_pool *pool)
>         }
>         spin_unlock(&tree->lock);
>
> -       ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> +       ret = zswap_writeback_entry(lru_entry, zhdr, tree);
>
>         spin_lock(&tree->lock);
>         if (ret) {
>                 spin_lock(&pool->lru_lock);
>                 list_move(&lru_entry->lru, &pool->lru);
>                 spin_unlock(&pool->lru_lock);
> +               zswap_entry_put(tree, tree_entry);
> +       } else {
> +               /* free the local reference */
> +               zswap_entry_put(tree, tree_entry);
> +               /* free the entry if it's not been invalidated*/
> +               if (lru_entry == zswap_rb_search(&tree->rbroot, swpoffset))
> +                       zswap_entry_put(tree, tree_entry);

The comment that was here about the 2 possible cases was useful imo,
maybe keep it?

Also, I am not sure why we need to do a tree search vs. just reading
the refcount here before the first put. We can even make
zswap_entry_put() return the refcount after the put to know if we need
the additional put or not.

Can anyone think of any reason why we need to explicitly search the tree here?

>         }
> -       zswap_entry_put(tree, tree_entry);
>         spin_unlock(&tree->lock);
>
>         return ret ? -EAGAIN : 0;
> @@ -1039,16 +1046,14 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
>   * the swap cache, the compressed version stored by zswap can be
>   * freed.
>   */
> -static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> +static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_header *zhdr,
> +                                struct zswap_tree *tree)
>  {
> -       struct zswap_header *zhdr;
> -       swp_entry_t swpentry;
> -       struct zswap_tree *tree;
> -       pgoff_t offset;
> -       struct zswap_entry *entry;
> +       swp_entry_t swpentry = zhdr->swpentry;
>         struct page *page;
>         struct scatterlist input, output;
>         struct crypto_acomp_ctx *acomp_ctx;
> +       struct zpool *pool = entry->pool->zpool;
>
>         u8 *src, *tmp = NULL;
>         unsigned int dlen;
> @@ -1063,25 +1068,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
>                         return -ENOMEM;
>         }
>
> -       /* extract swpentry from data */
> -       zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
> -       swpentry = zhdr->swpentry; /* here */
> -       tree = zswap_trees[swp_type(swpentry)];
> -       offset = swp_offset(swpentry);
> -       zpool_unmap_handle(pool, handle);
> -
> -       /* find and ref zswap entry */
> -       spin_lock(&tree->lock);
> -       entry = zswap_entry_find_get(&tree->rbroot, offset);
> -       if (!entry) {
> -               /* entry was invalidated */
> -               spin_unlock(&tree->lock);
> -               kfree(tmp);
> -               return 0;
> -       }
> -       spin_unlock(&tree->lock);
> -       BUG_ON(offset != entry->offset);
> -
>         /* try to allocate swap cache page */
>         switch (zswap_get_swap_cache_page(swpentry, &page)) {
>         case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
> @@ -1115,12 +1101,12 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
>                 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
>                 dlen = PAGE_SIZE;
>
> -               zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
> +               zhdr = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
>                 src = (u8 *)zhdr + sizeof(struct zswap_header);
>                 if (!zpool_can_sleep_mapped(pool)) {
>                         memcpy(tmp, src, entry->length);
>                         src = tmp;
> -                       zpool_unmap_handle(pool, handle);
> +                       zpool_unmap_handle(pool, entry->handle);
>                 }
>
>                 mutex_lock(acomp_ctx->mutex);
> @@ -1135,7 +1121,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
>                 if (!zpool_can_sleep_mapped(pool))
>                         kfree(tmp);
>                 else
> -                       zpool_unmap_handle(pool, handle);
> +                       zpool_unmap_handle(pool, entry->handle);
>
>                 BUG_ON(ret);
>                 BUG_ON(dlen != PAGE_SIZE);
> @@ -1152,23 +1138,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
>         put_page(page);
>         zswap_written_back_pages++;
>
> -       spin_lock(&tree->lock);
> -       /* drop local reference */
> -       zswap_entry_put(tree, entry);
> -
> -       /*
> -       * There are two possible situations for entry here:
> -       * (1) refcount is 1(normal case),  entry is valid and on the tree
> -       * (2) refcount is 0, entry is freed and not on the tree
> -       *     because invalidate happened during writeback
> -       *  search the tree and free the entry if find entry
> -       */
> -       if (entry == zswap_rb_search(&tree->rbroot, offset))
> -               zswap_entry_put(tree, entry);
> -       spin_unlock(&tree->lock);
> -
>         return ret;
> -
>  fail:
>         if (!zpool_can_sleep_mapped(pool))
>                 kfree(tmp);
> @@ -1177,13 +1147,8 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
>         * if we get here due to ZSWAP_SWAPCACHE_EXIST
>         * a load may be happening concurrently.
>         * it is safe and okay to not free the entry.
> -       * if we free the entry in the following put
>         * it is also okay to return !0
>         */
> -       spin_lock(&tree->lock);
> -       zswap_entry_put(tree, entry);
> -       spin_unlock(&tree->lock);
> -
>         return ret;
>  }
>
> --
> 2.34.1
>
  
Johannes Weiner June 8, 2023, 4:48 p.m. UTC | #2
Hi Domenico,

On Tue, Jun 06, 2023 at 04:56:10PM +0200, Domenico Cerasuolo wrote:
> Previously, in zswap, the writeback function was passed to zpool drivers
> for their usage in calling the writeback operation. However, since the
> drivers did not possess references to entries and the function was
> specifically designed to work with handles, the writeback process has
> been modified to occur directly within zswap.

I'm having trouble parsing this sentence.

> Consequently, this change allows for some simplification of the
> writeback function, taking into account the updated workflow.

How about:

zswap_writeback_entry() used to be a callback for the backends, which
don't know about struct zswap_entry.

Now that the only user is the generic zswap LRU reclaimer, it can be
simplified: pass the pinned zswap_entry directly, and consolidate the
refcount management in the shrink function.

> Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> ---
>  mm/zswap.c | 69 ++++++++++++++----------------------------------------
>  1 file changed, 17 insertions(+), 52 deletions(-)
> 
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 2831bf56b168..ef8604812352 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -250,7 +250,8 @@ static bool zswap_has_pool;
>  	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
>  		 zpool_get_type((p)->zpool))
>  
> -static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
> +static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_header *zhdr,
> +				 struct zswap_tree *tree);
>  static int zswap_pool_get(struct zswap_pool *pool);
>  static void zswap_pool_put(struct zswap_pool *pool);
>  
> @@ -632,15 +633,21 @@ static int zswap_shrink(struct zswap_pool *pool)
>  	}
>  	spin_unlock(&tree->lock);
>  
> -	ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> +	ret = zswap_writeback_entry(lru_entry, zhdr, tree);
>  
>  	spin_lock(&tree->lock);
>  	if (ret) {
>  		spin_lock(&pool->lru_lock);
>  		list_move(&lru_entry->lru, &pool->lru);
>  		spin_unlock(&pool->lru_lock);

This could use a comment.

		/* Writeback failed, put entry back on LRU */

> +		zswap_entry_put(tree, tree_entry);

This put is a common factor in both branches, so you can consolidate.

> +	} else {
> +		/* free the local reference */
> +		zswap_entry_put(tree, tree_entry);
> +		/* free the entry if it's not been invalidated*/

Missing space between text and */

> +		if (lru_entry == zswap_rb_search(&tree->rbroot, swpoffset))
> +			zswap_entry_put(tree, tree_entry);
>  	}
> -	zswap_entry_put(tree, tree_entry);
>  	spin_unlock(&tree->lock);

The success path freeing (hopefully the common path) is now
unfortunately hidden in fairly deep indentation. I think this would be
better with a goto for the error case.

All together, something like this?

	if (ret) {
		/* Writeback failed, put entry back on LRU */
		...
		goto put_unlock;
	}

	/*
	 * Writeback started successfully, the page now belongs to the
	 * swapcache. Drop the base ref from the tree - unless invalidate
	 * already took it out while we had the tree->lock released for IO.
	 */
	if (lru_entry == zswap_rb_search(&tree->rb_root, swpoffset))
		zswap_entry_put(tree, entry);

put_unlock:
	/* Drop local reference */
	zswap_entry_put(tree, tree_entry);
	spin_unlock(&tree->lock);

	return ret ? -EAGAIN : 0;

Btw, it's unsettling that we drop the tree reference without
explicitly removing the entry for the tree. We rely on the final put
that frees the entry to do tree removal, but this doesn't happen if
somebody else is holding a reference; the entry can remain in the tree
long after we've officially handed over ownership of the data to
swapcache. TTBOMK there are currently no bugs because of that, but
it's a data corruption waiting to happen.

This should be:

	/*
	 * Writeback started successfully, the page now belongs to the
	 * swapcache. Drop the entry from zswap - unless invalidate already
	 * took it out while we had the tree->lock released for IO.
	 */
	if (entry == zswap_rb_search(&tree->rb_root, swpoffset))
		zswap_invalidate_entry(tree, entry);

Would you care to send a follow-up patch?
  
Domenico Cerasuolo June 9, 2023, 10:23 a.m. UTC | #3
On Wed, Jun 7, 2023 at 11:27 AM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Tue, Jun 6, 2023 at 7:56 AM Domenico Cerasuolo
> <cerasuolodomenico@gmail.com> wrote:
> >
> > Previously, in zswap, the writeback function was passed to zpool drivers
> > for their usage in calling the writeback operation. However, since the
> > drivers did not possess references to entries and the function was
> > specifically designed to work with handles, the writeback process has
> > been modified to occur directly within zswap. Consequently, this change
> > allows for some simplification of the writeback function, taking into
> > account the updated workflow.
> >
> > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > ---
> >  mm/zswap.c | 69 ++++++++++++++----------------------------------------
> >  1 file changed, 17 insertions(+), 52 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 2831bf56b168..ef8604812352 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -250,7 +250,8 @@ static bool zswap_has_pool;
> >         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
> >                  zpool_get_type((p)->zpool))
> >
> > -static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
> > +static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_header *zhdr,
> > +                                struct zswap_tree *tree);
> >  static int zswap_pool_get(struct zswap_pool *pool);
> >  static void zswap_pool_put(struct zswap_pool *pool);
> >
> > @@ -632,15 +633,21 @@ static int zswap_shrink(struct zswap_pool *pool)
> >         }
> >         spin_unlock(&tree->lock);
> >
> > -       ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> > +       ret = zswap_writeback_entry(lru_entry, zhdr, tree);
> >
> >         spin_lock(&tree->lock);
> >         if (ret) {
> >                 spin_lock(&pool->lru_lock);
> >                 list_move(&lru_entry->lru, &pool->lru);
> >                 spin_unlock(&pool->lru_lock);
> > +               zswap_entry_put(tree, tree_entry);
> > +       } else {
> > +               /* free the local reference */
> > +               zswap_entry_put(tree, tree_entry);
> > +               /* free the entry if it's not been invalidated*/
> > +               if (lru_entry == zswap_rb_search(&tree->rbroot, swpoffset))
> > +                       zswap_entry_put(tree, tree_entry);
>
> The comment that was here about the 2 possible cases was useful imo,
> maybe keep it?

I honestly found it brittle in that we're not interested in the refcount there,
but rather in releasing the base reference that keeps the entry valid.
There's not way to know which refcount value it should be though, consider
that throughout this series the values can be 1 or 0, but also 2 or 1,
depending on where this function is called (zpool_shrink or zswap_shrink).

>
> Also, I am not sure why we need to do a tree search vs. just reading
> the refcount here before the first put. We can even make
> zswap_entry_put() return the refcount after the put to know if we need
> the additional put or not.
>
> Can anyone think of any reason why we need to explicitly search the tree here?

I think that the reasoning here is that refcount > 0 doesn't necessarily mean
that the entry is on the tree. I'm not sure if reading refcount directly here
would cause an issue now, probably not, but I wonder if bugs could be
introduced if the context in which this function is called changes.

>
> >         }
> > -       zswap_entry_put(tree, tree_entry);
> >         spin_unlock(&tree->lock);
> >
> >         return ret ? -EAGAIN : 0;
> > @@ -1039,16 +1046,14 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
> >   * the swap cache, the compressed version stored by zswap can be
> >   * freed.
> >   */
> > -static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> > +static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_header *zhdr,
> > +                                struct zswap_tree *tree)
> >  {
> > -       struct zswap_header *zhdr;
> > -       swp_entry_t swpentry;
> > -       struct zswap_tree *tree;
> > -       pgoff_t offset;
> > -       struct zswap_entry *entry;
> > +       swp_entry_t swpentry = zhdr->swpentry;
> >         struct page *page;
> >         struct scatterlist input, output;
> >         struct crypto_acomp_ctx *acomp_ctx;
> > +       struct zpool *pool = entry->pool->zpool;
> >
> >         u8 *src, *tmp = NULL;
> >         unsigned int dlen;
> > @@ -1063,25 +1068,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> >                         return -ENOMEM;
> >         }
> >
> > -       /* extract swpentry from data */
> > -       zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
> > -       swpentry = zhdr->swpentry; /* here */
> > -       tree = zswap_trees[swp_type(swpentry)];
> > -       offset = swp_offset(swpentry);
> > -       zpool_unmap_handle(pool, handle);
> > -
> > -       /* find and ref zswap entry */
> > -       spin_lock(&tree->lock);
> > -       entry = zswap_entry_find_get(&tree->rbroot, offset);
> > -       if (!entry) {
> > -               /* entry was invalidated */
> > -               spin_unlock(&tree->lock);
> > -               kfree(tmp);
> > -               return 0;
> > -       }
> > -       spin_unlock(&tree->lock);
> > -       BUG_ON(offset != entry->offset);
> > -
> >         /* try to allocate swap cache page */
> >         switch (zswap_get_swap_cache_page(swpentry, &page)) {
> >         case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
> > @@ -1115,12 +1101,12 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> >                 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> >                 dlen = PAGE_SIZE;
> >
> > -               zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
> > +               zhdr = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
> >                 src = (u8 *)zhdr + sizeof(struct zswap_header);
> >                 if (!zpool_can_sleep_mapped(pool)) {
> >                         memcpy(tmp, src, entry->length);
> >                         src = tmp;
> > -                       zpool_unmap_handle(pool, handle);
> > +                       zpool_unmap_handle(pool, entry->handle);
> >                 }
> >
> >                 mutex_lock(acomp_ctx->mutex);
> > @@ -1135,7 +1121,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> >                 if (!zpool_can_sleep_mapped(pool))
> >                         kfree(tmp);
> >                 else
> > -                       zpool_unmap_handle(pool, handle);
> > +                       zpool_unmap_handle(pool, entry->handle);
> >
> >                 BUG_ON(ret);
> >                 BUG_ON(dlen != PAGE_SIZE);
> > @@ -1152,23 +1138,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> >         put_page(page);
> >         zswap_written_back_pages++;
> >
> > -       spin_lock(&tree->lock);
> > -       /* drop local reference */
> > -       zswap_entry_put(tree, entry);
> > -
> > -       /*
> > -       * There are two possible situations for entry here:
> > -       * (1) refcount is 1(normal case),  entry is valid and on the tree
> > -       * (2) refcount is 0, entry is freed and not on the tree
> > -       *     because invalidate happened during writeback
> > -       *  search the tree and free the entry if find entry
> > -       */
> > -       if (entry == zswap_rb_search(&tree->rbroot, offset))
> > -               zswap_entry_put(tree, entry);
> > -       spin_unlock(&tree->lock);
> > -
> >         return ret;
> > -
> >  fail:
> >         if (!zpool_can_sleep_mapped(pool))
> >                 kfree(tmp);
> > @@ -1177,13 +1147,8 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> >         * if we get here due to ZSWAP_SWAPCACHE_EXIST
> >         * a load may be happening concurrently.
> >         * it is safe and okay to not free the entry.
> > -       * if we free the entry in the following put
> >         * it is also okay to return !0
> >         */
> > -       spin_lock(&tree->lock);
> > -       zswap_entry_put(tree, entry);
> > -       spin_unlock(&tree->lock);
> > -
> >         return ret;
> >  }
> >
> > --
> > 2.34.1
> >
  
Yosry Ahmed June 9, 2023, 11:01 a.m. UTC | #4
On Fri, Jun 9, 2023 at 3:23 AM Domenico Cerasuolo
<cerasuolodomenico@gmail.com> wrote:
>
> On Wed, Jun 7, 2023 at 11:27 AM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > On Tue, Jun 6, 2023 at 7:56 AM Domenico Cerasuolo
> > <cerasuolodomenico@gmail.com> wrote:
> > >
> > > Previously, in zswap, the writeback function was passed to zpool drivers
> > > for their usage in calling the writeback operation. However, since the
> > > drivers did not possess references to entries and the function was
> > > specifically designed to work with handles, the writeback process has
> > > been modified to occur directly within zswap. Consequently, this change
> > > allows for some simplification of the writeback function, taking into
> > > account the updated workflow.
> > >
> > > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > > ---
> > >  mm/zswap.c | 69 ++++++++++++++----------------------------------------
> > >  1 file changed, 17 insertions(+), 52 deletions(-)
> > >
> > > diff --git a/mm/zswap.c b/mm/zswap.c
> > > index 2831bf56b168..ef8604812352 100644
> > > --- a/mm/zswap.c
> > > +++ b/mm/zswap.c
> > > @@ -250,7 +250,8 @@ static bool zswap_has_pool;
> > >         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
> > >                  zpool_get_type((p)->zpool))
> > >
> > > -static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
> > > +static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_header *zhdr,
> > > +                                struct zswap_tree *tree);
> > >  static int zswap_pool_get(struct zswap_pool *pool);
> > >  static void zswap_pool_put(struct zswap_pool *pool);
> > >
> > > @@ -632,15 +633,21 @@ static int zswap_shrink(struct zswap_pool *pool)
> > >         }
> > >         spin_unlock(&tree->lock);
> > >
> > > -       ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> > > +       ret = zswap_writeback_entry(lru_entry, zhdr, tree);
> > >
> > >         spin_lock(&tree->lock);
> > >         if (ret) {
> > >                 spin_lock(&pool->lru_lock);
> > >                 list_move(&lru_entry->lru, &pool->lru);
> > >                 spin_unlock(&pool->lru_lock);
> > > +               zswap_entry_put(tree, tree_entry);
> > > +       } else {
> > > +               /* free the local reference */
> > > +               zswap_entry_put(tree, tree_entry);
> > > +               /* free the entry if it's not been invalidated*/
> > > +               if (lru_entry == zswap_rb_search(&tree->rbroot, swpoffset))
> > > +                       zswap_entry_put(tree, tree_entry);
> >
> > The comment that was here about the 2 possible cases was useful imo,
> > maybe keep it?
>
> I honestly found it brittle in that we're not interested in the refcount there,
> but rather in releasing the base reference that keeps the entry valid.
> There's not way to know which refcount value it should be though, consider
> that throughout this series the values can be 1 or 0, but also 2 or 1,
> depending on where this function is called (zpool_shrink or zswap_shrink).

Yeah looking at it with fresh eyes makes sense, we want to invalidate
the entry, not really caring about the refcount (see below).

>
> >
> > Also, I am not sure why we need to do a tree search vs. just reading
> > the refcount here before the first put. We can even make
> > zswap_entry_put() return the refcount after the put to know if we need
> > the additional put or not.
> >
> > Can anyone think of any reason why we need to explicitly search the tree here?
>
> I think that the reasoning here is that refcount > 0 doesn't necessarily mean
> that the entry is on the tree. I'm not sure if reading refcount directly here
> would cause an issue now, probably not, but I wonder if bugs could be
> introduced if the context in which this function is called changes.

Yeah I agree. As Johannes suggested, using zswap_invalidate_entry()
(from my exclusive loads patch in mm-unstable) would be best here imo.

>
> >
> > >         }
> > > -       zswap_entry_put(tree, tree_entry);
> > >         spin_unlock(&tree->lock);
> > >
> > >         return ret ? -EAGAIN : 0;
> > > @@ -1039,16 +1046,14 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
> > >   * the swap cache, the compressed version stored by zswap can be
> > >   * freed.
> > >   */
> > > -static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> > > +static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_header *zhdr,
> > > +                                struct zswap_tree *tree)
> > >  {
> > > -       struct zswap_header *zhdr;
> > > -       swp_entry_t swpentry;
> > > -       struct zswap_tree *tree;
> > > -       pgoff_t offset;
> > > -       struct zswap_entry *entry;
> > > +       swp_entry_t swpentry = zhdr->swpentry;
> > >         struct page *page;
> > >         struct scatterlist input, output;
> > >         struct crypto_acomp_ctx *acomp_ctx;
> > > +       struct zpool *pool = entry->pool->zpool;
> > >
> > >         u8 *src, *tmp = NULL;
> > >         unsigned int dlen;
> > > @@ -1063,25 +1068,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> > >                         return -ENOMEM;
> > >         }
> > >
> > > -       /* extract swpentry from data */
> > > -       zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
> > > -       swpentry = zhdr->swpentry; /* here */
> > > -       tree = zswap_trees[swp_type(swpentry)];
> > > -       offset = swp_offset(swpentry);
> > > -       zpool_unmap_handle(pool, handle);
> > > -
> > > -       /* find and ref zswap entry */
> > > -       spin_lock(&tree->lock);
> > > -       entry = zswap_entry_find_get(&tree->rbroot, offset);
> > > -       if (!entry) {
> > > -               /* entry was invalidated */
> > > -               spin_unlock(&tree->lock);
> > > -               kfree(tmp);
> > > -               return 0;
> > > -       }
> > > -       spin_unlock(&tree->lock);
> > > -       BUG_ON(offset != entry->offset);
> > > -
> > >         /* try to allocate swap cache page */
> > >         switch (zswap_get_swap_cache_page(swpentry, &page)) {
> > >         case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
> > > @@ -1115,12 +1101,12 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> > >                 acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > >                 dlen = PAGE_SIZE;
> > >
> > > -               zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
> > > +               zhdr = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
> > >                 src = (u8 *)zhdr + sizeof(struct zswap_header);
> > >                 if (!zpool_can_sleep_mapped(pool)) {
> > >                         memcpy(tmp, src, entry->length);
> > >                         src = tmp;
> > > -                       zpool_unmap_handle(pool, handle);
> > > +                       zpool_unmap_handle(pool, entry->handle);
> > >                 }
> > >
> > >                 mutex_lock(acomp_ctx->mutex);
> > > @@ -1135,7 +1121,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> > >                 if (!zpool_can_sleep_mapped(pool))
> > >                         kfree(tmp);
> > >                 else
> > > -                       zpool_unmap_handle(pool, handle);
> > > +                       zpool_unmap_handle(pool, entry->handle);
> > >
> > >                 BUG_ON(ret);
> > >                 BUG_ON(dlen != PAGE_SIZE);
> > > @@ -1152,23 +1138,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> > >         put_page(page);
> > >         zswap_written_back_pages++;
> > >
> > > -       spin_lock(&tree->lock);
> > > -       /* drop local reference */
> > > -       zswap_entry_put(tree, entry);
> > > -
> > > -       /*
> > > -       * There are two possible situations for entry here:
> > > -       * (1) refcount is 1(normal case),  entry is valid and on the tree
> > > -       * (2) refcount is 0, entry is freed and not on the tree
> > > -       *     because invalidate happened during writeback
> > > -       *  search the tree and free the entry if find entry
> > > -       */
> > > -       if (entry == zswap_rb_search(&tree->rbroot, offset))
> > > -               zswap_entry_put(tree, entry);
> > > -       spin_unlock(&tree->lock);
> > > -
> > >         return ret;
> > > -
> > >  fail:
> > >         if (!zpool_can_sleep_mapped(pool))
> > >                 kfree(tmp);
> > > @@ -1177,13 +1147,8 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
> > >         * if we get here due to ZSWAP_SWAPCACHE_EXIST
> > >         * a load may be happening concurrently.
> > >         * it is safe and okay to not free the entry.
> > > -       * if we free the entry in the following put
> > >         * it is also okay to return !0
> > >         */
> > > -       spin_lock(&tree->lock);
> > > -       zswap_entry_put(tree, entry);
> > > -       spin_unlock(&tree->lock);
> > > -
> > >         return ret;
> > >  }
> > >
> > > --
> > > 2.34.1
> > >
  
Domenico Cerasuolo June 9, 2023, 11:05 a.m. UTC | #5
On Thu, Jun 8, 2023 at 6:48 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> Hi Domenico,
>
> On Tue, Jun 06, 2023 at 04:56:10PM +0200, Domenico Cerasuolo wrote:
> > Previously, in zswap, the writeback function was passed to zpool drivers
> > for their usage in calling the writeback operation. However, since the
> > drivers did not possess references to entries and the function was
> > specifically designed to work with handles, the writeback process has
> > been modified to occur directly within zswap.
>
> I'm having trouble parsing this sentence.
>
> > Consequently, this change allows for some simplification of the
> > writeback function, taking into account the updated workflow.
>
> How about:
>
> zswap_writeback_entry() used to be a callback for the backends, which
> don't know about struct zswap_entry.
>
> Now that the only user is the generic zswap LRU reclaimer, it can be
> simplified: pass the pinned zswap_entry directly, and consolidate the
> refcount management in the shrink function.

Sounds clearer, will update.

>
> > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> > ---
> >  mm/zswap.c | 69 ++++++++++++++----------------------------------------
> >  1 file changed, 17 insertions(+), 52 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 2831bf56b168..ef8604812352 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -250,7 +250,8 @@ static bool zswap_has_pool;
> >       pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
> >                zpool_get_type((p)->zpool))
> >
> > -static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
> > +static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_header *zhdr,
> > +                              struct zswap_tree *tree);
> >  static int zswap_pool_get(struct zswap_pool *pool);
> >  static void zswap_pool_put(struct zswap_pool *pool);
> >
> > @@ -632,15 +633,21 @@ static int zswap_shrink(struct zswap_pool *pool)
> >       }
> >       spin_unlock(&tree->lock);
> >
> > -     ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
> > +     ret = zswap_writeback_entry(lru_entry, zhdr, tree);
> >
> >       spin_lock(&tree->lock);
> >       if (ret) {
> >               spin_lock(&pool->lru_lock);
> >               list_move(&lru_entry->lru, &pool->lru);
> >               spin_unlock(&pool->lru_lock);
>
> This could use a comment.
>
>                 /* Writeback failed, put entry back on LRU */
>
> > +             zswap_entry_put(tree, tree_entry);
>
> This put is a common factor in both branches, so you can consolidate.
>
> > +     } else {
> > +             /* free the local reference */
> > +             zswap_entry_put(tree, tree_entry);
> > +             /* free the entry if it's not been invalidated*/
>
> Missing space between text and */
>
> > +             if (lru_entry == zswap_rb_search(&tree->rbroot, swpoffset))
> > +                     zswap_entry_put(tree, tree_entry);
> >       }
> > -     zswap_entry_put(tree, tree_entry);
> >       spin_unlock(&tree->lock);
>
> The success path freeing (hopefully the common path) is now
> unfortunately hidden in fairly deep indentation. I think this would be
> better with a goto for the error case.
>
> All together, something like this?
>
>         if (ret) {
>                 /* Writeback failed, put entry back on LRU */
>                 ...
>                 goto put_unlock;
>         }
>
>         /*
>          * Writeback started successfully, the page now belongs to the
>          * swapcache. Drop the base ref from the tree - unless invalidate
>          * already took it out while we had the tree->lock released for IO.
>          */
>         if (lru_entry == zswap_rb_search(&tree->rb_root, swpoffset))
>                 zswap_entry_put(tree, entry);
>
> put_unlock:
>         /* Drop local reference */
>         zswap_entry_put(tree, tree_entry);
>         spin_unlock(&tree->lock);
>
>         return ret ? -EAGAIN : 0;
>

This feedback overlaps with the on in patch 1/7, I'm integrating them
so that in patch #1, the invalidation check is done only with rb search
and a first `goto unlock` for error.
Then here the base reference-drop is added after the `ret` check, and
errors go to `put_unlock`:

if (ret) {
        /* Writeback failed, put entry back on LRU */
        spin_lock(&pool->lru_lock);
        list_move(&entry->lru, &pool->lru);
        spin_unlock(&pool->lru_lock);
        goto put_unlock;
}

/* Check for invalidate() race */
if (entry != zswap_rb_search(&tree->rbroot, swpoffset))
        goto put_unlock;

        /* Drop base reference */
        zswap_entry_put(tree, entry);

put_unlock:
        /* Drop local reference */
        zswap_entry_put(tree, entry);
unlock:
        spin_unlock(&tree->lock);
return ret ? -EAGAIN : 0;

> Btw, it's unsettling that we drop the tree reference without
> explicitly removing the entry for the tree. We rely on the final put
> that frees the entry to do tree removal, but this doesn't happen if
> somebody else is holding a reference; the entry can remain in the tree
> long after we've officially handed over ownership of the data to
> swapcache. TTBOMK there are currently no bugs because of that, but
> it's a data corruption waiting to happen.
>
> This should be:
>
>         /*
>          * Writeback started successfully, the page now belongs to the
>          * swapcache. Drop the entry from zswap - unless invalidate already
>          * took it out while we had the tree->lock released for IO.
>          */
>         if (entry == zswap_rb_search(&tree->rb_root, swpoffset))
>                 zswap_invalidate_entry(tree, entry);
>
> Would you care to send a follow-up patch?

Makes total sense, thanks will send a patch for this.
  

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index 2831bf56b168..ef8604812352 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -250,7 +250,8 @@  static bool zswap_has_pool;
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
 		 zpool_get_type((p)->zpool))
 
-static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
+static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_header *zhdr,
+				 struct zswap_tree *tree);
 static int zswap_pool_get(struct zswap_pool *pool);
 static void zswap_pool_put(struct zswap_pool *pool);
 
@@ -632,15 +633,21 @@  static int zswap_shrink(struct zswap_pool *pool)
 	}
 	spin_unlock(&tree->lock);
 
-	ret = zswap_writeback_entry(pool->zpool, lru_entry->handle);
+	ret = zswap_writeback_entry(lru_entry, zhdr, tree);
 
 	spin_lock(&tree->lock);
 	if (ret) {
 		spin_lock(&pool->lru_lock);
 		list_move(&lru_entry->lru, &pool->lru);
 		spin_unlock(&pool->lru_lock);
+		zswap_entry_put(tree, tree_entry);
+	} else {
+		/* free the local reference */
+		zswap_entry_put(tree, tree_entry);
+		/* free the entry if it's not been invalidated*/
+		if (lru_entry == zswap_rb_search(&tree->rbroot, swpoffset))
+			zswap_entry_put(tree, tree_entry);
 	}
-	zswap_entry_put(tree, tree_entry);
 	spin_unlock(&tree->lock);
 
 	return ret ? -EAGAIN : 0;
@@ -1039,16 +1046,14 @@  static int zswap_get_swap_cache_page(swp_entry_t entry,
  * the swap cache, the compressed version stored by zswap can be
  * freed.
  */
-static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
+static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_header *zhdr,
+				 struct zswap_tree *tree)
 {
-	struct zswap_header *zhdr;
-	swp_entry_t swpentry;
-	struct zswap_tree *tree;
-	pgoff_t offset;
-	struct zswap_entry *entry;
+	swp_entry_t swpentry = zhdr->swpentry;
 	struct page *page;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
+	struct zpool *pool = entry->pool->zpool;
 
 	u8 *src, *tmp = NULL;
 	unsigned int dlen;
@@ -1063,25 +1068,6 @@  static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 			return -ENOMEM;
 	}
 
-	/* extract swpentry from data */
-	zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
-	swpentry = zhdr->swpentry; /* here */
-	tree = zswap_trees[swp_type(swpentry)];
-	offset = swp_offset(swpentry);
-	zpool_unmap_handle(pool, handle);
-
-	/* find and ref zswap entry */
-	spin_lock(&tree->lock);
-	entry = zswap_entry_find_get(&tree->rbroot, offset);
-	if (!entry) {
-		/* entry was invalidated */
-		spin_unlock(&tree->lock);
-		kfree(tmp);
-		return 0;
-	}
-	spin_unlock(&tree->lock);
-	BUG_ON(offset != entry->offset);
-
 	/* try to allocate swap cache page */
 	switch (zswap_get_swap_cache_page(swpentry, &page)) {
 	case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
@@ -1115,12 +1101,12 @@  static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 		acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
 		dlen = PAGE_SIZE;
 
-		zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
+		zhdr = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO);
 		src = (u8 *)zhdr + sizeof(struct zswap_header);
 		if (!zpool_can_sleep_mapped(pool)) {
 			memcpy(tmp, src, entry->length);
 			src = tmp;
-			zpool_unmap_handle(pool, handle);
+			zpool_unmap_handle(pool, entry->handle);
 		}
 
 		mutex_lock(acomp_ctx->mutex);
@@ -1135,7 +1121,7 @@  static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 		if (!zpool_can_sleep_mapped(pool))
 			kfree(tmp);
 		else
-			zpool_unmap_handle(pool, handle);
+			zpool_unmap_handle(pool, entry->handle);
 
 		BUG_ON(ret);
 		BUG_ON(dlen != PAGE_SIZE);
@@ -1152,23 +1138,7 @@  static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 	put_page(page);
 	zswap_written_back_pages++;
 
-	spin_lock(&tree->lock);
-	/* drop local reference */
-	zswap_entry_put(tree, entry);
-
-	/*
-	* There are two possible situations for entry here:
-	* (1) refcount is 1(normal case),  entry is valid and on the tree
-	* (2) refcount is 0, entry is freed and not on the tree
-	*     because invalidate happened during writeback
-	*  search the tree and free the entry if find entry
-	*/
-	if (entry == zswap_rb_search(&tree->rbroot, offset))
-		zswap_entry_put(tree, entry);
-	spin_unlock(&tree->lock);
-
 	return ret;
-
 fail:
 	if (!zpool_can_sleep_mapped(pool))
 		kfree(tmp);
@@ -1177,13 +1147,8 @@  static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 	* if we get here due to ZSWAP_SWAPCACHE_EXIST
 	* a load may be happening concurrently.
 	* it is safe and okay to not free the entry.
-	* if we free the entry in the following put
 	* it is also okay to return !0
 	*/
-	spin_lock(&tree->lock);
-	zswap_entry_put(tree, entry);
-	spin_unlock(&tree->lock);
-
 	return ret;
 }