[v3] mm: zswap: multiple zpools support

Message ID 20230620194644.3142384-1-yosryahmed@google.com
State New
Headers
Series [v3] mm: zswap: multiple zpools support |

Commit Message

Yosry Ahmed June 20, 2023, 7:46 p.m. UTC
  Support using multiple zpools of the same type in zswap, for concurrency
purposes. A fixed number of 32 zpools is suggested by this commit, which
was determined empirically. It can be later changed or made into a
config option if needed.

On a setup with zswap and zsmalloc, comparing a single zpool to 32
zpools shows improvements in the zsmalloc lock contention, especially on
the swap out path.

The following shows the perf analysis of the swapout path when 10
workloads are simultaneously reclaiming and refaulting tmpfs pages.
There are some improvements on the swap in path as well, but less
significant.

1 zpool:

 |--28.99%--zswap_frontswap_store
       |
       <snip>
       |
       |--8.98%--zpool_map_handle
       |     |
       |      --8.98%--zs_zpool_map
       |           |
       |            --8.95%--zs_map_object
       |                 |
       |                  --8.38%--_raw_spin_lock
       |                       |
       |                        --7.39%--queued_spin_lock_slowpath
       |
       |--8.82%--zpool_malloc
       |     |
       |      --8.82%--zs_zpool_malloc
       |           |
       |            --8.80%--zs_malloc
       |                 |
       |                 |--7.21%--_raw_spin_lock
       |                 |     |
       |                 |      --6.81%--queued_spin_lock_slowpath
       <snip>

32 zpools:

 |--16.73%--zswap_frontswap_store
       |
       <snip>
       |
       |--1.81%--zpool_malloc
       |     |
       |      --1.81%--zs_zpool_malloc
       |           |
       |            --1.79%--zs_malloc
       |                 |
       |                  --0.73%--obj_malloc
       |
       |--1.06%--zswap_update_total_size
       |
       |--0.59%--zpool_map_handle
       |     |
       |      --0.59%--zs_zpool_map
       |           |
       |            --0.57%--zs_map_object
       |                 |
       |                  --0.51%--_raw_spin_lock
       <snip>

Suggested-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---

v2 -> v3:
- Removed config option (Johannes Weiner). Now it's a constant.
- Fixed spelling typos (Yu Zhao).

v1 -> v2:
- Prettified perf graph in commit log.
- Changed zswap_nr_zpools to a macro, changed zswap_pool->zpools to a
  fixed size array instead of a flex array.
- Removed stale comment.

---
 mm/zswap.c | 81 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 27 deletions(-)
  

Comments

Nhat Pham July 9, 2023, 11:12 p.m. UTC | #1
On Tue, Jun 20, 2023 at 12:46 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> Support using multiple zpools of the same type in zswap, for concurrency
> purposes. A fixed number of 32 zpools is suggested by this commit, which
> was determined empirically. It can be later changed or made into a
> config option if needed.
>
> On a setup with zswap and zsmalloc, comparing a single zpool to 32
> zpools shows improvements in the zsmalloc lock contention, especially on
> the swap out path.
>
> The following shows the perf analysis of the swapout path when 10
> workloads are simultaneously reclaiming and refaulting tmpfs pages.
> There are some improvements on the swap in path as well, but less
> significant.
>
> 1 zpool:
>
>  |--28.99%--zswap_frontswap_store
>        |
>        <snip>
>        |
>        |--8.98%--zpool_map_handle
>        |     |
>        |      --8.98%--zs_zpool_map
>        |           |
>        |            --8.95%--zs_map_object
>        |                 |
>        |                  --8.38%--_raw_spin_lock
>        |                       |
>        |                        --7.39%--queued_spin_lock_slowpath
>        |
>        |--8.82%--zpool_malloc
>        |     |
>        |      --8.82%--zs_zpool_malloc
>        |           |
>        |            --8.80%--zs_malloc
>        |                 |
>        |                 |--7.21%--_raw_spin_lock
>        |                 |     |
>        |                 |      --6.81%--queued_spin_lock_slowpath
>        <snip>
>
> 32 zpools:
>
>  |--16.73%--zswap_frontswap_store
>        |
>        <snip>
>        |
>        |--1.81%--zpool_malloc
>        |     |
>        |      --1.81%--zs_zpool_malloc
>        |           |
>        |            --1.79%--zs_malloc
>        |                 |
>        |                  --0.73%--obj_malloc
>        |
>        |--1.06%--zswap_update_total_size
>        |
>        |--0.59%--zpool_map_handle
>        |     |
>        |      --0.59%--zs_zpool_map
>        |           |
>        |            --0.57%--zs_map_object
>        |                 |
>        |                  --0.51%--_raw_spin_lock
>        <snip>
>
> Suggested-by: Yu Zhao <yuzhao@google.com>
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> ---
>
> v2 -> v3:
> - Removed config option (Johannes Weiner). Now it's a constant.
> - Fixed spelling typos (Yu Zhao).
>
> v1 -> v2:
> - Prettified perf graph in commit log.
> - Changed zswap_nr_zpools to a macro, changed zswap_pool->zpools to a
>   fixed size array instead of a flex array.
> - Removed stale comment.
>
> ---
>  mm/zswap.c | 81 ++++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 54 insertions(+), 27 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 87b204233115..6ee7028497b8 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -142,6 +142,9 @@ static bool zswap_exclusive_loads_enabled = IS_ENABLED(
>                 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
>  module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
>
> +/* Number of zpools in zswap_pool (empirically determined for scalability) */
> +#define ZSWAP_NR_ZPOOLS 32
> +
>  /*********************************
>  * data structures
>  **********************************/
> @@ -161,7 +164,7 @@ struct crypto_acomp_ctx {
>   * needs to be verified that it's still valid in the tree.
>   */
>  struct zswap_pool {
> -       struct zpool *zpool;
> +       struct zpool *zpools[ZSWAP_NR_ZPOOLS];
>         struct crypto_acomp_ctx __percpu *acomp_ctx;
>         struct kref kref;
>         struct list_head list;
> @@ -248,7 +251,7 @@ static bool zswap_has_pool;
>
>  #define zswap_pool_debug(msg, p)                               \
>         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
> -                zpool_get_type((p)->zpool))
> +                zpool_get_type((p)->zpools[0]))
>
>  static int zswap_writeback_entry(struct zswap_entry *entry,
>                                  struct zswap_tree *tree);
> @@ -272,11 +275,13 @@ static void zswap_update_total_size(void)
>  {
>         struct zswap_pool *pool;
>         u64 total = 0;
> +       int i;
>
>         rcu_read_lock();
>
>         list_for_each_entry_rcu(pool, &zswap_pools, list)
> -               total += zpool_get_total_size(pool->zpool);
> +               for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> +                       total += zpool_get_total_size(pool->zpools[i]);
>
>         rcu_read_unlock();
>
> @@ -363,6 +368,16 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
>         }
>  }
>
> +static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> +{
> +       int i = 0;
> +
> +       if (ZSWAP_NR_ZPOOLS > 1)
> +               i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
> +
> +       return entry->pool->zpools[i];
> +}
> +
>  /*
>   * Carries out the common pattern of freeing and entry's zpool allocation,
>   * freeing the entry itself, and decrementing the number of stored pages.
> @@ -379,7 +394,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
>                 spin_lock(&entry->pool->lru_lock);
>                 list_del(&entry->lru);
>                 spin_unlock(&entry->pool->lru_lock);
> -               zpool_free(entry->pool->zpool, entry->handle);
> +               zpool_free(zswap_find_zpool(entry), entry->handle);
>                 zswap_pool_put(entry->pool);
>         }
>         zswap_entry_cache_free(entry);
> @@ -588,7 +603,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>         list_for_each_entry_rcu(pool, &zswap_pools, list) {
>                 if (strcmp(pool->tfm_name, compressor))
>                         continue;
> -               if (strcmp(zpool_get_type(pool->zpool), type))
> +               /* all zpools share the same type */
> +               if (strcmp(zpool_get_type(pool->zpools[0]), type))
>                         continue;
>                 /* if we can't get it, it's about to be destroyed */
>                 if (!zswap_pool_get(pool))
> @@ -692,6 +708,7 @@ static void shrink_worker(struct work_struct *w)
>
>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  {
> +       int i;
>         struct zswap_pool *pool;
>         char name[38]; /* 'zswap' + 32 char (max) num + \0 */
>         gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> @@ -712,15 +729,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>         if (!pool)
>                 return NULL;
>
> -       /* unique name for each pool specifically required by zsmalloc */
> -       snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
> +       for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
> +               /* unique name for each pool specifically required by zsmalloc */
> +               snprintf(name, 38, "zswap%x",
> +                        atomic_inc_return(&zswap_pools_count));
>
> -       pool->zpool = zpool_create_pool(type, name, gfp);
> -       if (!pool->zpool) {
> -               pr_err("%s zpool not available\n", type);
> -               goto error;
> +               pool->zpools[i] = zpool_create_pool(type, name, gfp);
> +               if (!pool->zpools[i]) {
> +                       pr_err("%s zpool not available\n", type);
> +                       goto error;
> +               }
>         }
> -       pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
> +       pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
>
>         strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
>
> @@ -752,8 +772,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  error:
>         if (pool->acomp_ctx)
>                 free_percpu(pool->acomp_ctx);
> -       if (pool->zpool)
> -               zpool_destroy_pool(pool->zpool);
> +       while (i--)
> +               zpool_destroy_pool(pool->zpools[i]);
>         kfree(pool);
>         return NULL;
>  }
> @@ -802,11 +822,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
>
>  static void zswap_pool_destroy(struct zswap_pool *pool)
>  {
> +       int i;
> +
>         zswap_pool_debug("destroying", pool);
>
>         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
>         free_percpu(pool->acomp_ctx);
> -       zpool_destroy_pool(pool->zpool);
> +       for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> +               zpool_destroy_pool(pool->zpools[i]);
>         kfree(pool);
>  }
>
> @@ -1070,7 +1093,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
>         struct page *page;
>         struct scatterlist input, output;
>         struct crypto_acomp_ctx *acomp_ctx;
> -       struct zpool *pool = entry->pool->zpool;
> +       struct zpool *pool = zswap_find_zpool(entry);
>
>         u8 *src, *tmp = NULL;
>         unsigned int dlen;
> @@ -1211,6 +1234,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         struct crypto_acomp_ctx *acomp_ctx;
>         struct obj_cgroup *objcg = NULL;
>         struct zswap_pool *pool;
> +       struct zpool *zpool;
>         int ret;
>         unsigned int dlen = PAGE_SIZE;
>         unsigned long handle, value;
> @@ -1321,10 +1345,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>         }
>
>         /* store */
> +       zpool = zswap_find_zpool(entry);
>         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> -       if (zpool_malloc_support_movable(entry->pool->zpool))
> +       if (zpool_malloc_support_movable(zpool))
>                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> -       ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
> +       ret = zpool_malloc(zpool, dlen, gfp, &handle);
>         if (ret == -ENOSPC) {
>                 zswap_reject_compress_poor++;
>                 goto put_dstmem;
> @@ -1333,9 +1358,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
>                 zswap_reject_alloc_fail++;
>                 goto put_dstmem;
>         }
> -       buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
> +       buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
>         memcpy(buf, dst, dlen);
> -       zpool_unmap_handle(entry->pool->zpool, handle);
> +       zpool_unmap_handle(zpool, handle);
>         mutex_unlock(acomp_ctx->mutex);
>
>         /* populate entry */
> @@ -1406,6 +1431,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>         struct scatterlist input, output;
>         struct crypto_acomp_ctx *acomp_ctx;
>         u8 *src, *dst, *tmp;
> +       struct zpool *zpool;
>         unsigned int dlen;
>         int ret;
>
> @@ -1427,7 +1453,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>                 goto stats;
>         }
>
> -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> +       zpool = zswap_find_zpool(entry);
> +       if (!zpool_can_sleep_mapped(zpool)) {
>                 tmp = kmalloc(entry->length, GFP_KERNEL);
>                 if (!tmp) {
>                         ret = -ENOMEM;
> @@ -1437,12 +1464,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>
>         /* decompress */
>         dlen = PAGE_SIZE;
> -       src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
>
> -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> +       if (!zpool_can_sleep_mapped(zpool)) {
>                 memcpy(tmp, src, entry->length);
>                 src = tmp;
> -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> +               zpool_unmap_handle(zpool, entry->handle);
>         }
>
>         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> @@ -1454,8 +1481,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
>         ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
>         mutex_unlock(acomp_ctx->mutex);
>
> -       if (zpool_can_sleep_mapped(entry->pool->zpool))
> -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> +       if (zpool_can_sleep_mapped(zpool))
> +               zpool_unmap_handle(zpool, entry->handle);
>         else
>                 kfree(tmp);
>
> @@ -1616,7 +1643,7 @@ static int zswap_setup(void)
>         pool = __zswap_pool_create_fallback();
>         if (pool) {
>                 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
> -                       zpool_get_type(pool->zpool));
> +                       zpool_get_type(pool->zpools[0]));
>                 list_add(&pool->list, &zswap_pools);
>                 zswap_has_pool = true;
>         } else {
> --
> 2.41.0.162.gfafddb0af9-goog
>

In terms of correctness, the code LGTM.
However, I do share Johannes' concern about this change.

May I ask how sensitive is the system performance to the number of pools?
i.e during the tuning process, did you see lots of performance
variability as you vary the number of pools? Is 32 a number that
works well across workloads, hardware, etc?

Personally, I prefer the per-CPU pool idea - or some way to automatically
determine the number of pools that are more generic than this 32 value
(what if we have new hardware with more CPUs? Would 32 still be valid,
or do we have to change it?).

I'm experimenting with some other zswap changes - if I have
extra cycles and resources I'll try to apply this patch and see how the
numbers play out.

I'll defer to Johannes and other reviewers for further comments.
  
Yosry Ahmed July 13, 2023, 10:35 a.m. UTC | #2
On Sun, Jul 9, 2023 at 4:12 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Tue, Jun 20, 2023 at 12:46 PM Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > Support using multiple zpools of the same type in zswap, for concurrency
> > purposes. A fixed number of 32 zpools is suggested by this commit, which
> > was determined empirically. It can be later changed or made into a
> > config option if needed.
> >
> > On a setup with zswap and zsmalloc, comparing a single zpool to 32
> > zpools shows improvements in the zsmalloc lock contention, especially on
> > the swap out path.
> >
> > The following shows the perf analysis of the swapout path when 10
> > workloads are simultaneously reclaiming and refaulting tmpfs pages.
> > There are some improvements on the swap in path as well, but less
> > significant.
> >
> > 1 zpool:
> >
> >  |--28.99%--zswap_frontswap_store
> >        |
> >        <snip>
> >        |
> >        |--8.98%--zpool_map_handle
> >        |     |
> >        |      --8.98%--zs_zpool_map
> >        |           |
> >        |            --8.95%--zs_map_object
> >        |                 |
> >        |                  --8.38%--_raw_spin_lock
> >        |                       |
> >        |                        --7.39%--queued_spin_lock_slowpath
> >        |
> >        |--8.82%--zpool_malloc
> >        |     |
> >        |      --8.82%--zs_zpool_malloc
> >        |           |
> >        |            --8.80%--zs_malloc
> >        |                 |
> >        |                 |--7.21%--_raw_spin_lock
> >        |                 |     |
> >        |                 |      --6.81%--queued_spin_lock_slowpath
> >        <snip>
> >
> > 32 zpools:
> >
> >  |--16.73%--zswap_frontswap_store
> >        |
> >        <snip>
> >        |
> >        |--1.81%--zpool_malloc
> >        |     |
> >        |      --1.81%--zs_zpool_malloc
> >        |           |
> >        |            --1.79%--zs_malloc
> >        |                 |
> >        |                  --0.73%--obj_malloc
> >        |
> >        |--1.06%--zswap_update_total_size
> >        |
> >        |--0.59%--zpool_map_handle
> >        |     |
> >        |      --0.59%--zs_zpool_map
> >        |           |
> >        |            --0.57%--zs_map_object
> >        |                 |
> >        |                  --0.51%--_raw_spin_lock
> >        <snip>
> >
> > Suggested-by: Yu Zhao <yuzhao@google.com>
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > ---
> >
> > v2 -> v3:
> > - Removed config option (Johannes Weiner). Now it's a constant.
> > - Fixed spelling typos (Yu Zhao).
> >
> > v1 -> v2:
> > - Prettified perf graph in commit log.
> > - Changed zswap_nr_zpools to a macro, changed zswap_pool->zpools to a
> >   fixed size array instead of a flex array.
> > - Removed stale comment.
> >
> > ---
> >  mm/zswap.c | 81 ++++++++++++++++++++++++++++++++++++------------------
> >  1 file changed, 54 insertions(+), 27 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 87b204233115..6ee7028497b8 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -142,6 +142,9 @@ static bool zswap_exclusive_loads_enabled = IS_ENABLED(
> >                 CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
> >  module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
> >
> > +/* Number of zpools in zswap_pool (empirically determined for scalability) */
> > +#define ZSWAP_NR_ZPOOLS 32
> > +
> >  /*********************************
> >  * data structures
> >  **********************************/
> > @@ -161,7 +164,7 @@ struct crypto_acomp_ctx {
> >   * needs to be verified that it's still valid in the tree.
> >   */
> >  struct zswap_pool {
> > -       struct zpool *zpool;
> > +       struct zpool *zpools[ZSWAP_NR_ZPOOLS];
> >         struct crypto_acomp_ctx __percpu *acomp_ctx;
> >         struct kref kref;
> >         struct list_head list;
> > @@ -248,7 +251,7 @@ static bool zswap_has_pool;
> >
> >  #define zswap_pool_debug(msg, p)                               \
> >         pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
> > -                zpool_get_type((p)->zpool))
> > +                zpool_get_type((p)->zpools[0]))
> >
> >  static int zswap_writeback_entry(struct zswap_entry *entry,
> >                                  struct zswap_tree *tree);
> > @@ -272,11 +275,13 @@ static void zswap_update_total_size(void)
> >  {
> >         struct zswap_pool *pool;
> >         u64 total = 0;
> > +       int i;
> >
> >         rcu_read_lock();
> >
> >         list_for_each_entry_rcu(pool, &zswap_pools, list)
> > -               total += zpool_get_total_size(pool->zpool);
> > +               for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> > +                       total += zpool_get_total_size(pool->zpools[i]);
> >
> >         rcu_read_unlock();
> >
> > @@ -363,6 +368,16 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
> >         }
> >  }
> >
> > +static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
> > +{
> > +       int i = 0;
> > +
> > +       if (ZSWAP_NR_ZPOOLS > 1)
> > +               i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
> > +
> > +       return entry->pool->zpools[i];
> > +}
> > +
> >  /*
> >   * Carries out the common pattern of freeing and entry's zpool allocation,
> >   * freeing the entry itself, and decrementing the number of stored pages.
> > @@ -379,7 +394,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
> >                 spin_lock(&entry->pool->lru_lock);
> >                 list_del(&entry->lru);
> >                 spin_unlock(&entry->pool->lru_lock);
> > -               zpool_free(entry->pool->zpool, entry->handle);
> > +               zpool_free(zswap_find_zpool(entry), entry->handle);
> >                 zswap_pool_put(entry->pool);
> >         }
> >         zswap_entry_cache_free(entry);
> > @@ -588,7 +603,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> >         list_for_each_entry_rcu(pool, &zswap_pools, list) {
> >                 if (strcmp(pool->tfm_name, compressor))
> >                         continue;
> > -               if (strcmp(zpool_get_type(pool->zpool), type))
> > +               /* all zpools share the same type */
> > +               if (strcmp(zpool_get_type(pool->zpools[0]), type))
> >                         continue;
> >                 /* if we can't get it, it's about to be destroyed */
> >                 if (!zswap_pool_get(pool))
> > @@ -692,6 +708,7 @@ static void shrink_worker(struct work_struct *w)
> >
> >  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >  {
> > +       int i;
> >         struct zswap_pool *pool;
> >         char name[38]; /* 'zswap' + 32 char (max) num + \0 */
> >         gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> > @@ -712,15 +729,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >         if (!pool)
> >                 return NULL;
> >
> > -       /* unique name for each pool specifically required by zsmalloc */
> > -       snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
> > +       for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
> > +               /* unique name for each pool specifically required by zsmalloc */
> > +               snprintf(name, 38, "zswap%x",
> > +                        atomic_inc_return(&zswap_pools_count));
> >
> > -       pool->zpool = zpool_create_pool(type, name, gfp);
> > -       if (!pool->zpool) {
> > -               pr_err("%s zpool not available\n", type);
> > -               goto error;
> > +               pool->zpools[i] = zpool_create_pool(type, name, gfp);
> > +               if (!pool->zpools[i]) {
> > +                       pr_err("%s zpool not available\n", type);
> > +                       goto error;
> > +               }
> >         }
> > -       pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
> > +       pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
> >
> >         strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
> >
> > @@ -752,8 +772,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >  error:
> >         if (pool->acomp_ctx)
> >                 free_percpu(pool->acomp_ctx);
> > -       if (pool->zpool)
> > -               zpool_destroy_pool(pool->zpool);
> > +       while (i--)
> > +               zpool_destroy_pool(pool->zpools[i]);
> >         kfree(pool);
> >         return NULL;
> >  }
> > @@ -802,11 +822,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
> >
> >  static void zswap_pool_destroy(struct zswap_pool *pool)
> >  {
> > +       int i;
> > +
> >         zswap_pool_debug("destroying", pool);
> >
> >         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> >         free_percpu(pool->acomp_ctx);
> > -       zpool_destroy_pool(pool->zpool);
> > +       for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> > +               zpool_destroy_pool(pool->zpools[i]);
> >         kfree(pool);
> >  }
> >
> > @@ -1070,7 +1093,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
> >         struct page *page;
> >         struct scatterlist input, output;
> >         struct crypto_acomp_ctx *acomp_ctx;
> > -       struct zpool *pool = entry->pool->zpool;
> > +       struct zpool *pool = zswap_find_zpool(entry);
> >
> >         u8 *src, *tmp = NULL;
> >         unsigned int dlen;
> > @@ -1211,6 +1234,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> >         struct crypto_acomp_ctx *acomp_ctx;
> >         struct obj_cgroup *objcg = NULL;
> >         struct zswap_pool *pool;
> > +       struct zpool *zpool;
> >         int ret;
> >         unsigned int dlen = PAGE_SIZE;
> >         unsigned long handle, value;
> > @@ -1321,10 +1345,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> >         }
> >
> >         /* store */
> > +       zpool = zswap_find_zpool(entry);
> >         gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
> > -       if (zpool_malloc_support_movable(entry->pool->zpool))
> > +       if (zpool_malloc_support_movable(zpool))
> >                 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
> > -       ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
> > +       ret = zpool_malloc(zpool, dlen, gfp, &handle);
> >         if (ret == -ENOSPC) {
> >                 zswap_reject_compress_poor++;
> >                 goto put_dstmem;
> > @@ -1333,9 +1358,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
> >                 zswap_reject_alloc_fail++;
> >                 goto put_dstmem;
> >         }
> > -       buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
> > +       buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
> >         memcpy(buf, dst, dlen);
> > -       zpool_unmap_handle(entry->pool->zpool, handle);
> > +       zpool_unmap_handle(zpool, handle);
> >         mutex_unlock(acomp_ctx->mutex);
> >
> >         /* populate entry */
> > @@ -1406,6 +1431,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> >         struct scatterlist input, output;
> >         struct crypto_acomp_ctx *acomp_ctx;
> >         u8 *src, *dst, *tmp;
> > +       struct zpool *zpool;
> >         unsigned int dlen;
> >         int ret;
> >
> > @@ -1427,7 +1453,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> >                 goto stats;
> >         }
> >
> > -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> > +       zpool = zswap_find_zpool(entry);
> > +       if (!zpool_can_sleep_mapped(zpool)) {
> >                 tmp = kmalloc(entry->length, GFP_KERNEL);
> >                 if (!tmp) {
> >                         ret = -ENOMEM;
> > @@ -1437,12 +1464,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> >
> >         /* decompress */
> >         dlen = PAGE_SIZE;
> > -       src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
> > +       src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
> >
> > -       if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
> > +       if (!zpool_can_sleep_mapped(zpool)) {
> >                 memcpy(tmp, src, entry->length);
> >                 src = tmp;
> > -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> > +               zpool_unmap_handle(zpool, entry->handle);
> >         }
> >
> >         acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
> > @@ -1454,8 +1481,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
> >         ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
> >         mutex_unlock(acomp_ctx->mutex);
> >
> > -       if (zpool_can_sleep_mapped(entry->pool->zpool))
> > -               zpool_unmap_handle(entry->pool->zpool, entry->handle);
> > +       if (zpool_can_sleep_mapped(zpool))
> > +               zpool_unmap_handle(zpool, entry->handle);
> >         else
> >                 kfree(tmp);
> >
> > @@ -1616,7 +1643,7 @@ static int zswap_setup(void)
> >         pool = __zswap_pool_create_fallback();
> >         if (pool) {
> >                 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
> > -                       zpool_get_type(pool->zpool));
> > +                       zpool_get_type(pool->zpools[0]));
> >                 list_add(&pool->list, &zswap_pools);
> >                 zswap_has_pool = true;
> >         } else {
> > --
> > 2.41.0.162.gfafddb0af9-goog
> >
>
> In terms of correctness, the code LGTM.
> However, I do share Johannes' concern about this change.
>
> May I ask how sensitive is the system performance to the number of pools?
> i.e during the tuning process, did you see lots of performance
> variability as you vary the number of pools? Is 32 a number that
> works well across workloads, hardware, etc?

I did not tune this myself, it has been used in our fleet for many
years now, and honestly I am not sure what range of values was tried
out. For us, 32 is a number that works well across our entire fleet
(that uses zswap ofc).

>
> Personally, I prefer the per-CPU pool idea - or some way to automatically
> determine the number of pools that are more generic than this 32 value
> (what if we have new hardware with more CPUs? Would 32 still be valid,
> or do we have to change it?).

Ideally, we would have the number of zpools be a function of the
system specs, or even autotune based on the workload. However, I don't
think we have a clear idea about what this should look like. While a
constant value is suboptimal, we have multiple constants in MM that
seem to be working relatively well across different machines and
workloads (e.g. SWAP_CLUSTER_MAX) -- so it's not unheard of.

We have been using 32 zpools across different machines and workloads
for years now. I would be hesitant to throw away years of production
testing right away, without data to back that something else is
better. I would prefer to start with something that (at least in our
fleet) is proven to be good, and we can easily extend it later to
replace 32 with a more sophisticated formula or something that is
calculated at boot or even tuned by userspace. Having the support in
the code to have multiple zpools is valuable as-is imo.

>
> I'm experimenting with some other zswap changes - if I have
> extra cycles and resources I'll try to apply this patch and see how the
> numbers play out.

That would be amazing. Looking forward to any numbers you can dig :)

>
> I'll defer to Johannes and other reviewers for further comments.
  
Andrew Morton Aug. 11, 2023, 9:19 p.m. UTC | #3
On Thu, 13 Jul 2023 03:35:25 -0700 Yosry Ahmed <yosryahmed@google.com> wrote:

> >
> > I'm experimenting with some other zswap changes - if I have
> > extra cycles and resources I'll try to apply this patch and see how the
> > numbers play out.
> 
> That would be amazing. Looking forward to any numbers you can dig :)

So this patch seems stuck.  I can keep it in mm.git until the fog
clears, but would prefer not to.  Can we please revisit and decide on a
way forward?
  
Yosry Ahmed Aug. 11, 2023, 11:20 p.m. UTC | #4
On Fri, Aug 11, 2023 at 2:19 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Thu, 13 Jul 2023 03:35:25 -0700 Yosry Ahmed <yosryahmed@google.com> wrote:
>
> > >
> > > I'm experimenting with some other zswap changes - if I have
> > > extra cycles and resources I'll try to apply this patch and see how the
> > > numbers play out.
> >
> > That would be amazing. Looking forward to any numbers you can dig :)
>
> So this patch seems stuck.  I can keep it in mm.git until the fog
> clears, but would prefer not to.  Can we please revisit and decide on a
> way forward?

Johannes did not like a config option so I proposed it here as a
constant (like SWAP_CLUSTER_MAX and others we have). This is a value
that we have been using in our data centers for almost a decade, so it
has seen a ton of testing. I was hoping Johannes would get time to
take a look, or Nhat would get time to test it out, but neither of
these things happen.

I obviously want it to be merged, but hopefully someone will chime in here :)
  
Chris Li Aug. 15, 2023, 10:22 p.m. UTC | #5
Hi Yosry,

On Fri, Aug 11, 2023 at 4:21 PM Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Fri, Aug 11, 2023 at 2:19 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > On Thu, 13 Jul 2023 03:35:25 -0700 Yosry Ahmed <yosryahmed@google.com> wrote:
> >
> > > >
> > > > I'm experimenting with some other zswap changes - if I have
> > > > extra cycles and resources I'll try to apply this patch and see how the
> > > > numbers play out.
> > >
> > > That would be amazing. Looking forward to any numbers you can dig :)
> >
> > So this patch seems stuck.  I can keep it in mm.git until the fog
> > clears, but would prefer not to.  Can we please revisit and decide on a
> > way forward?
>
> Johannes did not like a config option so I proposed it here as a
> constant (like SWAP_CLUSTER_MAX and others we have). This is a value
> that we have been using in our data centers for almost a decade, so it

I dug up the previous V1 discussion and this V3 discussion thread.
It seems obvious having multiple pools having locking contention advantage.
The number does not lie.

However the number of pools is hard to decide at compile time.

Regarding the per CPU pool. That might work well for a small number of CPUs.
When the system has many CPUs e.g. a few hundreds of CPUs. It means having
hundreds of pools which is  a bad idea.

How about just setting it as a run time value(size/bits) and can only
change pool
(size/bits) when zswap does not have any active stores.

Chris

> has seen a ton of testing. I was hoping Johannes would get time to
> take a look, or Nhat would get time to test it out, but neither of
> these things happen.
>
> I obviously want it to be merged, but hopefully someone will chime in here :)
>
  

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index 87b204233115..6ee7028497b8 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -142,6 +142,9 @@  static bool zswap_exclusive_loads_enabled = IS_ENABLED(
 		CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON);
 module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
 
+/* Number of zpools in zswap_pool (empirically determined for scalability) */
+#define ZSWAP_NR_ZPOOLS 32
+
 /*********************************
 * data structures
 **********************************/
@@ -161,7 +164,7 @@  struct crypto_acomp_ctx {
  * needs to be verified that it's still valid in the tree.
  */
 struct zswap_pool {
-	struct zpool *zpool;
+	struct zpool *zpools[ZSWAP_NR_ZPOOLS];
 	struct crypto_acomp_ctx __percpu *acomp_ctx;
 	struct kref kref;
 	struct list_head list;
@@ -248,7 +251,7 @@  static bool zswap_has_pool;
 
 #define zswap_pool_debug(msg, p)				\
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
-		 zpool_get_type((p)->zpool))
+		 zpool_get_type((p)->zpools[0]))
 
 static int zswap_writeback_entry(struct zswap_entry *entry,
 				 struct zswap_tree *tree);
@@ -272,11 +275,13 @@  static void zswap_update_total_size(void)
 {
 	struct zswap_pool *pool;
 	u64 total = 0;
+	int i;
 
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(pool, &zswap_pools, list)
-		total += zpool_get_total_size(pool->zpool);
+		for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+			total += zpool_get_total_size(pool->zpools[i]);
 
 	rcu_read_unlock();
 
@@ -363,6 +368,16 @@  static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 	}
 }
 
+static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
+{
+	int i = 0;
+
+	if (ZSWAP_NR_ZPOOLS > 1)
+		i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
+
+	return entry->pool->zpools[i];
+}
+
 /*
  * Carries out the common pattern of freeing and entry's zpool allocation,
  * freeing the entry itself, and decrementing the number of stored pages.
@@ -379,7 +394,7 @@  static void zswap_free_entry(struct zswap_entry *entry)
 		spin_lock(&entry->pool->lru_lock);
 		list_del(&entry->lru);
 		spin_unlock(&entry->pool->lru_lock);
-		zpool_free(entry->pool->zpool, entry->handle);
+		zpool_free(zswap_find_zpool(entry), entry->handle);
 		zswap_pool_put(entry->pool);
 	}
 	zswap_entry_cache_free(entry);
@@ -588,7 +603,8 @@  static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 	list_for_each_entry_rcu(pool, &zswap_pools, list) {
 		if (strcmp(pool->tfm_name, compressor))
 			continue;
-		if (strcmp(zpool_get_type(pool->zpool), type))
+		/* all zpools share the same type */
+		if (strcmp(zpool_get_type(pool->zpools[0]), type))
 			continue;
 		/* if we can't get it, it's about to be destroyed */
 		if (!zswap_pool_get(pool))
@@ -692,6 +708,7 @@  static void shrink_worker(struct work_struct *w)
 
 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 {
+	int i;
 	struct zswap_pool *pool;
 	char name[38]; /* 'zswap' + 32 char (max) num + \0 */
 	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
@@ -712,15 +729,18 @@  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	if (!pool)
 		return NULL;
 
-	/* unique name for each pool specifically required by zsmalloc */
-	snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
+	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) {
+		/* unique name for each pool specifically required by zsmalloc */
+		snprintf(name, 38, "zswap%x",
+			 atomic_inc_return(&zswap_pools_count));
 
-	pool->zpool = zpool_create_pool(type, name, gfp);
-	if (!pool->zpool) {
-		pr_err("%s zpool not available\n", type);
-		goto error;
+		pool->zpools[i] = zpool_create_pool(type, name, gfp);
+		if (!pool->zpools[i]) {
+			pr_err("%s zpool not available\n", type);
+			goto error;
+		}
 	}
-	pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
+	pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0]));
 
 	strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
 
@@ -752,8 +772,8 @@  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
-	if (pool->zpool)
-		zpool_destroy_pool(pool->zpool);
+	while (i--)
+		zpool_destroy_pool(pool->zpools[i]);
 	kfree(pool);
 	return NULL;
 }
@@ -802,11 +822,14 @@  static struct zswap_pool *__zswap_pool_create_fallback(void)
 
 static void zswap_pool_destroy(struct zswap_pool *pool)
 {
+	int i;
+
 	zswap_pool_debug("destroying", pool);
 
 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 	free_percpu(pool->acomp_ctx);
-	zpool_destroy_pool(pool->zpool);
+	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+		zpool_destroy_pool(pool->zpools[i]);
 	kfree(pool);
 }
 
@@ -1070,7 +1093,7 @@  static int zswap_writeback_entry(struct zswap_entry *entry,
 	struct page *page;
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
-	struct zpool *pool = entry->pool->zpool;
+	struct zpool *pool = zswap_find_zpool(entry);
 
 	u8 *src, *tmp = NULL;
 	unsigned int dlen;
@@ -1211,6 +1234,7 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	struct crypto_acomp_ctx *acomp_ctx;
 	struct obj_cgroup *objcg = NULL;
 	struct zswap_pool *pool;
+	struct zpool *zpool;
 	int ret;
 	unsigned int dlen = PAGE_SIZE;
 	unsigned long handle, value;
@@ -1321,10 +1345,11 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	}
 
 	/* store */
+	zpool = zswap_find_zpool(entry);
 	gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
-	if (zpool_malloc_support_movable(entry->pool->zpool))
+	if (zpool_malloc_support_movable(zpool))
 		gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
-	ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle);
+	ret = zpool_malloc(zpool, dlen, gfp, &handle);
 	if (ret == -ENOSPC) {
 		zswap_reject_compress_poor++;
 		goto put_dstmem;
@@ -1333,9 +1358,9 @@  static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 		zswap_reject_alloc_fail++;
 		goto put_dstmem;
 	}
-	buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
+	buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO);
 	memcpy(buf, dst, dlen);
-	zpool_unmap_handle(entry->pool->zpool, handle);
+	zpool_unmap_handle(zpool, handle);
 	mutex_unlock(acomp_ctx->mutex);
 
 	/* populate entry */
@@ -1406,6 +1431,7 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	struct scatterlist input, output;
 	struct crypto_acomp_ctx *acomp_ctx;
 	u8 *src, *dst, *tmp;
+	struct zpool *zpool;
 	unsigned int dlen;
 	int ret;
 
@@ -1427,7 +1453,8 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 		goto stats;
 	}
 
-	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+	zpool = zswap_find_zpool(entry);
+	if (!zpool_can_sleep_mapped(zpool)) {
 		tmp = kmalloc(entry->length, GFP_KERNEL);
 		if (!tmp) {
 			ret = -ENOMEM;
@@ -1437,12 +1464,12 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 
 	/* decompress */
 	dlen = PAGE_SIZE;
-	src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
+	src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO);
 
-	if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+	if (!zpool_can_sleep_mapped(zpool)) {
 		memcpy(tmp, src, entry->length);
 		src = tmp;
-		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+		zpool_unmap_handle(zpool, entry->handle);
 	}
 
 	acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
@@ -1454,8 +1481,8 @@  static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
 	mutex_unlock(acomp_ctx->mutex);
 
-	if (zpool_can_sleep_mapped(entry->pool->zpool))
-		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+	if (zpool_can_sleep_mapped(zpool))
+		zpool_unmap_handle(zpool, entry->handle);
 	else
 		kfree(tmp);
 
@@ -1616,7 +1643,7 @@  static int zswap_setup(void)
 	pool = __zswap_pool_create_fallback();
 	if (pool) {
 		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
-			zpool_get_type(pool->zpool));
+			zpool_get_type(pool->zpools[0]));
 		list_add(&pool->list, &zswap_pools);
 		zswap_has_pool = true;
 	} else {