[v5,4/6] zsmalloc: Add a LRU to zs_pool to keep track of zspages in LRU order
Commit Message
This helps determines the coldest zspages as candidates for writeback.
Signed-off-by: Nhat Pham <nphamcs@gmail.com>
---
mm/zsmalloc.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 43 insertions(+), 2 deletions(-)
--
2.30.2
Comments
On Fri, Nov 18, 2022 at 10:24:05AM -0800, Nhat Pham wrote:
> This helps determines the coldest zspages as candidates for writeback.
>
> Signed-off-by: Nhat Pham <nphamcs@gmail.com>
> ---
> mm/zsmalloc.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 326faa751f0a..9e7b54324181 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -239,6 +239,11 @@ struct zs_pool {
> /* Compact classes */
> struct shrinker shrinker;
>
> +#ifdef CONFIG_ZPOOL
> + /* List tracking the zspages in LRU order by most recently added object */
> + struct list_head lru;
> +#endif
> +
> #ifdef CONFIG_ZSMALLOC_STAT
> struct dentry *stat_dentry;
> #endif
> @@ -260,6 +265,12 @@ struct zspage {
> unsigned int freeobj;
> struct page *first_page;
> struct list_head list; /* fullness list */
> +
> +#ifdef CONFIG_ZPOOL
> + /* links the zspage to the lru list in the pool */
> + struct list_head lru;
> +#endif
> +
> struct zs_pool *pool;
> #ifdef CONFIG_COMPACTION
> rwlock_t lock;
> @@ -352,6 +363,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
> kmem_cache_free(pool->zspage_cachep, zspage);
> }
>
> +#ifdef CONFIG_ZPOOL
> +/* Moves the zspage to the front of the zspool's LRU */
> +static void move_to_front(struct zs_pool *pool, struct zspage *zspage)
> +{
> + assert_spin_locked(&pool->lock);
> +
> + if (!list_empty(&zspage->lru))
> + list_del(&zspage->lru);
> + list_add(&zspage->lru, &pool->lru);
> +}
> +#endif
> +
> /* pool->lock(which owns the handle) synchronizes races */
> static void record_obj(unsigned long handle, unsigned long obj)
> {
> @@ -953,6 +976,9 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
> }
>
> remove_zspage(class, zspage, ZS_EMPTY);
> +#ifdef CONFIG_ZPOOL
> + list_del(&zspage->lru);
> +#endif
> __free_zspage(pool, class, zspage);
> }
>
> @@ -998,6 +1024,10 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
> off %= PAGE_SIZE;
> }
>
> +#ifdef CONFIG_ZPOOL
> + INIT_LIST_HEAD(&zspage->lru);
> +#endif
> +
> set_freeobj(zspage, 0);
> }
>
> @@ -1418,9 +1448,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
> fix_fullness_group(class, zspage);
> record_obj(handle, obj);
> class_stat_inc(class, OBJ_USED, 1);
> - spin_unlock(&pool->lock);
>
> - return handle;
> + goto out;
> }
>
> spin_unlock(&pool->lock);
> @@ -1444,6 +1473,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
>
> /* We completely set up zspage so mark them as movable */
> SetZsPageMovable(pool, zspage);
> +out:
> +#ifdef CONFIG_ZPOOL
> + /* Move the zspage to front of pool's LRU */
> + move_to_front(pool, zspage);
> +#endif
> spin_unlock(&pool->lock);
Please move the move_to_front into zs_map_object with ZS_MM_WO with
comment with "why we are doing only for WO case".
On Fri, Nov 18, 2022 at 11:32:01AM -0800, Minchan Kim wrote:
> On Fri, Nov 18, 2022 at 10:24:05AM -0800, Nhat Pham wrote:
> > @@ -1444,6 +1473,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
> >
> > /* We completely set up zspage so mark them as movable */
> > SetZsPageMovable(pool, zspage);
> > +out:
> > +#ifdef CONFIG_ZPOOL
> > + /* Move the zspage to front of pool's LRU */
> > + move_to_front(pool, zspage);
> > +#endif
> > spin_unlock(&pool->lock);
>
> Please move the move_to_front into zs_map_object with ZS_MM_WO with
> comment with "why we are doing only for WO case".
I replied to the other thread, but I disagree with this request.
The WO exception would be as zswap-specific as is the
rotate-on-alloc. It doesn't make the resulting zsmalloc code any
cleaner or more generic, just weird in a slightly different way.
On the other hand, it makes zsmalloc deviate from the other backends
and introduces new callchains that invalidate thousands of machine
hours of production testing of this code.
On Fri, Nov 18, 2022 at 03:05:04PM -0500, Johannes Weiner wrote:
> On Fri, Nov 18, 2022 at 11:32:01AM -0800, Minchan Kim wrote:
> > On Fri, Nov 18, 2022 at 10:24:05AM -0800, Nhat Pham wrote:
> > > @@ -1444,6 +1473,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
> > >
> > > /* We completely set up zspage so mark them as movable */
> > > SetZsPageMovable(pool, zspage);
> > > +out:
> > > +#ifdef CONFIG_ZPOOL
> > > + /* Move the zspage to front of pool's LRU */
> > > + move_to_front(pool, zspage);
> > > +#endif
> > > spin_unlock(&pool->lock);
> >
> > Please move the move_to_front into zs_map_object with ZS_MM_WO with
> > comment with "why we are doing only for WO case".
>
> I replied to the other thread, but I disagree with this request.
>
> The WO exception would be as zswap-specific as is the
> rotate-on-alloc. It doesn't make the resulting zsmalloc code any
That's true but at least, zs_pool allocators have the accessor so
that's fair place to have the LRU updating. I guess that's why
you agreed that's better place. No?
I understand that's zswap-specific that the bad design keeps
pushing smelly code into allocators and then "push to take it
since other were already doing" with "we will take them off with
better solution in future". I am really struggling to understand
this concept. Johannes, Is that really how we work over a decade?
> cleaner or more generic, just weird in a slightly different way.
>
> On the other hand, it makes zsmalloc deviate from the other backends
> and introduces new callchains that invalidate thousands of machine
> hours of production testing of this code.
Do you really believe the trival change makes invalidates
the testing?
ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;
}
if (ret) {
zswap_reject_alloc_fail++;
goto put_dstmem;
}
buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
memcpy(buf, &zhdr, hlen);
memcpy(buf + hlen, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
On Fri, Nov 18, 2022 at 01:35:01PM -0800, Minchan Kim wrote:
> On Fri, Nov 18, 2022 at 03:05:04PM -0500, Johannes Weiner wrote:
> > On Fri, Nov 18, 2022 at 11:32:01AM -0800, Minchan Kim wrote:
> > > On Fri, Nov 18, 2022 at 10:24:05AM -0800, Nhat Pham wrote:
> > > > @@ -1444,6 +1473,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
> > > >
> > > > /* We completely set up zspage so mark them as movable */
> > > > SetZsPageMovable(pool, zspage);
> > > > +out:
> > > > +#ifdef CONFIG_ZPOOL
> > > > + /* Move the zspage to front of pool's LRU */
> > > > + move_to_front(pool, zspage);
> > > > +#endif
> > > > spin_unlock(&pool->lock);
> > >
> > > Please move the move_to_front into zs_map_object with ZS_MM_WO with
> > > comment with "why we are doing only for WO case".
> >
> > I replied to the other thread, but I disagree with this request.
> >
> > The WO exception would be as zswap-specific as is the
> > rotate-on-alloc. It doesn't make the resulting zsmalloc code any
>
> That's true but at least, zs_pool allocators have the accessor so
> that's fair place to have the LRU updating. I guess that's why
> you agreed that's better place. No?
>
> I understand that's zswap-specific that the bad design keeps
> pushing smelly code into allocators and then "push to take it
> since other were already doing" with "we will take them off with
> better solution in future". I am really struggling to understand
> this concept. Johannes, Is that really how we work over a decade?
My point was that there is no difference between having zswap code in
alloc or in map. And there is a small upside to having it in alloc
because of the other backends.
But I won't fight you on it. The code isn't going to stay like this
for long anyway.
@@ -239,6 +239,11 @@ struct zs_pool {
/* Compact classes */
struct shrinker shrinker;
+#ifdef CONFIG_ZPOOL
+ /* List tracking the zspages in LRU order by most recently added object */
+ struct list_head lru;
+#endif
+
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
@@ -260,6 +265,12 @@ struct zspage {
unsigned int freeobj;
struct page *first_page;
struct list_head list; /* fullness list */
+
+#ifdef CONFIG_ZPOOL
+ /* links the zspage to the lru list in the pool */
+ struct list_head lru;
+#endif
+
struct zs_pool *pool;
#ifdef CONFIG_COMPACTION
rwlock_t lock;
@@ -352,6 +363,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
kmem_cache_free(pool->zspage_cachep, zspage);
}
+#ifdef CONFIG_ZPOOL
+/* Moves the zspage to the front of the zspool's LRU */
+static void move_to_front(struct zs_pool *pool, struct zspage *zspage)
+{
+ assert_spin_locked(&pool->lock);
+
+ if (!list_empty(&zspage->lru))
+ list_del(&zspage->lru);
+ list_add(&zspage->lru, &pool->lru);
+}
+#endif
+
/* pool->lock(which owns the handle) synchronizes races */
static void record_obj(unsigned long handle, unsigned long obj)
{
@@ -953,6 +976,9 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
}
remove_zspage(class, zspage, ZS_EMPTY);
+#ifdef CONFIG_ZPOOL
+ list_del(&zspage->lru);
+#endif
__free_zspage(pool, class, zspage);
}
@@ -998,6 +1024,10 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
off %= PAGE_SIZE;
}
+#ifdef CONFIG_ZPOOL
+ INIT_LIST_HEAD(&zspage->lru);
+#endif
+
set_freeobj(zspage, 0);
}
@@ -1418,9 +1448,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
fix_fullness_group(class, zspage);
record_obj(handle, obj);
class_stat_inc(class, OBJ_USED, 1);
- spin_unlock(&pool->lock);
- return handle;
+ goto out;
}
spin_unlock(&pool->lock);
@@ -1444,6 +1473,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
+out:
+#ifdef CONFIG_ZPOOL
+ /* Move the zspage to front of pool's LRU */
+ move_to_front(pool, zspage);
+#endif
spin_unlock(&pool->lock);
return handle;
@@ -1967,6 +2001,9 @@ static void async_free_zspage(struct work_struct *work)
VM_BUG_ON(fullness != ZS_EMPTY);
class = pool->size_class[class_idx];
spin_lock(&pool->lock);
+#ifdef CONFIG_ZPOOL
+ list_del(&zspage->lru);
+#endif
__free_zspage(pool, class, zspage);
spin_unlock(&pool->lock);
}
@@ -2278,6 +2315,10 @@ struct zs_pool *zs_create_pool(const char *name)
*/
zs_register_shrinker(pool);
+#ifdef CONFIG_ZPOOL
+ INIT_LIST_HEAD(&pool->lru);
+#endif
+
return pool;
err: