@@ -153,6 +153,13 @@ attribute, e. g.::
Setting this parameter to 100 will disable the hysteresis.
+When there is a sizable amount of cold memory residing in the zswap pool, it
+can be advantageous to proactively write these cold pages to swap and reclaim
+the memory for other use cases. By default, the zswap shrinker is disabled.
+User can enable it as follows:
+
+ echo Y > /sys/module/zswap/parameters/shrinker_enabled
+
A debugfs interface is provided for various statistic about pool size, number
of pages stored, same-value filled pages and various counters for the reasons
pages are rejected.
@@ -22,6 +22,7 @@
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
+#include <linux/zswap.h>
#include <asm/page.h>
/* Free memory management - zoned buddy allocator. */
@@ -637,6 +638,7 @@ struct lruvec {
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
+ struct zswap_lruvec_state zswap_lruvec_state;
};
/* Isolate for asynchronous migration */
@@ -5,20 +5,40 @@
#include <linux/types.h>
#include <linux/mm_types.h>
+struct lruvec;
+
extern u64 zswap_pool_total_size;
extern atomic_t zswap_stored_pages;
#ifdef CONFIG_ZSWAP
+struct zswap_lruvec_state {
+ /*
+ * Number of pages in zswap that should be protected from the shrinker.
+ * This number is an estimate of the following counts:
+ *
+ * a) Recent page faults.
+ * b) Recent insertion to the zswap LRU. This includes new zswap stores,
+ * as well as recent zswap LRU rotations.
+ *
+ * These pages are likely to be warm, and might incur IO if the are written
+ * to swap.
+ */
+ atomic_long_t nr_zswap_protected;
+};
+
bool zswap_store(struct folio *folio);
bool zswap_load(struct folio *folio);
void zswap_invalidate(int type, pgoff_t offset);
void zswap_swapon(int type);
void zswap_swapoff(int type);
void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
-
+void zswap_lruvec_state_init(struct lruvec *lruvec);
+void zswap_lruvec_swapin(struct page *page);
#else
+struct zswap_lruvec_state {};
+
static inline bool zswap_store(struct folio *folio)
{
return false;
@@ -33,7 +53,8 @@ static inline void zswap_invalidate(int type, pgoff_t offset) {}
static inline void zswap_swapon(int type) {}
static inline void zswap_swapoff(int type) {}
static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
-
+static inline void zswap_lruvec_init(struct lruvec *lruvec) {}
+static inline void zswap_lruvec_swapin(struct page *page) {}
#endif
#endif /* _LINUX_ZSWAP_H */
@@ -78,6 +78,7 @@ void lruvec_init(struct lruvec *lruvec)
memset(lruvec, 0, sizeof(struct lruvec));
spin_lock_init(&lruvec->lru_lock);
+ zswap_lruvec_state_init(lruvec);
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
@@ -687,6 +687,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
&page_allocated, false);
if (unlikely(page_allocated))
swap_readpage(page, false, NULL);
+ zswap_lruvec_swapin(page);
return page;
}
@@ -862,6 +863,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
&page_allocated, false);
if (unlikely(page_allocated))
swap_readpage(page, false, NULL);
+ zswap_lruvec_swapin(page);
return page;
}
@@ -146,6 +146,10 @@ module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644);
/* Number of zpools in zswap_pool (empirically determined for scalability) */
#define ZSWAP_NR_ZPOOLS 32
+/* Enable/disable memory pressure-based shrinker. */
+static bool zswap_shrinker_enabled;
+module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644);
+
/*********************************
* data structures
**********************************/
@@ -176,6 +180,8 @@ struct zswap_pool {
struct list_lru list_lru;
spinlock_t next_shrink_lock;
struct mem_cgroup *next_shrink;
+ struct shrinker *shrinker;
+ atomic_t nr_stored;
};
/*
@@ -274,17 +280,26 @@ static bool zswap_can_accept(void)
DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
}
+static u64 get_zswap_pool_size(struct zswap_pool *pool)
+{
+ u64 pool_size = 0;
+ int i;
+
+ for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+ pool_size += zpool_get_total_size(pool->zpools[i]);
+
+ return pool_size;
+}
+
static void zswap_update_total_size(void)
{
struct zswap_pool *pool;
u64 total = 0;
- int i;
rcu_read_lock();
list_for_each_entry_rcu(pool, &zswap_pools, list)
- for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
- total += zpool_get_total_size(pool->zpools[i]);
+ total += get_zswap_pool_size(pool);
rcu_read_unlock();
@@ -339,13 +354,34 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
kmem_cache_free(zswap_entry_cache, entry);
}
+/*********************************
+* zswap lruvec functions
+**********************************/
+void zswap_lruvec_state_init(struct lruvec *lruvec)
+{
+ atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+}
+
+void zswap_lruvec_swapin(struct page *page)
+{
+ struct lruvec *lruvec;
+
+ if (page) {
+ lruvec = folio_lruvec(page_folio(page));
+ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ }
+}
+
/*********************************
* lru functions
**********************************/
static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
{
+ atomic_long_t *nr_zswap_protected;
+ unsigned long lru_size, old, new;
int nid = entry_to_nid(entry);
struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
/*
* Note that it is safe to use rcu_read_lock() here, even in the face of
@@ -363,6 +399,19 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
memcg = get_mem_cgroup_from_entry(entry);
/* will always succeed */
list_lru_add(list_lru, &entry->lru, nid, memcg);
+
+ /* Update the protection area */
+ lru_size = list_lru_count_one(list_lru, nid, memcg);
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
+ old = atomic_long_inc_return(nr_zswap_protected);
+ /*
+ * Decay to avoid overflow and adapt to changing workloads.
+ * This is based on LRU reclaim cost decaying heuristics.
+ */
+ do {
+ new = old > lru_size / 4 ? old / 2 : old;
+ } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
rcu_read_unlock();
}
@@ -384,6 +433,7 @@ static void zswap_lru_putback(struct list_lru *list_lru,
int nid = entry_to_nid(entry);
spinlock_t *lock = &list_lru->node[nid].lock;
struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
rcu_read_lock();
memcg = get_mem_cgroup_from_entry(entry);
@@ -391,6 +441,10 @@ static void zswap_lru_putback(struct list_lru *list_lru,
/* we cannot use list_lru_add here, because it increments node's lru count */
list_lru_putback(list_lru, &entry->lru, nid, memcg);
spin_unlock(lock);
+
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry)));
+ /* increment the protection area to account for the LRU rotation. */
+ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
rcu_read_unlock();
}
@@ -480,6 +534,7 @@ static void zswap_free_entry(struct zswap_entry *entry)
else {
zswap_lru_del(&entry->pool->list_lru, entry);
zpool_free(zswap_find_zpool(entry), entry->handle);
+ atomic_dec(&entry->pool->nr_stored);
zswap_pool_put(entry->pool);
}
zswap_entry_cache_free(entry);
@@ -521,6 +576,95 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
return entry;
}
+/*********************************
+* shrinker functions
+**********************************/
+static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
+ spinlock_t *lock, void *arg);
+
+static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
+ unsigned long shrink_ret, nr_protected, lru_size;
+ struct zswap_pool *pool = shrinker->private_data;
+ bool encountered_page_in_swapcache = false;
+
+ nr_protected =
+ atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ lru_size = list_lru_shrink_count(&pool->list_lru, sc);
+
+ /*
+ * Abort if the shrinker is disabled or if we are shrinking into the
+ * protected region.
+ */
+ if (!zswap_shrinker_enabled || nr_protected >= lru_size - sc->nr_to_scan) {
+ sc->nr_scanned = 0;
+ return SHRINK_STOP;
+ }
+
+ shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
+ &encountered_page_in_swapcache);
+
+ if (encountered_page_in_swapcache)
+ return SHRINK_STOP;
+
+ return shrink_ret ? shrink_ret : SHRINK_STOP;
+}
+
+static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ struct zswap_pool *pool = shrinker->private_data;
+ struct mem_cgroup *memcg = sc->memcg;
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
+ unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
+
+#ifdef CONFIG_MEMCG_KMEM
+ cgroup_rstat_flush(memcg->css.cgroup);
+ nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
+ nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
+#else
+ /* use pool stats instead of memcg stats */
+ nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
+ nr_stored = atomic_read(&pool->nr_stored);
+#endif
+
+ if (!zswap_shrinker_enabled || !nr_stored)
+ return 0;
+
+ nr_protected =
+ atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
+ /*
+ * Subtract the lru size by an estimate of the number of pages
+ * that should be protected.
+ */
+ nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
+
+ /*
+ * Scale the number of freeable pages by the memory saving factor.
+ * This ensures that the better zswap compresses memory, the fewer
+ * pages we will evict to swap (as it will otherwise incur IO for
+ * relatively small memory saving).
+ */
+ return mult_frac(nr_freeable, nr_backing, nr_stored);
+}
+
+static void zswap_alloc_shrinker(struct zswap_pool *pool)
+{
+ pool->shrinker =
+ shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
+ if (!pool->shrinker)
+ return;
+
+ pool->shrinker->private_data = pool;
+ pool->shrinker->scan_objects = zswap_shrinker_scan;
+ pool->shrinker->count_objects = zswap_shrinker_count;
+ pool->shrinker->batch = 0;
+ pool->shrinker->seeks = DEFAULT_SEEKS;
+}
+
/*********************************
* per-cpu code
**********************************/
@@ -716,6 +860,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
spinlock_t *lock, void *arg)
{
struct zswap_entry *entry = container_of(item, struct zswap_entry, lru);
+ bool *encountered_page_in_swapcache = (bool *)arg;
struct zswap_tree *tree;
pgoff_t swpoffset;
enum lru_status ret = LRU_REMOVED_RETRY;
@@ -751,6 +896,17 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
zswap_reject_reclaim_fail++;
zswap_lru_putback(&entry->pool->list_lru, entry);
ret = LRU_RETRY;
+
+ /*
+ * Encountering a page already in swap cache is a sign that we are shrinking
+ * into the warmer region. We should terminate shrinking (if we're in the dynamic
+ * shrinker context).
+ */
+ if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
+ ret = LRU_SKIP;
+ *encountered_page_in_swapcache = true;
+ }
+
goto put_unlock;
}
zswap_written_back_pages++;
@@ -890,6 +1046,11 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
&pool->node);
if (ret)
goto error;
+
+ zswap_alloc_shrinker(pool);
+ if (!pool->shrinker)
+ goto error;
+
pr_debug("using %s compressor\n", pool->tfm_name);
/* being the current pool takes 1 ref; this func expects the
@@ -897,14 +1058,20 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
*/
kref_init(&pool->kref);
INIT_LIST_HEAD(&pool->list);
- list_lru_init_memcg(&pool->list_lru, NULL);
+ if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
+ goto lru_fail;
+ shrinker_register(pool->shrinker);
INIT_WORK(&pool->shrink_work, shrink_worker);
+ atomic_set(&pool->nr_stored, 0);
zswap_pool_debug("created", pool);
spin_lock_init(&pool->next_shrink_lock);
return pool;
+lru_fail:
+ list_lru_destroy(&pool->list_lru);
+ shrinker_free(pool->shrinker);
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
@@ -962,6 +1129,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
zswap_pool_debug("destroying", pool);
+ shrinker_free(pool->shrinker);
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
free_percpu(pool->acomp_ctx);
list_lru_destroy(&pool->list_lru);
@@ -1511,6 +1679,7 @@ bool zswap_store(struct folio *folio)
if (entry->length) {
INIT_LIST_HEAD(&entry->lru);
zswap_lru_add(&entry->pool->list_lru, entry);
+ atomic_inc(&entry->pool->nr_stored);
}
spin_unlock(&tree->lock);