[RFC,1/2] mm: multi-gen LRU: periodic aging

Message ID 20221214225123.2770216-2-yuanchu@google.com
State New
Headers
Series mm: multi-gen LRU: working set extensions |

Commit Message

Yuanchu Xie Dec. 14, 2022, 10:51 p.m. UTC
  Periodically age MGLRU-enabled lruvecs to turn MGLRU generations into
time-based working set information. This includes an interface to set
the periodic aging interval and a new kthread to perform aging.

memory.periodic_aging: a new root-level only file in cgroupfs
Writing to memory.periodic aging sets the aging interval and opts into
periodic aging.
kold: a new kthread that ages memcgs based on the set aging interval.

Signed-off-by: Yuanchu Xie <yuanchu@google.com>
---
 include/linux/kold.h   |  44 ++++++++++++
 include/linux/mmzone.h |   4 +-
 mm/Makefile            |   3 +
 mm/kold.c              | 150 +++++++++++++++++++++++++++++++++++++++++
 mm/memcontrol.c        |  52 ++++++++++++++
 mm/vmscan.c            |  35 +++++++++-
 6 files changed, 286 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/kold.h
 create mode 100644 mm/kold.c
  

Patch

diff --git a/include/linux/kold.h b/include/linux/kold.h
new file mode 100644
index 000000000000..10b0dbe09a5c
--- /dev/null
+++ b/include/linux/kold.h
@@ -0,0 +1,44 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Periodic aging for multi-gen LRU
+ *
+ * Copyright (C) 2022 Yuanchu Xie <yuanchu@google.com>
+ */
+#ifndef KOLD_H_
+#define KOLD_H_
+
+#include <linux/memcontrol.h>
+
+struct kold_stats {
+	/* late is defined as spending an entire interval aging without sleep
+	 * stat is aggregated every aging interval
+	 */
+	unsigned int late_count;
+};
+
+int kold_set_interval(unsigned int interval);
+unsigned int kold_get_interval(void);
+int kold_get_stats(struct kold_stats *stats);
+
+/* returns the creation timestamp of the youngest generation */
+unsigned long lru_gen_force_age_lruvec(struct mem_cgroup *memcg, int nid,
+				      unsigned long min_ttl);
+
+#ifndef CONFIG_MEMCG
+int kold_set_interval(unsigned int interval)
+{
+	return 0;
+}
+
+unsigned int kold_get_interval(void)
+{
+	return 0;
+}
+
+int kold_get_stats(struct kold_stats *stats)
+{
+	return -1;
+}
+#endif	/* CONFIG_MEMCG */
+
+#endif	/* KOLD_H_ */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5f74891556f3..929c777b826a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1218,7 +1218,9 @@  typedef struct pglist_data {
 
 #ifdef CONFIG_LRU_GEN
 	/* kswap mm walk data */
-	struct lru_gen_mm_walk	mm_walk;
+	struct lru_gen_mm_walk mm_walk;
+	/* kold periodic aging walk data */
+	struct lru_gen_mm_walk kold_mm_walk;
 #endif
 
 	CACHELINE_PADDING(_pad2_);
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5b3e29..8bd554a6eb7d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -98,6 +98,9 @@  obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
 obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
 obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+ifdef CONFIG_LRU_GEN
+obj-$(CONFIG_MEMCG) += kold.o
+endif
 ifdef CONFIG_SWAP
 obj-$(CONFIG_MEMCG) += swap_cgroup.o
 endif
diff --git a/mm/kold.c b/mm/kold.c
new file mode 100644
index 000000000000..094574177968
--- /dev/null
+++ b/mm/kold.c
@@ -0,0 +1,150 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Yuanchu Xie <yuanchu@google.com>
+ */
+#include <linux/stddef.h>
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <linux/mmzone.h>
+#include <linux/nodemask.h>
+#include <linux/sched/mm.h>
+#include <linux/swap.h>
+#include <linux/memcontrol.h>
+#include <linux/err.h>
+#include <linux/jiffies.h>
+#include <linux/sched.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/kold.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/mm_inline.h>
+
+static struct task_struct *kold_thread __read_mostly;
+/* protects kold_thread */
+static DEFINE_MUTEX(kold_mutex);
+
+static unsigned int aging_interval __read_mostly;
+static unsigned int late_count;
+
+/* try to move to a cpu on the target node */
+static void try_move_current_to_node(int nid)
+{
+	struct cpumask node_cpus;
+
+	cpumask_and(&node_cpus, cpumask_of_node(nid), cpu_online_mask);
+	if (!cpumask_empty(&node_cpus))
+		set_cpus_allowed_ptr(current, &node_cpus);
+}
+
+static int kold_run(void *none)
+{
+	int nid;
+	unsigned int flags;
+	unsigned long last_interval_start_time = jiffies;
+	bool sleep_since_last_full_scan = false;
+	struct mem_cgroup *memcg;
+	struct reclaim_state reclaim_state = {};
+
+	while (!kthread_should_stop()) {
+		unsigned long interval =
+			(unsigned long)(READ_ONCE(aging_interval)) * HZ;
+		unsigned long next_wakeup_tick = jiffies + interval;
+		long timeout_ticks;
+
+		current->reclaim_state = &reclaim_state;
+		flags = memalloc_noreclaim_save();
+
+		for_each_node_state(nid, N_MEMORY) {
+			pg_data_t *pgdat = NODE_DATA(nid);
+
+			try_move_current_to_node(nid);
+			reclaim_state.mm_walk = &pgdat->kold_mm_walk;
+
+			memcg = mem_cgroup_iter(NULL, NULL, NULL);
+			do {
+				unsigned long young_timestamp =
+					lru_gen_force_age_lruvec(memcg, nid,
+								 interval);
+
+				if (time_before(young_timestamp + interval,
+						next_wakeup_tick)) {
+					next_wakeup_tick = young_timestamp + interval;
+				}
+			} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+		}
+
+		memalloc_noreclaim_restore(flags);
+		current->reclaim_state = NULL;
+
+		/* late_count stats update */
+		if (time_is_before_jiffies(last_interval_start_time + interval)) {
+			last_interval_start_time += interval;
+			if (!sleep_since_last_full_scan) {
+				WRITE_ONCE(late_count,
+					   READ_ONCE(late_count) + 1);
+			}
+			sleep_since_last_full_scan = false;
+		}
+
+		/* sleep until next aging */
+		timeout_ticks = -(long)(jiffies - next_wakeup_tick);
+		if (timeout_ticks > 0 && timeout_ticks != MAX_SCHEDULE_TIMEOUT) {
+			sleep_since_last_full_scan = true;
+			schedule_timeout_idle(timeout_ticks);
+		}
+	}
+	return 0;
+}
+
+int kold_get_stats(struct kold_stats *stats)
+{
+	stats->late_count = READ_ONCE(late_count);
+	return 0;
+}
+
+unsigned int kold_get_interval(void)
+{
+	return READ_ONCE(aging_interval);
+}
+
+int kold_set_interval(unsigned int interval)
+{
+	int err = 0;
+
+	mutex_lock(&kold_mutex);
+	if (interval && !kold_thread) {
+		if (!lru_gen_enabled()) {
+			err = -EOPNOTSUPP;
+			goto cleanup;
+		}
+		kold_thread = kthread_create(kold_run, NULL, "kold");
+
+		if (IS_ERR(kold_thread)) {
+			pr_err("kold: kthread_run(kold_run) failed\n");
+			err = PTR_ERR(kold_thread);
+			kold_thread = NULL;
+			goto cleanup;
+		}
+		WRITE_ONCE(aging_interval, interval);
+		wake_up_process(kold_thread);
+	} else {
+		if (!interval && kold_thread) {
+			kthread_stop(kold_thread);
+			kold_thread = NULL;
+		}
+		WRITE_ONCE(aging_interval, interval);
+	}
+
+cleanup:
+	mutex_unlock(&kold_mutex);
+	return err;
+}
+
+static int __init kold_init(void)
+{
+	return 0;
+}
+
+module_init(kold_init);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d8549ae1b30..7d2fb3fc4580 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,6 +63,7 @@ 
 #include <linux/resume_user_mode.h>
 #include <linux/psi.h>
 #include <linux/seq_buf.h>
+#include <linux/kold.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -6569,6 +6570,49 @@  static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+#ifdef CONFIG_LRU_GEN
+static int memory_periodic_aging_show(struct seq_file *m, void *v)
+{
+	unsigned int interval = kold_get_interval();
+	struct kold_stats stats;
+	int err;
+
+	err = kold_get_stats(&stats);
+
+	if (err)
+		return err;
+
+	seq_printf(m, "aging_interval %u\n", interval);
+	seq_printf(m, "late_count %u\n", stats.late_count);
+	return 0;
+}
+
+static ssize_t memory_periodic_aging_write(struct kernfs_open_file *of,
+				       char *buf, size_t nbytes,
+				       loff_t off)
+{
+	unsigned int new_interval;
+	int err;
+
+	if (!lru_gen_enabled())
+		return -EOPNOTSUPP;
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	err = kstrtouint(buf, 0, &new_interval);
+	if (err)
+		return err;
+
+	err = kold_set_interval(new_interval);
+	if (err)
+		return err;
+
+	return nbytes;
+}
+#endif /* CONFIG_LRU_GEN */
+
 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 			      size_t nbytes, loff_t off)
 {
@@ -6679,6 +6723,14 @@  static struct cftype memory_files[] = {
 		.flags = CFTYPE_NS_DELEGATABLE,
 		.write = memory_reclaim,
 	},
+#ifdef CONFIG_LRU_GEN
+	{
+		.name = "periodic_aging",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.seq_show = memory_periodic_aging_show,
+		.write = memory_periodic_aging_write,
+	},
+#endif
 	{ }	/* terminate */
 };
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 04d8b88e5216..0fea21366fc8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -54,6 +54,7 @@ 
 #include <linux/shmem_fs.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
+#include <linux/kold.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -5279,8 +5280,10 @@  static void lru_gen_change_state(bool enabled)
 
 	if (enabled)
 		static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
-	else
+	else {
 		static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+		kold_set_interval(0);
+	}
 
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 	do {
@@ -5760,6 +5763,36 @@  static const struct file_operations lru_gen_ro_fops = {
 	.release = seq_release,
 };
 
+/******************************************************************************
+ *                      periodic aging (kold)
+ ******************************************************************************/
+
+/* age lruvec as long as it is older than min_ttl,
+ * return the timestamp of the youngest generation
+ */
+unsigned long lru_gen_force_age_lruvec(struct mem_cgroup *memcg, int nid,
+				      unsigned long min_ttl)
+{
+	struct scan_control sc = {
+		.may_writepage = true,
+		.may_unmap = true,
+		.may_swap = true,
+		.reclaim_idx = MAX_NR_ZONES - 1,
+		.gfp_mask = GFP_KERNEL,
+	};
+	struct lruvec *lruvec = get_lruvec(memcg, nid);
+	DEFINE_MAX_SEQ(lruvec);
+	int gen = lru_gen_from_seq(max_seq);
+	unsigned long birth_timestamp =
+		READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+	if (time_is_before_jiffies(birth_timestamp + min_ttl))
+		try_to_inc_max_seq(lruvec, max_seq, &sc, true, true);
+
+	return READ_ONCE(lruvec->lrugen.timestamps[lru_gen_from_seq(
+		READ_ONCE((lruvec)->lrugen.max_seq))]);
+}
+
 /******************************************************************************
  *                          initialization
  ******************************************************************************/