@@ -1257,6 +1257,8 @@ struct task_struct {
unsigned int numa_sample_period;
int numa_preferred_nid;
unsigned long numa_migrate_retry;
+ unsigned int numa_access_faults;
+ unsigned int numa_access_faults_window;
/* Migration stamp: */
u64 node_stamp;
u64 last_task_numa_placement;
@@ -334,6 +334,14 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
+ debugfs_create_u32("sample_period_def", 0644, numa,
+ &sysctl_numa_balancing_sample_period_def);
+ debugfs_create_u32("sample_period_min", 0644, numa,
+ &sysctl_numa_balancing_sample_period_min);
+ debugfs_create_u32("sample_period_max", 0644, numa,
+ &sysctl_numa_balancing_sample_period_max);
+ debugfs_create_u32("access_faults_threshold", 0644, numa,
+ &sysctl_numa_balancing_access_faults_threshold);
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
@@ -1093,6 +1093,11 @@ adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
#endif /* CONFIG_NUMA */
#ifdef CONFIG_NUMA_BALANCING
+unsigned int sysctl_numa_balancing_sample_period_def = 10000;
+unsigned int sysctl_numa_balancing_sample_period_min = 5000;
+unsigned int sysctl_numa_balancing_sample_period_max = 20000;
+unsigned int sysctl_numa_balancing_access_faults_threshold = 250;
+
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
* calculated based on the tasks virtual memory size and
@@ -1572,6 +1577,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ bool early = false;
/*
* The pages in slow memory node should be migrated according
@@ -1611,13 +1617,21 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
return false;
+ if (static_branch_unlikely(&hw_access_hints)) {
+ if (p->numa_access_faults < sysctl_numa_balancing_access_faults_threshold * 4)
+ early = true;
+ } else {
+ if (p->numa_scan_seq <= 4)
+ early = true;
+ }
+
/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
- if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || early) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
@@ -2305,7 +2319,11 @@ static void numa_migrate_preferred(struct task_struct *p)
return;
/* Periodically retry migrating the task to the preferred node */
- interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+ if (static_branch_unlikely(&hw_access_hints))
+ interval = min(interval, msecs_to_jiffies(p->numa_sample_period) / 16);
+ else
+ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+
p->numa_migrate_retry = jiffies + interval;
/* Success if task is already running on preferred CPU */
@@ -2430,6 +2448,77 @@ static void update_task_scan_period(struct task_struct *p,
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
+static void update_task_sample_period(struct task_struct *p,
+ unsigned long shared, unsigned long private)
+{
+ unsigned int period_slot;
+ int lr_ratio, ps_ratio;
+ int diff;
+
+ unsigned long remote = p->numa_faults_locality[0];
+ unsigned long local = p->numa_faults_locality[1];
+
+ /*
+ * If there were no access faults then either the task is
+ * completely idle or all activity is in areas that are not of interest
+ * to automatic numa balancing. Related to that, if there were failed
+ * migration then it implies we are migrating too quickly or the local
+ * node is overloaded. In either case, increase the sampling rate.
+ */
+ if (local + shared == 0 || p->numa_faults_locality[2]) {
+ p->numa_sample_period = min(sysctl_numa_balancing_sample_period_max,
+ p->numa_sample_period << 1);
+ return;
+ }
+
+ /*
+ * Prepare to scale scan period relative to the current period.
+ * == NUMA_PERIOD_THRESHOLD sample period stays the same
+ * < NUMA_PERIOD_THRESHOLD sample period decreases
+ * >= NUMA_PERIOD_THRESHOLD sample period increases
+ */
+ period_slot = DIV_ROUND_UP(p->numa_sample_period, NUMA_PERIOD_SLOTS);
+ lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+ if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are local. There is no need to
+ * do fast access sampling, since memory is already local.
+ */
+ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are shared with other tasks.
+ * There is no point in continuing fast access sampling,
+ * since other tasks may just move the memory elsewhere.
+ */
+ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
+
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else {
+ /*
+ * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+ * yet they are not on the local NUMA node. Speed up
+ * access sampling to get the memory moved over.
+ */
+ int ratio = max(lr_ratio, ps_ratio);
+
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+ }
+
+ p->numa_sample_period = clamp(p->numa_sample_period + diff,
+ sysctl_numa_balancing_sample_period_min,
+ sysctl_numa_balancing_sample_period_max);
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
/*
* Get the fraction of time the task has been running since the last
* NUMA placement cycle. The scheduler keeps similar statistics, but
@@ -2560,16 +2649,24 @@ static void task_numa_placement(struct task_struct *p)
spinlock_t *group_lock = NULL;
struct numa_group *ng;
- /*
- * The p->mm->numa_scan_seq field gets updated without
- * exclusive access. Use READ_ONCE() here to ensure
- * that the field is read in a single access:
- */
- seq = READ_ONCE(p->mm->numa_scan_seq);
- if (p->numa_scan_seq == seq)
- return;
- p->numa_scan_seq = seq;
- p->numa_scan_period_max = task_scan_max(p);
+ if (static_branch_unlikely(&hw_access_hints)) {
+ p->numa_access_faults_window++;
+ p->numa_access_faults++;
+ if (p->numa_access_faults_window < sysctl_numa_balancing_access_faults_threshold)
+ return;
+ p->numa_access_faults_window = 0;
+ } else {
+ /*
+ * The p->mm->numa_scan_seq field gets updated without
+ * exclusive access. Use READ_ONCE() here to ensure
+ * that the field is read in a single access:
+ */
+ seq = READ_ONCE(p->mm->numa_scan_seq);
+ if (p->numa_scan_seq == seq)
+ return;
+ p->numa_scan_seq = seq;
+ p->numa_scan_period_max = task_scan_max(p);
+ }
total_faults = p->numa_faults_locality[0] +
p->numa_faults_locality[1];
@@ -2672,7 +2769,10 @@ static void task_numa_placement(struct task_struct *p)
sched_setnuma(p, max_nid);
}
- update_task_scan_period(p, fault_types[0], fault_types[1]);
+ if (static_branch_unlikely(&hw_access_hints))
+ update_task_sample_period(p, fault_types[0], fault_types[1]);
+ else
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
}
static inline int get_numa_group(struct numa_group *grp)
@@ -3094,7 +3194,9 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
- p->numa_sample_period = 0;
+ p->numa_sample_period = sysctl_numa_balancing_sample_period_def;
+ p->numa_access_faults = 0;
+ p->numa_access_faults_window = 0;
p->numa_migrate_retry = 0;
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
@@ -2473,6 +2473,10 @@ extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_numa_balancing_hot_threshold;
+extern unsigned int sysctl_numa_balancing_sample_period_def;
+extern unsigned int sysctl_numa_balancing_sample_period_min;
+extern unsigned int sysctl_numa_balancing_sample_period_max;
+extern unsigned int sysctl_numa_balancing_access_faults_threshold;
#endif
#ifdef CONFIG_SCHED_HRTICK