[RFC,3/5] x86/ibs: Enable per-process IBS from sched switch path
Commit Message
Program IBS for access profiling for threads from the
task sched switch path. IBS is programmed with a period
that corresponds to the incoming thread. Kernel threads are
excluded from this.
The sample period is currently kept at a fixed value of 10000.
Signed-off-by: Bharata B Rao <bharata@amd.com>
---
arch/x86/mm/ibs.c | 27 +++++++++++++++++++++++++++
include/linux/sched.h | 1 +
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 1 +
kernel/sched/sched.h | 5 +++++
5 files changed, 35 insertions(+)
@@ -8,6 +8,7 @@
#include <asm/perf_event.h> /* TODO: Move defns like IBS_OP_ENABLE into non-perf header */
#include <asm/apic.h>
+#define IBS_SAMPLE_PERIOD 10000
static u64 ibs_config __read_mostly;
struct ibs_access_work {
@@ -15,6 +16,31 @@ struct ibs_access_work {
u64 laddr, paddr;
};
+void hw_access_sched_in(struct task_struct *prev, struct task_struct *curr)
+{
+ u64 config = 0;
+ unsigned int period;
+
+ if (!static_branch_unlikely(&hw_access_hints))
+ return;
+
+ /* Disable IBS for kernel thread */
+ if (!curr->mm)
+ goto out;
+
+ if (curr->numa_sample_period)
+ period = curr->numa_sample_period;
+ else
+ period = IBS_SAMPLE_PERIOD;
+
+
+ config = (period >> 4) & IBS_OP_MAX_CNT;
+ config |= (period & IBS_OP_MAX_CNT_EXT_MASK);
+ config |= ibs_config;
+out:
+ wrmsrl(MSR_AMD64_IBSOPCTL, config);
+}
+
void task_ibs_access_work(struct callback_head *work)
{
struct ibs_access_work *iwork = container_of(work, struct ibs_access_work, work);
@@ -198,6 +224,7 @@ int __init ibs_access_profiling_init(void)
x86_amd_ibs_access_profile_startup,
x86_amd_ibs_access_profile_teardown);
+ static_branch_enable(&hw_access_hints);
pr_info("IBS access profiling setup for NUMA Balancing\n");
return 0;
}
@@ -1254,6 +1254,7 @@ struct task_struct {
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
+ unsigned int numa_sample_period;
int numa_preferred_nid;
unsigned long numa_migrate_retry;
/* Migration stamp: */
@@ -5165,6 +5165,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = READ_ONCE(prev->__state);
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
+ hw_access_sched_in(prev, current);
finish_task(prev);
tick_nohz_task_switch();
finish_lock_switch(rq);
@@ -3094,6 +3094,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_sample_period = 0;
p->numa_migrate_retry = 0;
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
@@ -1723,11 +1723,16 @@ extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *p, struct task_struct *t,
int cpu, int scpu);
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+void hw_access_sched_in(struct task_struct *prev, struct task_struct *curr);
#else
static inline void
init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
{
}
+static inline void hw_access_sched_in(struct task_struct *prev,
+ struct task_struct *curr)
+{
+}
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SMP