[v4,4/4] sched/psi: allow unprivileged polling of N*2s period
Commit Message
PSI offers 2 mechanisms to get information about a specific resource
pressure. One is reading from /proc/pressure/<resource>, which gives
average pressures aggregated every 2s. The other is creating a pollable
fd for a specific resource and cgroup.
The trigger creation requires CAP_SYS_RESOURCE, and gives the
possibility to pick specific time window and threshold, spawing an RT
thread to aggregate the data.
Systemd would like to provide containers the option to monitor pressure
on their own cgroup and sub-cgroups. For example, if systemd launches a
container that itself then launches services, the container should have
the ability to poll() for pressure in individual services. But neither
the container nor the services are privileged.
This patch implements a mechanism to allow unprivileged users to create
pressure triggers. The difference with privileged triggers creation is
that unprivileged ones must have a time window that's a multiple of 2s.
This is so that we can avoid unrestricted spawning of rt threads, and
use instead the same aggregation mechanism done for the averages, which
runs independently of any triggers.
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
---
Documentation/accounting/psi.rst | 4 ++
include/linux/psi.h | 2 +-
include/linux/psi_types.h | 7 ++
kernel/cgroup/cgroup.c | 2 +-
kernel/sched/psi.c | 120 +++++++++++++++++++------------
5 files changed, 87 insertions(+), 48 deletions(-)
Comments
On Wed, Mar 29, 2023 at 05:33:27PM +0200, Domenico Cerasuolo wrote:
> PSI offers 2 mechanisms to get information about a specific resource
> pressure. One is reading from /proc/pressure/<resource>, which gives
> average pressures aggregated every 2s. The other is creating a pollable
> fd for a specific resource and cgroup.
>
> The trigger creation requires CAP_SYS_RESOURCE, and gives the
> possibility to pick specific time window and threshold, spawing an RT
> thread to aggregate the data.
>
> Systemd would like to provide containers the option to monitor pressure
> on their own cgroup and sub-cgroups. For example, if systemd launches a
> container that itself then launches services, the container should have
> the ability to poll() for pressure in individual services. But neither
> the container nor the services are privileged.
>
> This patch implements a mechanism to allow unprivileged users to create
> pressure triggers. The difference with privileged triggers creation is
> that unprivileged ones must have a time window that's a multiple of 2s.
> This is so that we can avoid unrestricted spawning of rt threads, and
> use instead the same aggregation mechanism done for the averages, which
> runs independently of any triggers.
>
> Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Overall it looks good to me. Thanks for adding the comment on the
privilege check, it's much easier to understand now.
A few nitpicks below:
> @@ -151,6 +151,9 @@ struct psi_trigger {
>
> /* Deferred event(s) from previous ratelimit window */
> bool pending_event;
> +
> + /* Used to differentiate destruction action*/
> + enum psi_aggregators aggregator;
> };
The comment is a bit mysterious. How about
/* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
> @@ -186,9 +186,9 @@ static void group_init(struct psi_group *group)
> seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
> group->avg_last_update = sched_clock();
> group->avg_next_update = group->avg_last_update + psi_period;
> - INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
> mutex_init(&group->avgs_lock);
> - /* Init trigger-related members */
> +
> + /* Init rtpoll trigger-related members */
> atomic_set(&group->rtpoll_scheduled, 0);
> mutex_init(&group->rtpoll_trigger_lock);
> INIT_LIST_HEAD(&group->rtpoll_triggers);
> @@ -197,6 +197,11 @@ static void group_init(struct psi_group *group)
> init_waitqueue_head(&group->rtpoll_wait);
> timer_setup(&group->rtpoll_timer, poll_timer_fn, 0);
> rcu_assign_pointer(group->rtpoll_task, NULL);
> +
> + /* Init avg trigger-related members */
> + INIT_LIST_HEAD(&group->avg_triggers);
> + memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers));
> + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
> }
Can you move those above the rtpoll inits?
It helps with navigating the code and spotting missing inits when the
init sequence follows the order of the struct members.
> @@ -430,21 +435,25 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
> return growth;
> }
>
> -static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total)
> +static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
> + enum psi_aggregators aggregator)
> {
> struct psi_trigger *t;
> - u64 *total = group->total[PSI_POLL];
> + u64 *total = group->total[aggregator];
> + struct list_head *triggers = aggregator == PSI_AVGS ? &group->avg_triggers
> + : &group->rtpoll_triggers;
> + u64 *aggregator_total = aggregator == PSI_AVGS ? group->avg_total : group->rtpoll_total;
> *update_total = false;
These lines are a bit too long. When the init part gets that long,
it's preferable to move it outside of the decl block:
if (aggregator == PSI_AVGS) {
triggers = &group->avg_triggers;
aggregator_total = group->avg_total;
} else {
triggers = &group->rtpoll_triggers;
aggregator_total = group->rtpoll_total;
}
> @@ -1357,22 +1389,26 @@ void psi_trigger_destroy(struct psi_trigger *t)
> u64 period = ULLONG_MAX;
>
> list_del(&t->node);
> - group->rtpoll_nr_triggers[t->state]--;
> - if (!group->rtpoll_nr_triggers[t->state])
> - group->rtpoll_states &= ~(1 << t->state);
> - /* reset min update period for the remaining triggers */
> - list_for_each_entry(tmp, &group->rtpoll_triggers, node)
> - period = min(period, div_u64(tmp->win.size,
> - UPDATES_PER_WINDOW));
> - group->rtpoll_min_period = period;
> - /* Destroy rtpoll_task when the last trigger is destroyed */
> - if (group->rtpoll_states == 0) {
> - group->rtpoll_until = 0;
> - task_to_destroy = rcu_dereference_protected(
> - group->rtpoll_task,
> - lockdep_is_held(&group->rtpoll_trigger_lock));
> - rcu_assign_pointer(group->rtpoll_task, NULL);
> - del_timer(&group->rtpoll_timer);
> + if (t->aggregator == PSI_AVGS) {
> + group->avg_nr_triggers[t->state]--;
> + } else {
> + group->rtpoll_nr_triggers[t->state]--;
> + if (!group->rtpoll_nr_triggers[t->state])
> + group->rtpoll_states &= ~(1 << t->state);
> + /* reset min update period for the remaining triggers */
> + list_for_each_entry(tmp, &group->rtpoll_triggers, node)
> + period = min(period, div_u64(tmp->win.size,
> + UPDATES_PER_WINDOW));
> + group->rtpoll_min_period = period;
> + /* Destroy rtpoll_task when the last trigger is destroyed */
> + if (group->rtpoll_states == 0) {
> + group->rtpoll_until = 0;
> + task_to_destroy = rcu_dereference_protected(
> + group->rtpoll_task,
> + lockdep_is_held(&group->rtpoll_trigger_lock));
> + rcu_assign_pointer(group->rtpoll_task, NULL);
> + del_timer(&group->rtpoll_timer);
These lines are quite long too, we usually shoot for a line length of
80 characters. Can you do
if (t->aggregator == PSI_AVGS) {
group->avg_nr_triggers[t->state]--;
return;
}
/* Else, it's an rtpoll trigger */
group->rtpoll_nr_triggers[t->state]--;
...
With that, please add
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
On Wed, Mar 29, 2023 at 08:32:12PM +0200, Domenico Cerasuolo wrote:
> On Wed, Mar 29, 2023 at 6:13 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> > On Wed, Mar 29, 2023 at 05:33:27PM +0200, Domenico Cerasuolo wrote:
> > > PSI offers 2 mechanisms to get information about a specific resource
> > > pressure. One is reading from /proc/pressure/<resource>, which gives
> > > average pressures aggregated every 2s. The other is creating a pollable
> > > fd for a specific resource and cgroup.
> > >
> > > The trigger creation requires CAP_SYS_RESOURCE, and gives the
> > > possibility to pick specific time window and threshold, spawing an RT
> > > thread to aggregate the data.
> > >
> > > Systemd would like to provide containers the option to monitor pressure
> > > on their own cgroup and sub-cgroups. For example, if systemd launches a
> > > container that itself then launches services, the container should have
> > > the ability to poll() for pressure in individual services. But neither
> > > the container nor the services are privileged.
> > >
> > > This patch implements a mechanism to allow unprivileged users to create
> > > pressure triggers. The difference with privileged triggers creation is
> > > that unprivileged ones must have a time window that's a multiple of 2s.
> > > This is so that we can avoid unrestricted spawning of rt threads, and
> > > use instead the same aggregation mechanism done for the averages, which
> > > runs independently of any triggers.
> > >
> > > Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
> > > Signed-off-by: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
> >
> > Overall it looks good to me. Thanks for adding the comment on the
> > privilege check, it's much easier to understand now.
> >
> > A few nitpicks below:
> >
> > > @@ -151,6 +151,9 @@ struct psi_trigger {
> > >
> > > /* Deferred event(s) from previous ratelimit window */
> > > bool pending_event;
> > > +
> > > + /* Used to differentiate destruction action*/
> > > + enum psi_aggregators aggregator;
> > > };
> >
> > The comment is a bit mysterious. How about
> >
> > /* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
> >
> > > @@ -186,9 +186,9 @@ static void group_init(struct psi_group *group)
> > > seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
> > > group->avg_last_update = sched_clock();
> > > group->avg_next_update = group->avg_last_update + psi_period;
> > > - INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
> > > mutex_init(&group->avgs_lock);
> > > - /* Init trigger-related members */
> > > +
> > > + /* Init rtpoll trigger-related members */
> > > atomic_set(&group->rtpoll_scheduled, 0);
> > > mutex_init(&group->rtpoll_trigger_lock);
> > > INIT_LIST_HEAD(&group->rtpoll_triggers);
> > > @@ -197,6 +197,11 @@ static void group_init(struct psi_group *group)
> > > init_waitqueue_head(&group->rtpoll_wait);
> > > timer_setup(&group->rtpoll_timer, poll_timer_fn, 0);
> > > rcu_assign_pointer(group->rtpoll_task, NULL);
> > > +
> > > + /* Init avg trigger-related members */
> > > + INIT_LIST_HEAD(&group->avg_triggers);
> > > + memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers));
> > > + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
> > > }
> >
> > Can you move those above the rtpoll inits?
> >
> > It helps with navigating the code and spotting missing inits when the
> > init sequence follows the order of the struct members.
> >
> > > @@ -430,21 +435,25 @@ static u64 window_update(struct psi_window *win,
> > u64 now, u64 value)
> > > return growth;
> > > }
> > >
> > > -static u64 update_triggers(struct psi_group *group, u64 now, bool
> > *update_total)
> > > +static u64 update_triggers(struct psi_group *group, u64 now, bool
> > *update_total,
> > > + enum psi_aggregators
> > aggregator)
> > > {
> > > struct psi_trigger *t;
> > > - u64 *total = group->total[PSI_POLL];
> > > + u64 *total = group->total[aggregator];
> > > + struct list_head *triggers = aggregator == PSI_AVGS ?
> > &group->avg_triggers
> > > + : &group->rtpoll_triggers;
> > > + u64 *aggregator_total = aggregator == PSI_AVGS ? group->avg_total
> > : group->rtpoll_total;
> > > *update_total = false;
> >
> > These lines are a bit too long. When the init part gets that long,
> > it's preferable to move it outside of the decl block:
> >
> > if (aggregator == PSI_AVGS) {
> > triggers = &group->avg_triggers;
> > aggregator_total = group->avg_total;
> > } else {
> > triggers = &group->rtpoll_triggers;
> > aggregator_total = group->rtpoll_total;
> > }
> >
> > > @@ -1357,22 +1389,26 @@ void psi_trigger_destroy(struct psi_trigger *t)
> > > u64 period = ULLONG_MAX;
> > >
> > > list_del(&t->node);
> > > - group->rtpoll_nr_triggers[t->state]--;
> > > - if (!group->rtpoll_nr_triggers[t->state])
> > > - group->rtpoll_states &= ~(1 << t->state);
> > > - /* reset min update period for the remaining triggers */
> > > - list_for_each_entry(tmp, &group->rtpoll_triggers, node)
> > > - period = min(period, div_u64(tmp->win.size,
> > > - UPDATES_PER_WINDOW));
> > > - group->rtpoll_min_period = period;
> > > - /* Destroy rtpoll_task when the last trigger is destroyed
> > */
> > > - if (group->rtpoll_states == 0) {
> > > - group->rtpoll_until = 0;
> > > - task_to_destroy = rcu_dereference_protected(
> > > - group->rtpoll_task,
> > > -
> > lockdep_is_held(&group->rtpoll_trigger_lock));
> > > - rcu_assign_pointer(group->rtpoll_task, NULL);
> > > - del_timer(&group->rtpoll_timer);
> > > + if (t->aggregator == PSI_AVGS) {
> > > + group->avg_nr_triggers[t->state]--;
> > > + } else {
> > > + group->rtpoll_nr_triggers[t->state]--;
> > > + if (!group->rtpoll_nr_triggers[t->state])
> > > + group->rtpoll_states &= ~(1 << t->state);
> > > + /* reset min update period for the remaining
> > triggers */
> > > + list_for_each_entry(tmp, &group->rtpoll_triggers,
> > node)
> > > + period = min(period, div_u64(tmp->win.size,
> > > + UPDATES_PER_WINDOW));
> > > + group->rtpoll_min_period = period;
> > > + /* Destroy rtpoll_task when the last trigger is
> > destroyed */
> > > + if (group->rtpoll_states == 0) {
> > > + group->rtpoll_until = 0;
> > > + task_to_destroy =
> > rcu_dereference_protected(
> > > + group->rtpoll_task,
> > > +
> > lockdep_is_held(&group->rtpoll_trigger_lock));
> > > + rcu_assign_pointer(group->rtpoll_task,
> > NULL);
> > > + del_timer(&group->rtpoll_timer);
> >
> > These lines are quite long too, we usually shoot for a line length of
> > 80 characters. Can you do
> >
> > if (t->aggregator == PSI_AVGS) {
> > group->avg_nr_triggers[t->state]--;
> > return;
> > }
> >
> > /* Else, it's an rtpoll trigger */
> > group->rtpoll_nr_triggers[t->state]--;
> > ...
> >
> Can't return there I think, the function doesn't end after the else branch,
> should I put a `goto out` instead to jump the rtpoll code?
You're right, I missed the bottom part beyond the diff.
Looking closer, I think trigger_create and trigger_destroy are
actually buggy. They have to protect against update_trigger(), so both
creation and destruction must take the right lock - avgs_lock or
rtpoll_trigger_lock - before modifying the list. They're both taking
only the rtpoll_trigger_lock right now.
IOW the trigger type distinction needs to be higher up in general.
if (t->aggregator == PSI_AVGS) {
mutex_lock(&group->avgs_lock);
...
} else {
mutex_lock(&group->rtpoll_trigger_lock);
...
}
@@ -105,6 +105,10 @@ prevent overly frequent polling. Max limit is chosen as a high enough number
after which monitors are most likely not needed and psi averages can be used
instead.
+Unprivileged users can also create monitors, with the only limitation that the
+window size must be a multiple of 2s, in order to prevent excessive resource
+usage.
+
When activated, psi monitor stays active for at least the duration of one
tracking window to avoid repeated activations/deactivations when system is
bouncing in and out of the stall state.
@@ -24,7 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
struct psi_trigger *psi_trigger_create(struct psi_group *group,
- char *buf, enum psi_res res);
+ char *buf, enum psi_res res, struct file *file);
void psi_trigger_destroy(struct psi_trigger *t);
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
@@ -151,6 +151,9 @@ struct psi_trigger {
/* Deferred event(s) from previous ratelimit window */
bool pending_event;
+
+ /* Used to differentiate destruction action*/
+ enum psi_aggregators aggregator;
};
struct psi_group {
@@ -171,6 +174,10 @@ struct psi_group {
/* Aggregator work control */
struct delayed_work avgs_work;
+ /* Unprivileged triggers against N*PSI_FREQ windows */
+ struct list_head avg_triggers;
+ u32 avg_nr_triggers[NR_PSI_STATES - 1];
+
/* Total stall times and sampled pressure averages */
u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
unsigned long avg[NR_PSI_STATES - 1][3];
@@ -3761,7 +3761,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
}
psi = cgroup_psi(cgrp);
- new = psi_trigger_create(psi, buf, res);
+ new = psi_trigger_create(psi, buf, res, of->file);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
@@ -186,9 +186,9 @@ static void group_init(struct psi_group *group)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
group->avg_next_update = group->avg_last_update + psi_period;
- INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
- /* Init trigger-related members */
+
+ /* Init rtpoll trigger-related members */
atomic_set(&group->rtpoll_scheduled, 0);
mutex_init(&group->rtpoll_trigger_lock);
INIT_LIST_HEAD(&group->rtpoll_triggers);
@@ -197,6 +197,11 @@ static void group_init(struct psi_group *group)
init_waitqueue_head(&group->rtpoll_wait);
timer_setup(&group->rtpoll_timer, poll_timer_fn, 0);
rcu_assign_pointer(group->rtpoll_task, NULL);
+
+ /* Init avg trigger-related members */
+ INIT_LIST_HEAD(&group->avg_triggers);
+ memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers));
+ INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
}
void __init psi_init(void)
@@ -430,21 +435,25 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
return growth;
}
-static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total)
+static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
+ enum psi_aggregators aggregator)
{
struct psi_trigger *t;
- u64 *total = group->total[PSI_POLL];
+ u64 *total = group->total[aggregator];
+ struct list_head *triggers = aggregator == PSI_AVGS ? &group->avg_triggers
+ : &group->rtpoll_triggers;
+ u64 *aggregator_total = aggregator == PSI_AVGS ? group->avg_total : group->rtpoll_total;
*update_total = false;
/*
* On subsequent updates, calculate growth deltas and let
* watchers know when their specified thresholds are exceeded.
*/
- list_for_each_entry(t, &group->rtpoll_triggers, node) {
+ list_for_each_entry(t, triggers, node) {
u64 growth;
bool new_stall;
- new_stall = group->rtpoll_total[t->state] != total[t->state];
+ new_stall = aggregator_total[t->state] != total[t->state];
/* Check for stall activity or a previous threshold breach */
if (!new_stall && !t->pending_event)
@@ -546,6 +555,7 @@ static void psi_avgs_work(struct work_struct *work)
struct delayed_work *dwork;
struct psi_group *group;
u32 changed_states;
+ bool update_total;
u64 now;
dwork = to_delayed_work(work);
@@ -563,8 +573,10 @@ static void psi_avgs_work(struct work_struct *work)
* Once restarted, we'll catch up the running averages in one
* go - see calc_avgs() and missed_periods.
*/
- if (now >= group->avg_next_update)
+ if (now >= group->avg_next_update) {
+ update_triggers(group, now, &update_total, PSI_AVGS);
group->avg_next_update = update_averages(group, now);
+ }
if (changed_states & PSI_STATE_RESCHEDULE) {
schedule_delayed_work(dwork, nsecs_to_jiffies(
@@ -574,7 +586,7 @@ static void psi_avgs_work(struct work_struct *work)
mutex_unlock(&group->avgs_lock);
}
-static void init_triggers(struct psi_group *group, u64 now)
+static void init_rtpoll_triggers(struct psi_group *group, u64 now)
{
struct psi_trigger *t;
@@ -667,7 +679,7 @@ static void psi_rtpoll_work(struct psi_group *group)
if (changed_states & group->rtpoll_states) {
/* Initialize trigger windows when entering polling mode */
if (now > group->rtpoll_until)
- init_triggers(group, now);
+ init_rtpoll_triggers(group, now);
/*
* Keep the monitor active for at least the duration of the
@@ -684,7 +696,7 @@ static void psi_rtpoll_work(struct psi_group *group)
}
if (now >= group->rtpoll_next_update) {
- group->rtpoll_next_update = update_triggers(group, now, &update_total);
+ group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL);
if (update_total)
memcpy(group->rtpoll_total, group->total[PSI_POLL],
sizeof(group->rtpoll_total));
@@ -1254,16 +1266,23 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
}
struct psi_trigger *psi_trigger_create(struct psi_group *group,
- char *buf, enum psi_res res)
+ char *buf, enum psi_res res, struct file *file)
{
struct psi_trigger *t;
enum psi_states state;
u32 threshold_us;
+ bool privileged;
u32 window_us;
if (static_branch_likely(&psi_disabled))
return ERR_PTR(-EOPNOTSUPP);
+ /*
+ * Checking the privilege here on file->f_cred implies that a privileged user
+ * could open the file and delegate the write to an unprivileged one.
+ */
+ privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE);
+
if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
state = PSI_IO_SOME + res * 2;
else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
@@ -1283,6 +1302,13 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
window_us > WINDOW_MAX_US)
return ERR_PTR(-EINVAL);
+ /*
+ * Unprivileged users can only use 2s windows so that averages aggregation
+ * work is used, and no RT threads need to be spawned.
+ */
+ if (!privileged && window_us % 2000000)
+ return ERR_PTR(-EINVAL);
+
/* Check threshold */
if (threshold_us == 0 || threshold_us > window_us)
return ERR_PTR(-EINVAL);
@@ -1302,10 +1328,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->last_event_time = 0;
init_waitqueue_head(&t->event_wait);
t->pending_event = false;
+ t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
mutex_lock(&group->rtpoll_trigger_lock);
- if (!rcu_access_pointer(group->rtpoll_task)) {
+ if (privileged && !rcu_access_pointer(group->rtpoll_task)) {
struct task_struct *task;
task = kthread_create(psi_rtpoll_worker, group, "psimon");
@@ -1319,11 +1346,16 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
rcu_assign_pointer(group->rtpoll_task, task);
}
- list_add(&t->node, &group->rtpoll_triggers);
- group->rtpoll_min_period = min(group->rtpoll_min_period,
- div_u64(t->win.size, UPDATES_PER_WINDOW));
- group->rtpoll_nr_triggers[t->state]++;
- group->rtpoll_states |= (1 << t->state);
+ if (privileged) {
+ list_add(&t->node, &group->rtpoll_triggers);
+ group->rtpoll_min_period = min(group->rtpoll_min_period,
+ div_u64(t->win.size, UPDATES_PER_WINDOW));
+ group->rtpoll_nr_triggers[t->state]++;
+ group->rtpoll_states |= (1 << t->state);
+ } else {
+ list_add(&t->node, &group->avg_triggers);
+ group->avg_nr_triggers[t->state]++;
+ }
mutex_unlock(&group->rtpoll_trigger_lock);
@@ -1357,22 +1389,26 @@ void psi_trigger_destroy(struct psi_trigger *t)
u64 period = ULLONG_MAX;
list_del(&t->node);
- group->rtpoll_nr_triggers[t->state]--;
- if (!group->rtpoll_nr_triggers[t->state])
- group->rtpoll_states &= ~(1 << t->state);
- /* reset min update period for the remaining triggers */
- list_for_each_entry(tmp, &group->rtpoll_triggers, node)
- period = min(period, div_u64(tmp->win.size,
- UPDATES_PER_WINDOW));
- group->rtpoll_min_period = period;
- /* Destroy rtpoll_task when the last trigger is destroyed */
- if (group->rtpoll_states == 0) {
- group->rtpoll_until = 0;
- task_to_destroy = rcu_dereference_protected(
- group->rtpoll_task,
- lockdep_is_held(&group->rtpoll_trigger_lock));
- rcu_assign_pointer(group->rtpoll_task, NULL);
- del_timer(&group->rtpoll_timer);
+ if (t->aggregator == PSI_AVGS) {
+ group->avg_nr_triggers[t->state]--;
+ } else {
+ group->rtpoll_nr_triggers[t->state]--;
+ if (!group->rtpoll_nr_triggers[t->state])
+ group->rtpoll_states &= ~(1 << t->state);
+ /* reset min update period for the remaining triggers */
+ list_for_each_entry(tmp, &group->rtpoll_triggers, node)
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ group->rtpoll_min_period = period;
+ /* Destroy rtpoll_task when the last trigger is destroyed */
+ if (group->rtpoll_states == 0) {
+ group->rtpoll_until = 0;
+ task_to_destroy = rcu_dereference_protected(
+ group->rtpoll_task,
+ lockdep_is_held(&group->rtpoll_trigger_lock));
+ rcu_assign_pointer(group->rtpoll_task, NULL);
+ del_timer(&group->rtpoll_timer);
+ }
}
}
@@ -1437,27 +1473,19 @@ static int psi_cpu_show(struct seq_file *m, void *v)
return psi_show(m, &psi_system, PSI_CPU);
}
-static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
-{
- if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- return single_open(file, psi_show, NULL);
-}
-
static int psi_io_open(struct inode *inode, struct file *file)
{
- return psi_open(file, psi_io_show);
+ return single_open(file, psi_io_show, NULL);
}
static int psi_memory_open(struct inode *inode, struct file *file)
{
- return psi_open(file, psi_memory_show);
+ return single_open(file, psi_memory_show, NULL);
}
static int psi_cpu_open(struct inode *inode, struct file *file)
{
- return psi_open(file, psi_cpu_show);
+ return single_open(file, psi_cpu_show, NULL);
}
static ssize_t psi_write(struct file *file, const char __user *user_buf,
@@ -1491,7 +1519,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
return -EBUSY;
}
- new = psi_trigger_create(&psi_system, buf, res);
+ new = psi_trigger_create(&psi_system, buf, res, file);
if (IS_ERR(new)) {
mutex_unlock(&seq->lock);
return PTR_ERR(new);
@@ -1571,7 +1599,7 @@ static int psi_irq_show(struct seq_file *m, void *v)
static int psi_irq_open(struct inode *inode, struct file *file)
{
- return psi_open(file, psi_irq_show);
+ return single_open(file, psi_irq_show, NULL);
}
static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,