[RFC,1/5] perf: Add ioctl to emit sideband events
Commit Message
perf tools currently read /proc to get this information, but that
races with changes made by the kernel.
Add an ioctl to output status-only sideband events for a currently
active event on the current CPU. Using timestamps, these status-only
sideband events will be correctly ordered with respect to "real"
sideband events.
The assumption is a user will:
- open and enable a dummy event to track sideband events
- call the new ioctl to get sideband information for currently
running processes as needed
- enable the remaining selected events
The initial sideband events to be supported will be: fork, namespaces, comm
and mmap.
Add a new misc flag PERF_RECORD_MISC_STATUS_ONLY to differentiate "real"
sideband events from status-only sideband events.
The limitation that the event must be active is significant. The ioctl
caller must either:
i) For a CPU context, set CPU affinity to the correct CPU.
Note, obviously that would not need to be done for system-wide
tracing on all CPUs. It would also only need to be done for the
period of tracing when the ioctl is to be used.
ii) Use an event opened for the current process on all CPUs.
Note, if such an additional event is needed, it would also use
additional memory from the user's perf_event_mlock_kb /
RLIMIT_MEMLOCK limit.
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
include/uapi/linux/perf_event.h | 19 ++++++-
kernel/events/core.c | 87 ++++++++++++++++++++++++++++++++-
2 files changed, 103 insertions(+), 3 deletions(-)
Comments
On Fri, Apr 14, 2023 at 11:22:56AM +0300, Adrian Hunter wrote:
> perf tools currently read /proc to get this information, but that
> races with changes made by the kernel.
>
> Add an ioctl to output status-only sideband events for a currently
> active event on the current CPU. Using timestamps, these status-only
> sideband events will be correctly ordered with respect to "real"
> sideband events.
>
> The assumption is a user will:
> - open and enable a dummy event to track sideband events
> - call the new ioctl to get sideband information for currently
> running processes as needed
> - enable the remaining selected events
>
> The initial sideband events to be supported will be: fork, namespaces, comm
> and mmap.
>
> Add a new misc flag PERF_RECORD_MISC_STATUS_ONLY to differentiate "real"
> sideband events from status-only sideband events.
>
> The limitation that the event must be active is significant. The ioctl
> caller must either:
> i) For a CPU context, set CPU affinity to the correct CPU.
> Note, obviously that would not need to be done for system-wide
> tracing on all CPUs. It would also only need to be done for the
> period of tracing when the ioctl is to be used.
> ii) Use an event opened for the current process on all CPUs.
> Note, if such an additional event is needed, it would also use
> additional memory from the user's perf_event_mlock_kb /
> RLIMIT_MEMLOCK limit.
Why would a single per-task event not work? I see nothing in the code
that would require a per-task-per-cpu setup. Or am I just having trouble
reading again?
On 17/04/23 13:57, Peter Zijlstra wrote:
> On Fri, Apr 14, 2023 at 11:22:56AM +0300, Adrian Hunter wrote:
>> perf tools currently read /proc to get this information, but that
>> races with changes made by the kernel.
>>
>> Add an ioctl to output status-only sideband events for a currently
>> active event on the current CPU. Using timestamps, these status-only
>> sideband events will be correctly ordered with respect to "real"
>> sideband events.
>>
>> The assumption is a user will:
>> - open and enable a dummy event to track sideband events
>> - call the new ioctl to get sideband information for currently
>> running processes as needed
>> - enable the remaining selected events
>>
>> The initial sideband events to be supported will be: fork, namespaces, comm
>> and mmap.
>>
>> Add a new misc flag PERF_RECORD_MISC_STATUS_ONLY to differentiate "real"
>> sideband events from status-only sideband events.
>>
>> The limitation that the event must be active is significant. The ioctl
>> caller must either:
>> i) For a CPU context, set CPU affinity to the correct CPU.
>> Note, obviously that would not need to be done for system-wide
>> tracing on all CPUs. It would also only need to be done for the
>> period of tracing when the ioctl is to be used.
>> ii) Use an event opened for the current process on all CPUs.
>> Note, if such an additional event is needed, it would also use
>> additional memory from the user's perf_event_mlock_kb /
>> RLIMIT_MEMLOCK limit.
>
> Why would a single per-task event not work? I see nothing in the code
> that would require a per-task-per-cpu setup. Or am I just having trouble
> reading again?
Sorry, "all CPUS" should have been "cpu=-1"
@@ -541,6 +541,18 @@ struct perf_event_query_bpf {
__u32 ids[];
};
+enum perf_event_emit_flag {
+ PERF_EVENT_EMIT_FORK = 1U << 0,
+ PERF_EVENT_EMIT_NAMESPACES = 1U << 1,
+ PERF_EVENT_EMIT_COMM = 1U << 2,
+ PERF_EVENT_EMIT_MMAP = 1U << 3,
+};
+
+struct perf_event_pid_sb {
+ __u32 pid;
+ __u32 emit_flags; /* Refer perf_event_emit_flag */
+};
+
/*
* Ioctls that can be done on a perf event fd:
*/
@@ -556,6 +568,7 @@ struct perf_event_query_bpf {
#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32)
#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *)
#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *)
+#define PERF_EVENT_IOC_EMIT_SIDEBAND _IOW('$', 12, struct perf_event_pid_sb *)
enum perf_event_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
@@ -743,12 +756,13 @@ struct perf_event_mmap_page {
* The current state of perf_event_header::misc bits usage:
* ('|' used bit, '-' unused bit)
*
- * 012 CDEF
- * |||---------||||
+ * 012 BCDEF
+ * |||--------|||||
*
* Where:
* 0-2 CPUMODE_MASK
*
+ * B STATUS_ONLY
* C PROC_MAP_PARSE_TIMEOUT
* D MMAP_DATA / COMM_EXEC / FORK_EXEC / SWITCH_OUT
* E MMAP_BUILD_ID / EXACT_IP / SCHED_OUT_PREEMPT
@@ -763,6 +777,7 @@ struct perf_event_mmap_page {
#define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0)
#define PERF_RECORD_MISC_GUEST_USER (5 << 0)
+#define PERF_RECORD_MISC_STATUS_ONLY (1 << 11)
/*
* Indicates that /proc/PID/maps parsing are truncated by time out.
*/
@@ -5797,6 +5797,7 @@ static int perf_event_set_output(struct perf_event *event,
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
struct perf_event_attr *attr);
+static int perf_event_emit_sideband(struct perf_event *event, void __user *arg);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
@@ -5924,6 +5925,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
if (ret)
return ret;
+ if (cmd == PERF_EVENT_IOC_EMIT_SIDEBAND)
+ return perf_event_emit_sideband(event, (void __user *)arg);
+
ctx = perf_event_ctx_lock(event);
ret = _perf_ioctl(event, cmd, arg);
perf_event_ctx_unlock(event, ctx);
@@ -5940,6 +5944,7 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
case _IOC_NR(PERF_EVENT_IOC_ID):
case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
+ case _IOC_NR(PERF_EVENT_IOC_EMIT_SIDEBAND):
/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
cmd &= ~IOCSIZE_MASK;
@@ -12277,7 +12282,7 @@ perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
bool is_capable = perfmon_capable();
- if (attr->sigtrap) {
+ if (attr && attr->sigtrap) {
/*
* perf_event_attr::sigtrap sends signals to the other task.
* Require the current task to also have CAP_KILL.
@@ -12810,6 +12815,86 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+static int perf_event_emit_fork(struct perf_event *event, struct task_struct *task)
+{
+ return -EINVAL;
+}
+
+static int perf_event_emit_namespaces(struct perf_event *event, struct task_struct *task)
+{
+ return -EINVAL;
+}
+
+static int perf_event_emit_comm(struct perf_event *event, struct task_struct *task)
+{
+ return -EINVAL;
+}
+
+static int perf_event_emit_mmap(struct perf_event *event, struct task_struct *task)
+{
+ return -EINVAL;
+}
+
+static int perf_event_emit_sideband(struct perf_event *event, void __user *arg)
+{
+ struct perf_event_pid_sb pid_sb;
+ struct perf_event_context *ctx;
+ struct task_struct *task;
+ int err;
+
+ if (copy_from_user(&pid_sb, arg, sizeof(pid_sb)))
+ return -EFAULT;
+
+ if (pid_sb.emit_flags & ~(PERF_EVENT_EMIT_FORK |
+ PERF_EVENT_EMIT_NAMESPACES |
+ PERF_EVENT_EMIT_COMM |
+ PERF_EVENT_EMIT_MMAP))
+ return -EINVAL;
+
+ task = find_lively_task_by_vpid(pid_sb.pid);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ err = down_read_interruptible(&task->signal->exec_update_lock);
+ if (err)
+ goto out_put_task;
+
+ /* Validate access to pid (same as perf_event_open) */
+ err = -EACCES;
+ if (!perf_check_permission(NULL, task))
+ goto out_cred;
+
+ ctx = perf_event_ctx_lock(event);
+
+ if (pid_sb.emit_flags & PERF_EVENT_EMIT_FORK) {
+ err = perf_event_emit_fork(event, task);
+ if (err)
+ goto out_ctx;
+ }
+ if (pid_sb.emit_flags & PERF_EVENT_EMIT_NAMESPACES) {
+ err = perf_event_emit_namespaces(event, task);
+ if (err)
+ goto out_ctx;
+ }
+ if (pid_sb.emit_flags & PERF_EVENT_EMIT_COMM) {
+ err = perf_event_emit_comm(event, task);
+ if (err)
+ goto out_ctx;
+ }
+ if (pid_sb.emit_flags & PERF_EVENT_EMIT_MMAP) {
+ err = perf_event_emit_mmap(event, task);
+ if (err)
+ goto out_ctx;
+ }
+out_ctx:
+ perf_event_ctx_unlock(event, ctx);
+out_cred:
+ up_read(&task->signal->exec_update_lock);
+out_put_task:
+ put_task_struct(task);
+ return err;
+}
+
static void __perf_pmu_remove(struct perf_event_context *ctx,
int cpu, struct pmu *pmu,
struct perf_event_groups *groups,