[v2] tracing: Optimize event type allocation with IDA

Message ID 20221110020319.1259291-1-zhengyejian1@huawei.com
State New
Headers
Series [v2] tracing: Optimize event type allocation with IDA |

Commit Message

Zheng Yejian Nov. 10, 2022, 2:03 a.m. UTC
  After commit 060fa5c83e67 ("tracing/events: reuse trace event ids after
 overflow"), trace events with dynamic type are linked up in list
'ftrace_event_list' through field 'trace_event.list'. Then when max
event type number used up, it's possible to reuse type number of some
freed one by traversing 'ftrace_event_list'.

As instead, using IDA to manage available type numbers can make codes
simpler and then the field 'trace_event.list' can be dropped.

Since 'struct trace_event' is used in static tracepoints, drop
'trace_event.list' can make vmlinux smaller. Local test with about 2000
tracepoints, vmlinux reduced about 64KB:
  before:-rwxrwxr-x 1 root root 76669448 Nov  8 17:14 vmlinux
  after: -rwxrwxr-x 1 root root 76604176 Nov  8 17:15 vmlinux

Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
---
 include/linux/trace_events.h |  1 -
 kernel/trace/trace_output.c  | 66 +++++++++---------------------------
 2 files changed, 16 insertions(+), 51 deletions(-)

Changes since v1:
  - Explicitly include linux/idr.h as suggested by Masami Hiramatsu
    Link: https://lore.kernel.org/lkml/20221109222650.ce6c22e231345f6852f6956f@kernel.org/#t
  

Comments

Masami Hiramatsu (Google) Nov. 11, 2022, 2:41 p.m. UTC | #1
On Thu, 10 Nov 2022 10:03:19 +0800
Zheng Yejian <zhengyejian1@huawei.com> wrote:

> After commit 060fa5c83e67 ("tracing/events: reuse trace event ids after
>  overflow"), trace events with dynamic type are linked up in list
> 'ftrace_event_list' through field 'trace_event.list'. Then when max
> event type number used up, it's possible to reuse type number of some
> freed one by traversing 'ftrace_event_list'.
> 
> As instead, using IDA to manage available type numbers can make codes
> simpler and then the field 'trace_event.list' can be dropped.
> 
> Since 'struct trace_event' is used in static tracepoints, drop
> 'trace_event.list' can make vmlinux smaller. Local test with about 2000
> tracepoints, vmlinux reduced about 64KB:
>   before:-rwxrwxr-x 1 root root 76669448 Nov  8 17:14 vmlinux
>   after: -rwxrwxr-x 1 root root 76604176 Nov  8 17:15 vmlinux
> 

This looks good to me.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks

> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
> ---
>  include/linux/trace_events.h |  1 -
>  kernel/trace/trace_output.c  | 66 +++++++++---------------------------
>  2 files changed, 16 insertions(+), 51 deletions(-)
> 
> Changes since v1:
>   - Explicitly include linux/idr.h as suggested by Masami Hiramatsu
>     Link: https://lore.kernel.org/lkml/20221109222650.ce6c22e231345f6852f6956f@kernel.org/#t
> 
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index 20749bd9db71..bb2053246d6a 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -136,7 +136,6 @@ struct trace_event_functions {
>  
>  struct trace_event {
>  	struct hlist_node		node;
> -	struct list_head		list;
>  	int				type;
>  	struct trace_event_functions	*funcs;
>  };
> diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
> index 67f47ea27921..f0ba97121345 100644
> --- a/kernel/trace/trace_output.c
> +++ b/kernel/trace/trace_output.c
> @@ -11,6 +11,7 @@
>  #include <linux/kprobes.h>
>  #include <linux/sched/clock.h>
>  #include <linux/sched/mm.h>
> +#include <linux/idr.h>
>  
>  #include "trace_output.h"
>  
> @@ -21,8 +22,6 @@ DECLARE_RWSEM(trace_event_sem);
>  
>  static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
>  
> -static int next_event_type = __TRACE_LAST_TYPE;
> -
>  enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
>  {
>  	struct trace_seq *s = &iter->seq;
> @@ -688,38 +687,23 @@ struct trace_event *ftrace_find_event(int type)
>  	return NULL;
>  }
>  
> -static LIST_HEAD(ftrace_event_list);
> +static DEFINE_IDA(trace_event_ida);
>  
> -static int trace_search_list(struct list_head **list)
> +static void free_trace_event_type(int type)
>  {
> -	struct trace_event *e = NULL, *iter;
> -	int next = __TRACE_LAST_TYPE;
> -
> -	if (list_empty(&ftrace_event_list)) {
> -		*list = &ftrace_event_list;
> -		return next;
> -	}
> +	if (type >= __TRACE_LAST_TYPE)
> +		ida_free(&trace_event_ida, type);
> +}
>  
> -	/*
> -	 * We used up all possible max events,
> -	 * lets see if somebody freed one.
> -	 */
> -	list_for_each_entry(iter, &ftrace_event_list, list) {
> -		if (iter->type != next) {
> -			e = iter;
> -			break;
> -		}
> -		next++;
> -	}
> +static int alloc_trace_event_type(void)
> +{
> +	int next;
>  
> -	/* Did we used up all 65 thousand events??? */
> -	if (next > TRACE_EVENT_TYPE_MAX)
> +	/* Skip static defined type numbers */
> +	next = ida_alloc_range(&trace_event_ida, __TRACE_LAST_TYPE,
> +			       TRACE_EVENT_TYPE_MAX, GFP_KERNEL);
> +	if (next < 0)
>  		return 0;
> -
> -	if (e)
> -		*list = &e->list;
> -	else
> -		*list = &ftrace_event_list;
>  	return next;
>  }
>  
> @@ -761,28 +745,10 @@ int register_trace_event(struct trace_event *event)
>  	if (WARN_ON(!event->funcs))
>  		goto out;
>  
> -	INIT_LIST_HEAD(&event->list);
> -
>  	if (!event->type) {
> -		struct list_head *list = NULL;
> -
> -		if (next_event_type > TRACE_EVENT_TYPE_MAX) {
> -
> -			event->type = trace_search_list(&list);
> -			if (!event->type)
> -				goto out;
> -
> -		} else {
> -
> -			event->type = next_event_type++;
> -			list = &ftrace_event_list;
> -		}
> -
> -		if (WARN_ON(ftrace_find_event(event->type)))
> +		event->type = alloc_trace_event_type();
> +		if (!event->type)
>  			goto out;
> -
> -		list_add_tail(&event->list, list);
> -
>  	} else if (WARN(event->type > __TRACE_LAST_TYPE,
>  			"Need to add type to trace.h")) {
>  		goto out;
> @@ -819,7 +785,7 @@ EXPORT_SYMBOL_GPL(register_trace_event);
>  int __unregister_trace_event(struct trace_event *event)
>  {
>  	hlist_del(&event->node);
> -	list_del(&event->list);
> +	free_trace_event_type(event->type);
>  	return 0;
>  }
>  
> -- 
> 2.25.1
>
  
Zheng Yejian Nov. 23, 2022, 3:18 a.m. UTC | #2
On Fri, 11 Nov 2022 23:41:37 +0900,
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> On Thu, 10 Nov 2022 10:03:19 +0800
> Zheng Yejian <zhengyejian1@huawei.com> wrote:
>
> > After commit 060fa5c83e67 ("tracing/events: reuse trace event ids after
> >  overflow"), trace events with dynamic type are linked up in list
> > 'ftrace_event_list' through field 'trace_event.list'. Then when max
> > event type number used up, it's possible to reuse type number of some
> > freed one by traversing 'ftrace_event_list'.
> >
> > As instead, using IDA to manage available type numbers can make codes
> > simpler and then the field 'trace_event.list' can be dropped.
> >
> > Since 'struct trace_event' is used in static tracepoints, drop
> > 'trace_event.list' can make vmlinux smaller. Local test with about 2000
> > tracepoints, vmlinux reduced about 64KB:
> >   before:-rwxrwxr-x 1 root root 76669448 Nov  8 17:14 vmlinux
> >   after: -rwxrwxr-x 1 root root 76604176 Nov  8 17:15 vmlinux
> >
>
> This looks good to me.
>
> Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks for your kind ack, Masami!

But Yujie Liu <yujie.liu@intel.com> reported a problem that highly
reproducible after applying this patch:
Link: https://lore.kernel.org/lkml/54f23c9c-97ae-e326-5873-bfa5d2c81f52@intel.com/

So please DO NOT apply this patch before I find what happened about it.

--
Best regards,
Zheng Yejian

>
> Thanks
>
> > Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
> > ---
> >  include/linux/trace_events.h |  1 -
> >  kernel/trace/trace_output.c  | 66 +++++++++---------------------------
> >  2 files changed, 16 insertions(+), 51 deletions(-)
> >
> > Changes since v1:
> >   - Explicitly include linux/idr.h as suggested by Masami Hiramatsu
> >     Link: https://lore.kernel.org/lkml/20221109222650.ce6c22e231345f6852f6956f@kernel.org/#t
> >
> > diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> > index 20749bd9db71..bb2053246d6a 100644
> > --- a/include/linux/trace_events.h
> > +++ b/include/linux/trace_events.h
> > @@ -136,7 +136,6 @@ struct trace_event_functions {
> >
> >  struct trace_event {
> >	struct hlist_node		node;
> > -	struct list_head		list;
> >	int				type;
> >	struct trace_event_functions	*funcs;
> >  };
> > diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
> > index 67f47ea27921..f0ba97121345 100644
> > --- a/kernel/trace/trace_output.c
> > +++ b/kernel/trace/trace_output.c
> > @@ -11,6 +11,7 @@
> >  #include <linux/kprobes.h>
> >  #include <linux/sched/clock.h>
> >  #include <linux/sched/mm.h>
> > +#include <linux/idr.h>
> >
> >  #include "trace_output.h"
> >
> > @@ -21,8 +22,6 @@ DECLARE_RWSEM(trace_event_sem);
> >
> >  static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
> >
> > -static int next_event_type = __TRACE_LAST_TYPE;
> > -
> >  enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
> >  {
> >	struct trace_seq *s = &iter->seq;
> > @@ -688,38 +687,23 @@ struct trace_event *ftrace_find_event(int type)
> >	return NULL;
> >  }
> >
> > -static LIST_HEAD(ftrace_event_list);
> > +static DEFINE_IDA(trace_event_ida);
> >
> > -static int trace_search_list(struct list_head **list)
> > +static void free_trace_event_type(int type)
> >  {
> > -	struct trace_event *e = NULL, *iter;
> > -	int next = __TRACE_LAST_TYPE;
> > -
> > -	if (list_empty(&ftrace_event_list)) {
> > -		*list = &ftrace_event_list;
> > -		return next;
> > -	}
> > +	if (type >= __TRACE_LAST_TYPE)
> > +		ida_free(&trace_event_ida, type);
> > +}
> >
> > -	/*
> > -	 * We used up all possible max events,
> > -	 * lets see if somebody freed one.
> > -	 */
> > -	list_for_each_entry(iter, &ftrace_event_list, list) {
> > -		if (iter->type != next) {
> > -			e = iter;
> > -			break;
> > -		}
> > -		next++;
> > -	}
> > +static int alloc_trace_event_type(void)
> > +{
> > +	int next;
> >
> > -	/* Did we used up all 65 thousand events??? */
> > -	if (next > TRACE_EVENT_TYPE_MAX)
> > +	/* Skip static defined type numbers */
> > +	next = ida_alloc_range(&trace_event_ida, __TRACE_LAST_TYPE,
> > +			       TRACE_EVENT_TYPE_MAX, GFP_KERNEL);
> > +	if (next < 0)
> >		return 0;
> > -
> > -	if (e)
> > -		*list = &e->list;
> > -	else
> > -		*list = &ftrace_event_list;
> >	return next;
> >  }
> >
> > @@ -761,28 +745,10 @@ int register_trace_event(struct trace_event *event)
> >	if (WARN_ON(!event->funcs))
> >		goto out;
> >
> > -	INIT_LIST_HEAD(&event->list);
> > -
> >	if (!event->type) {
> > -		struct list_head *list = NULL;
> > -
> > -		if (next_event_type > TRACE_EVENT_TYPE_MAX) {
> > -
> > -			event->type = trace_search_list(&list);
> > -			if (!event->type)
> > -				goto out;
> > -
> > -		} else {
> > -
> > -			event->type = next_event_type++;
> > -			list = &ftrace_event_list;
> > -		}
> > -
> > -		if (WARN_ON(ftrace_find_event(event->type)))
> > +		event->type = alloc_trace_event_type();
> > +		if (!event->type)
> >			goto out;
> > -
> > -		list_add_tail(&event->list, list);
> > -
> >	} else if (WARN(event->type > __TRACE_LAST_TYPE,
> >			"Need to add type to trace.h")) {
> >		goto out;
> > @@ -819,7 +785,7 @@ EXPORT_SYMBOL_GPL(register_trace_event);
> >  int __unregister_trace_event(struct trace_event *event)
> >  {
> >	hlist_del(&event->node);
> > -	list_del(&event->list);
> > +	free_trace_event_type(event->type);
> >	return 0;
> >  }
> >
> > --
> > 2.25.1
> >
>
>
> --
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
  
Steven Rostedt Nov. 23, 2022, 3:32 a.m. UTC | #3
On Wed, 23 Nov 2022 11:18:06 +0800
Zheng Yejian <zhengyejian1@huawei.com> wrote:

> But Yujie Liu <yujie.liu@intel.com> reported a problem that highly
> reproducible after applying this patch:
> Link: https://lore.kernel.org/lkml/54f23c9c-97ae-e326-5873-bfa5d2c81f52@intel.com/
> 
> So please DO NOT apply this patch before I find what happened about it.

I know what the issue is.

The current way of assigning types is to always increment. And not to
reuse until it fills up. And even then, it looks for the next available
number.

I'm guessing the IDA will reuse a number as soon as it is freed. This
may also have uncovered a bug, as in reality, we must actually clear
the tracing buffers every time a number is reused.

What happens is that the type number is associated to a print format.
That is, the raw data is tagged with the type. This type maps to how to
parse the raw data. If you have a kprobe, it creates a new type number.
If you free it, and create another one. With the IDA, it is likely to
reassign the previously freed number to a new probe.

To explain this better, let's look at the following scenario:

 echo 'p:foo val=$arg1:u64' > kprobe_events
 echo 1 > events/kprobes/foo/enable
 sleep 1
 echo 0 > events/kprobes/foo/enable

 echo 'p:bar val=+0($arg1):string' > kprobe_events

 # foo kprobe is deleted and bar is created and
 # with IDA, bar has the same number for type as foo

 cat trace

When you read the trace, it will see a binary blob representing an
event and marked with a type. Although the event was foo, it will now
map it to bar. And it will read foo's $arg1:u64 as bar's
+0($arg1):string, and will crash.

-- Steve
  
Steven Rostedt Nov. 23, 2022, 4:17 a.m. UTC | #4
On Tue, 22 Nov 2022 22:32:58 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> To explain this better, let's look at the following scenario:
> 
>  echo 'p:foo val=$arg1:u64' > kprobe_events
>  echo 1 > events/kprobes/foo/enable
>  sleep 1
>  echo 0 > events/kprobes/foo/enable
> 
>  echo 'p:bar val=+0($arg1):string' > kprobe_events
> 
>  # foo kprobe is deleted and bar is created and
>  # with IDA, bar has the same number for type as foo
> 
>  cat trace
> 
> When you read the trace, it will see a binary blob representing an
> event and marked with a type. Although the event was foo, it will now
> map it to bar. And it will read foo's $arg1:u64 as bar's
> +0($arg1):string, and will crash.

I just tested my theory, and it proved it:

 # cd /sys/kernel/tracing
 # echo 'p:kprobes/foo do_sys_openat2 $arg1:u32' > kprobe_events
 # echo 1 > events/kprobes/foo/enable
 # cat /etc/passwd
 # echo 0 > events/kprobes/foo/enable
 # echo 'p:kprobes/foo do_sys_openat2 +0($arg2):string' > kprobe_events
 # cat trace
# tracer: nop
#
# entries-in-buffer/entries-written: 14/14   #P:8
#
#                                _-----=> irqs-off/BH-disabled
#                               / _----=> need-resched
#                              | / _---=> hardirq/softirq
#                              || / _--=> preempt-depth
#                              ||| / _-=> migrate-disable
#                              |||| /     delay
#           TASK-PID     CPU#  |||||  TIMESTAMP  FUNCTION
#              | |         |   |||||     |         |
        sendmail-1942    [002] .....   530.136320: foo: (do_sys_openat2+0x0/0x240) arg1=             cat-2046    [004] .....   530.930817: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
             cat-2046    [004] .....   530.930961: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
             cat-2046    [004] .....   530.934278: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
             cat-2046    [004] .....   530.934563: foo: (do_sys_openat2+0x0/0x240) arg1="������������������������������������������������������������������������������������������������"
            bash-1515    [007] .....   534.299093: foo: (do_sys_openat2+0x0/0x240) arg1="kkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk���������@��4Z����;Y�����U

And dmesg has:

[  558.504698] ==================================================================
[  558.511925] BUG: KASAN: use-after-free in string+0xd4/0x1c0
[  558.517501] Read of size 1 at addr ffff88805fdbbfa0 by task cat/2049

[  558.525348] CPU: 0 PID: 2049 Comm: cat Not tainted 6.1.0-rc6-test+ #641
[  558.531959] Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016
[  558.540909] Call Trace:
[  558.543360]  <TASK>
[  558.545465]  dump_stack_lvl+0x5b/0x77
[  558.549132]  print_report+0x17f/0x47b
[  558.552805]  ? __virt_addr_valid+0xd9/0x160
[  558.556998]  ? string+0xd4/0x1c0
[  558.560229]  ? string+0xd4/0x1c0
[  558.563462]  kasan_report+0xad/0x130
[  558.567042]  ? string+0xd4/0x1c0
[  558.570274]  string+0xd4/0x1c0
[  558.573332]  ? ip6_addr_string_sa+0x3e0/0x3e0
[  558.577690]  ? format_decode+0xa2/0x690
[  558.581531]  ? simple_strtoul+0x10/0x10
[  558.585378]  vsnprintf+0x500/0x840
[  558.588785]  ? pointer+0x740/0x740
[  558.592190]  ? pointer+0x740/0x740
[  558.595594]  seq_buf_vprintf+0x62/0xc0
[  558.599346]  trace_seq_printf+0x10e/0x1e0
[  558.603359]  ? trace_seq_bitmask+0x130/0x130
[  558.607632]  ? memcpy+0x38/0x60
[  558.610774]  ? seq_buf_putmem+0x6e/0xa0
[  558.614616]  print_type_string+0x90/0xa0
[  558.618539]  ? print_type_symbol+0x80/0x80
[  558.622640]  print_kprobe_event+0x16b/0x290
[  558.626830]  print_trace_line+0x451/0x8e0
[  558.630847]  ? tracing_buffers_read+0x3f0/0x3f0
[  558.635380]  ? preempt_count_sub+0xb7/0x100
[  558.639566]  ? _raw_spin_unlock_irqrestore+0x28/0x50
[  558.644532]  ? trace_find_next_entry_inc+0xa7/0xe0
[  558.649325]  s_show+0x72/0x1f0
[  558.652386]  seq_read_iter+0x58e/0x750
[  558.656147]  seq_read+0x115/0x160
[  558.659475]  ? seq_read_iter+0x750/0x750
[  558.663401]  ? __mod_lruvec_page_state+0x123/0x280
[  558.668199]  ? tracer_preempt_on+0x74/0x1d0
[  558.672386]  ? preempt_count_sub+0xb7/0x100
[  558.676573]  ? fsnotify_perm.part.0+0xa0/0x250
[  558.681025]  vfs_read+0x11d/0x460
[  558.684344]  ? kernel_read+0xc0/0xc0
[  558.687922]  ? __fget_light+0x1b0/0x200
[  558.691763]  ksys_read+0xa9/0x130
[  558.695082]  ? __ia32_sys_pwrite64+0x120/0x120
[  558.699528]  ? trace_hardirqs_on+0x2c/0x110
[  558.703715]  do_syscall_64+0x3a/0x90
[  558.707304]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[  558.712356] RIP: 0033:0x7fc2e972ade2
[  558.715934] Code: c0 e9 b2 fe ff ff 50 48 8d 3d b2 3f 0a 00 e8 05 f0 01 00 0f 1f 44 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 0f 05 <48> 3d 00 f0 ff ff 77 56 c3 0f 1f 44 00 00 48 83 ec 28 48 89 54 24
[  558.734684] RSP: 002b:00007ffc64e687c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[  558.742257] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fc2e972ade2
[  558.749388] RDX: 0000000000020000 RSI: 00007fc2e980d000 RDI: 0000000000000003
[  558.756520] RBP: 00007fc2e980d000 R08: 00007fc2e980c010 R09: 0000000000000000
[  558.763652] R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000020f00
[  558.770785] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000
[  558.777920]  </TASK>

[  558.781609] The buggy address belongs to the physical page:
[  558.787182] page:ffffea00017f6ec0 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x5fdbb
[  558.796483] flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff)
[  558.802925] raw: 000fffffc0000000 0000000000000000 ffffea00017f6ec8 0000000000000000
[  558.810663] raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
[  558.818400] page dumped because: kasan: bad access detected

[  558.825465] Memory state around the buggy address:
[  558.830258]  ffff88805fdbbe80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
[  558.837479]  ffff88805fdbbf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
[  558.844699] >ffff88805fdbbf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
[  558.851917]                                ^
[  558.856190]  ffff88805fdbc000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
[  558.863409]  ffff88805fdbc080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
[  558.870628] ==================================================================

Look familiar?

-- Steve
  
Zheng Yejian Nov. 23, 2022, 8:01 a.m. UTC | #5
On Tue, 22 Nov 2022 22:32:58 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:
> On Wed, 23 Nov 2022 11:18:06 +0800
> Zheng Yejian <zhengyejian1@huawei.com> wrote:
>
> > But Yujie Liu <yujie.liu@intel.com> reported a problem that highly
> > reproducible after applying this patch:
> > Link: https://lore.kernel.org/lkml/54f23c9c-97ae-e326-5873-bfa5d2c81f52@intel.com/
> >
> > So please DO NOT apply this patch before I find what happened about it.
>
> I know what the issue is.
>
> The current way of assigning types is to always increment. And not to
> reuse until it fills up. And even then, it looks for the next available
> number.
>
> I'm guessing the IDA will reuse a number as soon as it is freed. This

Yes, it is.

> may also have uncovered a bug, as in reality, we must actually clear
> the tracing buffers every time a number is reused.

Thanks for your explanation!

It seems almost the case, and with current way of assigning types, this
problem maybe also happend when reuse type id, am I right?

But instead of clear tracing buffers, would it be better to just mark
that record invalid if we had a way of knowing that the format had changed?

>
> What happens is that the type number is associated to a print format.
> That is, the raw data is tagged with the type. This type maps to how to
> parse the raw data. If you have a kprobe, it creates a new type number.
> If you free it, and create another one. With the IDA, it is likely to
> reassign the previously freed number to a new probe.
>
> To explain this better, let's look at the following scenario:
>
>  echo 'p:foo val=$arg1:u64' > kprobe_events
>  echo 1 > events/kprobes/foo/enable
>  sleep 1
>  echo 0 > events/kprobes/foo/enable
>
>  echo 'p:bar val=+0($arg1):string' > kprobe_events
>
>  # foo kprobe is deleted and bar is created and
>  # with IDA, bar has the same number for type as foo
>
>  cat trace
>
> When you read the trace, it will see a binary blob representing an
> event and marked with a type. Although the event was foo, it will now
> map it to bar. And it will read foo's $arg1:u64 as bar's
> +0($arg1):string, and will crash.
>
> -- Steve
  
Steven Rostedt Nov. 23, 2022, 1:37 p.m. UTC | #6
On Wed, 23 Nov 2022 16:01:33 +0800
Zheng Yejian <zhengyejian1@huawei.com> wrote:

> But instead of clear tracing buffers, would it be better to just mark
> that record invalid if we had a way of knowing that the format had changed?

We'd have to scan all ring buffers for the records. The only way to know
what a binary blob in the ring buffer represents is this type.

But, I do the same for modules. And by hooking into that infrastructure, I
can do the above commands again with the IDA patch included, and it works
fine. If any dynamic event was enabled and then removed, it will reset the
corresponding buffer where it was enabled in. This is needed regardless of
your patch because once we get over 65536 types, the behavior of the type
assignment is the same as the IDA logic. The IDA code didn't trigger a bug,
it revealed an existing one.

Care to test with the below patch added. I'll add this first (breaking it
up a little) and then apply your patch.

-- Steve

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6a47fed2f473..93a75a97118f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2180,10 +2180,12 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
 }
 
 /* Must have trace_types_lock held */
-void tracing_reset_all_online_cpus(void)
+void tracing_reset_all_online_cpus_unlocked(void)
 {
 	struct trace_array *tr;
 
+	lockdep_assert_held(&trace_types_lock);
+
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
 		if (!tr->clear_trace)
 			continue;
@@ -2195,6 +2197,13 @@ void tracing_reset_all_online_cpus(void)
 	}
 }
 
+void tracing_reset_all_online_cpus(void)
+{
+	mutex_lock(&trace_types_lock);
+	tracing_reset_all_online_cpus_unlocked();
+	mutex_unlock(&trace_types_lock);
+}
+
 /*
  * The tgid_map array maps from pid to tgid; i.e. the value stored at index i
  * is the tgid last observed corresponding to pid=i.
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d9b470a0adf2..48643f07bc01 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -580,6 +580,7 @@ int tracing_is_enabled(void);
 void tracing_reset_online_cpus(struct array_buffer *buf);
 void tracing_reset_current(int cpu);
 void tracing_reset_all_online_cpus(void);
+void tracing_reset_all_online_cpus_unlocked(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 int tracing_open_generic_tr(struct inode *inode, struct file *filp);
 bool tracing_is_disabled(void);
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 154996684fb5..4376887e0d8a 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -118,6 +118,7 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
 		if (ret)
 			break;
 	}
+	tracing_reset_all_online_cpus();
 	mutex_unlock(&event_mutex);
 out:
 	argv_free(argv);
@@ -214,6 +215,7 @@ int dyn_events_release_all(struct dyn_event_operations *type)
 			break;
 	}
 out:
+	tracing_reset_all_online_cpus();
 	mutex_unlock(&event_mutex);
 
 	return ret;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7372fffb8109..3bfaf560ecc4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2947,7 +2947,10 @@ static int probe_remove_event_call(struct trace_event_call *call)
 		 * TRACE_REG_UNREGISTER.
 		 */
 		if (file->flags & EVENT_FILE_FL_ENABLED)
-			return -EBUSY;
+			goto busy;
+
+		if (file->flags & EVENT_FILE_FL_WAS_ENABLED)
+			tr->clear_trace = true;
 		/*
 		 * The do_for_each_event_file_safe() is
 		 * a double loop. After finding the call for this
@@ -2960,6 +2963,12 @@ static int probe_remove_event_call(struct trace_event_call *call)
 	__trace_remove_event_call(call);
 
 	return 0;
+ busy:
+	/* No need to clear the trace now */
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		tr->clear_trace = false;
+	}
+	return -EBUSY;
 }
 
 /* Remove an event_call */
@@ -3039,7 +3048,7 @@ static void trace_module_remove_events(struct module *mod)
 	 * over from this module may be passed to the new module events and
 	 * unexpected results may occur.
 	 */
-	tracing_reset_all_online_cpus();
+	tracing_reset_all_online_cpus_unlocked();
 }
 
 static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 29fbfb27c2b2..c3b582d19b62 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1425,7 +1425,6 @@ int synth_event_delete(const char *event_name)
 	mutex_unlock(&event_mutex);
 
 	if (mod) {
-		mutex_lock(&trace_types_lock);
 		/*
 		 * It is safest to reset the ring buffer if the module
 		 * being unloaded registered any events that were
@@ -1437,7 +1436,6 @@ int synth_event_delete(const char *event_name)
 		 * occur.
 		 */
 		tracing_reset_all_online_cpus();
-		mutex_unlock(&trace_types_lock);
 	}
 
 	return ret;
  

Patch

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 20749bd9db71..bb2053246d6a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -136,7 +136,6 @@  struct trace_event_functions {
 
 struct trace_event {
 	struct hlist_node		node;
-	struct list_head		list;
 	int				type;
 	struct trace_event_functions	*funcs;
 };
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 67f47ea27921..f0ba97121345 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -11,6 +11,7 @@ 
 #include <linux/kprobes.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/mm.h>
+#include <linux/idr.h>
 
 #include "trace_output.h"
 
@@ -21,8 +22,6 @@  DECLARE_RWSEM(trace_event_sem);
 
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
-static int next_event_type = __TRACE_LAST_TYPE;
-
 enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -688,38 +687,23 @@  struct trace_event *ftrace_find_event(int type)
 	return NULL;
 }
 
-static LIST_HEAD(ftrace_event_list);
+static DEFINE_IDA(trace_event_ida);
 
-static int trace_search_list(struct list_head **list)
+static void free_trace_event_type(int type)
 {
-	struct trace_event *e = NULL, *iter;
-	int next = __TRACE_LAST_TYPE;
-
-	if (list_empty(&ftrace_event_list)) {
-		*list = &ftrace_event_list;
-		return next;
-	}
+	if (type >= __TRACE_LAST_TYPE)
+		ida_free(&trace_event_ida, type);
+}
 
-	/*
-	 * We used up all possible max events,
-	 * lets see if somebody freed one.
-	 */
-	list_for_each_entry(iter, &ftrace_event_list, list) {
-		if (iter->type != next) {
-			e = iter;
-			break;
-		}
-		next++;
-	}
+static int alloc_trace_event_type(void)
+{
+	int next;
 
-	/* Did we used up all 65 thousand events??? */
-	if (next > TRACE_EVENT_TYPE_MAX)
+	/* Skip static defined type numbers */
+	next = ida_alloc_range(&trace_event_ida, __TRACE_LAST_TYPE,
+			       TRACE_EVENT_TYPE_MAX, GFP_KERNEL);
+	if (next < 0)
 		return 0;
-
-	if (e)
-		*list = &e->list;
-	else
-		*list = &ftrace_event_list;
 	return next;
 }
 
@@ -761,28 +745,10 @@  int register_trace_event(struct trace_event *event)
 	if (WARN_ON(!event->funcs))
 		goto out;
 
-	INIT_LIST_HEAD(&event->list);
-
 	if (!event->type) {
-		struct list_head *list = NULL;
-
-		if (next_event_type > TRACE_EVENT_TYPE_MAX) {
-
-			event->type = trace_search_list(&list);
-			if (!event->type)
-				goto out;
-
-		} else {
-
-			event->type = next_event_type++;
-			list = &ftrace_event_list;
-		}
-
-		if (WARN_ON(ftrace_find_event(event->type)))
+		event->type = alloc_trace_event_type();
+		if (!event->type)
 			goto out;
-
-		list_add_tail(&event->list, list);
-
 	} else if (WARN(event->type > __TRACE_LAST_TYPE,
 			"Need to add type to trace.h")) {
 		goto out;
@@ -819,7 +785,7 @@  EXPORT_SYMBOL_GPL(register_trace_event);
 int __unregister_trace_event(struct trace_event *event)
 {
 	hlist_del(&event->node);
-	list_del(&event->list);
+	free_trace_event_type(event->type);
 	return 0;
 }