[RFC] tracing: Introduce pipe_cpumask to avoid race on trace_pipes

Message ID 20230817115057.1637676-1-zhengyejian1@huawei.com
State New
Headers
Series [RFC] tracing: Introduce pipe_cpumask to avoid race on trace_pipes |

Commit Message

Zheng Yejian Aug. 17, 2023, 11:50 a.m. UTC
  There is race issue when concurrently splice_read main trace_pipe and
per_cpu trace_pipes which will result in data read out being different
from what actually writen.

As suggested by Steven:
  > I believe we should add a ref count to trace_pipe and the per_cpu
  > trace_pipes, where if they are opened, nothing else can read it.
  >
  > Opening trace_pipe locks all per_cpu ref counts, if any of them are
  > open, then the trace_pipe open will fail (and releases any ref counts
  > it had taken).
  >
  > Opening a per_cpu trace_pipe will up the ref count for just that
  > CPU buffer. This will allow multiple tasks to read different per_cpu
  > trace_pipe files, but will prevent the main trace_pipe file from
  > being opened.

But because we only need to know whether per_cpu trace_pipe is open or
not, using a cpumask instead of using ref count may be easier.

After this patch, users will find that:
 - Main trace_pipe can be opened by only one user, and if it is
   opened, all per_cpu trace_pipes cannot be opened;
 - Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
   trace_pipe can only be opened by one user. And if one of them is
   opened, main trace_pipe cannot be opened.

Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
---

> Does that work for this?

Hi, Steven, how about using a cpumask instead of ref count?
This patch will also prevent main trace_pipe or anyone per_cpu trace_pipe
from being opened by multiple people at the same time.

 kernel/trace/trace.c | 56 ++++++++++++++++++++++++++++++++++++++------
 kernel/trace/trace.h |  2 ++
 2 files changed, 51 insertions(+), 7 deletions(-)
  

Comments

Steven Rostedt Aug. 17, 2023, 2:13 p.m. UTC | #1
On Thu, 17 Aug 2023 19:50:57 +0800
Zheng Yejian <zhengyejian1@huawei.com> wrote:

> There is race issue when concurrently splice_read main trace_pipe and
> per_cpu trace_pipes which will result in data read out being different
> from what actually writen.
> 
> As suggested by Steven:
>   > I believe we should add a ref count to trace_pipe and the per_cpu
>   > trace_pipes, where if they are opened, nothing else can read it.
>   >
>   > Opening trace_pipe locks all per_cpu ref counts, if any of them are
>   > open, then the trace_pipe open will fail (and releases any ref counts
>   > it had taken).
>   >
>   > Opening a per_cpu trace_pipe will up the ref count for just that
>   > CPU buffer. This will allow multiple tasks to read different per_cpu
>   > trace_pipe files, but will prevent the main trace_pipe file from
>   > being opened.  
> 
> But because we only need to know whether per_cpu trace_pipe is open or
> not, using a cpumask instead of using ref count may be easier.
> 
> After this patch, users will find that:
>  - Main trace_pipe can be opened by only one user, and if it is
>    opened, all per_cpu trace_pipes cannot be opened;
>  - Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
>    trace_pipe can only be opened by one user. And if one of them is
>    opened, main trace_pipe cannot be opened.
> 
> Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
> ---
> 
> > Does that work for this?  
> 
> Hi, Steven, how about using a cpumask instead of ref count?
> This patch will also prevent main trace_pipe or anyone per_cpu trace_pipe
> from being opened by multiple people at the same time.

I'm fine with the CPU mask.

> 
>  kernel/trace/trace.c | 56 ++++++++++++++++++++++++++++++++++++++------
>  kernel/trace/trace.h |  2 ++
>  2 files changed, 51 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index b8870078ef58..73f6f4d10d43 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -6705,24 +6705,54 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
>  
>  #endif
>  
> +static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
> +{
> +	if (cpu == RING_BUFFER_ALL_CPUS) {
> +		if (cpumask_empty(tr->pipe_cpumask)) {
> +			cpumask_setall(tr->pipe_cpumask);
> +			return 0;
> +		}
> +	} else {
> +		if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
> +			cpumask_set_cpu(cpu, tr->pipe_cpumask);
> +			return 0;
> +		}
> +	}
> +	return -EBUSY;

The above would look nicer if you had that be } else if () {

	if (cpu == RING_BUFFER_ALL_CPUS) {
		if (cpumask_empty(tr->pipe_cpumask)) {
			cpumask_setall(tr->pipe_cpumask);
			return 0;
		}
	} else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
		cpumask_set_cpu(cpu, tr->pipe_cpumask);
		return 0;
	}
	return -EBUSY;


> +}
> +
> +static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
> +{
> +	if (cpu == RING_BUFFER_ALL_CPUS) {
> +		WARN_ON(!cpumask_full(tr->pipe_cpumask));
> +		cpumask_clear(tr->pipe_cpumask);
> +	} else {
> +		WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
> +		cpumask_clear_cpu(cpu, tr->pipe_cpumask);
> +	}
> +}
> +
>  static int tracing_open_pipe(struct inode *inode, struct file *filp)
>  {
>  	struct trace_array *tr = inode->i_private;
>  	struct trace_iterator *iter;
>  	int ret;
> +	int cpu = tracing_get_cpu(inode);

tracing_get_cpu() must be called after tracing_check_open_get_tr(),
otherwise it may not be reliable.

Also, keep "int ret;" as the last declaration.

-- Steve

>  
>  	ret = tracing_check_open_get_tr(tr);
>  	if (ret)
>  		return ret;
>  
>  	mutex_lock(&trace_types_lock);
> +	ret = open_pipe_on_cpu(tr, cpu);
> +	if (ret)
> +		goto fail_pipe_on_cpu;
>  
>  	/* create a buffer to store the information to pass to userspace */
>  	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
>  	if (!iter) {
>  		ret = -ENOMEM;
> -		__trace_array_put(tr);
> -		goto out;
> +		goto fail_alloc_iter;
>  	}
>  
>  	trace_seq_init(&iter->seq);
> @@ -6745,7 +6775,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
>  
>  	iter->tr = tr;
>  	iter->array_buffer = &tr->array_buffer;
> -	iter->cpu_file = tracing_get_cpu(inode);
> +	iter->cpu_file = cpu;
>  	mutex_init(&iter->mutex);
>  	filp->private_data = iter;
>  
> @@ -6755,12 +6785,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
>  	nonseekable_open(inode, filp);
>  
>  	tr->trace_ref++;
> -out:
> +
>  	mutex_unlock(&trace_types_lock);
>  	return ret;
>  
>  fail:
>  	kfree(iter);
> +fail_alloc_iter:
> +	close_pipe_on_cpu(tr, cpu);
> +fail_pipe_on_cpu:
>  	__trace_array_put(tr);
>  	mutex_unlock(&trace_types_lock);
>  	return ret;
> @@ -6777,7 +6810,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
>  
>  	if (iter->trace->pipe_close)
>  		iter->trace->pipe_close(iter);
> -
> +	close_pipe_on_cpu(tr, iter->cpu_file);
>  	mutex_unlock(&trace_types_lock);
>  
>  	free_cpumask_var(iter->started);
> @@ -9441,6 +9474,9 @@ static struct trace_array *trace_array_create(const char *name)
>  	if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
>  		goto out_free_tr;
>  
> +	if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
> +		goto out_free_tr;
> +
>  	tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
>  
>  	cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
> @@ -9482,6 +9518,7 @@ static struct trace_array *trace_array_create(const char *name)
>   out_free_tr:
>  	ftrace_free_ftrace_ops(tr);
>  	free_trace_buffers(tr);
> +	free_cpumask_var(tr->pipe_cpumask);
>  	free_cpumask_var(tr->tracing_cpumask);
>  	kfree(tr->name);
>  	kfree(tr);
> @@ -9584,6 +9621,7 @@ static int __remove_instance(struct trace_array *tr)
>  	}
>  	kfree(tr->topts);
>  
> +	free_cpumask_var(tr->pipe_cpumask);
>  	free_cpumask_var(tr->tracing_cpumask);
>  	kfree(tr->name);
>  	kfree(tr);
> @@ -10381,12 +10419,14 @@ __init static int tracer_alloc_buffers(void)
>  	if (trace_create_savedcmd() < 0)
>  		goto out_free_temp_buffer;
>  
> +	if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
> +		goto out_free_savedcmd;
> +
>  	/* TODO: make the number of buffers hot pluggable with CPUS */
>  	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
>  		MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
> -		goto out_free_savedcmd;
> +		goto out_free_pipe_cpumask;
>  	}
> -
>  	if (global_trace.buffer_disabled)
>  		tracing_off();
>  
> @@ -10439,6 +10479,8 @@ __init static int tracer_alloc_buffers(void)
>  
>  	return 0;
>  
> +out_free_pipe_cpumask:
> +	free_cpumask_var(global_trace.pipe_cpumask);
>  out_free_savedcmd:
>  	free_saved_cmdlines_buffer(savedcmd);
>  out_free_temp_buffer:
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index e1edc2197fc8..53ac0f7780c2 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -377,6 +377,8 @@ struct trace_array {
>  	struct list_head	events;
>  	struct trace_event_file *trace_marker_file;
>  	cpumask_var_t		tracing_cpumask; /* only trace on set CPUs */
> +	/* one per_cpu trace_pipe can be opened by only one user */
> +	cpumask_var_t		pipe_cpumask;
>  	int			ref;
>  	int			trace_ref;
>  #ifdef CONFIG_FUNCTION_TRACER
  
Zheng Yejian Aug. 18, 2023, 1:38 a.m. UTC | #2
On 2023/8/17 22:13, Steven Rostedt wrote:
> On Thu, 17 Aug 2023 19:50:57 +0800
> Zheng Yejian <zhengyejian1@huawei.com> wrote:
> 
>> There is race issue when concurrently splice_read main trace_pipe and
>> per_cpu trace_pipes which will result in data read out being different
>> from what actually writen.
>>
>> As suggested by Steven:
>>    > I believe we should add a ref count to trace_pipe and the per_cpu
>>    > trace_pipes, where if they are opened, nothing else can read it.
>>    >
>>    > Opening trace_pipe locks all per_cpu ref counts, if any of them are
>>    > open, then the trace_pipe open will fail (and releases any ref counts
>>    > it had taken).
>>    >
>>    > Opening a per_cpu trace_pipe will up the ref count for just that
>>    > CPU buffer. This will allow multiple tasks to read different per_cpu
>>    > trace_pipe files, but will prevent the main trace_pipe file from
>>    > being opened.
>>
>> But because we only need to know whether per_cpu trace_pipe is open or
>> not, using a cpumask instead of using ref count may be easier.
>>
>> After this patch, users will find that:
>>   - Main trace_pipe can be opened by only one user, and if it is
>>     opened, all per_cpu trace_pipes cannot be opened;
>>   - Per_cpu trace_pipes can be opened by multiple users, but each per_cpu
>>     trace_pipe can only be opened by one user. And if one of them is
>>     opened, main trace_pipe cannot be opened.
>>
>> Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
>> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
>> ---
>>
>>> Does that work for this?
>>
>> Hi, Steven, how about using a cpumask instead of ref count?
>> This patch will also prevent main trace_pipe or anyone per_cpu trace_pipe
>> from being opened by multiple people at the same time.
> 
> I'm fine with the CPU mask.

Hi, Steve, if a task open a trace_pipe file, then concurrently read it
with multiple threads, then the read race problem may also happen, this
patch will not prevent this case.

Do we need to consider this case? Or just tell user not to do like this
through some documents?

> 
>>
>>   kernel/trace/trace.c | 56 ++++++++++++++++++++++++++++++++++++++------
>>   kernel/trace/trace.h |  2 ++
>>   2 files changed, 51 insertions(+), 7 deletions(-)
>>
>> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
>> index b8870078ef58..73f6f4d10d43 100644
>> --- a/kernel/trace/trace.c
>> +++ b/kernel/trace/trace.c
>> @@ -6705,24 +6705,54 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
>>   
>>   #endif
>>   
>> +static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
>> +{
>> +	if (cpu == RING_BUFFER_ALL_CPUS) {
>> +		if (cpumask_empty(tr->pipe_cpumask)) {
>> +			cpumask_setall(tr->pipe_cpumask);
>> +			return 0;
>> +		}
>> +	} else {
>> +		if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
>> +			cpumask_set_cpu(cpu, tr->pipe_cpumask);
>> +			return 0;
>> +		}
>> +	}
>> +	return -EBUSY;
> 
> The above would look nicer if you had that be } else if () {
> 
> 	if (cpu == RING_BUFFER_ALL_CPUS) {
> 		if (cpumask_empty(tr->pipe_cpumask)) {
> 			cpumask_setall(tr->pipe_cpumask);
> 			return 0;
> 		}
> 	} else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
> 		cpumask_set_cpu(cpu, tr->pipe_cpumask);
> 		return 0;
> 	}
> 	return -EBUSY;
> 
> 
>> +}
>> +
>> +static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
>> +{
>> +	if (cpu == RING_BUFFER_ALL_CPUS) {
>> +		WARN_ON(!cpumask_full(tr->pipe_cpumask));
>> +		cpumask_clear(tr->pipe_cpumask);
>> +	} else {
>> +		WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
>> +		cpumask_clear_cpu(cpu, tr->pipe_cpumask);
>> +	}
>> +}
>> +
>>   static int tracing_open_pipe(struct inode *inode, struct file *filp)
>>   {
>>   	struct trace_array *tr = inode->i_private;
>>   	struct trace_iterator *iter;
>>   	int ret;
>> +	int cpu = tracing_get_cpu(inode);
> 
> tracing_get_cpu() must be called after tracing_check_open_get_tr(),
> otherwise it may not be reliable.
> 
> Also, keep "int ret;" as the last declaration.
> 

Thanks, I'll fix them in v3.

-- Zheng Yejian

> -- Steve
> 
>>   
>>   	ret = tracing_check_open_get_tr(tr);
>>   	if (ret)
>>   		return ret;
>>   
>>   	mutex_lock(&trace_types_lock);
>> +	ret = open_pipe_on_cpu(tr, cpu);
>> +	if (ret)
>> +		goto fail_pipe_on_cpu;
>>   
>>   	/* create a buffer to store the information to pass to userspace */
>>   	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
>>   	if (!iter) {
>>   		ret = -ENOMEM;
>> -		__trace_array_put(tr);
>> -		goto out;
>> +		goto fail_alloc_iter;
>>   	}
>>   
>>   	trace_seq_init(&iter->seq);
>> @@ -6745,7 +6775,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
>>   
>>   	iter->tr = tr;
>>   	iter->array_buffer = &tr->array_buffer;
>> -	iter->cpu_file = tracing_get_cpu(inode);
>> +	iter->cpu_file = cpu;
>>   	mutex_init(&iter->mutex);
>>   	filp->private_data = iter;
>>   
>> @@ -6755,12 +6785,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
>>   	nonseekable_open(inode, filp);
>>   
>>   	tr->trace_ref++;
>> -out:
>> +
>>   	mutex_unlock(&trace_types_lock);
>>   	return ret;
>>   
>>   fail:
>>   	kfree(iter);
>> +fail_alloc_iter:
>> +	close_pipe_on_cpu(tr, cpu);
>> +fail_pipe_on_cpu:
>>   	__trace_array_put(tr);
>>   	mutex_unlock(&trace_types_lock);
>>   	return ret;
>> @@ -6777,7 +6810,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
>>   
>>   	if (iter->trace->pipe_close)
>>   		iter->trace->pipe_close(iter);
>> -
>> +	close_pipe_on_cpu(tr, iter->cpu_file);
>>   	mutex_unlock(&trace_types_lock);
>>   
>>   	free_cpumask_var(iter->started);
>> @@ -9441,6 +9474,9 @@ static struct trace_array *trace_array_create(const char *name)
>>   	if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
>>   		goto out_free_tr;
>>   
>> +	if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
>> +		goto out_free_tr;
>> +
>>   	tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
>>   
>>   	cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
>> @@ -9482,6 +9518,7 @@ static struct trace_array *trace_array_create(const char *name)
>>    out_free_tr:
>>   	ftrace_free_ftrace_ops(tr);
>>   	free_trace_buffers(tr);
>> +	free_cpumask_var(tr->pipe_cpumask);
>>   	free_cpumask_var(tr->tracing_cpumask);
>>   	kfree(tr->name);
>>   	kfree(tr);
>> @@ -9584,6 +9621,7 @@ static int __remove_instance(struct trace_array *tr)
>>   	}
>>   	kfree(tr->topts);
>>   
>> +	free_cpumask_var(tr->pipe_cpumask);
>>   	free_cpumask_var(tr->tracing_cpumask);
>>   	kfree(tr->name);
>>   	kfree(tr);
>> @@ -10381,12 +10419,14 @@ __init static int tracer_alloc_buffers(void)
>>   	if (trace_create_savedcmd() < 0)
>>   		goto out_free_temp_buffer;
>>   
>> +	if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
>> +		goto out_free_savedcmd;
>> +
>>   	/* TODO: make the number of buffers hot pluggable with CPUS */
>>   	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
>>   		MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
>> -		goto out_free_savedcmd;
>> +		goto out_free_pipe_cpumask;
>>   	}
>> -
>>   	if (global_trace.buffer_disabled)
>>   		tracing_off();
>>   
>> @@ -10439,6 +10479,8 @@ __init static int tracer_alloc_buffers(void)
>>   
>>   	return 0;
>>   
>> +out_free_pipe_cpumask:
>> +	free_cpumask_var(global_trace.pipe_cpumask);
>>   out_free_savedcmd:
>>   	free_saved_cmdlines_buffer(savedcmd);
>>   out_free_temp_buffer:
>> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
>> index e1edc2197fc8..53ac0f7780c2 100644
>> --- a/kernel/trace/trace.h
>> +++ b/kernel/trace/trace.h
>> @@ -377,6 +377,8 @@ struct trace_array {
>>   	struct list_head	events;
>>   	struct trace_event_file *trace_marker_file;
>>   	cpumask_var_t		tracing_cpumask; /* only trace on set CPUs */
>> +	/* one per_cpu trace_pipe can be opened by only one user */
>> +	cpumask_var_t		pipe_cpumask;
>>   	int			ref;
>>   	int			trace_ref;
>>   #ifdef CONFIG_FUNCTION_TRACER
> 
>
  
Steven Rostedt Aug. 18, 2023, 1:43 a.m. UTC | #3
On Fri, 18 Aug 2023 09:38:27 +0800
Zheng Yejian <zhengyejian1@huawei.com> wrote:

> >> from being opened by multiple people at the same time.  
> > 
> > I'm fine with the CPU mask.  
> 
> Hi, Steve, if a task open a trace_pipe file, then concurrently read it
> with multiple threads, then the read race problem may also happen, this
> patch will not prevent this case.

I was thinking this too and started to look at a fix for that when I
realized, it shouldn't be a problem. The open creates the iterator and it's
attached to the file descriptor. All the threads will be using the same
file descriptor, which means they will be using the same iterator. That
iter->ent from one thread will be still valid for the next thread.

-- Steve


> 
> Do we need to consider this case? Or just tell user not to do like this
> through some documents?
  
Steven Rostedt Aug. 18, 2023, 1:41 p.m. UTC | #4
On Fri, 18 Aug 2023 14:03:09 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> Looks good to me. 
> 
> Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> BTW, this reminds me that splice read support had been introduced for
> virtio-trace support. I've tested it with/without this patch, but I hit
> a different issue, which seems like a virtio-serial issue (per-cpu buffers
> on the guest are spliced correctly, but the data can not be read from host
> side). Let me investigate it.

Is virtio-trace using trace_pipe or trace_pipe_raw? Those two are handled
differently.

-- Steve
  
Masami Hiramatsu (Google) Aug. 19, 2023, 1:42 a.m. UTC | #5
On Fri, 18 Aug 2023 11:53:22 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Fri, 18 Aug 2023 23:23:01 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial)
> > returns -1 and errno == EAGAIN, the trace data will be lost?
> 
> It shouldn't. If it does, then there's likely a bug. The code will block
> and if an interrupt comes in it will return immediately without reading
> from the buffer.
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262
> 
> I don't see where it would return -EINTR and consume data, but I may be
> missing something.

Hmm, I suspect the case if the spilice_to_pipe() returns -EAGAIN.

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8491

It seems not handling such case.

Anyway, I also think something wrong in virtio-serial (or misusing?), since
it can not read anything from the host sometimes. I just setup the virtio-trace
with below patch (ignore EAGAIN).


From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Thu, 17 Aug 2023 14:08:40 +0900
Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice()

splice() can return EAGAIN error instead of returning 0 size read.
In that case, wait a while and try to call splice() again.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c
index ddfe7875eb16..e8a4c4f0c499 100644
--- a/tools/virtio/virtio-trace/trace-agent-rw.c
+++ b/tools/virtio/virtio-trace/trace-agent-rw.c
@@ -8,6 +8,7 @@
  */
 
 #define _GNU_SOURCE
+#include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -127,10 +128,10 @@ static void *rw_thread_main(void *thread_info)
 		rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL,
 				ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE);
 
-		if (rlen < 0) {
-			pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num);
+		if (rlen < 0 && errno != EAGAIN) {
+			pr_err("Splice_read error (%d) in rw-thread(%d)\n", errno, ts->cpu_num);
 			goto error;
-		} else if (rlen == 0) {
+		} else if (rlen == 0 || errno == EAGAIN) {
 			/*
 			 * If trace data do not exist or are unreadable not
 			 * for exceeding the page size, splice_read returns
  
Masami Hiramatsu (Google) Aug. 20, 2023, 1:18 p.m. UTC | #6
On Sat, 19 Aug 2023 10:42:57 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Fri, 18 Aug 2023 11:53:22 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Fri, 18 Aug 2023 23:23:01 +0900
> > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > 
> > > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial)
> > > returns -1 and errno == EAGAIN, the trace data will be lost?
> > 
> > It shouldn't. If it does, then there's likely a bug. The code will block
> > and if an interrupt comes in it will return immediately without reading
> > from the buffer.
> > 
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262
> > 
> > I don't see where it would return -EINTR and consume data, but I may be
> > missing something.
> 
> Hmm, I suspect the case if the spilice_to_pipe() returns -EAGAIN.
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8491
> 
> It seems not handling such case.
> 
> Anyway, I also think something wrong in virtio-serial (or misusing?), since
> it can not read anything from the host sometimes. I just setup the virtio-trace
> with below patch (ignore EAGAIN).

Hmm, I couldn't reproduce it. (maybe a host security update change something?)
Anyway, I confirmed that the ring buffer pages will not be consumed unless
splice_to_pipe() succeeded.

Thank you,

> 
> 
> From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001
> From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
> Date: Thu, 17 Aug 2023 14:08:40 +0900
> Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice()
> 
> splice() can return EAGAIN error instead of returning 0 size read.
> In that case, wait a while and try to call splice() again.
> 
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
>  tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c
> index ddfe7875eb16..e8a4c4f0c499 100644
> --- a/tools/virtio/virtio-trace/trace-agent-rw.c
> +++ b/tools/virtio/virtio-trace/trace-agent-rw.c
> @@ -8,6 +8,7 @@
>   */
>  
>  #define _GNU_SOURCE
> +#include <errno.h>
>  #include <fcntl.h>
>  #include <stdio.h>
>  #include <stdlib.h>
> @@ -127,10 +128,10 @@ static void *rw_thread_main(void *thread_info)
>  		rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL,
>  				ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE);
>  
> -		if (rlen < 0) {
> -			pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num);
> +		if (rlen < 0 && errno != EAGAIN) {
> +			pr_err("Splice_read error (%d) in rw-thread(%d)\n", errno, ts->cpu_num);
>  			goto error;
> -		} else if (rlen == 0) {
> +		} else if (rlen == 0 || errno == EAGAIN) {
>  			/*
>  			 * If trace data do not exist or are unreadable not
>  			 * for exceeding the page size, splice_read returns
> -- 
> 2.34.1
> 
> 
> -- 
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
  
Masami Hiramatsu (Google) Aug. 21, 2023, 2:19 a.m. UTC | #7
On Sat, 19 Aug 2023 10:42:57 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Fri, 18 Aug 2023 11:53:22 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Fri, 18 Aug 2023 23:23:01 +0900
> > Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > 
> > > It uses trace_pipe_raw. I guess if splice(from trace_pipe_raw to virtio-serial)
> > > returns -1 and errno == EAGAIN, the trace data will be lost?
> > 
> > It shouldn't. If it does, then there's likely a bug. The code will block
> > and if an interrupt comes in it will return immediately without reading
> > from the buffer.
> > 
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8262
> > 
> > I don't see where it would return -EINTR and consume data, but I may be
> > missing something.
> 
> Hmm, I suspect the case if the spilice_to_pipe() returns -EAGAIN.
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/trace/trace.c#n8491
> 
> It seems not handling such case.
> 
> Anyway, I also think something wrong in virtio-serial (or misusing?), since
> it can not read anything from the host sometimes. I just setup the virtio-trace
> with below patch (ignore EAGAIN).
> 
> 
> From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001
> From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
> Date: Thu, 17 Aug 2023 14:08:40 +0900
> Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice()
> 
> splice() can return EAGAIN error instead of returning 0 size read.
> In that case, wait a while and try to call splice() again.
> 
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
>  tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/tools/virtio/virtio-trace/trace-agent-rw.c b/tools/virtio/virtio-trace/trace-agent-rw.c
> index ddfe7875eb16..e8a4c4f0c499 100644
> --- a/tools/virtio/virtio-trace/trace-agent-rw.c
> +++ b/tools/virtio/virtio-trace/trace-agent-rw.c
> @@ -8,6 +8,7 @@
>   */
>  
>  #define _GNU_SOURCE
> +#include <errno.h>
>  #include <fcntl.h>
>  #include <stdio.h>
>  #include <stdlib.h>
> @@ -127,10 +128,10 @@ static void *rw_thread_main(void *thread_info)
>  		rlen = splice(ts->in_fd, NULL, ts->read_pipe, NULL,
>  				ts->pipe_size, SPLICE_F_MOVE | SPLICE_F_MORE);
>  
> -		if (rlen < 0) {
> -			pr_err("Splice_read in rw-thread(%d)\n", ts->cpu_num);
> +		if (rlen < 0 && errno != EAGAIN) {
> +			pr_err("Splice_read error (%d) in rw-thread(%d)\n", errno, ts->cpu_num);
>  			goto error;
> -		} else if (rlen == 0) {
> +		} else if (rlen == 0 || errno == EAGAIN) {

Ah, this caused a drop. errno can be EAGAIN even if rlen > 0.
I've fixed this and that works.
BTW, I think this virtio-trace would be better to move under
tools/tracing because it is a tracing tool.

Thanks,

>  			/*
>  			 * If trace data do not exist or are unreadable not
>  			 * for exceeding the page size, splice_read returns
> -- 
> 2.34.1
> 
> 
> -- 
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
  
Steven Rostedt Aug. 21, 2023, 2:33 a.m. UTC | #8
On Mon, 21 Aug 2023 11:19:54 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> Ah, this caused a drop. errno can be EAGAIN even if rlen > 0.
> I've fixed this and that works.
> BTW, I think this virtio-trace would be better to move under
> tools/tracing because it is a tracing tool.

I'm fine with that, as where it is, I'm very unfamiliar with this tool.
It is likely not taking advantage of all the tracing tooling we
have. I actually never even used it.

-- Steve
  
Masami Hiramatsu (Google) Aug. 21, 2023, 9:21 a.m. UTC | #9
On Sun, 20 Aug 2023 22:33:01 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Mon, 21 Aug 2023 11:19:54 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > Ah, this caused a drop. errno can be EAGAIN even if rlen > 0.
> > I've fixed this and that works.
> > BTW, I think this virtio-trace would be better to move under
> > tools/tracing because it is a tracing tool.
> 
> I'm fine with that, as where it is, I'm very unfamiliar with this tool.
> It is likely not taking advantage of all the tracing tooling we
> have. I actually never even used it.

Actually I also used this after a long time. :P
I think this is a kind of simplest splice support test tool.

What the tool does;

(guest side)
per_cpu/cpu*/trace_pipe_raw
     |
   (splice)
     |
  anon-pipe
     |
   (splice)
     |
  virtio-serial chardev
     |
= virtqueue ===
     |
  named-fifo
(host-side)

So that we can move the traced data (page) from the ring buffer to
virtqueue. Then host tool can read the trace data without copying.
(The host part needs a copy to read or write to file.)
Obviously, this requires some integration work with other tracing
tools, because this is just a "fastest trace-data dumper".

(I think Yoshihiro worked that integration, but it was not updated)

https://lkml.org/lkml/2013/9/12/788

Thank you,
  
Masami Hiramatsu (Google) Aug. 21, 2023, 10:32 p.m. UTC | #10
On Mon, 21 Aug 2023 11:17:25 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Sat, 19 Aug 2023 10:42:57 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > >From 92242480285448360c9390a743ea7b3751bb3e61 Mon Sep 17 00:00:00 2001  
> > From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
> > Date: Thu, 17 Aug 2023 14:08:40 +0900
> > Subject: [PATCH 1/3] tools/virtio-trace: Ignore EAGAIN error on splice()
> > 
> > splice() can return EAGAIN error instead of returning 0 size read.
> > In that case, wait a while and try to call splice() again.
> > 
> > Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> Bah,
> 
> When I pulled in Zheng's patch from patchwork, it included the above
> signed-off-by. I was confused to why it added your SoB.

Oops, I didn't noticed such patchwork behavior. 

> 
> Probably best if we had started this conversation on another thread and
> not hijack the discussion about this patch.

Agreed. Sorry for confusion.


> 
> -- Steve
> 
> 
> > ---
> >  tools/virtio/virtio-trace/trace-agent-rw.c | 7 ++++---
> >  1 file changed, 4 insertions(+), 3 deletions(-)
>
  

Patch

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b8870078ef58..73f6f4d10d43 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6705,24 +6705,54 @@  tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 
 #endif
 
+static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+	if (cpu == RING_BUFFER_ALL_CPUS) {
+		if (cpumask_empty(tr->pipe_cpumask)) {
+			cpumask_setall(tr->pipe_cpumask);
+			return 0;
+		}
+	} else {
+		if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+			cpumask_set_cpu(cpu, tr->pipe_cpumask);
+			return 0;
+		}
+	}
+	return -EBUSY;
+}
+
+static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+	if (cpu == RING_BUFFER_ALL_CPUS) {
+		WARN_ON(!cpumask_full(tr->pipe_cpumask));
+		cpumask_clear(tr->pipe_cpumask);
+	} else {
+		WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+		cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+	}
+}
+
 static int tracing_open_pipe(struct inode *inode, struct file *filp)
 {
 	struct trace_array *tr = inode->i_private;
 	struct trace_iterator *iter;
 	int ret;
+	int cpu = tracing_get_cpu(inode);
 
 	ret = tracing_check_open_get_tr(tr);
 	if (ret)
 		return ret;
 
 	mutex_lock(&trace_types_lock);
+	ret = open_pipe_on_cpu(tr, cpu);
+	if (ret)
+		goto fail_pipe_on_cpu;
 
 	/* create a buffer to store the information to pass to userspace */
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter) {
 		ret = -ENOMEM;
-		__trace_array_put(tr);
-		goto out;
+		goto fail_alloc_iter;
 	}
 
 	trace_seq_init(&iter->seq);
@@ -6745,7 +6775,7 @@  static int tracing_open_pipe(struct inode *inode, struct file *filp)
 
 	iter->tr = tr;
 	iter->array_buffer = &tr->array_buffer;
-	iter->cpu_file = tracing_get_cpu(inode);
+	iter->cpu_file = cpu;
 	mutex_init(&iter->mutex);
 	filp->private_data = iter;
 
@@ -6755,12 +6785,15 @@  static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	nonseekable_open(inode, filp);
 
 	tr->trace_ref++;
-out:
+
 	mutex_unlock(&trace_types_lock);
 	return ret;
 
 fail:
 	kfree(iter);
+fail_alloc_iter:
+	close_pipe_on_cpu(tr, cpu);
+fail_pipe_on_cpu:
 	__trace_array_put(tr);
 	mutex_unlock(&trace_types_lock);
 	return ret;
@@ -6777,7 +6810,7 @@  static int tracing_release_pipe(struct inode *inode, struct file *file)
 
 	if (iter->trace->pipe_close)
 		iter->trace->pipe_close(iter);
-
+	close_pipe_on_cpu(tr, iter->cpu_file);
 	mutex_unlock(&trace_types_lock);
 
 	free_cpumask_var(iter->started);
@@ -9441,6 +9474,9 @@  static struct trace_array *trace_array_create(const char *name)
 	if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
 		goto out_free_tr;
 
+	if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+		goto out_free_tr;
+
 	tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
 
 	cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9482,6 +9518,7 @@  static struct trace_array *trace_array_create(const char *name)
  out_free_tr:
 	ftrace_free_ftrace_ops(tr);
 	free_trace_buffers(tr);
+	free_cpumask_var(tr->pipe_cpumask);
 	free_cpumask_var(tr->tracing_cpumask);
 	kfree(tr->name);
 	kfree(tr);
@@ -9584,6 +9621,7 @@  static int __remove_instance(struct trace_array *tr)
 	}
 	kfree(tr->topts);
 
+	free_cpumask_var(tr->pipe_cpumask);
 	free_cpumask_var(tr->tracing_cpumask);
 	kfree(tr->name);
 	kfree(tr);
@@ -10381,12 +10419,14 @@  __init static int tracer_alloc_buffers(void)
 	if (trace_create_savedcmd() < 0)
 		goto out_free_temp_buffer;
 
+	if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+		goto out_free_savedcmd;
+
 	/* TODO: make the number of buffers hot pluggable with CPUS */
 	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
 		MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
-		goto out_free_savedcmd;
+		goto out_free_pipe_cpumask;
 	}
-
 	if (global_trace.buffer_disabled)
 		tracing_off();
 
@@ -10439,6 +10479,8 @@  __init static int tracer_alloc_buffers(void)
 
 	return 0;
 
+out_free_pipe_cpumask:
+	free_cpumask_var(global_trace.pipe_cpumask);
 out_free_savedcmd:
 	free_saved_cmdlines_buffer(savedcmd);
 out_free_temp_buffer:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e1edc2197fc8..53ac0f7780c2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,8 @@  struct trace_array {
 	struct list_head	events;
 	struct trace_event_file *trace_marker_file;
 	cpumask_var_t		tracing_cpumask; /* only trace on set CPUs */
+	/* one per_cpu trace_pipe can be opened by only one user */
+	cpumask_var_t		pipe_cpumask;
 	int			ref;
 	int			trace_ref;
 #ifdef CONFIG_FUNCTION_TRACER