[v2,01/11] splice: copy_splice_read: do the I/O with IOCB_NOWAIT

Message ID d87ac02081f2d698dde10da7da51336afc59b480.1703126594.git.nabijaczleweli@nabijaczleweli.xyz
State New
Headers
Series Avoid unprivileged splice(file->)/(->socket) pipe exclusion |

Commit Message

Ahelenia Ziemiańska Dec. 21, 2023, 3:08 a.m. UTC
  Otherwise we risk sleeping with the pipe locked for indeterminate
lengths of time ‒ given:
	cat > udp.c <<^D
	#define _GNU_SOURCE
	#include <fcntl.h>
	#include <sys/socket.h>
	#include <netinet/in.h>
	#include <netinet/udp.h>
	int main()
	{
		int s = socket(AF_INET, SOCK_DGRAM, 0);
		bind(s,
		     &(struct sockaddr_in){ .sin_family = AF_INET,
					    .sin_addr.s_addr = htonl(INADDR_ANY) },
		     sizeof(struct sockaddr_in));
		for (;;)
			splice(s, 0, 1, 0, 128 * 1024 * 1024, 0);
	}
	^D
	cc udp.c -o udp
	mkfifo fifo
	./udp > fifo &
	read -r _ < fifo &
	sleep 0.1
	echo zupa > fifo
udp used to sleep in splice and the shell used to enter an
uninterruptible sleep in open("fifo");
now the splice returns -EAGAIN and the whole program completes.

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
 fs/splice.c | 1 +
 1 file changed, 1 insertion(+)
  

Comments

Christoph Hellwig Dec. 21, 2023, 8:27 a.m. UTC | #1
On Thu, Dec 21, 2023 at 04:08:45AM +0100, Ahelenia Ziemiańska wrote:
> Otherwise we risk sleeping with the pipe locked for indeterminate

You can't just assume that any ->read_iter support IOCB_NOWAIT.
  
Ahelenia Ziemiańska Dec. 21, 2023, 4:30 p.m. UTC | #2
On Thu, Dec 21, 2023 at 12:27:12AM -0800, Christoph Hellwig wrote:
> On Thu, Dec 21, 2023 at 04:08:45AM +0100, Ahelenia Ziemiańska wrote:
> > Otherwise we risk sleeping with the pipe locked for indeterminate
> You can't just assume that any ->read_iter support IOCB_NOWAIT.
Let's see.

zero_fops	drivers/char/mem.c:     .splice_read    = copy_splice_read,
full_fops	drivers/char/mem.c:     .splice_read    = copy_splice_read,

random_fops	drivers/char/random.c:  .splice_read = copy_splice_read,
random_read_iter checks
urandom_fops	drivers/char/random.c:  .splice_read = copy_splice_read,
urandom_read_iter returns instantly

if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
fs/splice.c:            return copy_splice_read(in, ppos, pipe, len, flags);
FMODE_CAN_ODIRECT is set by filesystems and blockdevs, so trusted

fs/9p/vfs_file.c:               return copy_splice_read(in, ppos, pipe, len, flags);
fs/ceph/file.c:         return copy_splice_read(in, ppos, pipe, len, flags);
fs/ceph/file.c:         return copy_splice_read(in, ppos, pipe, len, flags);
fs/gfs2/file.c: .splice_read    = copy_splice_read,
fs/gfs2/file.c: .splice_read    = copy_splice_read,
fs/kernfs/file.c:       .splice_read    = copy_splice_read,
fs/smb/client/cifsfs.c: .splice_read = copy_splice_read,
fs/smb/client/cifsfs.c: .splice_read = copy_splice_read,
fs/proc/inode.c:        .splice_read    = copy_splice_read,
fs/proc/inode.c:        .splice_read    = copy_splice_read,
fs/proc/proc_sysctl.c:  .splice_read    = copy_splice_read,
fs/proc_namespace.c:    .splice_read    = copy_splice_read,
fs/proc_namespace.c:    .splice_read    = copy_splice_read,
fs/proc_namespace.c:    .splice_read    = copy_splice_read,
filesystems => trusted

tracing_fops
kernel/trace/trace.c:   .splice_read    = copy_splice_read,
used in /sys/kernel/debug/tracing/per_cpu/cpu*/trace
    and /sys/kernel/debug/tracing/trace
which are seq_read_iter and even if they did block,
it's in tracefs so same logic as tracing_buffers_splice_read applies.

net/socket.c:           return copy_splice_read(file, ppos, pipe, len, flags);
this is the default implementation for protocols without explicit
splice_reads, and sock_read_iter translates IOCB_NOWAIT into
MSG_DONTAIT.


So I think I can, because the ~three implementations that we want
to constrain do support it. If anything, this hints to me that to
yield a more consistent API that doesn't arbitrarily distinguish
between O_DIRECT files with and without IOCB_NOWAIT support, something
to the effect of the following diff may be used.

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 11cd8d23f6f2..dc42837ee0af 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -392,7 +392,7 @@ static ssize_t v9fs_file_splice_read(struct file *in, loff_t *ppos,
 		 fid->fid, len, *ppos);
 
 	if (fid->mode & P9L_DIRECT)
-		return copy_splice_read(in, ppos, pipe, len, flags);
+		return copy_splice_read_sleepok(in, ppos, pipe, len, flags);
 	return filemap_splice_read(in, ppos, pipe, len, flags);
 }
 
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b5aae29e944..9a4679013135 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2209,7 +2209,7 @@ static ssize_t ceph_splice_read(struct file *in, loff_t *ppos,
 
 	if (ceph_has_inline_data(ci) ||
 	    (fi->flags & CEPH_F_SYNC))
-		return copy_splice_read(in, ppos, pipe, len, flags);
+		return copy_splice_read_sleepok(in, ppos, pipe, len, flags);
 
 	ceph_start_io_read(inode);
 
@@ -2228,7 +2228,7 @@ static ssize_t ceph_splice_read(struct file *in, loff_t *ppos,
 
 		ceph_put_cap_refs(ci, got);
 		ceph_end_io_read(inode);
-		return copy_splice_read(in, ppos, pipe, len, flags);
+		return copy_splice_read_sleepok(in, ppos, pipe, len, flags);
 	}
 
 	dout("splice_read %p %llx.%llx %llu~%zu got cap refs on %s\n",
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4b66efc1a82a..5b0cbb6b95c4 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1581,7 +1581,7 @@ const struct file_operations gfs2_file_fops = {
 	.fsync		= gfs2_fsync,
 	.lock		= gfs2_lock,
 	.flock		= gfs2_flock,
-	.splice_read	= copy_splice_read,
+	.splice_read	= copy_splice_read_sleepok,
 	.splice_write	= gfs2_file_splice_write,
 	.setlease	= simple_nosetlease,
 	.fallocate	= gfs2_fallocate,
@@ -1612,7 +1612,7 @@ const struct file_operations gfs2_file_fops_nolock = {
 	.open		= gfs2_open,
 	.release	= gfs2_release,
 	.fsync		= gfs2_fsync,
-	.splice_read	= copy_splice_read,
+	.splice_read	= copy_splice_read_sleepok,
 	.splice_write	= gfs2_file_splice_write,
 	.setlease	= generic_setlease,
 	.fallocate	= gfs2_fallocate,
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index f0cb729e9a97..f0b6e85b2c5b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -989,7 +989,7 @@ const struct file_operations kernfs_file_fops = {
 	.release	= kernfs_fop_release,
 	.poll		= kernfs_fop_poll,
 	.fsync		= noop_fsync,
-	.splice_read	= copy_splice_read,
+	.splice_read	= copy_splice_read_sleepok,
 	.splice_write	= iter_file_splice_write,
 };
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b33e490e3fd9..7ec2f4653299 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -588,7 +588,7 @@ static const struct file_operations proc_iter_file_ops = {
 	.llseek		= proc_reg_llseek,
 	.read_iter	= proc_reg_read_iter,
 	.write		= proc_reg_write,
-	.splice_read	= copy_splice_read,
+	.splice_read	= copy_splice_read_sleepok,
 	.poll		= proc_reg_poll,
 	.unlocked_ioctl	= proc_reg_unlocked_ioctl,
 	.mmap		= proc_reg_mmap,
@@ -614,7 +614,7 @@ static const struct file_operations proc_reg_file_ops_compat = {
 static const struct file_operations proc_iter_file_ops_compat = {
 	.llseek		= proc_reg_llseek,
 	.read_iter	= proc_reg_read_iter,
-	.splice_read	= copy_splice_read,
+	.splice_read	= copy_splice_read_sleepok,
 	.write		= proc_reg_write,
 	.poll		= proc_reg_poll,
 	.unlocked_ioctl	= proc_reg_unlocked_ioctl,
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 8064ea76f80b..11d26fd14e7d 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -864,7 +864,7 @@ static const struct file_operations proc_sys_file_operations = {
 	.poll		= proc_sys_poll,
 	.read_iter	= proc_sys_read,
 	.write_iter	= proc_sys_write,
-	.splice_read	= copy_splice_read,
+	.splice_read	= copy_splice_read_sleepok,
 	.splice_write	= iter_file_splice_write,
 	.llseek		= default_llseek,
 };
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 250eb5bf7b52..e9d19a856dd7 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -324,7 +324,7 @@ static int mountstats_open(struct inode *inode, struct file *file)
 const struct file_operations proc_mounts_operations = {
 	.open		= mounts_open,
 	.read_iter	= seq_read_iter,
-	.splice_read	= copy_splice_read,
+	.splice_read	= copy_splice_read_sleepok,
 	.llseek		= seq_lseek,
 	.release	= mounts_release,
 	.poll		= mounts_poll,
@@ -333,7 +333,7 @@ const struct file_operations proc_mounts_operations = {
 const struct file_operations proc_mountinfo_operations = {
 	.open		= mountinfo_open,
 	.read_iter	= seq_read_iter,
-	.splice_read	= copy_splice_read,
+	.splice_read	= copy_splice_read_sleepok,
 	.llseek		= seq_lseek,
 	.release	= mounts_release,
 	.poll		= mounts_poll,
@@ -342,7 +342,7 @@ const struct file_operations proc_mountinfo_operations = {
 const struct file_operations proc_mountstats_operations = {
 	.open		= mountstats_open,
 	.read_iter	= seq_read_iter,
-	.splice_read	= copy_splice_read,
+	.splice_read	= copy_splice_read_sleepok,
 	.llseek		= seq_lseek,
 	.release	= mounts_release,
 };
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 2131638f26d0..5f9fb3ce3bcb 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1561,7 +1561,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.fsync = cifs_fsync,
 	.flush = cifs_flush,
 	.mmap = cifs_file_mmap,
-	.splice_read = copy_splice_read,
+	.splice_read = copy_splice_read_sleepok,
 	.splice_write = iter_file_splice_write,
 	.unlocked_ioctl  = cifs_ioctl,
 	.copy_file_range = cifs_copy_file_range,
@@ -1615,7 +1615,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 	.fsync = cifs_fsync,
 	.flush = cifs_flush,
 	.mmap = cifs_file_mmap,
-	.splice_read = copy_splice_read,
+	.splice_read = copy_splice_read_sleepok,
 	.splice_write = iter_file_splice_write,
 	.unlocked_ioctl  = cifs_ioctl,
 	.copy_file_range = cifs_copy_file_range,
diff --git a/fs/splice.c b/fs/splice.c
index 2871c6f9366f..90ebcf236c05 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -298,12 +298,14 @@ void splice_shrink_spd(struct splice_pipe_desc *spd)
 }
 
 /**
- * copy_splice_read -  Copy data from a file and splice the copy into a pipe
+ * __copy_splice_read -  Copy data from a file and splice the copy into a pipe
  * @in: The file to read from
  * @ppos: Pointer to the file position to read from
  * @pipe: The pipe to splice into
  * @len: The amount to splice
  * @flags: The SPLICE_F_* flags
+ * @sleepok: Set if splicing from a trusted filesystem,
+ *           don't set if splicing from an IPC mechanism
  *
  * This function allocates a bunch of pages sufficient to hold the requested
  * amount of data (but limited by the remaining pipe capacity), passes it to
@@ -317,10 +319,11 @@ void splice_shrink_spd(struct splice_pipe_desc *spd)
  * if the pipe has insufficient space, we reach the end of the data or we hit a
  * hole.
  */
-ssize_t copy_splice_read(struct file *in, loff_t *ppos,
-			 struct pipe_inode_info *pipe,
-			 size_t len, unsigned int flags)
+static ssize_t __copy_splice_read(struct file *in, loff_t *ppos,
+				  struct pipe_inode_info *pipe,
+				  size_t len, unsigned int flags, bool sleepok)
 {
+	printk("__copy_splice_read(%d)\n", sleepok);
 	struct iov_iter to;
 	struct bio_vec *bv;
 	struct kiocb kiocb;
@@ -361,7 +364,8 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
 	iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
 	init_sync_kiocb(&kiocb, in);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_flags |= IOCB_NOWAIT;
+	if (!sleepok)
+		kiocb.ki_flags |= IOCB_NOWAIT;
 	ret = call_read_iter(in, &kiocb, &to);
 
 	if (ret > 0) {
@@ -399,8 +403,21 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
 	kfree(bv);
 	return ret;
 }
+
+ssize_t copy_splice_read(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe,
+			 size_t len, unsigned int flags) {
+	return __copy_splice_read(in, ppos, pipe, len, flags, false);
+}
 EXPORT_SYMBOL(copy_splice_read);
 
+ssize_t copy_splice_read_sleepok(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe,
+				 size_t len, unsigned int flags) {
+	return __copy_splice_read(in, ppos, pipe, len, flags, true);
+}
+EXPORT_SYMBOL(copy_splice_read_sleepok);
+
 const struct pipe_buf_operations default_pipe_buf_ops = {
 	.release	= generic_pipe_buf_release,
 	.try_steal	= generic_pipe_buf_try_steal,
@@ -988,7 +1005,7 @@ long vfs_splice_read(struct file *in, loff_t *ppos,
 	 * buffer, copy into it and splice that into the pipe.
 	 */
 	if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
-		return copy_splice_read(in, ppos, pipe, len, flags);
+		return copy_splice_read_sleepok(in, ppos, pipe, len, flags);
 	return in->f_op->splice_read(in, ppos, pipe, len, flags);
 }
 EXPORT_SYMBOL_GPL(vfs_splice_read);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..0980bf6ba8fd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2989,6 +2989,9 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
 ssize_t copy_splice_read(struct file *in, loff_t *ppos,
 			 struct pipe_inode_info *pipe,
 			 size_t len, unsigned int flags);
+ssize_t copy_splice_read_sleepok(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe,
+			 size_t len, unsigned int flags);
 extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
  

Patch

diff --git a/fs/splice.c b/fs/splice.c
index d983d375ff11..9d29664f23ee 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -361,6 +361,7 @@  ssize_t copy_splice_read(struct file *in, loff_t *ppos,
 	iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
 	init_sync_kiocb(&kiocb, in);
 	kiocb.ki_pos = *ppos;
+	kiocb.ki_flags |= IOCB_NOWAIT;
 	ret = call_read_iter(in, &kiocb, &to);
 
 	if (ret > 0) {