[v2,1/6] fs: split off vfs_getdents function of getdents64 syscall

Message ID 20230422-uring-getdents-v2-1-2db1e37dc55e@codewreck.org
State New
Headers
Series io_uring: add getdents support, take 2 |

Commit Message

Dominique Martinet May 10, 2023, 10:52 a.m. UTC
  This splits off the vfs_getdents function from the getdents64 system
call.
This will allow io_uring to call the vfs_getdents function.

Co-authored-by: Stefan Roesch <shr@fb.com>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 fs/internal.h |  8 ++++++++
 fs/readdir.c  | 34 ++++++++++++++++++++++++++--------
 2 files changed, 34 insertions(+), 8 deletions(-)
  

Comments

Christian Brauner May 23, 2023, 3:39 p.m. UTC | #1
On Wed, May 10, 2023 at 07:52:49PM +0900, Dominique Martinet wrote:
> This splits off the vfs_getdents function from the getdents64 system
> call.
> This will allow io_uring to call the vfs_getdents function.
> 
> Co-authored-by: Stefan Roesch <shr@fb.com>
> Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
> ---
>  fs/internal.h |  8 ++++++++
>  fs/readdir.c  | 34 ++++++++++++++++++++++++++--------
>  2 files changed, 34 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/internal.h b/fs/internal.h
> index bd3b2810a36b..e8ca000e6613 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -260,3 +260,11 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po
>  struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
>  struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
>  void mnt_idmap_put(struct mnt_idmap *idmap);
> +
> +/*
> + * fs/readdir.c
> + */
> +struct linux_dirent64;
> +
> +int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
> +		 unsigned int count);
> diff --git a/fs/readdir.c b/fs/readdir.c
> index 9c53edb60c03..ed0803d0011e 100644
> --- a/fs/readdir.c
> +++ b/fs/readdir.c
> @@ -21,6 +21,7 @@
>  #include <linux/unistd.h>
>  #include <linux/compat.h>
>  #include <linux/uaccess.h>
> +#include "internal.h"
>  
>  #include <asm/unaligned.h>
>  
> @@ -351,10 +352,16 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
>  	return false;
>  }
>  
> -SYSCALL_DEFINE3(getdents64, unsigned int, fd,
> -		struct linux_dirent64 __user *, dirent, unsigned int, count)
> +
> +/**
> + * vfs_getdents - getdents without fdget
> + * @file    : pointer to file struct of directory
> + * @dirent  : pointer to user directory structure
> + * @count   : size of buffer
> + */
> +int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
> +		 unsigned int count)
>  {
> -	struct fd f;
>  	struct getdents_callback64 buf = {
>  		.ctx.actor = filldir64,
>  		.count = count,
> @@ -362,11 +369,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
>  	};
>  	int error;
>  
> -	f = fdget_pos(fd);
> -	if (!f.file)
> -		return -EBADF;
> -
> -	error = iterate_dir(f.file, &buf.ctx);
> +	error = iterate_dir(file, &buf.ctx);

So afaict this isn't enough.
If you look into iterate_shared() you should see that it uses and
updates f_pos. But that can't work for io_uring and also isn't how
io_uring handles read and write. You probably need to use a local pos
similar to what io_uring does in rw.c for rw->kiocb.ki_pos. But in
contrast simply disallow any offsets for getdents completely. Thus not
relying on f_pos anywhere at all.
  
Dominique Martinet May 23, 2023, 9:13 p.m. UTC | #2
Christian Brauner wrote on Tue, May 23, 2023 at 05:39:08PM +0200:
> > @@ -362,11 +369,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
> >  	};
> >  	int error;
> >  
> > -	f = fdget_pos(fd);
> > -	if (!f.file)
> > -		return -EBADF;
> > -
> > -	error = iterate_dir(f.file, &buf.ctx);
> > +	error = iterate_dir(file, &buf.ctx);
> 
> So afaict this isn't enough.
> If you look into iterate_shared() you should see that it uses and
> updates f_pos. But that can't work for io_uring and also isn't how
> io_uring handles read and write. You probably need to use a local pos
> similar to what io_uring does in rw.c for rw->kiocb.ki_pos. But in
> contrast simply disallow any offsets for getdents completely. Thus not
> relying on f_pos anywhere at all.

Using a private offset from the sqe was the previous implementation
discussed around here[1], and Al Viro pointed out that the iterate
filesystem implementations don't validate the offset makes sense as it's
either costly or for some filesystems downright impossible, so I went
into a don't let users modify it approach.

[1] https://lore.kernel.org/all/20211221164004.119663-1-shr@fb.com/T/#m517583f23502f32b040c819d930359313b3db00c


I agree it's not how io_uring usually works -- it dislikes global
states -- but it works perfectly well as long as you don't have multiple
users on the same file, which the application can take care of.

Not having any offsets would work for small directories but make reading
large directories impossible so some sort of continuation is required,
which means we need to keep the offset around; I also suggested keeping
the offset in argument as the previous version but only allowing the
last known offset (... so ultimately still updating f_pos anyway as we
don't have anywhere else to store it) or 0, but if we're going to do
that it looks much simpler to me to expose the same API as getdents.
  
Christian Brauner May 24, 2023, 1:52 p.m. UTC | #3
On Wed, May 24, 2023 at 06:13:57AM +0900, Dominique Martinet wrote:
> Christian Brauner wrote on Tue, May 23, 2023 at 05:39:08PM +0200:
> > > @@ -362,11 +369,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
> > >  	};
> > >  	int error;
> > >  
> > > -	f = fdget_pos(fd);
> > > -	if (!f.file)
> > > -		return -EBADF;
> > > -
> > > -	error = iterate_dir(f.file, &buf.ctx);
> > > +	error = iterate_dir(file, &buf.ctx);
> > 
> > So afaict this isn't enough.
> > If you look into iterate_shared() you should see that it uses and
> > updates f_pos. But that can't work for io_uring and also isn't how
> > io_uring handles read and write. You probably need to use a local pos
> > similar to what io_uring does in rw.c for rw->kiocb.ki_pos. But in
> > contrast simply disallow any offsets for getdents completely. Thus not
> > relying on f_pos anywhere at all.
> 
> Using a private offset from the sqe was the previous implementation
> discussed around here[1], and Al Viro pointed out that the iterate
> filesystem implementations don't validate the offset makes sense as it's
> either costly or for some filesystems downright impossible, so I went
> into a don't let users modify it approach.
> 
> [1] https://lore.kernel.org/all/20211221164004.119663-1-shr@fb.com/T/#m517583f23502f32b040c819d930359313b3db00c
> 
> 
> I agree it's not how io_uring usually works -- it dislikes global
> states -- but it works perfectly well as long as you don't have multiple
> users on the same file, which the application can take care of.
> 
> Not having any offsets would work for small directories but make reading
> large directories impossible so some sort of continuation is required,
> which means we need to keep the offset around; I also suggested keeping
> the offset in argument as the previous version but only allowing the
> last known offset (... so ultimately still updating f_pos anyway as we
> don't have anywhere else to store it) or 0, but if we're going to do
> that it looks much simpler to me to expose the same API as getdents.
> 
> -- 
> Dominique Martinet | Asmadeus

On Wed, May 24, 2023 at 06:05:06AM +0900, Dominique Martinet wrote:
> Christian Brauner wrote on Tue, May 23, 2023 at 04:30:14PM +0200:
> > > index b15ec81c1ed2..f6222b0148ef 100644
> > > --- a/io_uring/fs.c
> > > +++ b/io_uring/fs.c
> > > @@ -322,6 +322,7 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
> > >  {
> > >  	struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents);
> > >  	unsigned long getdents_flags = 0;
> > > +	u32 cqe_flags = 0;
> > >  	int ret;
> > >  
> > >  	if (issue_flags & IO_URING_F_NONBLOCK) {
> > > @@ -338,13 +339,16 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags)
> > >  			goto out;
> > >  	}
> > >  
> > > -	ret = vfs_getdents(req->file, gd->dirent, gd->count, getdents_flags);
> > > +	ret = vfs_getdents(req->file, gd->dirent, gd->count, &getdents_flags);
> > 
> > I don't understand how synchronization and updating of f_pos works here.
> > For example, what happens if a concurrent seek happens on the fd while
> > io_uring is using vfs_getdents which calls into iterate_dir() and
> > updates f_pos?
> 
> I don't see how different that is from a user spawning two threads and
> calling getdents64 + lseek or two getdents64 in parallel?
> (or any two other users of iterate_dir)
> 
> As far as I understand you'll either get the old or new pos as
> obtained/updated by iterate_dir()?
> 
> That iterate_dir probably ought to be using READ_ONCE/WRITE_ONCE or some
> atomic read/update wrappers as the shared case only has a read lock
> around these, but that's not a new problem; and for all I care
> about I'm happy to let users shoot themselves in the foot.
> (although I guess that with filesystems not validating the offset as
> was pointed out in a previous version comment having non-atomic update
> might be a security issue at some point on architectures that don't
> guarantee atomic 64bit updates, but if someone manages to abuse it
> it's already possible to abuse it with the good old syscalls, so I'd
> rather leave that up to someone who understand how atomicity in the
> kernel works better than me...)

There's multiple issues here.

The main objection in [1] was to allow specifying an arbitrary offset
from userspace. What [3] did was to implement a pread() variant for
directories, i.e., pgetdents(). That can't work in principle/is
prohibitively complex. Which is what your series avoids by not allowing
any offsets to be specified.

However, there's still a problem here. Updates to f_pos happen under an
approriate lock to guarantee consistency of the position between calls
that move the cursor position. In the normal read-write path io_uring
doesn't concern itself with f_pos as it keeps local state in
kiocb->ki_pos.

But then it still does end up running into f_pos consistency problems
for read-write because it does allow operating on the current f_pos if
the offset if struct io_rw is set to -1.

In that case it does retrieve and update f_pos which should take
f_pos_lock and a patchset for this was posted but it didn't go anywhere.
It should probably hold that lock. See Jann's comments in the other
thread how that currently can lead to issues.

For getdents() not protecting f_pos is equally bad or worse. The patch
doesn't hold f_pos_lock and just updates f_pos prior _and_ post
iterate_dir() arguing that this race is fine. But again, f_version and
f_pos are consistent after each system call invocation.

But without that you can have a concurrent seek running and can end up
with an inconsistent f_pos position within the same system call. IOW,
you're breaking f_pos being in a well-known state. And you're not doing
that just for io_uring you're doing it for the regular system call
interface as well as both can be used on the same fd simultaneously.
So that's a no go imho.

> I don't see how different that is from a user spawning two threads and
> calling getdents64 + lseek or two getdents64 in parallel?
> (or any two other users of iterate_dir)

The difference is that in both cases f_pos_lock for both getdents and
lseek is held. So f_pos is in a good known state. You're not taking any
locks so now we're risking inconsistency within the same system call if
getdents and lseek run concurrently. Jens also mentioned that you could
even have this problem from within io_uring itself.

So tl;dr, there's no good reason to declare this an acceptable race
afaict. So either this is fixed properly or we're not doing it as far as
I'm concerned.

[1] https://lore.kernel.org/all/20211221164004.119663-1-shr@fb.com/T/#m517583f23502f32b040c819d930359313b3db00c
[2] https://lore.kernel.org/io-uring/20211221164004.119663-4-shr@fb.com
[3] https://lore.kernel.org/io-uring/20211221164004.119663-1-shr@fb.com
  
Dominique Martinet May 24, 2023, 9:36 p.m. UTC | #4
Christian Brauner wrote on Wed, May 24, 2023 at 03:52:45PM +0200:
> The main objection in [1] was to allow specifying an arbitrary offset
> from userspace. What [3] did was to implement a pread() variant for
> directories, i.e., pgetdents(). That can't work in principle/is
> prohibitively complex. Which is what your series avoids by not allowing
> any offsets to be specified.

Yes.

> However, there's still a problem here. Updates to f_pos happen under an
> approriate lock to guarantee consistency of the position between calls
> that move the cursor position. In the normal read-write path io_uring
> doesn't concern itself with f_pos as it keeps local state in
> kiocb->ki_pos.
> 
> But then it still does end up running into f_pos consistency problems
> for read-write because it does allow operating on the current f_pos if
> the offset if struct io_rw is set to -1.
> 
> In that case it does retrieve and update f_pos which should take
> f_pos_lock and a patchset for this was posted but it didn't go anywhere.
> It should probably hold that lock. See Jann's comments in the other
> thread how that currently can lead to issues.

Assuming that is this mail:
https://lore.kernel.org/io-uring/CAG48ez1O9VxSuWuLXBjke23YxUA8EhMP+6RCHo5PNQBf3B0pDQ@mail.gmail.com/

So, ok, I didn't realize fdget_pos() actually acted as a lock on the
file's f_pos_lock (apparently only if FMODE_ATMOIC_POS is set? but it
looks set on most if not all directories through finish_open(), that
looks called consistently enough)

What was confusing is that default_llseek updates f_pos under the
inode_lock (write), and getdents also takes that lock (for read only in
shared implem), so I assumed getdents also was just protected by this
read lock, but I guess that was a bad assumption (as I kept pointing
out, a shared read lock isn't good enough, we definitely agree there)


In practice, in the non-registered file case io_uring is also calling
fdget, so the lock is held exactly the same as the syscall and I wasn't
so far off -- but we need to figure something for the registered file
case.
openat variants don't allow working with the registered variant at all
on the parent fd, so as far as I'm concerned I'd be happy setting the
same limitation for getdents: just say it acts on fd and not files, and
call it a day...
It'd also be possible to check if REQ_F_FIXED_FILE flag was set and
manually take the lock somehow but we don't have any primitive that
takes f_pos_lock from a file (the only place that takes it is fdget
which requires a fd), so I'd rather not add such a new exception.
I assume the other patch you mentioned about adding that lock was this
one:
https://lore.kernel.org/all/20220222105504.3331010-1-dylany@fb.com/T/#m3609dc8057d0bc8e41ceab643e4d630f7b91bde6
and it just atkes the lock, but __fdget_pos also checks for
FMODE_ATOMIC_OPS and file_count and I'm not sure I understand how it
sets (f.flags & FDPUT_POS_UNLOCK) (for fdput_pos) so I'd rather not add
such a code path at this point..


So, ok, what do you think about just forbidding registered files?
I can't see where that wouldn't suffice but I might be missing something
else.

> For getdents() not protecting f_pos is equally bad or worse. The patch
> doesn't hold f_pos_lock and just updates f_pos prior _and_ post
> iterate_dir() arguing that this race is fine. But again, f_version and
> f_pos are consistent after each system call invocation.
> 
> But without that you can have a concurrent seek running and can end up
> with an inconsistent f_pos position within the same system call. IOW,
> you're breaking f_pos being in a well-known state. And you're not doing
> that just for io_uring you're doing it for the regular system call
> interface as well as both can be used on the same fd simultaneously.
> So that's a no go imho.

(so seek is fine, but I agree two concurrent getdents on registered
files won't have the required lock)
  
Christian Brauner May 25, 2023, 9:22 a.m. UTC | #5
On Thu, May 25, 2023 at 06:36:17AM +0900, Dominique Martinet wrote:
> Christian Brauner wrote on Wed, May 24, 2023 at 03:52:45PM +0200:
> > The main objection in [1] was to allow specifying an arbitrary offset
> > from userspace. What [3] did was to implement a pread() variant for
> > directories, i.e., pgetdents(). That can't work in principle/is
> > prohibitively complex. Which is what your series avoids by not allowing
> > any offsets to be specified.
> 
> Yes.
> 
> > However, there's still a problem here. Updates to f_pos happen under an
> > approriate lock to guarantee consistency of the position between calls
> > that move the cursor position. In the normal read-write path io_uring
> > doesn't concern itself with f_pos as it keeps local state in
> > kiocb->ki_pos.
> > 
> > But then it still does end up running into f_pos consistency problems
> > for read-write because it does allow operating on the current f_pos if
> > the offset if struct io_rw is set to -1.
> > 
> > In that case it does retrieve and update f_pos which should take
> > f_pos_lock and a patchset for this was posted but it didn't go anywhere.
> > It should probably hold that lock. See Jann's comments in the other
> > thread how that currently can lead to issues.
> 
> Assuming that is this mail:
> https://lore.kernel.org/io-uring/CAG48ez1O9VxSuWuLXBjke23YxUA8EhMP+6RCHo5PNQBf3B0pDQ@mail.gmail.com/
> 
> So, ok, I didn't realize fdget_pos() actually acted as a lock on the
> file's f_pos_lock (apparently only if FMODE_ATMOIC_POS is set? but it
> looks set on most if not all directories through finish_open(), that
> looks called consistently enough)

It's set for any regular file and directory.

> 
> What was confusing is that default_llseek updates f_pos under the
> inode_lock (write), and getdents also takes that lock (for read only in
> shared implem), so I assumed getdents also was just protected by this
> read lock, but I guess that was a bad assumption (as I kept pointing
> out, a shared read lock isn't good enough, we definitely agree there)
> 
> 
> In practice, in the non-registered file case io_uring is also calling
> fdget, so the lock is held exactly the same as the syscall and I wasn't

No, it really isn't. fdget() doesn't take f_pos_lock at all:

fdget()
-> __fdget()
   -> __fget_light()
      -> __fget()
         -> __fget_files()
            -> __fget_files_rcu()

If that were true then any system call that passes an fd and uses
fdget() would try to acquire a mutex on f_pos_lock. We'd be serializing
every *at based system call on f_pos_lock whenever we have multiple fds
referring to the same file trying to operate on it concurrently.

We do have fdget_pos() and fdput_pos() as a special purpose fdget() for
a select group of system calls that require this synchronization.

> so far off -- but we need to figure something for the registered file
> case.
> openat variants don't allow working with the registered variant at all
> on the parent fd, so as far as I'm concerned I'd be happy setting the
> same limitation for getdents: just say it acts on fd and not files, and
> call it a day...

I don't follow. Also this is hacky so no.

The reason why io_uring *at implementations don't work with fixed files
is that the VFS interface expect regular fds. You could very well make
this work for fixed files but why. It would mean exposing a whole new
set of vfs helpers to io_uring and would probably involve nasty corner
cases.

Also the connection between regular and fixed files in io_uring is
pretty much fluent. While fixed files can only remain pinned in an
io_uring instance it requires that the caller explicitly gave up all
references in their fdtable to that struct file by closing all fds
referring to the same file.

But there's no guarantee. For example, if another thread dups the fd or
the caller sends the fd via SCM_RIGHTS to another process or the caller
simply doesn't close the fd or another thread gets an fd to the same
file from that task via pidfd_getfd before it closed it this doesn't hold.

So it's very well possible to have an fd and a fixed io_uring reference
referring to the same file. The first one can be used with the regular
system call interface and io_uring *at requests that forbid fixed files.
And the other one can be used for i_uring fixed file operations. Doesn't
matter if that shouldn't be done, it's possible afaict.

For regular and fixed files you also have the same problem from within
the same io_uring instance where you can have concurrent getdent
requests. You'd end up producing the exact same inconsistencies.

> It'd also be possible to check if REQ_F_FIXED_FILE flag was set and
> manually take the lock somehow but we don't have any primitive that
> takes f_pos_lock from a file (the only place that takes it is fdget
> which requires a fd), so I'd rather not add such a new exception.
> I assume the other patch you mentioned about adding that lock was this
> one:
> https://lore.kernel.org/all/20220222105504.3331010-1-dylany@fb.com/T/#m3609dc8057d0bc8e41ceab643e4d630f7b91bde6
> and it just atkes the lock, but __fdget_pos also checks for
> FMODE_ATOMIC_OPS and file_count and I'm not sure I understand how it
> sets (f.flags & FDPUT_POS_UNLOCK) (for fdput_pos) so I'd rather not add
> such a code path at this point..
> 
> 
> So, ok, what do you think about just forbidding registered files?
> I can't see where that wouldn't suffice but I might be missing something
> else.

It doesn't help.
  
Dominique Martinet May 25, 2023, 11 a.m. UTC | #6
Christian Brauner wrote on Thu, May 25, 2023 at 11:22:08AM +0200:
> > What was confusing is that default_llseek updates f_pos under the
> > inode_lock (write), and getdents also takes that lock (for read only in
> > shared implem), so I assumed getdents also was just protected by this
> > read lock, but I guess that was a bad assumption (as I kept pointing
> > out, a shared read lock isn't good enough, we definitely agree there)
> > 
> > 
> > In practice, in the non-registered file case io_uring is also calling
> > fdget, so the lock is held exactly the same as the syscall and I wasn't
> 
> No, it really isn't. fdget() doesn't take f_pos_lock at all:
> 
> fdget()
> -> __fdget()
>    -> __fget_light()
>       -> __fget()
>          -> __fget_files()
>             -> __fget_files_rcu()

Ugh, I managed to not notice that I was looking at fdget_pos and that
it's not the same as fdget by the time I wrote two paragraphs... These
functions all have too many wrappers and too similar names for a quick
look before work.

> If that were true then any system call that passes an fd and uses
> fdget() would try to acquire a mutex on f_pos_lock. We'd be serializing
> every *at based system call on f_pos_lock whenever we have multiple fds
> referring to the same file trying to operate on it concurrently.
> 
> We do have fdget_pos() and fdput_pos() as a special purpose fdget() for
> a select group of system calls that require this synchronization.

Right, that makes sense, and invalidates everything I said after that
anyway but it's not like looking stupid ever killed anyone.

Ok so it would require adding a new wrapper from struct file to struct
fd that'd eventually take the lock and set FDPUT_POS_UNLOCK for... not
fdput_pos but another function for that stopping short of fdput...
Then just call that around both vfs_llseek and vfs_getdents calls; which
is the easy part.

(Or possibly call mutex_lock directly like Dylan did in [1]...)
[1] https://lore.kernel.org/all/20220222105504.3331010-1-dylany@fb.com/T/#m3609dc8057d0bc8e41ceab643e4d630f7b91bde6



I'll be honest though I'm thankful for your explanations but I think
I'll just do like Stefan and stop trying for now: the only reason I've
started this was because I wanted to play with io_uring for a new toy
project and it felt awkward without a getdents for crawling a tree; and
I'm long past the point where I should have thrown the towel and just
make that a sequential walk.
There's too many "conditional patches" (NOWAIT, end of dir indicator)
that I don't care about and require additional work to rebase
continuously so I'll just leave it up to someone else who does care.

So to that someone: feel free to continue from these branches (I've
included the fix for kernfs_fop_readdir that Dan Carpenter reported):
https://github.com/martinetd/linux/commits/io_uring_getdents
https://github.com/martinetd/liburing/commits/getdents

Or just start over, there's not that much code now hopefully the
baseline requirements have gotten a little bit clearer.


Sorry for stirring the mess and leaving halfway, if nobody does continue
I might send a v3 when I have more time/energy in a few months, but it
won't be quick.
  
Christian Brauner May 25, 2023, 12:33 p.m. UTC | #7
On Thu, May 25, 2023 at 08:00:02PM +0900, Dominique Martinet wrote:
> Christian Brauner wrote on Thu, May 25, 2023 at 11:22:08AM +0200:
> > > What was confusing is that default_llseek updates f_pos under the
> > > inode_lock (write), and getdents also takes that lock (for read only in
> > > shared implem), so I assumed getdents also was just protected by this
> > > read lock, but I guess that was a bad assumption (as I kept pointing
> > > out, a shared read lock isn't good enough, we definitely agree there)
> > > 
> > > 
> > > In practice, in the non-registered file case io_uring is also calling
> > > fdget, so the lock is held exactly the same as the syscall and I wasn't
> > 
> > No, it really isn't. fdget() doesn't take f_pos_lock at all:
> > 
> > fdget()
> > -> __fdget()
> >    -> __fget_light()
> >       -> __fget()
> >          -> __fget_files()
> >             -> __fget_files_rcu()
> 
> Ugh, I managed to not notice that I was looking at fdget_pos and that
> it's not the same as fdget by the time I wrote two paragraphs... These
> functions all have too many wrappers and too similar names for a quick
> look before work.
> 
> > If that were true then any system call that passes an fd and uses
> > fdget() would try to acquire a mutex on f_pos_lock. We'd be serializing
> > every *at based system call on f_pos_lock whenever we have multiple fds
> > referring to the same file trying to operate on it concurrently.
> > 
> > We do have fdget_pos() and fdput_pos() as a special purpose fdget() for
> > a select group of system calls that require this synchronization.
> 
> Right, that makes sense, and invalidates everything I said after that
> anyway but it's not like looking stupid ever killed anyone.

I strongly disagree with the looking stupid part. These callchains are
quite unwieldy and it's easy to get confused. Usually if you receive a
long mail about the semantics involved - as in the earlier thread - it
means there's landmines all over.

> 
> Ok so it would require adding a new wrapper from struct file to struct
> fd that'd eventually take the lock and set FDPUT_POS_UNLOCK for... not
> fdput_pos but another function for that stopping short of fdput...
> Then just call that around both vfs_llseek and vfs_getdents calls; which
> is the easy part.
> 
> (Or possibly call mutex_lock directly like Dylan did in [1]...)
> [1] https://lore.kernel.org/all/20220222105504.3331010-1-dylany@fb.com/T/#m3609dc8057d0bc8e41ceab643e4d630f7b91bde6

We'd need a consistent story whatever it ends up being.

> I'll be honest though I'm thankful for your explanations but I think
> I'll just do like Stefan and stop trying for now: the only reason I've
> started this was because I wanted to play with io_uring for a new toy
> project and it felt awkward without a getdents for crawling a tree; and
> I'm long past the point where I should have thrown the towel and just
> make that a sequential walk.
> There's too many "conditional patches" (NOWAIT, end of dir indicator)
> that I don't care about and require additional work to rebase
> continuously so I'll just leave it up to someone else who does care.
> 
> So to that someone: feel free to continue from these branches (I've
> included the fix for kernfs_fop_readdir that Dan Carpenter reported):
> https://github.com/martinetd/linux/commits/io_uring_getdents
> https://github.com/martinetd/liburing/commits/getdents
> 
> Or just start over, there's not that much code now hopefully the
> baseline requirements have gotten a little bit clearer.
> 
> 
> Sorry for stirring the mess and leaving halfway, if nobody does continue
> I might send a v3 when I have more time/energy in a few months, but it
> won't be quick.

It's fine.
  
Hao Xu July 11, 2023, 8:17 a.m. UTC | #8
On 5/25/23 19:00, Dominique Martinet wrote:
> Christian Brauner wrote on Thu, May 25, 2023 at 11:22:08AM +0200:
>>> What was confusing is that default_llseek updates f_pos under the
>>> inode_lock (write), and getdents also takes that lock (for read only in
>>> shared implem), so I assumed getdents also was just protected by this
>>> read lock, but I guess that was a bad assumption (as I kept pointing
>>> out, a shared read lock isn't good enough, we definitely agree there)
>>>
>>>
>>> In practice, in the non-registered file case io_uring is also calling
>>> fdget, so the lock is held exactly the same as the syscall and I wasn't
>>
>> No, it really isn't. fdget() doesn't take f_pos_lock at all:
>>
>> fdget()
>> -> __fdget()
>>     -> __fget_light()
>>        -> __fget()
>>           -> __fget_files()
>>              -> __fget_files_rcu()
> 
> Ugh, I managed to not notice that I was looking at fdget_pos and that
> it's not the same as fdget by the time I wrote two paragraphs... These
> functions all have too many wrappers and too similar names for a quick
> look before work.
> 
>> If that were true then any system call that passes an fd and uses
>> fdget() would try to acquire a mutex on f_pos_lock. We'd be serializing
>> every *at based system call on f_pos_lock whenever we have multiple fds
>> referring to the same file trying to operate on it concurrently.
>>
>> We do have fdget_pos() and fdput_pos() as a special purpose fdget() for
>> a select group of system calls that require this synchronization.
> 
> Right, that makes sense, and invalidates everything I said after that
> anyway but it's not like looking stupid ever killed anyone.
> 
> Ok so it would require adding a new wrapper from struct file to struct
> fd that'd eventually take the lock and set FDPUT_POS_UNLOCK for... not
> fdput_pos but another function for that stopping short of fdput...
> Then just call that around both vfs_llseek and vfs_getdents calls; which
> is the easy part.
> 
> (Or possibly call mutex_lock directly like Dylan did in [1]...)
> [1] https://lore.kernel.org/all/20220222105504.3331010-1-dylany@fb.com/T/#m3609dc8057d0bc8e41ceab643e4d630f7b91bde6
> 
> 
> 
> I'll be honest though I'm thankful for your explanations but I think
> I'll just do like Stefan and stop trying for now: the only reason I've
> started this was because I wanted to play with io_uring for a new toy
> project and it felt awkward without a getdents for crawling a tree; and
> I'm long past the point where I should have thrown the towel and just
> make that a sequential walk.
> There's too many "conditional patches" (NOWAIT, end of dir indicator)
> that I don't care about and require additional work to rebase
> continuously so I'll just leave it up to someone else who does care.
> 
> So to that someone: feel free to continue from these branches (I've
> included the fix for kernfs_fop_readdir that Dan Carpenter reported):
> https://github.com/martinetd/linux/commits/io_uring_getdents
> https://github.com/martinetd/liburing/commits/getdents
> 
> Or just start over, there's not that much code now hopefully the
> baseline requirements have gotten a little bit clearer.
> 
> 
> Sorry for stirring the mess and leaving halfway, if nobody does continue
> I might send a v3 when I have more time/energy in a few months, but it
> won't be quick.
> 

Hi Dominique,

I'd like to take this if you don't mind.

Regards,
Hao
  
Dominique Martinet July 11, 2023, 8:24 a.m. UTC | #9
Hao Xu wrote on Tue, Jul 11, 2023 at 04:17:11PM +0800:
> > So to that someone: feel free to continue from these branches (I've
> > included the fix for kernfs_fop_readdir that Dan Carpenter reported):
> > https://github.com/martinetd/linux/commits/io_uring_getdents
> > https://github.com/martinetd/liburing/commits/getdents
> > 
> > Or just start over, there's not that much code now hopefully the
> > baseline requirements have gotten a little bit clearer.
> > 
> > 
> > Sorry for stirring the mess and leaving halfway, if nobody does continue
> > I might send a v3 when I have more time/energy in a few months, but it
> > won't be quick.
> 
> I'd like to take this if you don't mind.

Sure, I haven't been working on this lately, feel free to work on this.

Will be happy to review anything you send based on what came out of the
previous discussions to save Christian and others some time so you can
keep me in Cc if you'd like.
  

Patch

diff --git a/fs/internal.h b/fs/internal.h
index bd3b2810a36b..e8ca000e6613 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -260,3 +260,11 @@  ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po
 struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
 struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
 void mnt_idmap_put(struct mnt_idmap *idmap);
+
+/*
+ * fs/readdir.c
+ */
+struct linux_dirent64;
+
+int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
+		 unsigned int count);
diff --git a/fs/readdir.c b/fs/readdir.c
index 9c53edb60c03..ed0803d0011e 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -21,6 +21,7 @@ 
 #include <linux/unistd.h>
 #include <linux/compat.h>
 #include <linux/uaccess.h>
+#include "internal.h"
 
 #include <asm/unaligned.h>
 
@@ -351,10 +352,16 @@  static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
 	return false;
 }
 
-SYSCALL_DEFINE3(getdents64, unsigned int, fd,
-		struct linux_dirent64 __user *, dirent, unsigned int, count)
+
+/**
+ * vfs_getdents - getdents without fdget
+ * @file    : pointer to file struct of directory
+ * @dirent  : pointer to user directory structure
+ * @count   : size of buffer
+ */
+int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent,
+		 unsigned int count)
 {
-	struct fd f;
 	struct getdents_callback64 buf = {
 		.ctx.actor = filldir64,
 		.count = count,
@@ -362,11 +369,7 @@  SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 	};
 	int error;
 
-	f = fdget_pos(fd);
-	if (!f.file)
-		return -EBADF;
-
-	error = iterate_dir(f.file, &buf.ctx);
+	error = iterate_dir(file, &buf.ctx);
 	if (error >= 0)
 		error = buf.error;
 	if (buf.prev_reclen) {
@@ -379,6 +382,21 @@  SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 		else
 			error = count - buf.count;
 	}
+	return error;
+}
+
+SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+		struct linux_dirent64 __user *, dirent, unsigned int, count)
+{
+	struct fd f;
+	int error;
+
+	f = fdget_pos(fd);
+	if (!f.file)
+		return -EBADF;
+
+	error = vfs_getdents(f.file, dirent, count);
+
 	fdput_pos(f);
 	return error;
 }