[v2,(kindof)] fs: use __fput_sync in close(2)

Message ID CAGudoHGqRr_WNz86pmgK9Kmnwsox+_XXqqbp+rLW53e5t8higg@mail.gmail.com
State New
Headers
Series [v2,(kindof)] fs: use __fput_sync in close(2) |

Commit Message

Mateusz Guzik Aug. 8, 2023, 3:07 p.m. UTC
  I slapped the following variant just for illustration purposes.

- adds __close_fd which returns a struct file
- adds __filp_close with a flag whether to fput
- makes close(2) use both
- transparent to everyone else

Downside is that __fput_sync still loses the assert. Instead of
losing, it could perhaps be extended with a hack to check syscall
number -- pass if either this is close (or binary compat close) or a
kthread, BUG out otherwise. Alternatively perhaps deref could be
opencoded along with a comment about real fput that this is taking
place. Or maybe some other cosmetic choice.

I cannot compile-test right now, so down below is a rough copy make
sure it is clear what I mean.

I feel compelled to note that simple patches get microbenchmarked all
the time, with these results being the only justification provided.
I'm confused why this patch is supposed to be an exception given its
simplicity.

Serious justification should be expected from tough calls --
complicated, invasive changes, maybe with numerous tradeoffs.

In contrast close(2) doing __fput_sync looks a clear cut thing to do,
at worst one can argue which way to do it.

 extern struct filename *getname_uflags(const char __user *, int);
  

Comments

Christian Brauner Aug. 8, 2023, 4:30 p.m. UTC | #1
On Tue, Aug 08, 2023 at 05:07:22PM +0200, Mateusz Guzik wrote:
> I slapped the following variant just for illustration purposes.
> 
> - adds __close_fd which returns a struct file
> - adds __filp_close with a flag whether to fput
> - makes close(2) use both
> - transparent to everyone else
> 
> Downside is that __fput_sync still loses the assert. Instead of
> losing, it could perhaps be extended with a hack to check syscall
> number -- pass if either this is close (or binary compat close) or a
> kthread, BUG out otherwise. Alternatively perhaps deref could be
> opencoded along with a comment about real fput that this is taking
> place. Or maybe some other cosmetic choice.
> 
> I cannot compile-test right now, so down below is a rough copy make
> sure it is clear what I mean.
> 
> I feel compelled to note that simple patches get microbenchmarked all
> the time, with these results being the only justification provided.
> I'm confused why this patch is supposed to be an exception given its
> simplicity.
> 
> Serious justification should be expected from tough calls --
> complicated, invasive changes, maybe with numerous tradeoffs.
> 
> In contrast close(2) doing __fput_sync looks a clear cut thing to do,
> at worst one can argue which way to do it.
> 
> diff --git a/fs/file.c b/fs/file.c
> index 3fd003a8604f..c341b07533b0 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -651,20 +651,30 @@ static struct file *pick_file(struct
> files_struct *files, unsigned fd)
>         return file;
>  }
> 
> -int close_fd(unsigned fd)
> +struct file *__close_fd(unsigned fd, struct file_struct *files)
>  {
> -       struct files_struct *files = current->files;
>         struct file *file;
> 
>         spin_lock(&files->file_lock);
>         file = pick_file(files, fd);
>         spin_unlock(&files->file_lock);
> +
> +       return file;
> +}
> +EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
> +
> +int close_fd(unsigned fd)
> +{
> +       struct files_struct *files = current->files;
> +       struct file *file;
> +
> +       file = __close_fd(fd, files);
>         if (!file)
>                 return -EBADF;
> 
>         return filp_close(file, files);
>  }
> -EXPORT_SYMBOL(close_fd); /* for ksys_close() */
> +EXPORT_SYMBOL(close_fd);
> 
>  /**
>   * last_fd - return last valid index into fd table
> diff --git a/fs/file_table.c b/fs/file_table.c
> index fc7d677ff5ad..b7461f0b73f4 100644
> --- a/fs/file_table.c
> +++ b/fs/file_table.c
> @@ -463,6 +463,11 @@ void __fput_sync(struct file *file)
>  {
>         if (atomic_long_dec_and_test(&file->f_count)) {
>                 struct task_struct *task = current;
> +               /*
> +                * I see 2 basic options
> +                * 1. just remove the assert
> +                * 2. demand the flag *or* that the caller is close(2)
> +                */
>                 BUG_ON(!(task->flags & PF_KTHREAD));
>                 __fput(file);
>         }
> diff --git a/fs/open.c b/fs/open.c
> index e6ead0f19964..b1602307c1c3 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -1533,7 +1533,16 @@ EXPORT_SYMBOL(filp_close);
>   */
>  SYSCALL_DEFINE1(close, unsigned int, fd)
>  {
> -       int retval = close_fd(fd);
> +       struct files_struct *files = current->files;
> +       struct file *file;
> +       int retval;
> +
> +       file = __close_fd(fd);
> +       if (!file)
> +               return -EBADF;
> +
> +       retval = __filp_close(file, files, false);
> +       __fput_sync(file);
> 
>         /* can't restart close syscall because file table entry was cleared */
>         if (unlikely(retval == -ERESTARTSYS ||
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 562f2623c9c9..e64c0238a65f 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2388,7 +2388,11 @@ static inline struct file
> *file_clone_open(struct file *file)
>  {
>         return dentry_open(&file->f_path, file->f_flags, file->f_cred);
>  }
> -extern int filp_close(struct file *, fl_owner_t id);
> +extern int __filp_close(struct file *file, fl_owner_t id, bool dofput);
> +static inline int filp_close(struct file *file, fl_owner_t id)
> +{
> +       return __filp_close(file, id, true);
> +}
> 
>  extern struct filename *getname_flags(const char __user *, int, int *);
>  extern struct filename *getname_uflags(const char __user *, int);

At least make this really dumb and obvious and keep the ugliness to
internal.h and open.c

---
 fs/file_table.c | 10 ++++++++++
 fs/internal.h   |  1 +
 fs/open.c       | 21 +++++++++++++++------
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index fc7d677ff5ad..18f8adaba972 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -471,6 +471,16 @@ void __fput_sync(struct file *file)
 EXPORT_SYMBOL(fput);
 EXPORT_SYMBOL(__fput_sync);
 
+/*
+ * Same as __fput_sync() but for regular close.
+ * Not exported, not for general use.
+ */
+void fput_sync(struct file *file)
+{
+	if (atomic_long_dec_and_test(&file->f_count))
+		__fput(file);
+}
+
 void __init files_init(void)
 {
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
diff --git a/fs/internal.h b/fs/internal.h
index f7a3dc111026..1ad2a4ce728b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -297,6 +297,7 @@ static inline ssize_t do_get_acl(struct mnt_idmap *idmap,
 #endif
 
 ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
+void fput_sync(struct file *file);
 
 /*
  * fs/attr.c
diff --git a/fs/open.c b/fs/open.c
index e6ead0f19964..2540b22fb114 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1499,11 +1499,7 @@ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
 }
 #endif
 
-/*
- * "id" is the POSIX thread ID. We use the
- * files pointer for this..
- */
-int filp_close(struct file *filp, fl_owner_t id)
+static int __filp_close(struct file *filp, fl_owner_t id, bool may_delay)
 {
 	int retval = 0;
 
@@ -1520,10 +1516,23 @@ int filp_close(struct file *filp, fl_owner_t id)
 		dnotify_flush(filp, id);
 		locks_remove_posix(filp, id);
 	}
-	fput(filp);
+
+	if (may_delay)
+		fput(filp);
+	else
+		fput_sync(filp);
 	return retval;
 }
 
+/*
+ * "id" is the POSIX thread ID. We use the
+ * files pointer for this..
+ */
+int filp_close(struct file *filp, fl_owner_t id)
+{
+	return __filp_close(filp, id, true);
+}
+
 EXPORT_SYMBOL(filp_close);
 
 /*
  
Christian Brauner Aug. 8, 2023, 5 p.m. UTC | #2
On Tue, Aug 08, 2023 at 06:30:06PM +0200, Christian Brauner wrote:
> On Tue, Aug 08, 2023 at 05:07:22PM +0200, Mateusz Guzik wrote:
> > I slapped the following variant just for illustration purposes.
> > 
> > - adds __close_fd which returns a struct file
> > - adds __filp_close with a flag whether to fput
> > - makes close(2) use both
> > - transparent to everyone else
> > 
> > Downside is that __fput_sync still loses the assert. Instead of
> > losing, it could perhaps be extended with a hack to check syscall
> > number -- pass if either this is close (or binary compat close) or a
> > kthread, BUG out otherwise. Alternatively perhaps deref could be
> > opencoded along with a comment about real fput that this is taking
> > place. Or maybe some other cosmetic choice.
> > 
> > I cannot compile-test right now, so down below is a rough copy make
> > sure it is clear what I mean.
> > 
> > I feel compelled to note that simple patches get microbenchmarked all
> > the time, with these results being the only justification provided.
> > I'm confused why this patch is supposed to be an exception given its
> > simplicity.
> > 
> > Serious justification should be expected from tough calls --
> > complicated, invasive changes, maybe with numerous tradeoffs.
> > 
> > In contrast close(2) doing __fput_sync looks a clear cut thing to do,
> > at worst one can argue which way to do it.
> > 
> > diff --git a/fs/file.c b/fs/file.c
> > index 3fd003a8604f..c341b07533b0 100644
> > --- a/fs/file.c
> > +++ b/fs/file.c
> > @@ -651,20 +651,30 @@ static struct file *pick_file(struct
> > files_struct *files, unsigned fd)
> >         return file;
> >  }
> > 
> > -int close_fd(unsigned fd)
> > +struct file *__close_fd(unsigned fd, struct file_struct *files)
> >  {
> > -       struct files_struct *files = current->files;
> >         struct file *file;
> > 
> >         spin_lock(&files->file_lock);
> >         file = pick_file(files, fd);
> >         spin_unlock(&files->file_lock);
> > +
> > +       return file;
> > +}
> > +EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
> > +
> > +int close_fd(unsigned fd)
> > +{
> > +       struct files_struct *files = current->files;
> > +       struct file *file;
> > +
> > +       file = __close_fd(fd, files);
> >         if (!file)
> >                 return -EBADF;
> > 
> >         return filp_close(file, files);
> >  }
> > -EXPORT_SYMBOL(close_fd); /* for ksys_close() */
> > +EXPORT_SYMBOL(close_fd);
> > 
> >  /**
> >   * last_fd - return last valid index into fd table
> > diff --git a/fs/file_table.c b/fs/file_table.c
> > index fc7d677ff5ad..b7461f0b73f4 100644
> > --- a/fs/file_table.c
> > +++ b/fs/file_table.c
> > @@ -463,6 +463,11 @@ void __fput_sync(struct file *file)
> >  {
> >         if (atomic_long_dec_and_test(&file->f_count)) {
> >                 struct task_struct *task = current;
> > +               /*
> > +                * I see 2 basic options
> > +                * 1. just remove the assert
> > +                * 2. demand the flag *or* that the caller is close(2)
> > +                */
> >                 BUG_ON(!(task->flags & PF_KTHREAD));
> >                 __fput(file);
> >         }
> > diff --git a/fs/open.c b/fs/open.c
> > index e6ead0f19964..b1602307c1c3 100644
> > --- a/fs/open.c
> > +++ b/fs/open.c
> > @@ -1533,7 +1533,16 @@ EXPORT_SYMBOL(filp_close);
> >   */
> >  SYSCALL_DEFINE1(close, unsigned int, fd)
> >  {
> > -       int retval = close_fd(fd);
> > +       struct files_struct *files = current->files;
> > +       struct file *file;
> > +       int retval;
> > +
> > +       file = __close_fd(fd);
> > +       if (!file)
> > +               return -EBADF;
> > +
> > +       retval = __filp_close(file, files, false);
> > +       __fput_sync(file);
> > 
> >         /* can't restart close syscall because file table entry was cleared */
> >         if (unlikely(retval == -ERESTARTSYS ||
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index 562f2623c9c9..e64c0238a65f 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -2388,7 +2388,11 @@ static inline struct file
> > *file_clone_open(struct file *file)
> >  {
> >         return dentry_open(&file->f_path, file->f_flags, file->f_cred);
> >  }
> > -extern int filp_close(struct file *, fl_owner_t id);
> > +extern int __filp_close(struct file *file, fl_owner_t id, bool dofput);
> > +static inline int filp_close(struct file *file, fl_owner_t id)
> > +{
> > +       return __filp_close(file, id, true);
> > +}
> > 
> >  extern struct filename *getname_flags(const char __user *, int, int *);
> >  extern struct filename *getname_uflags(const char __user *, int);
> 
> At least make this really dumb and obvious and keep the ugliness to
> internal.h and open.c

Sorry, I only sent a portion of the patch...
  
Linus Torvalds Aug. 8, 2023, 5:05 p.m. UTC | #3
On Tue, 8 Aug 2023 at 09:30, Christian Brauner <brauner@kernel.org> wrote:
>
> At least make this really dumb and obvious and keep the ugliness to
> internal.h and open.c

See the patch I just sent out.

I hate yours too, for that nasty "bool may_delay".

I hate those "bool flag" things that change behavior of a function. It
may be obvious when you look at the function itself, and know the
code, but then it causes things like this:

        return __filp_close(filp, id, true);

and there is zero clue about what the heck 'true' means.

At least then the "behavior flags" are named bitmasks, things make
*sense*. But we have too many of these boolean arguments.

And yes, I realize that we have tons of extant ones, and this would be
only one more in a sea of others. That doesn't make it ok. So please
keep it to when it *has* to be done to avoid major problems.

             Linus
  
Christian Brauner Aug. 8, 2023, 5:06 p.m. UTC | #4
On Tue, Aug 08, 2023 at 10:05:20AM -0700, Linus Torvalds wrote:
> On Tue, 8 Aug 2023 at 09:30, Christian Brauner <brauner@kernel.org> wrote:
> >
> > At least make this really dumb and obvious and keep the ugliness to
> > internal.h and open.c
> 
> See the patch I just sent out.

Yeah, I saw.
  
David Laight Aug. 9, 2023, 9:03 a.m. UTC | #5
From: Linus Torvalds
> Sent: 08 August 2023 18:05
...
>         return __filp_close(filp, id, true);
> 
> and there is zero clue about what the heck 'true' means.
> 
> At least then the "behavior flags" are named bitmasks, things make
> *sense*. But we have too many of these boolean arguments.

And make the usual case 0.

I was chasing through some code that has a flag for a
conditional lock.
However is was 'NEED_TO_LOCK' not 'ALREADY_LOCKED'.
(bpf calling in with the socket locked).

As well as making the code harder to read it is an accident
just waiting to happen.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
  

Patch

diff --git a/fs/file.c b/fs/file.c
index 3fd003a8604f..c341b07533b0 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -651,20 +651,30 @@  static struct file *pick_file(struct
files_struct *files, unsigned fd)
        return file;
 }

-int close_fd(unsigned fd)
+struct file *__close_fd(unsigned fd, struct file_struct *files)
 {
-       struct files_struct *files = current->files;
        struct file *file;

        spin_lock(&files->file_lock);
        file = pick_file(files, fd);
        spin_unlock(&files->file_lock);
+
+       return file;
+}
+EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
+
+int close_fd(unsigned fd)
+{
+       struct files_struct *files = current->files;
+       struct file *file;
+
+       file = __close_fd(fd, files);
        if (!file)
                return -EBADF;

        return filp_close(file, files);
 }
-EXPORT_SYMBOL(close_fd); /* for ksys_close() */
+EXPORT_SYMBOL(close_fd);

 /**
  * last_fd - return last valid index into fd table
diff --git a/fs/file_table.c b/fs/file_table.c
index fc7d677ff5ad..b7461f0b73f4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -463,6 +463,11 @@  void __fput_sync(struct file *file)
 {
        if (atomic_long_dec_and_test(&file->f_count)) {
                struct task_struct *task = current;
+               /*
+                * I see 2 basic options
+                * 1. just remove the assert
+                * 2. demand the flag *or* that the caller is close(2)
+                */
                BUG_ON(!(task->flags & PF_KTHREAD));
                __fput(file);
        }
diff --git a/fs/open.c b/fs/open.c
index e6ead0f19964..b1602307c1c3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1533,7 +1533,16 @@  EXPORT_SYMBOL(filp_close);
  */
 SYSCALL_DEFINE1(close, unsigned int, fd)
 {
-       int retval = close_fd(fd);
+       struct files_struct *files = current->files;
+       struct file *file;
+       int retval;
+
+       file = __close_fd(fd);
+       if (!file)
+               return -EBADF;
+
+       retval = __filp_close(file, files, false);
+       __fput_sync(file);

        /* can't restart close syscall because file table entry was cleared */
        if (unlikely(retval == -ERESTARTSYS ||
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 562f2623c9c9..e64c0238a65f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2388,7 +2388,11 @@  static inline struct file
*file_clone_open(struct file *file)
 {
        return dentry_open(&file->f_path, file->f_flags, file->f_cred);
 }
-extern int filp_close(struct file *, fl_owner_t id);
+extern int __filp_close(struct file *file, fl_owner_t id, bool dofput);
+static inline int filp_close(struct file *file, fl_owner_t id)
+{
+       return __filp_close(file, id, true);
+}

 extern struct filename *getname_flags(const char __user *, int, int *);