[v3] vfs: avoid delegating to task_work when cleaning up failed open

Message ID 20230928102516.186008-1-mjguzik@gmail.com
State New
Headers
Series [v3] vfs: avoid delegating to task_work when cleaning up failed open |

Commit Message

Mateusz Guzik Sept. 28, 2023, 10:25 a.m. UTC
  I rebased my patch on top of the one shipped by Linus, then benched both.

My patch now depends on it going in first, inlined here for reference:
  

Comments

Matthew Wilcox Sept. 29, 2023, 12:59 p.m. UTC | #1
On Thu, Sep 28, 2023 at 12:25:16PM +0200, Mateusz Guzik wrote:
> Below is my rebased patch + rewritten commit message with updated bench
> results. I decided to stick to fput_badopen name because with your patch
> it legitimately has to unref. Naming that "release_empty_file" or
> whatever would be rather misleading imho.

Do we still need fput_badopen()?  Couldn't we just make this part of
regular fput() at this point?  ie:

+++ b/fs/file_table.c
@@ -435,6 +435,10 @@ void fput(struct file *file)
        if (atomic_long_dec_and_test(&file->f_count)) {
                struct task_struct *task = current;
 
+               if (!(file->f_mode & FMODE_OPENED)) {
+                       file_free(file);
+                       return;
+               }
                if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
                        init_task_work(&file->f_rcuhead, ____fput);
                        if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
  

Patch

diff --git a/fs/file_table.c b/fs/file_table.c
index ee21b3da9d08..7b38ff7385cc 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -65,21 +65,21 @@  static void file_free_rcu(struct rcu_head *head)
 {
 	struct file *f = container_of(head, struct file, f_rcuhead);

-	put_cred(f->f_cred);
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		kfree(backing_file(f));
-	else
-		kmem_cache_free(filp_cachep, f);
+	kfree(backing_file(f));
 }

 static inline void file_free(struct file *f)
 {
 	security_file_free(f);
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		path_put(backing_file_real_path(f));
 	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
 		percpu_counter_dec(&nr_files);
-	call_rcu(&f->f_rcuhead, file_free_rcu);
+	put_cred(f->f_cred);
+	if (unlikely(f->f_mode & FMODE_BACKING)) {
+		path_put(backing_file_real_path(f));
+		call_rcu(&f->f_rcuhead, file_free_rcu);
+	} else {
+		kmem_cache_free(filp_cachep, f);
+	}
 }

 /*
@@ -471,7 +471,8 @@  EXPORT_SYMBOL(__fput_sync);
 void __init files_init(void)
 {
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+			SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN
+			| SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 }

Sapphire Rapids, open1_processes -t 1 from will-it-scale + tmpfs on
/tmp (ops/s):
before:	1539109
after:	1785908 (+16%)

there was also a speed up for negative entries but the above should be
enough for the commit message and I don't want to duplicate the testcase
between them

Below is my rebased patch + rewritten commit message with updated bench
results. I decided to stick to fput_badopen name because with your patch
it legitimately has to unref. Naming that "release_empty_file" or
whatever would be rather misleading imho.

===================== cut here =====================
vfs: avoid delegating to task_work when cleaning up failed open

Failed opens (mostly ENOENT) legitimately happen a lot, for example here
are stats from stracing kernel build for few seconds (strace -fc make):

  % time     seconds  usecs/call     calls    errors syscall
  ------ ----------- ----------- --------- --------- ------------------
    0.76    0.076233           5     15040      3688 openat

(this is tons of header files tried in different paths)

Normally these are closed from task_work machinery, but getting there is
very expensive (see 021a160abf62 ("fs: use __fput_sync in close(2)") and
in the common case trivially avoidable.

Benchmarked with will-it-scale with a custom testcase based on
tests/open1.c, stuffed into tests/openneg.c:
[snip]
        while (1) {
                int fd = open("/tmp/nonexistent", O_RDONLY);
                assert(fd == -1);

                (*iterations)++;
        }
[/snip]

Sapphire Rapids, openneg_processes -t 1 (ops/s):
before:	2299006
after:	2986226 (+29%)

v3:
- rebase on top of the patch which dodges RCU freeing altogether. the
  patch is no longer applicable on top of stock kernel.

v2:
- unexport fput_badopen and move to fs/internal.h
- handle the refcount with cmpxchg, adjust commentary accordingly
- tweak the commit message

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
 fs/file_table.c | 22 ++++++++++++++++++++++
 fs/internal.h   |  2 ++
 fs/namei.c      |  2 +-
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 7b38ff7385cc..8909737e1872 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -468,6 +468,28 @@  void __fput_sync(struct file *file)
 EXPORT_SYMBOL(fput);
 EXPORT_SYMBOL(__fput_sync);
 
+/*
+ * Clean up after failing to open (e.g., open(2) returns with -ENOENT).
+ *
+ * In the common case this avoids delegating the free to task_work.
+ */
+void fput_badopen(struct file *file)
+{
+	if (unlikely(file->f_mode & FMODE_OPENED)) {
+		fput(file);
+		return;
+	}
+
+	/*
+	 * While we did not expose the file to anyone, we may be racing against
+	 * __fget_files_rcu refing a stale object. Should this happen it is
+	 * going to backpedal with fput, but it means we have to unref with an
+	 * atomic to synchronize against it.
+	 */
+	if (atomic_long_dec_and_test(&file->f_count))
+		file_free(file);
+}
+
 void __init files_init(void)
 {
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
diff --git a/fs/internal.h b/fs/internal.h
index d64ae03998cc..93da6d815e90 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -95,6 +95,8 @@  struct file *alloc_empty_file(int flags, const struct cred *cred);
 struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
 struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
 
+void fput_badopen(struct file *);
+
 static inline void put_file_access(struct file *file)
 {
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
diff --git a/fs/namei.c b/fs/namei.c
index 567ee547492b..67579fe30b28 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3802,7 +3802,7 @@  static struct file *path_openat(struct nameidata *nd,
 		WARN_ON(1);
 		error = -EINVAL;
 	}
-	fput(file);
+	fput_badopen(file);
 	if (error == -EOPENSTALE) {
 		if (flags & LOOKUP_RCU)
 			error = -ECHILD;