[RFC,2/6] debugfs: annotate debugfs handlers vs. removal with lockdep
Commit Message
From: Johannes Berg <johannes.berg@intel.com>
When you take a lock in a debugfs handler but also try
to remove the debugfs file under that lock, things can
deadlock since the removal has to wait for all users
to finish.
Add lockdep annotations in debugfs_file_get()/_put()
to catch such issues.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
fs/debugfs/file.c | 10 ++++++++++
fs/debugfs/inode.c | 12 ++++++++++++
fs/debugfs/internal.h | 6 ++++++
3 files changed, 28 insertions(+)
Comments
On (23/11/09 22:22), Johannes Berg wrote:
> From: Johannes Berg <johannes.berg@intel.com>
>
> When you take a lock in a debugfs handler but also try
> to remove the debugfs file under that lock, things can
> deadlock since the removal has to wait for all users
> to finish.
>
> Add lockdep annotations in debugfs_file_get()/_put()
> to catch such issues.
So this triggers when I reset zram device (zsmalloc compiled with
CONFIG_ZSMALLOC_STAT).
debugfs_create_file() and debugfs_remove_recursive() are called
under zram->init_lock, and zsmalloc never calls into zram code.
What I don't really get is where does the
`debugfs::classes -> zram->init_lock`
dependency come from?
[ 47.283364] ======================================================
[ 47.284790] WARNING: possible circular locking dependency detected
[ 47.286217] 6.7.0-rc3-next-20231201+ #239 Tainted: G N
[ 47.287723] ------------------------------------------------------
[ 47.289145] zram-test.sh/727 is trying to acquire lock:
[ 47.290350] ffff88814b824070 (debugfs:classes){++++}-{0:0}, at: remove_one+0x65/0x210
[ 47.292202]
[ 47.292202] but task is already holding lock:
[ 47.293554] ffff88812fe7ee48 (&sb->s_type->i_mutex_key#2){++++}-{3:3}, at: simple_recursive_removal+0x217/0x3a0
[ 47.295895]
[ 47.295895] which lock already depends on the new lock.
[ 47.295895]
[ 47.297757]
[ 47.297757] the existing dependency chain (in reverse order) is:
[ 47.299498]
[ 47.299498] -> #4 (&sb->s_type->i_mutex_key#2){++++}-{3:3}:
[ 47.301129] down_write+0x40/0x80
[ 47.302028] start_creating+0xa5/0x1b0
[ 47.303024] debugfs_create_dir+0x16/0x240
[ 47.304104] zs_create_pool+0x5da/0x6f0
[ 47.305123] disksize_store+0xce/0x320
[ 47.306129] kernfs_fop_write_iter+0x1cb/0x270
[ 47.307279] vfs_write+0x42f/0x4c0
[ 47.308205] ksys_write+0x8f/0x110
[ 47.309129] do_syscall_64+0x40/0xe0
[ 47.310100] entry_SYSCALL_64_after_hwframe+0x62/0x6a
[ 47.311403]
[ 47.311403] -> #3 (&zram->init_lock){++++}-{3:3}:
[ 47.312856] down_write+0x40/0x80
[ 47.313755] zram_reset_device+0x22/0x2b0
[ 47.314814] reset_store+0x15b/0x190
[ 47.315788] kernfs_fop_write_iter+0x1cb/0x270
[ 47.316946] vfs_write+0x42f/0x4c0
[ 47.317869] ksys_write+0x8f/0x110
[ 47.318787] do_syscall_64+0x40/0xe0
[ 47.319754] entry_SYSCALL_64_after_hwframe+0x62/0x6a
[ 47.321051]
[ 47.321051] -> #2 (&of->mutex){+.+.}-{3:3}:
[ 47.322374] __mutex_lock+0x97/0x810
[ 47.323338] kernfs_seq_start+0x34/0x190
[ 47.324387] seq_read_iter+0x1e1/0x6c0
[ 47.325389] vfs_read+0x38f/0x420
[ 47.326295] ksys_read+0x8f/0x110
[ 47.327200] do_syscall_64+0x40/0xe0
[ 47.328164] entry_SYSCALL_64_after_hwframe+0x62/0x6a
[ 47.329460]
[ 47.329460] -> #1 (&p->lock){+.+.}-{3:3}:
[ 47.330740] __mutex_lock+0x97/0x810
[ 47.331717] seq_read_iter+0x5c/0x6c0
[ 47.332696] seq_read+0xfe/0x140
[ 47.333577] full_proxy_read+0x90/0x110
[ 47.334598] vfs_read+0xfb/0x420
[ 47.335482] ksys_read+0x8f/0x110
[ 47.336382] do_syscall_64+0x40/0xe0
[ 47.337344] entry_SYSCALL_64_after_hwframe+0x62/0x6a
[ 47.338636]
[ 47.338636] -> #0 (debugfs:classes){++++}-{0:0}:
[ 47.340056] __lock_acquire+0x20b1/0x3b50
[ 47.341117] lock_acquire+0xe3/0x210
[ 47.342072] remove_one+0x7d/0x210
[ 47.342991] simple_recursive_removal+0x325/0x3a0
[ 47.344210] debugfs_remove+0x40/0x60
[ 47.345181] zs_destroy_pool+0x4e/0x3d0
[ 47.346199] zram_reset_device+0x151/0x2b0
[ 47.347272] reset_store+0x15b/0x190
[ 47.348257] kernfs_fop_write_iter+0x1cb/0x270
[ 47.349426] vfs_write+0x42f/0x4c0
[ 47.350350] ksys_write+0x8f/0x110
[ 47.351273] do_syscall_64+0x40/0xe0
[ 47.352239] entry_SYSCALL_64_after_hwframe+0x62/0x6a
[ 47.353536]
[ 47.353536] other info that might help us debug this:
[ 47.353536]
[ 47.355381] Chain exists of:
[ 47.355381] debugfs:classes --> &zram->init_lock --> &sb->s_type->i_mutex_key#2
[ 47.355381]
[ 47.358105] Possible unsafe locking scenario:
[ 47.358105]
[ 47.359484] CPU0 CPU1
[ 47.360545] ---- ----
[ 47.361599] lock(&sb->s_type->i_mutex_key#2);
[ 47.362665] lock(&zram->init_lock);
[ 47.364146] lock(&sb->s_type->i_mutex_key#2);
[ 47.365781] lock(debugfs:classes);
[ 47.366626]
[ 47.366626] *** DEADLOCK ***
[ 47.366626]
[ 47.368005] 5 locks held by zram-test.sh/727:
[ 47.369028] #0: ffff88811420c420 (sb_writers#5){.+.+}-{0:0}, at: vfs_write+0xff/0x4c0
[ 47.370873] #1: ffff8881346e1090 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x126/0x270
[ 47.372932] #2: ffff88810b7bfb38 (kn->active#50){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x137/0x270
[ 47.375052] #3: ffff88810b3e60b0 (&zram->init_lock){++++}-{3:3}, at: zram_reset_device+0x22/0x2b0
[ 47.377131] #4: ffff88812fe7ee48 (&sb->s_type->i_mutex_key#2){++++}-{3:3}, at: simple_recursive_removal+0x217/0x3a0
[ 47.379599]
[ 47.379599] stack backtrace:
[ 47.380619] CPU: 39 PID: 727 Comm: zram-test.sh Tainted: G N 6.7.0-rc3-next-20231201+ #239
[ 47.382836] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
[ 47.384976] Call Trace:
[ 47.385565] <TASK>
[ 47.386074] dump_stack_lvl+0x6e/0xa0
[ 47.386942] check_noncircular+0x1b6/0x1e0
[ 47.387910] ? lockdep_unlock+0x96/0x130
[ 47.388838] __lock_acquire+0x20b1/0x3b50
[ 47.389789] ? d_walk+0x3c3/0x410
[ 47.390589] lock_acquire+0xe3/0x210
[ 47.391449] ? remove_one+0x65/0x210
[ 47.392301] ? preempt_count_sub+0x14/0xc0
[ 47.393264] ? _raw_spin_unlock+0x29/0x40
[ 47.394214] ? d_walk+0x3c3/0x410
[ 47.395005] ? remove_one+0x65/0x210
[ 47.395866] remove_one+0x7d/0x210
[ 47.396675] ? remove_one+0x65/0x210
[ 47.397523] ? d_invalidate+0xbe/0x170
[ 47.398412] simple_recursive_removal+0x325/0x3a0
[ 47.399521] ? debugfs_remove+0x60/0x60
[ 47.400430] debugfs_remove+0x40/0x60
[ 47.401302] zs_destroy_pool+0x4e/0x3d0
[ 47.402214] zram_reset_device+0x151/0x2b0
[ 47.403187] reset_store+0x15b/0x190
[ 47.404043] ? sysfs_kf_read+0x170/0x170
[ 47.404968] kernfs_fop_write_iter+0x1cb/0x270
[ 47.406015] vfs_write+0x42f/0x4c0
[ 47.406829] ksys_write+0x8f/0x110
[ 47.407642] do_syscall_64+0x40/0xe0
[ 47.408491] entry_SYSCALL_64_after_hwframe+0x62/0x6a
On Sat, 2023-12-02 at 15:37 +0900, Sergey Senozhatsky wrote:
> On (23/11/09 22:22), Johannes Berg wrote:
> > From: Johannes Berg <johannes.berg@intel.com>
> >
> > When you take a lock in a debugfs handler but also try
> > to remove the debugfs file under that lock, things can
> > deadlock since the removal has to wait for all users
> > to finish.
> >
> > Add lockdep annotations in debugfs_file_get()/_put()
> > to catch such issues.
>
> So this triggers when I reset zram device (zsmalloc compiled with
> CONFIG_ZSMALLOC_STAT).
I shouldn't have put that into the rc, that was more or less an
accident. I think I'll do a revert.
> debugfs_create_file() and debugfs_remove_recursive() are called
> under zram->init_lock, and zsmalloc never calls into zram code.
> What I don't really get is where does the
> `debugfs::classes -> zram->init_lock`
> dependency come from?
"debugfs:classes" means a file is being accessed and "classes" is the
name, so that's
static int zs_stats_size_show(struct seq_file *s, void *v)
which uses seq_file, so there we have a seq_file lock.
I think eventually the issue is that lockdep isn't telling the various
seq_file instances apart, and you have one that's removed under lock
(classes) and another one that's taking the lock (reset).
Anyway, I'll send a revert, don't think this is ready yet. I was fixing
the wireless debugfs lockdep and had used that to debug it.
johannes
@@ -103,6 +103,12 @@ int debugfs_file_get(struct dentry *dentry)
kfree(fsd);
fsd = READ_ONCE(dentry->d_fsdata);
}
+#ifdef CONFIG_LOCKDEP
+ fsd->lock_name = kasprintf(GFP_KERNEL, "debugfs:%pd", dentry);
+ lockdep_register_key(&fsd->key);
+ lockdep_init_map(&fsd->lockdep_map, fsd->lock_name ?: "debugfs",
+ &fsd->key, 0);
+#endif
}
/*
@@ -119,6 +125,8 @@ int debugfs_file_get(struct dentry *dentry)
if (!refcount_inc_not_zero(&fsd->active_users))
return -EIO;
+ lock_map_acquire_read(&fsd->lockdep_map);
+
return 0;
}
EXPORT_SYMBOL_GPL(debugfs_file_get);
@@ -136,6 +144,8 @@ void debugfs_file_put(struct dentry *dentry)
{
struct debugfs_fsdata *fsd = READ_ONCE(dentry->d_fsdata);
+ lock_map_release(&fsd->lockdep_map);
+
if (refcount_dec_and_test(&fsd->active_users))
complete(&fsd->active_users_drained);
}
@@ -241,6 +241,14 @@ static void debugfs_release_dentry(struct dentry *dentry)
if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
return;
+ /* check it wasn't an automount or dir */
+ if (fsd && fsd->real_fops) {
+#ifdef CONFIG_LOCKDEP
+ lockdep_unregister_key(&fsd->key);
+ kfree(fsd->lock_name);
+#endif
+ }
+
kfree(fsd);
}
@@ -742,6 +750,10 @@ static void __debugfs_file_removed(struct dentry *dentry)
fsd = READ_ONCE(dentry->d_fsdata);
if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
return;
+
+ lock_map_acquire(&fsd->lockdep_map);
+ lock_map_release(&fsd->lockdep_map);
+
if (!refcount_dec_and_test(&fsd->active_users))
wait_for_completion(&fsd->active_users_drained);
}
@@ -7,6 +7,7 @@
#ifndef _DEBUGFS_INTERNAL_H_
#define _DEBUGFS_INTERNAL_H_
+#include <linux/lockdep.h>
struct file_operations;
@@ -23,6 +24,11 @@ struct debugfs_fsdata {
struct {
refcount_t active_users;
struct completion active_users_drained;
+#ifdef CONFIG_LOCKDEP
+ struct lockdep_map lockdep_map;
+ struct lock_class_key key;
+ char *lock_name;
+#endif
};
};
};