[2/2] md: create symlink with disk holder after mddev resume
Commit Message
From: Li Nan <linan122@huawei.com>
There is a risk of deadlock when a process gets disk->open_mutex after
suspending mddev, because other processes may hold open_mutex while
submitting io. For example:
T1 T2
blkdev_open
bdev_open_by_dev
mutex_lock(&disk->open_mutex)
md_ioctl
mddev_suspend_and_lock
mddev_suspend
md_add_new_disk
bind_rdev_to_array
bd_link_disk_holder
//wait open_mutex
blkdev_get_whole
bdev_disk_changed
efi_partition
read_lba
...
md_submit_bio
md_handle_request
//wait resume
Fix it by getting disk->open_mutex after mddev resume, iterating each
mddev->disk to create symlink for rdev which has not been created yet.
and moving bd_unlink_disk_holder() to mddev_unlock(), rdev has been
deleted from mddev->disks here, which can avoid concurrent bind and unbind,
Fixes: 1b0a2d950ee2 ("md: use new apis to suspend array for ioctls involed array reconfiguration")
Signed-off-by: Li Nan <linan122@huawei.com>
---
drivers/md/md.c | 39 +++++++++++++++++++++++++++++----------
1 file changed, 29 insertions(+), 10 deletions(-)
Comments
Hi,
在 2023/12/21 15:11, linan666@huaweicloud.com 写道:
> From: Li Nan <linan122@huawei.com>
>
> There is a risk of deadlock when a process gets disk->open_mutex after
> suspending mddev, because other processes may hold open_mutex while
> submitting io. For example:
>
> T1 T2
> blkdev_open
> bdev_open_by_dev
> mutex_lock(&disk->open_mutex)
> md_ioctl
> mddev_suspend_and_lock
> mddev_suspend
> md_add_new_disk
> bind_rdev_to_array
> bd_link_disk_holder
> //wait open_mutex
> blkdev_get_whole
> bdev_disk_changed
> efi_partition
> read_lba
> ...
> md_submit_bio
> md_handle_request
> //wait resume
>
Nice catch! This patch looks good except that the new flag
'SymlinkCreated' doesn't look accurate, perhaps 'HolderLinked'
will make more sense.
Thanks,
Kuai
> Fix it by getting disk->open_mutex after mddev resume, iterating each
> mddev->disk to create symlink for rdev which has not been created yet.
> and moving bd_unlink_disk_holder() to mddev_unlock(), rdev has been
> deleted from mddev->disks here, which can avoid concurrent bind and unbind,
>
> Fixes: 1b0a2d950ee2 ("md: use new apis to suspend array for ioctls involed array reconfiguration")
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
> drivers/md/md.c | 39 +++++++++++++++++++++++++++++----------
> 1 file changed, 29 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index d6612b922c76..c128570f2a5d 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -521,6 +521,20 @@ void mddev_resume(struct mddev *mddev)
> }
> EXPORT_SYMBOL_GPL(mddev_resume);
>
> +static void md_link_disk_holder(struct mddev *mddev)
> +{
> + struct md_rdev *rdev;
> +
> + rcu_read_lock();
> + rdev_for_each_rcu(rdev, mddev) {
> + if (test_bit(SymlinkCreated, &rdev->flags))
> + continue;
> + if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
> + set_bit(SymlinkCreated, &rdev->flags);
> + }
> + rcu_read_unlock();
> +}
> +
> /*
> * Generic flush handling for md
> */
> @@ -902,6 +916,11 @@ void mddev_unlock(struct mddev *mddev)
>
> list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
> list_del_init(&rdev->same_set);
> + if (test_bit(SymlinkCreated, &rdev->flags)) {
> + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
> + clear_bit(SymlinkCreated, &rdev->flags);
> + }
> + rdev->mddev = NULL;
> kobject_del(&rdev->kobj);
> export_rdev(rdev, mddev);
> }
> @@ -2526,8 +2545,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
> sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
>
> list_add_rcu(&rdev->same_set, &mddev->disks);
> - if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
> - set_bit(SymlinkCreated, &rdev->flags);
>
> /* May as well allow recovery to be retried once */
> mddev->recovery_disabled++;
> @@ -2562,14 +2579,9 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
> {
> struct mddev *mddev = rdev->mddev;
>
> - if (test_bit(SymlinkCreated, &rdev->flags)) {
> - bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
> - clear_bit(SymlinkCreated, &rdev->flags);
> - }
> list_del_rcu(&rdev->same_set);
> pr_debug("md: unbind<%pg>\n", rdev->bdev);
> mddev_destroy_serial_pool(rdev->mddev, rdev);
> - rdev->mddev = NULL;
> sysfs_remove_link(&rdev->kobj, "block");
> sysfs_put(rdev->sysfs_state);
> sysfs_put(rdev->sysfs_unack_badblocks);
> @@ -4667,8 +4679,10 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
> if (err)
> export_rdev(rdev, mddev);
> mddev_unlock_and_resume(mddev);
> - if (!err)
> + if (!err) {
> + md_link_disk_holder(mddev);
> md_new_event();
> + }
> return err ? err : len;
> }
>
> @@ -6606,6 +6620,7 @@ static void autorun_devices(int part)
> }
> autorun_array(mddev);
> mddev_unlock_and_resume(mddev);
> + md_link_disk_holder(mddev);
> }
> /* on success, candidates will be empty, on error
> * it won't...
> @@ -7832,8 +7847,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
> err != -EINVAL)
> mddev->hold_active = 0;
>
> - md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
> - mddev_unlock(mddev);
> + if (md_ioctl_need_suspend(cmd)) {
> + mddev_unlock_and_resume(mddev);
> + md_link_disk_holder(mddev);
> + } else {
> + mddev_unlock(mddev);
> + }
>
> out:
> if(did_set_md_closing)
>
Hi, Thorsten here, the Linux kernel's regression tracker.
On 21.12.23 09:49, Yu Kuai wrote:
> 在 2023/12/21 15:11, linan666@huaweicloud.com 写道:
>> From: Li Nan <linan122@huawei.com>
>>
>> There is a risk of deadlock when a process gets disk->open_mutex after
>> suspending mddev, because other processes may hold open_mutex while
>> submitting io. For example:
>> [...]
> Nice catch! This patch looks good except that the new flag
> 'SymlinkCreated' doesn't look accurate, perhaps 'HolderLinked'
> will make more sense.
>
>> Fix it by getting disk->open_mutex after mddev resume, iterating each
>> mddev->disk to create symlink for rdev which has not been created yet.
>> and moving bd_unlink_disk_holder() to mddev_unlock(), rdev has been
>> deleted from mddev->disks here, which can avoid concurrent bind and
>> unbind,
>>
>> Fixes: 1b0a2d950ee2 ("md: use new apis to suspend array for ioctls
>> involed array reconfiguration")
Hey, what happened to that patch? It looks a lot like things stalled
here. I'm asking, because there is a regression report that claims
1b0a2d950ee2 to be the culprit that might or might not be causes by the
problem this patch tries to fix:
https://bugzilla.kernel.org/show_bug.cgi?id=218459
Ciao, Thorsten
>> Signed-off-by: Li Nan <linan122@huawei.com>
>> ---
>> drivers/md/md.c | 39 +++++++++++++++++++++++++++++----------
>> 1 file changed, 29 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index d6612b922c76..c128570f2a5d 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -521,6 +521,20 @@ void mddev_resume(struct mddev *mddev)
>> }
>> EXPORT_SYMBOL_GPL(mddev_resume);
>> +static void md_link_disk_holder(struct mddev *mddev)
>> +{
>> + struct md_rdev *rdev;
>> +
>> + rcu_read_lock();
>> + rdev_for_each_rcu(rdev, mddev) {
>> + if (test_bit(SymlinkCreated, &rdev->flags))
>> + continue;
>> + if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
>> + set_bit(SymlinkCreated, &rdev->flags);
>> + }
>> + rcu_read_unlock();
>> +}
>> +
>> /*
>> * Generic flush handling for md
>> */
>> @@ -902,6 +916,11 @@ void mddev_unlock(struct mddev *mddev)
>> list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
>> list_del_init(&rdev->same_set);
>> + if (test_bit(SymlinkCreated, &rdev->flags)) {
>> + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
>> + clear_bit(SymlinkCreated, &rdev->flags);
>> + }
>> + rdev->mddev = NULL;
>> kobject_del(&rdev->kobj);
>> export_rdev(rdev, mddev);
>> }
>> @@ -2526,8 +2545,6 @@ static int bind_rdev_to_array(struct md_rdev
>> *rdev, struct mddev *mddev)
>> sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
>> list_add_rcu(&rdev->same_set, &mddev->disks);
>> - if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
>> - set_bit(SymlinkCreated, &rdev->flags);
>> /* May as well allow recovery to be retried once */
>> mddev->recovery_disabled++;
>> @@ -2562,14 +2579,9 @@ static void md_kick_rdev_from_array(struct
>> md_rdev *rdev)
>> {
>> struct mddev *mddev = rdev->mddev;
>> - if (test_bit(SymlinkCreated, &rdev->flags)) {
>> - bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
>> - clear_bit(SymlinkCreated, &rdev->flags);
>> - }
>> list_del_rcu(&rdev->same_set);
>> pr_debug("md: unbind<%pg>\n", rdev->bdev);
>> mddev_destroy_serial_pool(rdev->mddev, rdev);
>> - rdev->mddev = NULL;
>> sysfs_remove_link(&rdev->kobj, "block");
>> sysfs_put(rdev->sysfs_state);
>> sysfs_put(rdev->sysfs_unack_badblocks);
>> @@ -4667,8 +4679,10 @@ new_dev_store(struct mddev *mddev, const char
>> *buf, size_t len)
>> if (err)
>> export_rdev(rdev, mddev);
>> mddev_unlock_and_resume(mddev);
>> - if (!err)
>> + if (!err) {
>> + md_link_disk_holder(mddev);
>> md_new_event();
>> + }
>> return err ? err : len;
>> }
>> @@ -6606,6 +6620,7 @@ static void autorun_devices(int part)
>> }
>> autorun_array(mddev);
>> mddev_unlock_and_resume(mddev);
>> + md_link_disk_holder(mddev);
>> }
>> /* on success, candidates will be empty, on error
>> * it won't...
>> @@ -7832,8 +7847,12 @@ static int md_ioctl(struct block_device *bdev,
>> blk_mode_t mode,
>> err != -EINVAL)
>> mddev->hold_active = 0;
>> - md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
>> - mddev_unlock(mddev);
>> + if (md_ioctl_need_suspend(cmd)) {
>> + mddev_unlock_and_resume(mddev);
>> + md_link_disk_holder(mddev);
>> + } else {
>> + mddev_unlock(mddev);
>> + }
>> out:
>> if(did_set_md_closing)
>>
>
On Tue, Feb 6, 2024 at 6:46 AM Linux regression tracking (Thorsten
Leemhuis) <regressions@leemhuis.info> wrote:
>
> Hi, Thorsten here, the Linux kernel's regression tracker.
>
> On 21.12.23 09:49, Yu Kuai wrote:
> > 在 2023/12/21 15:11, linan666@huaweicloud.com 写道:
> >> From: Li Nan <linan122@huawei.com>
> >>
> >> There is a risk of deadlock when a process gets disk->open_mutex after
> >> suspending mddev, because other processes may hold open_mutex while
> >> submitting io. For example:
> >> [...]
> > Nice catch! This patch looks good except that the new flag
> > 'SymlinkCreated' doesn't look accurate, perhaps 'HolderLinked'
> > will make more sense.
> >
> >> Fix it by getting disk->open_mutex after mddev resume, iterating each
> >> mddev->disk to create symlink for rdev which has not been created yet.
> >> and moving bd_unlink_disk_holder() to mddev_unlock(), rdev has been
> >> deleted from mddev->disks here, which can avoid concurrent bind and
> >> unbind,
> >>
> >> Fixes: 1b0a2d950ee2 ("md: use new apis to suspend array for ioctls
> >> involed array reconfiguration")
>
> Hey, what happened to that patch? It looks a lot like things stalled
> here. I'm asking, because there is a regression report that claims
> 1b0a2d950ee2 to be the culprit that might or might not be causes by the
> problem this patch tries to fix:
> https://bugzilla.kernel.org/show_bug.cgi?id=218459
Thanks for the heads-up. Replied to the thread.
Song
@@ -521,6 +521,20 @@ void mddev_resume(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(mddev_resume);
+static void md_link_disk_holder(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (test_bit(SymlinkCreated, &rdev->flags))
+ continue;
+ if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
+ set_bit(SymlinkCreated, &rdev->flags);
+ }
+ rcu_read_unlock();
+}
+
/*
* Generic flush handling for md
*/
@@ -902,6 +916,11 @@ void mddev_unlock(struct mddev *mddev)
list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
list_del_init(&rdev->same_set);
+ if (test_bit(SymlinkCreated, &rdev->flags)) {
+ bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
+ clear_bit(SymlinkCreated, &rdev->flags);
+ }
+ rdev->mddev = NULL;
kobject_del(&rdev->kobj);
export_rdev(rdev, mddev);
}
@@ -2526,8 +2545,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
list_add_rcu(&rdev->same_set, &mddev->disks);
- if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
- set_bit(SymlinkCreated, &rdev->flags);
/* May as well allow recovery to be retried once */
mddev->recovery_disabled++;
@@ -2562,14 +2579,9 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
{
struct mddev *mddev = rdev->mddev;
- if (test_bit(SymlinkCreated, &rdev->flags)) {
- bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
- clear_bit(SymlinkCreated, &rdev->flags);
- }
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev);
mddev_destroy_serial_pool(rdev->mddev, rdev);
- rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
sysfs_put(rdev->sysfs_unack_badblocks);
@@ -4667,8 +4679,10 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
export_rdev(rdev, mddev);
mddev_unlock_and_resume(mddev);
- if (!err)
+ if (!err) {
+ md_link_disk_holder(mddev);
md_new_event();
+ }
return err ? err : len;
}
@@ -6606,6 +6620,7 @@ static void autorun_devices(int part)
}
autorun_array(mddev);
mddev_unlock_and_resume(mddev);
+ md_link_disk_holder(mddev);
}
/* on success, candidates will be empty, on error
* it won't...
@@ -7832,8 +7847,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
err != -EINVAL)
mddev->hold_active = 0;
- md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
- mddev_unlock(mddev);
+ if (md_ioctl_need_suspend(cmd)) {
+ mddev_unlock_and_resume(mddev);
+ md_link_disk_holder(mddev);
+ } else {
+ mddev_unlock(mddev);
+ }
out:
if(did_set_md_closing)