block: fix deadlock between bd_link_disk_holder and partition scan

Message ID 20240207092756.2087888-1-linan666@huaweicloud.com
State New
Headers
Series block: fix deadlock between bd_link_disk_holder and partition scan |

Commit Message

Li Nan Feb. 7, 2024, 9:27 a.m. UTC
  From: Li Nan <linan122@huawei.com>

'open_mutex' of gendisk is used to protect open/close block devices. But
in bd_link_disk_holder(), it is used to protect the creation of symlink
between holding disk and slave bdev, which introduces some issues.

When bd_link_disk_holder() is called, the driver is usually in the process
of initialization/modification and may suspend submitting io. At this
time, any io hold 'open_mutex', such as scanning partitions, can cause
deadlocks. For example, in raid:

T1                              T2
bdev_open_by_dev
 lock open_mutex [1]
 ...
  efi_partition
  ...
   md_submit_bio
				md_ioctl mddev_syspend
				  -> suspend all io
				 md_add_new_disk
				  bind_rdev_to_array
				   bd_link_disk_holder
				    try lock open_mutex [2]
    md_handle_request
     -> wait mddev_resume

T1 scan partition, T2 add a new device to raid. T1 waits for T2 to resume
mddev, but T2 waits for open_mutex held by T1. Deadlock occurs.

Fix it by introducing a local mutex 'holder_mutex' to replace 'open_mutex'.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 block/holder.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)
  

Comments

Song Liu Feb. 8, 2024, 6:50 a.m. UTC | #1
On Wed, Feb 7, 2024 at 1:32 AM <linan666@huaweicloud.com> wrote:
>
> From: Li Nan <linan122@huawei.com>
>
> 'open_mutex' of gendisk is used to protect open/close block devices. But
> in bd_link_disk_holder(), it is used to protect the creation of symlink
> between holding disk and slave bdev, which introduces some issues.
>
> When bd_link_disk_holder() is called, the driver is usually in the process
> of initialization/modification and may suspend submitting io. At this
> time, any io hold 'open_mutex', such as scanning partitions, can cause
> deadlocks. For example, in raid:
>
> T1                              T2
> bdev_open_by_dev
>  lock open_mutex [1]
>  ...
>   efi_partition
>   ...
>    md_submit_bio
>                                 md_ioctl mddev_syspend
>                                   -> suspend all io
>                                  md_add_new_disk
>                                   bind_rdev_to_array
>                                    bd_link_disk_holder
>                                     try lock open_mutex [2]
>     md_handle_request
>      -> wait mddev_resume
>
> T1 scan partition, T2 add a new device to raid. T1 waits for T2 to resume
> mddev, but T2 waits for open_mutex held by T1. Deadlock occurs.
>
> Fix it by introducing a local mutex 'holder_mutex' to replace 'open_mutex'.

Is this to fix [1]? Do we need some Fixes and/or Closes tags?

Could you please add steps to reproduce this issue?

Thanks,
Song

[1] https://bugzilla.kernel.org/show_bug.cgi?id=218459

>
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>  block/holder.c | 12 +++++++-----
>  1 file changed, 7 insertions(+), 5 deletions(-)
>
> diff --git a/block/holder.c b/block/holder.c
> index 37d18c13d958..5bfb0a674cc7 100644
> --- a/block/holder.c
> +++ b/block/holder.c
> @@ -8,6 +8,8 @@ struct bd_holder_disk {
>         int                     refcnt;
>  };
>
> +static DEFINE_MUTEX(holder_mutex);
> +
>  static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
>                                                   struct gendisk *disk)
>  {
> @@ -80,7 +82,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
>         kobject_get(bdev->bd_holder_dir);
>         mutex_unlock(&bdev->bd_disk->open_mutex);
>
> -       mutex_lock(&disk->open_mutex);
> +       mutex_lock(&holder_mutex);
>         WARN_ON_ONCE(!bdev->bd_holder);
>
>         holder = bd_find_holder_disk(bdev, disk);
> @@ -108,7 +110,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
>                 goto out_del_symlink;
>         list_add(&holder->list, &disk->slave_bdevs);
>
> -       mutex_unlock(&disk->open_mutex);
> +       mutex_unlock(&holder_mutex);
>         return 0;
>
>  out_del_symlink:
> @@ -116,7 +118,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
>  out_free_holder:
>         kfree(holder);
>  out_unlock:
> -       mutex_unlock(&disk->open_mutex);
> +       mutex_unlock(&holder_mutex);
>         if (ret)
>                 kobject_put(bdev->bd_holder_dir);
>         return ret;
> @@ -140,7 +142,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
>         if (WARN_ON_ONCE(!disk->slave_dir))
>                 return;
>
> -       mutex_lock(&disk->open_mutex);
> +       mutex_lock(&holder_mutex);
>         holder = bd_find_holder_disk(bdev, disk);
>         if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
>                 del_symlink(disk->slave_dir, bdev_kobj(bdev));
> @@ -149,6 +151,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
>                 list_del_init(&holder->list);
>                 kfree(holder);
>         }
> -       mutex_unlock(&disk->open_mutex);
> +       mutex_unlock(&holder_mutex);
>  }
>  EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
> --
> 2.39.2
>
  
Li Nan Feb. 8, 2024, 8:44 a.m. UTC | #2
在 2024/2/8 14:50, Song Liu 写道:
> On Wed, Feb 7, 2024 at 1:32 AM <linan666@huaweicloud.com> wrote:
>>
>> From: Li Nan <linan122@huawei.com>
>>
>> 'open_mutex' of gendisk is used to protect open/close block devices. But
>> in bd_link_disk_holder(), it is used to protect the creation of symlink
>> between holding disk and slave bdev, which introduces some issues.
>>
>> When bd_link_disk_holder() is called, the driver is usually in the process
>> of initialization/modification and may suspend submitting io. At this
>> time, any io hold 'open_mutex', such as scanning partitions, can cause
>> deadlocks. For example, in raid:
>>
>> T1                              T2
>> bdev_open_by_dev
>>   lock open_mutex [1]
>>   ...
>>    efi_partition
>>    ...
>>     md_submit_bio
>>                                  md_ioctl mddev_syspend
>>                                    -> suspend all io
>>                                   md_add_new_disk
>>                                    bind_rdev_to_array
>>                                     bd_link_disk_holder
>>                                      try lock open_mutex [2]
>>      md_handle_request
>>       -> wait mddev_resume
>>
>> T1 scan partition, T2 add a new device to raid. T1 waits for T2 to resume
>> mddev, but T2 waits for open_mutex held by T1. Deadlock occurs.
>>
>> Fix it by introducing a local mutex 'holder_mutex' to replace 'open_mutex'.
> 
> Is this to fix [1]? Do we need some Fixes and/or Closes tags?
> 

No. Just use another way to fix [2], and both [2] and this patch can fix
the issue. I am not sure about the root cause of [1] yet.

[2] https://patchwork.kernel.org/project/linux-raid/list/?series=812045

> Could you please add steps to reproduce this issue?

We need to modify the kernel, add sleep in md_submit_bio() and md_ioctl()
as below, and then:
   1. mdadm -CR /dev/md0 -l1 -n2 /dev/sd[bc]  #create a raid
   2. echo 1 > /sys/module/md_mod/parameters/error_inject  #enable sleep
   3. 'mdadm --add /dev/md0 /dev/sda'  #add a disk to raid
   4. submit ioctl BLKRRPART to raid within 10s.


Changes of kernel:
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 350f5b22ba6f..ce16d319edf2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -76,6 +76,8 @@ static DEFINE_SPINLOCK(pers_lock);

  static const struct kobj_type md_ktype;

+static bool error_inject = false;
+
  struct md_cluster_operations *md_cluster_ops;
  EXPORT_SYMBOL(md_cluster_ops);
  static struct module *md_cluster_mod;
@@ -372,6 +374,8 @@ static bool is_suspended(struct mddev *mddev, struct 
bio *bio)

  void md_handle_request(struct mddev *mddev, struct bio *bio)
  {
+       if (error_inject)
+               ssleep(10);
  check_suspended:
         if (is_suspended(mddev, bio)) {
                 DEFINE_WAIT(__wait);
@@ -7752,6 +7756,8 @@ static int md_ioctl(struct block_device *bdev, 
blk_mode_t mode,
                  */
                 if (mddev->pers) {
                         mdu_disk_info_t info;
+                       if (error_inject)
+                               ssleep(10);
                         if (copy_from_user(&info, argp, sizeof(info)))
                                 err = -EFAULT;
                         else if (!(info.state & (1<<MD_DISK_SYNC)))
@@ -10120,6 +10126,7 @@ module_param_call(start_ro, set_ro, get_ro, NULL, 
S_IRUSR|S_IWUSR);
  module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
  module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
  module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
+module_param(error_inject, bool, S_IRUSR|S_IWUSR);

  MODULE_LICENSE("GPL");
  MODULE_DESCRIPTION("MD RAID framework");
  
Song Liu Feb. 9, 2024, 12:49 a.m. UTC | #3
On Thu, Feb 8, 2024 at 12:44 AM Li Nan <linan666@huaweicloud.com> wrote:
>
>
>
> 在 2024/2/8 14:50, Song Liu 写道:
> > On Wed, Feb 7, 2024 at 1:32 AM <linan666@huaweicloud.com> wrote:
> >>
> >> From: Li Nan <linan122@huawei.com>
> >>
> >> 'open_mutex' of gendisk is used to protect open/close block devices. But
> >> in bd_link_disk_holder(), it is used to protect the creation of symlink
> >> between holding disk and slave bdev, which introduces some issues.
> >>
> >> When bd_link_disk_holder() is called, the driver is usually in the process
> >> of initialization/modification and may suspend submitting io. At this
> >> time, any io hold 'open_mutex', such as scanning partitions, can cause
> >> deadlocks. For example, in raid:
> >>
> >> T1                              T2
> >> bdev_open_by_dev
> >>   lock open_mutex [1]
> >>   ...
> >>    efi_partition
> >>    ...
> >>     md_submit_bio
> >>                                  md_ioctl mddev_syspend
> >>                                    -> suspend all io
> >>                                   md_add_new_disk
> >>                                    bind_rdev_to_array
> >>                                     bd_link_disk_holder
> >>                                      try lock open_mutex [2]
> >>      md_handle_request
> >>       -> wait mddev_resume
> >>
> >> T1 scan partition, T2 add a new device to raid. T1 waits for T2 to resume
> >> mddev, but T2 waits for open_mutex held by T1. Deadlock occurs.
> >>
> >> Fix it by introducing a local mutex 'holder_mutex' to replace 'open_mutex'.
> >
> > Is this to fix [1]? Do we need some Fixes and/or Closes tags?
> >
>
> No. Just use another way to fix [2], and both [2] and this patch can fix
> the issue. I am not sure about the root cause of [1] yet.
>
> [2] https://patchwork.kernel.org/project/linux-raid/list/?series=812045
>
> > Could you please add steps to reproduce this issue?
>
> We need to modify the kernel, add sleep in md_submit_bio() and md_ioctl()
> as below, and then:
>    1. mdadm -CR /dev/md0 -l1 -n2 /dev/sd[bc]  #create a raid
>    2. echo 1 > /sys/module/md_mod/parameters/error_inject  #enable sleep
>    3. 'mdadm --add /dev/md0 /dev/sda'  #add a disk to raid
>    4. submit ioctl BLKRRPART to raid within 10s.

The analysis makes sense. I also hit the issue a couple times without adding
extra delays. But I am not sure whether this is the best fix (I didn't find real
issues with it either).

Maybe we don't need to suspend the array for ADD_NEW_DISK? So that
something like the following might just work?

Thanks,
Song

@@ -7573,7 +7577,6 @@ static inline bool md_ioctl_valid(unsigned int cmd)
 static bool md_ioctl_need_suspend(unsigned int cmd)
 {
        switch (cmd) {
-       case ADD_NEW_DISK:
        case HOT_ADD_DISK:
        case HOT_REMOVE_DISK:
        case SET_BITMAP_FILE:
  
Song Liu Feb. 16, 2024, 7:03 p.m. UTC | #4
On Thu, Feb 8, 2024 at 4:49 PM Song Liu <song@kernel.org> wrote:
>
> On Thu, Feb 8, 2024 at 12:44 AM Li Nan <linan666@huaweicloud.com> wrote:
> >
> >
> >
> > 在 2024/2/8 14:50, Song Liu 写道:
> > > On Wed, Feb 7, 2024 at 1:32 AM <linan666@huaweicloud.com> wrote:
> > >>
> > >> From: Li Nan <linan122@huawei.com>
> > >>
> > >> 'open_mutex' of gendisk is used to protect open/close block devices. But
> > >> in bd_link_disk_holder(), it is used to protect the creation of symlink
> > >> between holding disk and slave bdev, which introduces some issues.
> > >>
> > >> When bd_link_disk_holder() is called, the driver is usually in the process
> > >> of initialization/modification and may suspend submitting io. At this
> > >> time, any io hold 'open_mutex', such as scanning partitions, can cause
> > >> deadlocks. For example, in raid:
> > >>
> > >> T1                              T2
> > >> bdev_open_by_dev
> > >>   lock open_mutex [1]
> > >>   ...
> > >>    efi_partition
> > >>    ...
> > >>     md_submit_bio
> > >>                                  md_ioctl mddev_syspend
> > >>                                    -> suspend all io
> > >>                                   md_add_new_disk
> > >>                                    bind_rdev_to_array
> > >>                                     bd_link_disk_holder
> > >>                                      try lock open_mutex [2]
> > >>      md_handle_request
> > >>       -> wait mddev_resume
> > >>
> > >> T1 scan partition, T2 add a new device to raid. T1 waits for T2 to resume
> > >> mddev, but T2 waits for open_mutex held by T1. Deadlock occurs.
> > >>
> > >> Fix it by introducing a local mutex 'holder_mutex' to replace 'open_mutex'.
> > >
> > > Is this to fix [1]? Do we need some Fixes and/or Closes tags?
> > >
> >
> > No. Just use another way to fix [2], and both [2] and this patch can fix
> > the issue. I am not sure about the root cause of [1] yet.
> >
> > [2] https://patchwork.kernel.org/project/linux-raid/list/?series=812045
> >
> > > Could you please add steps to reproduce this issue?
> >
> > We need to modify the kernel, add sleep in md_submit_bio() and md_ioctl()
> > as below, and then:
> >    1. mdadm -CR /dev/md0 -l1 -n2 /dev/sd[bc]  #create a raid
> >    2. echo 1 > /sys/module/md_mod/parameters/error_inject  #enable sleep
> >    3. 'mdadm --add /dev/md0 /dev/sda'  #add a disk to raid
> >    4. submit ioctl BLKRRPART to raid within 10s.
>
> The analysis makes sense. I also hit the issue a couple times without adding
> extra delays. But I am not sure whether this is the best fix (I didn't find real
> issues with it either).

To be extra safe and future proof, we can do something like the
following to only
suspend the array for ADD_NEW_DISK on not-running arrays.

This appear to solve the problem reported in

https://bugzilla.kernel.org/show_bug.cgi?id=218459

Thanks,
Song

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9e41a9aaba8b..395911d5f4d6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7570,10 +7570,11 @@ static inline bool md_ioctl_valid(unsigned int cmd)
        }
 }

-static bool md_ioctl_need_suspend(unsigned int cmd)
+static bool md_ioctl_need_suspend(struct mddev *mddev, unsigned int cmd)
 {
        switch (cmd) {
        case ADD_NEW_DISK:
+               return mddev->pers != NULL;
        case HOT_ADD_DISK:
        case HOT_REMOVE_DISK:
        case SET_BITMAP_FILE:
@@ -7625,6 +7626,7 @@ static int md_ioctl(struct block_device *bdev,
blk_mode_t mode,
        void __user *argp = (void __user *)arg;
        struct mddev *mddev = NULL;
        bool did_set_md_closing = false;
+       bool need_suspend;

        if (!md_ioctl_valid(cmd))
                return -ENOTTY;
@@ -7716,8 +7718,11 @@ static int md_ioctl(struct block_device *bdev,
blk_mode_t mode,
        if (!md_is_rdwr(mddev))
                flush_work(&mddev->sync_work);

-       err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
-                                          mddev_lock(mddev);
+       need_suspend = md_ioctl_need_suspend(mddev, cmd);
+       if (need_suspend)
+               err = mddev_suspend_and_lock(mddev);
+       else
+               err = mddev_lock(mddev);
        if (err) {
                pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
                         err, cmd);
@@ -7846,8 +7851,10 @@ static int md_ioctl(struct block_device *bdev,
blk_mode_t mode,
            err != -EINVAL)
                mddev->hold_active = 0;

-       md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
-                                    mddev_unlock(mddev);
+       if (need_suspend)
+               mddev_unlock_and_resume(mddev);
+       else
+               mddev_unlock(mddev);

 out:
        if(did_set_md_closing)
  
Yu Kuai Feb. 18, 2024, 7:47 a.m. UTC | #5
Hi,

在 2024/02/17 3:03, Song Liu 写道:
> On Thu, Feb 8, 2024 at 4:49 PM Song Liu <song@kernel.org> wrote:
>>
>> On Thu, Feb 8, 2024 at 12:44 AM Li Nan <linan666@huaweicloud.com> wrote:
>>>
>>>
>>>
>>> 在 2024/2/8 14:50, Song Liu 写道:
>>>> On Wed, Feb 7, 2024 at 1:32 AM <linan666@huaweicloud.com> wrote:
>>>>>
>>>>> From: Li Nan <linan122@huawei.com>
>>>>>
>>>>> 'open_mutex' of gendisk is used to protect open/close block devices. But
>>>>> in bd_link_disk_holder(), it is used to protect the creation of symlink
>>>>> between holding disk and slave bdev, which introduces some issues.
>>>>>
>>>>> When bd_link_disk_holder() is called, the driver is usually in the process
>>>>> of initialization/modification and may suspend submitting io. At this
>>>>> time, any io hold 'open_mutex', such as scanning partitions, can cause
>>>>> deadlocks. For example, in raid:
>>>>>
>>>>> T1                              T2
>>>>> bdev_open_by_dev
>>>>>    lock open_mutex [1]
>>>>>    ...
>>>>>     efi_partition
>>>>>     ...
>>>>>      md_submit_bio
>>>>>                                   md_ioctl mddev_syspend
>>>>>                                     -> suspend all io
>>>>>                                    md_add_new_disk
>>>>>                                     bind_rdev_to_array
>>>>>                                      bd_link_disk_holder
>>>>>                                       try lock open_mutex [2]
>>>>>       md_handle_request
>>>>>        -> wait mddev_resume
>>>>>
>>>>> T1 scan partition, T2 add a new device to raid. T1 waits for T2 to resume
>>>>> mddev, but T2 waits for open_mutex held by T1. Deadlock occurs.
>>>>>
>>>>> Fix it by introducing a local mutex 'holder_mutex' to replace 'open_mutex'.
>>>>
>>>> Is this to fix [1]? Do we need some Fixes and/or Closes tags?
>>>>
>>>
>>> No. Just use another way to fix [2], and both [2] and this patch can fix
>>> the issue. I am not sure about the root cause of [1] yet.
>>>
>>> [2] https://patchwork.kernel.org/project/linux-raid/list/?series=812045
>>>
>>>> Could you please add steps to reproduce this issue?
>>>
>>> We need to modify the kernel, add sleep in md_submit_bio() and md_ioctl()
>>> as below, and then:
>>>     1. mdadm -CR /dev/md0 -l1 -n2 /dev/sd[bc]  #create a raid
>>>     2. echo 1 > /sys/module/md_mod/parameters/error_inject  #enable sleep
>>>     3. 'mdadm --add /dev/md0 /dev/sda'  #add a disk to raid
>>>     4. submit ioctl BLKRRPART to raid within 10s.
>>
>> The analysis makes sense. I also hit the issue a couple times without adding
>> extra delays. But I am not sure whether this is the best fix (I didn't find real
>> issues with it either).
> 
> To be extra safe and future proof, we can do something like the
> following to only
> suspend the array for ADD_NEW_DISK on not-running arrays.
> 
> This appear to solve the problem reported in
> 
> https://bugzilla.kernel.org/show_bug.cgi?id=218459
> 
> Thanks,
> Song
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 9e41a9aaba8b..395911d5f4d6 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7570,10 +7570,11 @@ static inline bool md_ioctl_valid(unsigned int cmd)
>          }
>   }
> 
> -static bool md_ioctl_need_suspend(unsigned int cmd)
> +static bool md_ioctl_need_suspend(struct mddev *mddev, unsigned int cmd)
>   {
>          switch (cmd) {
>          case ADD_NEW_DISK:
> +               return mddev->pers != NULL;

Did you check already that this problem is not related that 'active_io'
is leaked for flush IO?

I don't understand the problem reported yet. If 'mddev->pers' is not set
yet, md_submit_bio() will return directly, and 'active_io' should not be
grabbed in the first place.

md_run() is the only place to convert 'mddev->pers' from NULL to a real
personality, and it's protected by 'reconfig_mutex', however,
md_ioctl_need_suspend() is called without 'reconfig_mutex', hence there
is a race condition:

md_ioctl_need_suspend		array_state_store
  // mddev->pers is NULL, return false
				 mddev_lock
				 do_md_run
				  mddev->pers = xxx
				 mddev_unlock

  // mddev_suspend is not called
  mddev_lock
  md_add_new_disk
   if (mddev->pers)
    md_import_device
    bind_rdev_to_array
    add_bound_rdev
     mddev->pers->hot_add_disk
     -> hot add disk without suspending

Thanks,
Kuai

>          case HOT_ADD_DISK:
>          case HOT_REMOVE_DISK:
>          case SET_BITMAP_FILE:
> @@ -7625,6 +7626,7 @@ static int md_ioctl(struct block_device *bdev,
> blk_mode_t mode,
>          void __user *argp = (void __user *)arg;
>          struct mddev *mddev = NULL;
>          bool did_set_md_closing = false;
> +       bool need_suspend;
> 
>          if (!md_ioctl_valid(cmd))
>                  return -ENOTTY;
> @@ -7716,8 +7718,11 @@ static int md_ioctl(struct block_device *bdev,
> blk_mode_t mode,
>          if (!md_is_rdwr(mddev))
>                  flush_work(&mddev->sync_work);
> 
> -       err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
> -                                          mddev_lock(mddev);
> +       need_suspend = md_ioctl_need_suspend(mddev, cmd);
> +       if (need_suspend)
> +               err = mddev_suspend_and_lock(mddev);
> +       else
> +               err = mddev_lock(mddev);
>          if (err) {
>                  pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
>                           err, cmd);
> @@ -7846,8 +7851,10 @@ static int md_ioctl(struct block_device *bdev,
> blk_mode_t mode,
>              err != -EINVAL)
>                  mddev->hold_active = 0;
> 
> -       md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
> -                                    mddev_unlock(mddev);
> +       if (need_suspend)
> +               mddev_unlock_and_resume(mddev);
> +       else
> +               mddev_unlock(mddev);
> 
>   out:
>          if(did_set_md_closing)
> .
>
  
Song Liu Feb. 19, 2024, 5:14 a.m. UTC | #6
On Sat, Feb 17, 2024 at 11:47 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2024/02/17 3:03, Song Liu 写道:
> > On Thu, Feb 8, 2024 at 4:49 PM Song Liu <song@kernel.org> wrote:
> >>
> >> On Thu, Feb 8, 2024 at 12:44 AM Li Nan <linan666@huaweicloud.com> wrote:
> >>>
> >>>
> >>>
> >>> 在 2024/2/8 14:50, Song Liu 写道:
> >>>> On Wed, Feb 7, 2024 at 1:32 AM <linan666@huaweicloud.com> wrote:
> >>>>>
> >>>>> From: Li Nan <linan122@huawei.com>
> >>>>>
> >>>>> 'open_mutex' of gendisk is used to protect open/close block devices But
> >>>>> in bd_link_disk_holder(), it is used to protect the creation of symlink
> >>>>> between holding disk and slave bdev, which introduces some issues.
> >>>>>
> >>>>> When bd_link_disk_holder() is called, the driver is usually in the process
> >>>>> of initialization/modification and may suspend submitting io. At this
> >>>>> time, any io hold 'open_mutex', such as scanning partitions, can cause
> >>>>> deadlocks. For example, in raid:
> >>>>>
> >>>>> T1                              T2
> >>>>> bdev_open_by_dev
> >>>>>    lock open_mutex [1]
> >>>>>    ...
> >>>>>     efi_partition
> >>>>>     ...
> >>>>>      md_submit_bio
> >>>>>                                   md_ioctl mddev_syspend
> >>>>>                                     -> suspend all io
> >>>>>                                    md_add_new_disk
> >>>>>                                     bind_rdev_to_array
> >>>>>                                      bd_link_disk_holder
> >>>>>                                       try lock open_mutex [2]
> >>>>>       md_handle_request
> >>>>>        -> wait mddev_resume
> >>>>>
> >>>>> T1 scan partition, T2 add a new device to raid. T1 waits for T2 to resume
> >>>>> mddev, but T2 waits for open_mutex held by T1. Deadlock occurs.
> >>>>>
> >>>>> Fix it by introducing a local mutex 'holder_mutex' to replace 'open_mutex'.
> >>>>
> >>>> Is this to fix [1]? Do we need some Fixes and/or Closes tags?
> >>>>
> >>>
> >>> No. Just use another way to fix [2], and both [2] and this patch can fix
> >>> the issue. I am not sure about the root cause of [1] yet.
> >>>
> >>> [2] https://patchwork.kernel.org/project/linux-raid/list/?series=812045
> >>>
> >>>> Could you please add steps to reproduce this issue?
> >>>
> >>> We need to modify the kernel, add sleep in md_submit_bio() and md_ioctl()
> >>> as below, and then:
> >>>     1. mdadm -CR /dev/md0 -l1 -n2 /dev/sd[bc]  #create a raid
> >>>     2. echo 1 > /sys/module/md_mod/parameters/error_inject  #enable sleep
> >>>     3. 'mdadm --add /dev/md0 /dev/sda'  #add a disk to raid
> >>>     4. submit ioctl BLKRRPART to raid within 10s.
> >>
> >> The analysis makes sense. I also hit the issue a couple times without adding
> >> extra delays. But I am not sure whether this is the best fix (I didn't find real
> >> issues with it either).
> >
> > To be extra safe and future proof, we can do something like the
> > following to only
> > suspend the array for ADD_NEW_DISK on not-running arrays.
> >
> > This appear to solve the problem reported in
> >
> > https://bugzilla.kernel.org/show_bug.cgi?id=218459
> >
> > Thanks,
> > Song
> >
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index 9e41a9aaba8b..395911d5f4d6 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -7570,10 +7570,11 @@ static inline bool md_ioctl_valid(unsigned int cmd)
> >          }
> >   }
> >
> > -static bool md_ioctl_need_suspend(unsigned int cmd)
> > +static bool md_ioctl_need_suspend(struct mddev *mddev, unsigned int cmd)
> >   {
> >          switch (cmd) {
> >          case ADD_NEW_DISK:
> > +               return mddev->pers != NULL;
>
> Did you check already that this problem is not related that 'active_io'
> is leaked for flush IO?
>
> I don't understand the problem reported yet. If 'mddev->pers' is not set
> yet, md_submit_bio() will return directly, and 'active_io' should not be
> grabbed in the first place.

AFAICT, this is not related to the active_io issue.

>
> md_run() is the only place to convert 'mddev->pers' from NULL to a real
> personality, and it's protected by 'reconfig_mutex', however,
> md_ioctl_need_suspend() is called without 'reconfig_mutex', hence there
> is a race condition:
>
> md_ioctl_need_suspend           array_state_store
>   // mddev->pers is NULL, return false
>                                  mddev_lock
>                                  do_md_run
>                                   mddev->pers = xxx
>                                  mddev_unlock
>
>   // mddev_suspend is not called
>   mddev_lock
>   md_add_new_disk
>    if (mddev->pers)
>     md_import_device
>     bind_rdev_to_array
>     add_bound_rdev
>      mddev->pers->hot_add_disk
>      -> hot add disk without suspending

Yeah, this race condition exists. We probably need some
trick with suspend and lock here.

Thanks,
Song
  
Yu Kuai Feb. 19, 2024, 8:53 a.m. UTC | #7
Hi, Christoph

在 2024/02/07 17:27, linan666@huaweicloud.com 写道:
> From: Li Nan <linan122@huawei.com>
> 
> 'open_mutex' of gendisk is used to protect open/close block devices. But
> in bd_link_disk_holder(), it is used to protect the creation of symlink
> between holding disk and slave bdev, which introduces some issues.
> 
> When bd_link_disk_holder() is called, the driver is usually in the process
> of initialization/modification and may suspend submitting io. At this
> time, any io hold 'open_mutex', such as scanning partitions, can cause
> deadlocks. For example, in raid:
> 
> T1                              T2
> bdev_open_by_dev
>   lock open_mutex [1]
>   ...
>    efi_partition
>    ...
>     md_submit_bio
> 				md_ioctl mddev_syspend
> 				  -> suspend all io
> 				 md_add_new_disk
> 				  bind_rdev_to_array
> 				   bd_link_disk_holder
> 				    try lock open_mutex [2]
>      md_handle_request
>       -> wait mddev_resume
> 
> T1 scan partition, T2 add a new device to raid. T1 waits for T2 to resume
> mddev, but T2 waits for open_mutex held by T1. Deadlock occurs.
> 
> Fix it by introducing a local mutex 'holder_mutex' to replace 'open_mutex'.

Can you take a look at this patch? I think for raid(perhaps and dm and
other drivers), it's reasonable to suspend IO while hot adding new
underlying disks. And I think add new slaves to holder is not related to
open the holder disk, because caller should already open the holder disk
to hot add slaves, hence 'open_mutex' for holder is not necessary here.

Actually bd_link_disk_holder() is protected by 'reconfig_mutex' for
raid, and 'table_devices_lock' for dm(I'm not sure yet if other drivers
have similiar lock).

For raid, we do can fix this problem in raid by delay
bd_link_disk_holder() while the array is not suspended, however, we'll
consider this fix later if you think this patch is not acceptable.

Thanks,
Kuai

> 
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>   block/holder.c | 12 +++++++-----
>   1 file changed, 7 insertions(+), 5 deletions(-)
> 
> diff --git a/block/holder.c b/block/holder.c
> index 37d18c13d958..5bfb0a674cc7 100644
> --- a/block/holder.c
> +++ b/block/holder.c
> @@ -8,6 +8,8 @@ struct bd_holder_disk {
>   	int			refcnt;
>   };
>   
> +static DEFINE_MUTEX(holder_mutex);
> +
>   static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
>   						  struct gendisk *disk)
>   {
> @@ -80,7 +82,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
>   	kobject_get(bdev->bd_holder_dir);
>   	mutex_unlock(&bdev->bd_disk->open_mutex);
>   
> -	mutex_lock(&disk->open_mutex);
> +	mutex_lock(&holder_mutex);
>   	WARN_ON_ONCE(!bdev->bd_holder);
>   
>   	holder = bd_find_holder_disk(bdev, disk);
> @@ -108,7 +110,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
>   		goto out_del_symlink;
>   	list_add(&holder->list, &disk->slave_bdevs);
>   
> -	mutex_unlock(&disk->open_mutex);
> +	mutex_unlock(&holder_mutex);
>   	return 0;
>   
>   out_del_symlink:
> @@ -116,7 +118,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
>   out_free_holder:
>   	kfree(holder);
>   out_unlock:
> -	mutex_unlock(&disk->open_mutex);
> +	mutex_unlock(&holder_mutex);
>   	if (ret)
>   		kobject_put(bdev->bd_holder_dir);
>   	return ret;
> @@ -140,7 +142,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
>   	if (WARN_ON_ONCE(!disk->slave_dir))
>   		return;
>   
> -	mutex_lock(&disk->open_mutex);
> +	mutex_lock(&holder_mutex);
>   	holder = bd_find_holder_disk(bdev, disk);
>   	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
>   		del_symlink(disk->slave_dir, bdev_kobj(bdev));
> @@ -149,6 +151,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
>   		list_del_init(&holder->list);
>   		kfree(holder);
>   	}
> -	mutex_unlock(&disk->open_mutex);
> +	mutex_unlock(&holder_mutex);
>   }
>   EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
>
  
Christoph Hellwig Feb. 20, 2024, 8:09 a.m. UTC | #8
On Mon, Feb 19, 2024 at 04:53:36PM +0800, Yu Kuai wrote:
> Can you take a look at this patch? I think for raid(perhaps and dm and
> other drivers), it's reasonable to suspend IO while hot adding new
> underlying disks. And I think add new slaves to holder is not related to
> open the holder disk, because caller should already open the holder disk
> to hot add slaves, hence 'open_mutex' for holder is not necessary here.
>
> Actually bd_link_disk_holder() is protected by 'reconfig_mutex' for
> raid, and 'table_devices_lock' for dm(I'm not sure yet if other drivers
> have similiar lock).
>
> For raid, we do can fix this problem in raid by delay
> bd_link_disk_holder() while the array is not suspended, however, we'll
> consider this fix later if you think this patch is not acceptable.

Yes, not taking open_lock here seems reasonable, open_lock or it's
previous name has always been a bit of a catchall without very well
defined semantics.  I'd give the symbol a blk_ prefix, though.
  

Patch

diff --git a/block/holder.c b/block/holder.c
index 37d18c13d958..5bfb0a674cc7 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -8,6 +8,8 @@  struct bd_holder_disk {
 	int			refcnt;
 };
 
+static DEFINE_MUTEX(holder_mutex);
+
 static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
 						  struct gendisk *disk)
 {
@@ -80,7 +82,7 @@  int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 	kobject_get(bdev->bd_holder_dir);
 	mutex_unlock(&bdev->bd_disk->open_mutex);
 
-	mutex_lock(&disk->open_mutex);
+	mutex_lock(&holder_mutex);
 	WARN_ON_ONCE(!bdev->bd_holder);
 
 	holder = bd_find_holder_disk(bdev, disk);
@@ -108,7 +110,7 @@  int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 		goto out_del_symlink;
 	list_add(&holder->list, &disk->slave_bdevs);
 
-	mutex_unlock(&disk->open_mutex);
+	mutex_unlock(&holder_mutex);
 	return 0;
 
 out_del_symlink:
@@ -116,7 +118,7 @@  int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 out_free_holder:
 	kfree(holder);
 out_unlock:
-	mutex_unlock(&disk->open_mutex);
+	mutex_unlock(&holder_mutex);
 	if (ret)
 		kobject_put(bdev->bd_holder_dir);
 	return ret;
@@ -140,7 +142,7 @@  void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 	if (WARN_ON_ONCE(!disk->slave_dir))
 		return;
 
-	mutex_lock(&disk->open_mutex);
+	mutex_lock(&holder_mutex);
 	holder = bd_find_holder_disk(bdev, disk);
 	if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
 		del_symlink(disk->slave_dir, bdev_kobj(bdev));
@@ -149,6 +151,6 @@  void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 		list_del_init(&holder->list);
 		kfree(holder);
 	}
-	mutex_unlock(&disk->open_mutex);
+	mutex_unlock(&holder_mutex);
 }
 EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);