[2/2] md: create symlink with disk holder after mddev resume

Message ID 20231221071109.1562530-3-linan666@huaweicloud.com
State New
Headers
Series [1/2] md: fix WARN_ON if create symlink fail in bind_rdev_to_array() |

Commit Message

Li Nan Dec. 21, 2023, 7:11 a.m. UTC
  From: Li Nan <linan122@huawei.com>

There is a risk of deadlock when a process gets disk->open_mutex after
suspending mddev, because other processes may hold open_mutex while
submitting io. For example:

  T1				T2
  blkdev_open
   bdev_open_by_dev
    mutex_lock(&disk->open_mutex)
  				md_ioctl
  				 mddev_suspend_and_lock
  				  mddev_suspend
  				 md_add_new_disk
  				  bind_rdev_to_array
  				   bd_link_disk_holder
  				    //wait open_mutex
    blkdev_get_whole
     bdev_disk_changed
      efi_partition
       read_lba
        ...
         md_submit_bio
          md_handle_request
           //wait resume

Fix it by getting disk->open_mutex after mddev resume, iterating each
mddev->disk to create symlink for rdev which has not been created yet.
and moving bd_unlink_disk_holder() to mddev_unlock(), rdev has been
deleted from mddev->disks here, which can avoid concurrent bind and unbind,

Fixes: 1b0a2d950ee2 ("md: use new apis to suspend array for ioctls involed array reconfiguration")
Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/md.c | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)
  

Comments

Yu Kuai Dec. 21, 2023, 8:49 a.m. UTC | #1
Hi,

在 2023/12/21 15:11, linan666@huaweicloud.com 写道:
> From: Li Nan <linan122@huawei.com>
> 
> There is a risk of deadlock when a process gets disk->open_mutex after
> suspending mddev, because other processes may hold open_mutex while
> submitting io. For example:
> 
>    T1				T2
>    blkdev_open
>     bdev_open_by_dev
>      mutex_lock(&disk->open_mutex)
>    				md_ioctl
>    				 mddev_suspend_and_lock
>    				  mddev_suspend
>    				 md_add_new_disk
>    				  bind_rdev_to_array
>    				   bd_link_disk_holder
>    				    //wait open_mutex
>      blkdev_get_whole
>       bdev_disk_changed
>        efi_partition
>         read_lba
>          ...
>           md_submit_bio
>            md_handle_request
>             //wait resume
> 

Nice catch! This patch looks good except that the new flag
'SymlinkCreated' doesn't look accurate, perhaps 'HolderLinked'
will make more sense.

Thanks,
Kuai

> Fix it by getting disk->open_mutex after mddev resume, iterating each
> mddev->disk to create symlink for rdev which has not been created yet.
> and moving bd_unlink_disk_holder() to mddev_unlock(), rdev has been
> deleted from mddev->disks here, which can avoid concurrent bind and unbind,
> 
> Fixes: 1b0a2d950ee2 ("md: use new apis to suspend array for ioctls involed array reconfiguration")
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>   drivers/md/md.c | 39 +++++++++++++++++++++++++++++----------
>   1 file changed, 29 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index d6612b922c76..c128570f2a5d 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -521,6 +521,20 @@ void mddev_resume(struct mddev *mddev)
>   }
>   EXPORT_SYMBOL_GPL(mddev_resume);
>   
> +static void md_link_disk_holder(struct mddev *mddev)
> +{
> +	struct md_rdev *rdev;
> +
> +	rcu_read_lock();
> +	rdev_for_each_rcu(rdev, mddev) {
> +		if (test_bit(SymlinkCreated, &rdev->flags))
> +			continue;
> +		if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
> +			set_bit(SymlinkCreated, &rdev->flags);
> +	}
> +	rcu_read_unlock();
> +}
> +
>   /*
>    * Generic flush handling for md
>    */
> @@ -902,6 +916,11 @@ void mddev_unlock(struct mddev *mddev)
>   
>   	list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
>   		list_del_init(&rdev->same_set);
> +		if (test_bit(SymlinkCreated, &rdev->flags)) {
> +			bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
> +			clear_bit(SymlinkCreated, &rdev->flags);
> +		}
> +		rdev->mddev = NULL;
>   		kobject_del(&rdev->kobj);
>   		export_rdev(rdev, mddev);
>   	}
> @@ -2526,8 +2545,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
>   		sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
>   
>   	list_add_rcu(&rdev->same_set, &mddev->disks);
> -	if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
> -		set_bit(SymlinkCreated, &rdev->flags);
>   
>   	/* May as well allow recovery to be retried once */
>   	mddev->recovery_disabled++;
> @@ -2562,14 +2579,9 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
>   {
>   	struct mddev *mddev = rdev->mddev;
>   
> -	if (test_bit(SymlinkCreated, &rdev->flags)) {
> -		bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
> -		clear_bit(SymlinkCreated, &rdev->flags);
> -	}
>   	list_del_rcu(&rdev->same_set);
>   	pr_debug("md: unbind<%pg>\n", rdev->bdev);
>   	mddev_destroy_serial_pool(rdev->mddev, rdev);
> -	rdev->mddev = NULL;
>   	sysfs_remove_link(&rdev->kobj, "block");
>   	sysfs_put(rdev->sysfs_state);
>   	sysfs_put(rdev->sysfs_unack_badblocks);
> @@ -4667,8 +4679,10 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
>   	if (err)
>   		export_rdev(rdev, mddev);
>   	mddev_unlock_and_resume(mddev);
> -	if (!err)
> +	if (!err) {
> +		md_link_disk_holder(mddev);
>   		md_new_event();
> +	}
>   	return err ? err : len;
>   }
>   
> @@ -6606,6 +6620,7 @@ static void autorun_devices(int part)
>   			}
>   			autorun_array(mddev);
>   			mddev_unlock_and_resume(mddev);
> +			md_link_disk_holder(mddev);
>   		}
>   		/* on success, candidates will be empty, on error
>   		 * it won't...
> @@ -7832,8 +7847,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
>   	    err != -EINVAL)
>   		mddev->hold_active = 0;
>   
> -	md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
> -				     mddev_unlock(mddev);
> +	if (md_ioctl_need_suspend(cmd)) {
> +		mddev_unlock_and_resume(mddev);
> +		md_link_disk_holder(mddev);
> +	} else {
> +		mddev_unlock(mddev);
> +	}
>   
>   out:
>   	if(did_set_md_closing)
>
  
Linux regression tracking (Thorsten Leemhuis) Feb. 6, 2024, 2:46 p.m. UTC | #2
Hi, Thorsten here, the Linux kernel's regression tracker.

On 21.12.23 09:49, Yu Kuai wrote:
> 在 2023/12/21 15:11, linan666@huaweicloud.com 写道:
>> From: Li Nan <linan122@huawei.com>
>>
>> There is a risk of deadlock when a process gets disk->open_mutex after
>> suspending mddev, because other processes may hold open_mutex while
>> submitting io. For example:
>> [...]
> Nice catch! This patch looks good except that the new flag
> 'SymlinkCreated' doesn't look accurate, perhaps 'HolderLinked'
> will make more sense.
> 
>> Fix it by getting disk->open_mutex after mddev resume, iterating each
>> mddev->disk to create symlink for rdev which has not been created yet.
>> and moving bd_unlink_disk_holder() to mddev_unlock(), rdev has been
>> deleted from mddev->disks here, which can avoid concurrent bind and
>> unbind,
>>
>> Fixes: 1b0a2d950ee2 ("md: use new apis to suspend array for ioctls
>> involed array reconfiguration")

Hey, what happened to that patch? It looks a lot like things stalled
here. I'm asking, because there is a regression report that claims
1b0a2d950ee2 to be the culprit that might or might not be causes by the
problem this patch tries to fix:
https://bugzilla.kernel.org/show_bug.cgi?id=218459

Ciao, Thorsten

>> Signed-off-by: Li Nan <linan122@huawei.com>
>> ---
>>   drivers/md/md.c | 39 +++++++++++++++++++++++++++++----------
>>   1 file changed, 29 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index d6612b922c76..c128570f2a5d 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -521,6 +521,20 @@ void mddev_resume(struct mddev *mddev)
>>   }
>>   EXPORT_SYMBOL_GPL(mddev_resume);
>>   +static void md_link_disk_holder(struct mddev *mddev)
>> +{
>> +    struct md_rdev *rdev;
>> +
>> +    rcu_read_lock();
>> +    rdev_for_each_rcu(rdev, mddev) {
>> +        if (test_bit(SymlinkCreated, &rdev->flags))
>> +            continue;
>> +        if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
>> +            set_bit(SymlinkCreated, &rdev->flags);
>> +    }
>> +    rcu_read_unlock();
>> +}
>> +
>>   /*
>>    * Generic flush handling for md
>>    */
>> @@ -902,6 +916,11 @@ void mddev_unlock(struct mddev *mddev)
>>         list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
>>           list_del_init(&rdev->same_set);
>> +        if (test_bit(SymlinkCreated, &rdev->flags)) {
>> +            bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
>> +            clear_bit(SymlinkCreated, &rdev->flags);
>> +        }
>> +        rdev->mddev = NULL;
>>           kobject_del(&rdev->kobj);
>>           export_rdev(rdev, mddev);
>>       }
>> @@ -2526,8 +2545,6 @@ static int bind_rdev_to_array(struct md_rdev
>> *rdev, struct mddev *mddev)
>>           sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
>>         list_add_rcu(&rdev->same_set, &mddev->disks);
>> -    if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
>> -        set_bit(SymlinkCreated, &rdev->flags);
>>         /* May as well allow recovery to be retried once */
>>       mddev->recovery_disabled++;
>> @@ -2562,14 +2579,9 @@ static void md_kick_rdev_from_array(struct
>> md_rdev *rdev)
>>   {
>>       struct mddev *mddev = rdev->mddev;
>>   -    if (test_bit(SymlinkCreated, &rdev->flags)) {
>> -        bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
>> -        clear_bit(SymlinkCreated, &rdev->flags);
>> -    }
>>       list_del_rcu(&rdev->same_set);
>>       pr_debug("md: unbind<%pg>\n", rdev->bdev);
>>       mddev_destroy_serial_pool(rdev->mddev, rdev);
>> -    rdev->mddev = NULL;
>>       sysfs_remove_link(&rdev->kobj, "block");
>>       sysfs_put(rdev->sysfs_state);
>>       sysfs_put(rdev->sysfs_unack_badblocks);
>> @@ -4667,8 +4679,10 @@ new_dev_store(struct mddev *mddev, const char
>> *buf, size_t len)
>>       if (err)
>>           export_rdev(rdev, mddev);
>>       mddev_unlock_and_resume(mddev);
>> -    if (!err)
>> +    if (!err) {
>> +        md_link_disk_holder(mddev);
>>           md_new_event();
>> +    }
>>       return err ? err : len;
>>   }
>>   @@ -6606,6 +6620,7 @@ static void autorun_devices(int part)
>>               }
>>               autorun_array(mddev);
>>               mddev_unlock_and_resume(mddev);
>> +            md_link_disk_holder(mddev);
>>           }
>>           /* on success, candidates will be empty, on error
>>            * it won't...
>> @@ -7832,8 +7847,12 @@ static int md_ioctl(struct block_device *bdev,
>> blk_mode_t mode,
>>           err != -EINVAL)
>>           mddev->hold_active = 0;
>>   -    md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
>> -                     mddev_unlock(mddev);
>> +    if (md_ioctl_need_suspend(cmd)) {
>> +        mddev_unlock_and_resume(mddev);
>> +        md_link_disk_holder(mddev);
>> +    } else {
>> +        mddev_unlock(mddev);
>> +    }
>>     out:
>>       if(did_set_md_closing)
>>
>
  
Song Liu Feb. 6, 2024, 5 p.m. UTC | #3
On Tue, Feb 6, 2024 at 6:46 AM Linux regression tracking (Thorsten
Leemhuis) <regressions@leemhuis.info> wrote:
>
> Hi, Thorsten here, the Linux kernel's regression tracker.
>
> On 21.12.23 09:49, Yu Kuai wrote:
> > 在 2023/12/21 15:11, linan666@huaweicloud.com 写道:
> >> From: Li Nan <linan122@huawei.com>
> >>
> >> There is a risk of deadlock when a process gets disk->open_mutex after
> >> suspending mddev, because other processes may hold open_mutex while
> >> submitting io. For example:
> >> [...]
> > Nice catch! This patch looks good except that the new flag
> > 'SymlinkCreated' doesn't look accurate, perhaps 'HolderLinked'
> > will make more sense.
> >
> >> Fix it by getting disk->open_mutex after mddev resume, iterating each
> >> mddev->disk to create symlink for rdev which has not been created yet.
> >> and moving bd_unlink_disk_holder() to mddev_unlock(), rdev has been
> >> deleted from mddev->disks here, which can avoid concurrent bind and
> >> unbind,
> >>
> >> Fixes: 1b0a2d950ee2 ("md: use new apis to suspend array for ioctls
> >> involed array reconfiguration")
>
> Hey, what happened to that patch? It looks a lot like things stalled
> here. I'm asking, because there is a regression report that claims
> 1b0a2d950ee2 to be the culprit that might or might not be causes by the
> problem this patch tries to fix:
> https://bugzilla.kernel.org/show_bug.cgi?id=218459

Thanks for the heads-up. Replied to the thread.

Song
  

Patch

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d6612b922c76..c128570f2a5d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -521,6 +521,20 @@  void mddev_resume(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(mddev_resume);
 
+static void md_link_disk_holder(struct mddev *mddev)
+{
+	struct md_rdev *rdev;
+
+	rcu_read_lock();
+	rdev_for_each_rcu(rdev, mddev) {
+		if (test_bit(SymlinkCreated, &rdev->flags))
+			continue;
+		if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
+			set_bit(SymlinkCreated, &rdev->flags);
+	}
+	rcu_read_unlock();
+}
+
 /*
  * Generic flush handling for md
  */
@@ -902,6 +916,11 @@  void mddev_unlock(struct mddev *mddev)
 
 	list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
 		list_del_init(&rdev->same_set);
+		if (test_bit(SymlinkCreated, &rdev->flags)) {
+			bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
+			clear_bit(SymlinkCreated, &rdev->flags);
+		}
+		rdev->mddev = NULL;
 		kobject_del(&rdev->kobj);
 		export_rdev(rdev, mddev);
 	}
@@ -2526,8 +2545,6 @@  static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 		sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks");
 
 	list_add_rcu(&rdev->same_set, &mddev->disks);
-	if (!bd_link_disk_holder(rdev->bdev, mddev->gendisk))
-		set_bit(SymlinkCreated, &rdev->flags);
 
 	/* May as well allow recovery to be retried once */
 	mddev->recovery_disabled++;
@@ -2562,14 +2579,9 @@  static void md_kick_rdev_from_array(struct md_rdev *rdev)
 {
 	struct mddev *mddev = rdev->mddev;
 
-	if (test_bit(SymlinkCreated, &rdev->flags)) {
-		bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
-		clear_bit(SymlinkCreated, &rdev->flags);
-	}
 	list_del_rcu(&rdev->same_set);
 	pr_debug("md: unbind<%pg>\n", rdev->bdev);
 	mddev_destroy_serial_pool(rdev->mddev, rdev);
-	rdev->mddev = NULL;
 	sysfs_remove_link(&rdev->kobj, "block");
 	sysfs_put(rdev->sysfs_state);
 	sysfs_put(rdev->sysfs_unack_badblocks);
@@ -4667,8 +4679,10 @@  new_dev_store(struct mddev *mddev, const char *buf, size_t len)
 	if (err)
 		export_rdev(rdev, mddev);
 	mddev_unlock_and_resume(mddev);
-	if (!err)
+	if (!err) {
+		md_link_disk_holder(mddev);
 		md_new_event();
+	}
 	return err ? err : len;
 }
 
@@ -6606,6 +6620,7 @@  static void autorun_devices(int part)
 			}
 			autorun_array(mddev);
 			mddev_unlock_and_resume(mddev);
+			md_link_disk_holder(mddev);
 		}
 		/* on success, candidates will be empty, on error
 		 * it won't...
@@ -7832,8 +7847,12 @@  static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
 	    err != -EINVAL)
 		mddev->hold_active = 0;
 
-	md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
-				     mddev_unlock(mddev);
+	if (md_ioctl_need_suspend(cmd)) {
+		mddev_unlock_and_resume(mddev);
+		md_link_disk_holder(mddev);
+	} else {
+		mddev_unlock(mddev);
+	}
 
 out:
 	if(did_set_md_closing)