md/raid10: prioritize adding disk to 'removed' mirror

Message ID 20230527092007.3008856-1-linan666@huaweicloud.com
State New
Headers
Series md/raid10: prioritize adding disk to 'removed' mirror |

Commit Message

Li Nan May 27, 2023, 9:20 a.m. UTC
  From: Li Nan <linan122@huawei.com>

When add a new disk to raid10, it will traverse conf->mirror from start
and find one of the following mirror to add:
  1. mirror->rdev is set to WantReplacement and it have no replacement,
     set new disk to mirror->replacement.
  2. no mirror->rdev, set new disk to mirror->rdev.

There is a array as below (sda is set to WantReplacement):

    Number   Major   Minor   RaidDevice State
       0       8        0        0      active sync set-A   /dev/sda
       -       0        0        1      removed
       2       8       32        2      active sync set-A   /dev/sdc
       3       8       48        3      active sync set-B   /dev/sdd

Use 'mdadm --add' to add a new disk to this array, the new disk will
become sda's replacement instead of add to removed position, which is
confusing for users. Meanwhile, after new disk recovery success, sda
will be set to Faulty.

Prioritize adding disk to 'removed' mirror is a better choice. In the
above scenario, the behavior is the same as before, except sda will not
be deleted. Before other disks are added, continued use sda is more
reliable.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/raid10.c | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)
  

Comments

Yu Kuai May 29, 2023, 1 p.m. UTC | #1
Hi,

在 2023/05/27 17:20, linan666@huaweicloud.com 写道:
> From: Li Nan <linan122@huawei.com>
> 
> When add a new disk to raid10, it will traverse conf->mirror from start
> and find one of the following mirror to add:
>    1. mirror->rdev is set to WantReplacement and it have no replacement,
>       set new disk to mirror->replacement.
>    2. no mirror->rdev, set new disk to mirror->rdev.
> 
> There is a array as below (sda is set to WantReplacement):
> 
>      Number   Major   Minor   RaidDevice State
>         0       8        0        0      active sync set-A   /dev/sda
>         -       0        0        1      removed
>         2       8       32        2      active sync set-A   /dev/sdc
>         3       8       48        3      active sync set-B   /dev/sdd
> 
> Use 'mdadm --add' to add a new disk to this array, the new disk will
> become sda's replacement instead of add to removed position, which is
> confusing for users. Meanwhile, after new disk recovery success, sda
> will be set to Faulty.
> 
> Prioritize adding disk to 'removed' mirror is a better choice. In the
> above scenario, the behavior is the same as before, except sda will not
> be deleted. Before other disks are added, continued use sda is more
> reliable.
> 

I think this change make sense, however, it's better to do this for all
personality instead of just for raid10.

Thanks,
Kuai
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>   drivers/md/raid10.c | 35 ++++++++++++++++++++---------------
>   1 file changed, 20 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 4fcfcb350d2b..d90eb830ca1a 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -2148,9 +2148,10 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>   {
>   	struct r10conf *conf = mddev->private;
>   	int err = -EEXIST;
> -	int mirror;
> +	int mirror, repl_slot = -1;
>   	int first = 0;
>   	int last = conf->geo.raid_disks - 1;
> +	struct raid10_info *p;
>   
>   	if (mddev->recovery_cp < MaxSector)
>   		/* only hot-add to in-sync arrays, as recovery is
> @@ -2173,23 +2174,14 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>   	else
>   		mirror = first;
>   	for ( ; mirror <= last ; mirror++) {
> -		struct raid10_info *p = &conf->mirrors[mirror];
> +		p = &conf->mirrors[mirror];
>   		if (p->recovery_disabled == mddev->recovery_disabled)
>   			continue;
>   		if (p->rdev) {
> -			if (!test_bit(WantReplacement, &p->rdev->flags) ||
> -			    p->replacement != NULL)
> -				continue;
> -			clear_bit(In_sync, &rdev->flags);
> -			set_bit(Replacement, &rdev->flags);
> -			rdev->raid_disk = mirror;
> -			err = 0;
> -			if (mddev->gendisk)
> -				disk_stack_limits(mddev->gendisk, rdev->bdev,
> -						  rdev->data_offset << 9);
> -			conf->fullsync = 1;
> -			rcu_assign_pointer(p->replacement, rdev);
> -			break;
> +			if (test_bit(WantReplacement, &p->rdev->flags) &&
> +			    p->replacement == NULL && repl_slot < 0)
> +				repl_slot = mirror;
> +			continue;
>   		}
>   
>   		if (mddev->gendisk)
> @@ -2206,6 +2198,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
>   		break;
>   	}
>   
> +	if (err && repl_slot >= 0) {
> +		p = &conf->mirrors[repl_slot];
> +		clear_bit(In_sync, &rdev->flags);
> +		set_bit(Replacement, &rdev->flags);
> +		rdev->raid_disk = repl_slot;
> +		err = 0;
> +		if (mddev->gendisk)
> +			disk_stack_limits(mddev->gendisk, rdev->bdev,
> +					  rdev->data_offset << 9);
> +		conf->fullsync = 1;
> +		rcu_assign_pointer(p->replacement, rdev);
> +	}
> +
>   	print_conf(conf);
>   	return err;
>   }
>
  
Li Nan May 29, 2023, 1:14 p.m. UTC | #2
在 2023/5/29 21:00, Yu Kuai 写道:
> Hi,
> 
> 在 2023/05/27 17:20, linan666@huaweicloud.com 写道:
>> From: Li Nan <linan122@huawei.com>
>>
>> When add a new disk to raid10, it will traverse conf->mirror from start
>> and find one of the following mirror to add:
>>    1. mirror->rdev is set to WantReplacement and it have no replacement,
>>       set new disk to mirror->replacement.
>>    2. no mirror->rdev, set new disk to mirror->rdev.
>>
>> There is a array as below (sda is set to WantReplacement):
>>
>>      Number   Major   Minor   RaidDevice State
>>         0       8        0        0      active sync set-A   /dev/sda
>>         -       0        0        1      removed
>>         2       8       32        2      active sync set-A   /dev/sdc
>>         3       8       48        3      active sync set-B   /dev/sdd
>>
>> Use 'mdadm --add' to add a new disk to this array, the new disk will
>> become sda's replacement instead of add to removed position, which is
>> confusing for users. Meanwhile, after new disk recovery success, sda
>> will be set to Faulty.
>>
>> Prioritize adding disk to 'removed' mirror is a better choice. In the
>> above scenario, the behavior is the same as before, except sda will not
>> be deleted. Before other disks are added, continued use sda is more
>> reliable.
>>
> 
> I think this change make sense, however, it's better to do this for all
> personality instead of just for raid10.
> 
> Thanks,
> Kuai

raid5 is currently like this. If others are OK with this changes to 
raid10, I will modify raid1 later.
  
Song Liu May 30, 2023, 10:17 p.m. UTC | #3
On Mon, May 29, 2023 at 6:14 AM Li Nan <linan666@huaweicloud.com> wrote:
>
>
>
> 在 2023/5/29 21:00, Yu Kuai 写道:
> > Hi,
> >
> > 在 2023/05/27 17:20, linan666@huaweicloud.com 写道:
> >> From: Li Nan <linan122@huawei.com>
> >>
> >> When add a new disk to raid10, it will traverse conf->mirror from start
> >> and find one of the following mirror to add:
> >>    1. mirror->rdev is set to WantReplacement and it have no replacement,
> >>       set new disk to mirror->replacement.
> >>    2. no mirror->rdev, set new disk to mirror->rdev.
> >>
> >> There is a array as below (sda is set to WantReplacement):
> >>
> >>      Number   Major   Minor   RaidDevice State
> >>         0       8        0        0      active sync set-A   /dev/sda
> >>         -       0        0        1      removed
> >>         2       8       32        2      active sync set-A   /dev/sdc
> >>         3       8       48        3      active sync set-B   /dev/sdd
> >>
> >> Use 'mdadm --add' to add a new disk to this array, the new disk will
> >> become sda's replacement instead of add to removed position, which is
> >> confusing for users. Meanwhile, after new disk recovery success, sda
> >> will be set to Faulty.
> >>
> >> Prioritize adding disk to 'removed' mirror is a better choice. In the
> >> above scenario, the behavior is the same as before, except sda will not
> >> be deleted. Before other disks are added, continued use sda is more
> >> reliable.
> >>
> >
> > I think this change make sense, however, it's better to do this for all
> > personality instead of just for raid10.
> >
> > Thanks,
> > Kuai
>
> raid5 is currently like this. If others are OK with this changes to
> raid10, I will modify raid1 later.

This change looks reasonable. Could you please add a mdadm test
to cover this case?

Applied to md-next.

Thanks,
Song
  

Patch

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 4fcfcb350d2b..d90eb830ca1a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2148,9 +2148,10 @@  static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct r10conf *conf = mddev->private;
 	int err = -EEXIST;
-	int mirror;
+	int mirror, repl_slot = -1;
 	int first = 0;
 	int last = conf->geo.raid_disks - 1;
+	struct raid10_info *p;
 
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
@@ -2173,23 +2174,14 @@  static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	else
 		mirror = first;
 	for ( ; mirror <= last ; mirror++) {
-		struct raid10_info *p = &conf->mirrors[mirror];
+		p = &conf->mirrors[mirror];
 		if (p->recovery_disabled == mddev->recovery_disabled)
 			continue;
 		if (p->rdev) {
-			if (!test_bit(WantReplacement, &p->rdev->flags) ||
-			    p->replacement != NULL)
-				continue;
-			clear_bit(In_sync, &rdev->flags);
-			set_bit(Replacement, &rdev->flags);
-			rdev->raid_disk = mirror;
-			err = 0;
-			if (mddev->gendisk)
-				disk_stack_limits(mddev->gendisk, rdev->bdev,
-						  rdev->data_offset << 9);
-			conf->fullsync = 1;
-			rcu_assign_pointer(p->replacement, rdev);
-			break;
+			if (test_bit(WantReplacement, &p->rdev->flags) &&
+			    p->replacement == NULL && repl_slot < 0)
+				repl_slot = mirror;
+			continue;
 		}
 
 		if (mddev->gendisk)
@@ -2206,6 +2198,19 @@  static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		break;
 	}
 
+	if (err && repl_slot >= 0) {
+		p = &conf->mirrors[repl_slot];
+		clear_bit(In_sync, &rdev->flags);
+		set_bit(Replacement, &rdev->flags);
+		rdev->raid_disk = repl_slot;
+		err = 0;
+		if (mddev->gendisk)
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
+		conf->fullsync = 1;
+		rcu_assign_pointer(p->replacement, rdev);
+	}
+
 	print_conf(conf);
 	return err;
 }