[v5,04/14] md: don't register sync_thread for reshape directly

Message ID 20240201092559.910982-5-yukuai1@huaweicloud.com
State New
Headers
Series [v5,01/14] md: don't ignore suspended array in md_check_recovery() |

Commit Message

Yu Kuai Feb. 1, 2024, 9:25 a.m. UTC
  From: Yu Kuai <yukuai3@huawei.com>

Currently, if reshape is interrupted, then reassemble the array will
register sync_thread directly from pers->run(), in this case
'MD_RECOVERY_RUNNING' is set directly, however, there is no guarantee
that md_do_sync() will be executed, hence stop_sync_thread() will hang
because 'MD_RECOVERY_RUNNING' can't be cleared.

Last patch make sure that md_do_sync() will set MD_RECOVERY_DONE,
however, following hang can still be triggered by dm-raid test
shell/lvconvert-raid-reshape.sh occasionally:

[root@fedora ~]# cat /proc/1982/stack
[<0>] stop_sync_thread+0x1ab/0x270 [md_mod]
[<0>] md_frozen_sync_thread+0x5c/0xa0 [md_mod]
[<0>] raid_presuspend+0x1e/0x70 [dm_raid]
[<0>] dm_table_presuspend_targets+0x40/0xb0 [dm_mod]
[<0>] __dm_destroy+0x2a5/0x310 [dm_mod]
[<0>] dm_destroy+0x16/0x30 [dm_mod]
[<0>] dev_remove+0x165/0x290 [dm_mod]
[<0>] ctl_ioctl+0x4bb/0x7b0 [dm_mod]
[<0>] dm_ctl_ioctl+0x11/0x20 [dm_mod]
[<0>] vfs_ioctl+0x21/0x60
[<0>] __x64_sys_ioctl+0xb9/0xe0
[<0>] do_syscall_64+0xc6/0x230
[<0>] entry_SYSCALL_64_after_hwframe+0x6c/0x74

Meanwhile mddev->recovery is:
MD_RECOVERY_RUNNING |
MD_RECOVERY_INTR |
MD_RECOVERY_RESHAPE |
MD_RECOVERY_FROZEN

Fix this problem by remove the code to register sync_thread directly
from raid10 and raid5. And let md_check_recovery() to register
sync_thread.

Fixes: f67055780caa ("[PATCH] md: Checkpoint and allow restart of raid5 reshape")
Fixes: f52f5c71f3d4 ("md: fix stopping sync thread")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/md.c     |  5 ++++-
 drivers/md/raid10.c | 16 ++--------------
 drivers/md/raid5.c  | 29 ++---------------------------
 3 files changed, 8 insertions(+), 42 deletions(-)
  

Comments

Xiao Ni Feb. 28, 2024, 12:07 p.m. UTC | #1
在 2024/2/1 下午5:25, Yu Kuai 写道:
> From: Yu Kuai <yukuai3@huawei.com>
>
> Currently, if reshape is interrupted, then reassemble the array will
> register sync_thread directly from pers->run(), in this case
> 'MD_RECOVERY_RUNNING' is set directly, however, there is no guarantee
> that md_do_sync() will be executed, hence stop_sync_thread() will hang
> because 'MD_RECOVERY_RUNNING' can't be cleared.


Hi Kuai

I have a question here. Is it the reason sync_thread can't run 
md_do_sync because kthread_should_stop, so it doesn't have the chance to 
set MD_RECOVERY_DONE? Why creating sync thread in md_check_recovery 
doesn't have this problem? Could you explain more about this?

Best Regards

Xiao

>
> Last patch make sure that md_do_sync() will set MD_RECOVERY_DONE,
> however, following hang can still be triggered by dm-raid test
> shell/lvconvert-raid-reshape.sh occasionally:
>
> [root@fedora ~]# cat /proc/1982/stack
> [<0>] stop_sync_thread+0x1ab/0x270 [md_mod]
> [<0>] md_frozen_sync_thread+0x5c/0xa0 [md_mod]
> [<0>] raid_presuspend+0x1e/0x70 [dm_raid]
> [<0>] dm_table_presuspend_targets+0x40/0xb0 [dm_mod]
> [<0>] __dm_destroy+0x2a5/0x310 [dm_mod]
> [<0>] dm_destroy+0x16/0x30 [dm_mod]
> [<0>] dev_remove+0x165/0x290 [dm_mod]
> [<0>] ctl_ioctl+0x4bb/0x7b0 [dm_mod]
> [<0>] dm_ctl_ioctl+0x11/0x20 [dm_mod]
> [<0>] vfs_ioctl+0x21/0x60
> [<0>] __x64_sys_ioctl+0xb9/0xe0
> [<0>] do_syscall_64+0xc6/0x230
> [<0>] entry_SYSCALL_64_after_hwframe+0x6c/0x74
>
> Meanwhile mddev->recovery is:
> MD_RECOVERY_RUNNING |
> MD_RECOVERY_INTR |
> MD_RECOVERY_RESHAPE |
> MD_RECOVERY_FROZEN
>
> Fix this problem by remove the code to register sync_thread directly
> from raid10 and raid5. And let md_check_recovery() to register
> sync_thread.
>
> Fixes: f67055780caa ("[PATCH] md: Checkpoint and allow restart of raid5 reshape")
> Fixes: f52f5c71f3d4 ("md: fix stopping sync thread")
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>   drivers/md/md.c     |  5 ++++-
>   drivers/md/raid10.c | 16 ++--------------
>   drivers/md/raid5.c  | 29 ++---------------------------
>   3 files changed, 8 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index c65dfd156090..6c5d0a372927 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -9372,6 +9372,7 @@ static void md_start_sync(struct work_struct *ws)
>   	struct mddev *mddev = container_of(ws, struct mddev, sync_work);
>   	int spares = 0;
>   	bool suspend = false;
> +	char *name;
>   
>   	if (md_spares_need_change(mddev))
>   		suspend = true;
> @@ -9404,8 +9405,10 @@ static void md_start_sync(struct work_struct *ws)
>   	if (spares)
>   		md_bitmap_write_all(mddev->bitmap);
>   
> +	name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
> +			"reshape" : "resync";
>   	rcu_assign_pointer(mddev->sync_thread,
> -			   md_register_thread(md_do_sync, mddev, "resync"));
> +			   md_register_thread(md_do_sync, mddev, name));
>   	if (!mddev->sync_thread) {
>   		pr_warn("%s: could not start resync thread...\n",
>   			mdname(mddev));
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 7412066ea22c..a5f8419e2df1 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -4175,11 +4175,7 @@ static int raid10_run(struct mddev *mddev)
>   		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
>   		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
>   		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> -		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> -		rcu_assign_pointer(mddev->sync_thread,
> -			md_register_thread(md_do_sync, mddev, "reshape"));
> -		if (!mddev->sync_thread)
> -			goto out_free_conf;
> +		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
>   	}
>   
>   	return 0;
> @@ -4573,16 +4569,8 @@ static int raid10_start_reshape(struct mddev *mddev)
>   	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
>   	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
>   	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> -	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> -
> -	rcu_assign_pointer(mddev->sync_thread,
> -			   md_register_thread(md_do_sync, mddev, "reshape"));
> -	if (!mddev->sync_thread) {
> -		ret = -EAGAIN;
> -		goto abort;
> -	}
> +	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
>   	conf->reshape_checkpoint = jiffies;
> -	md_wakeup_thread(mddev->sync_thread);
>   	md_new_event();
>   	return 0;
>   
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 8497880135ee..6a7a32f7fb91 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -7936,11 +7936,7 @@ static int raid5_run(struct mddev *mddev)
>   		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
>   		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
>   		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> -		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> -		rcu_assign_pointer(mddev->sync_thread,
> -			md_register_thread(md_do_sync, mddev, "reshape"));
> -		if (!mddev->sync_thread)
> -			goto abort;
> +		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
>   	}
>   
>   	/* Ok, everything is just fine now */
> @@ -8506,29 +8502,8 @@ static int raid5_start_reshape(struct mddev *mddev)
>   	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
>   	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
>   	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> -	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> -	rcu_assign_pointer(mddev->sync_thread,
> -			   md_register_thread(md_do_sync, mddev, "reshape"));
> -	if (!mddev->sync_thread) {
> -		mddev->recovery = 0;
> -		spin_lock_irq(&conf->device_lock);
> -		write_seqcount_begin(&conf->gen_lock);
> -		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
> -		mddev->new_chunk_sectors =
> -			conf->chunk_sectors = conf->prev_chunk_sectors;
> -		mddev->new_layout = conf->algorithm = conf->prev_algo;
> -		rdev_for_each(rdev, mddev)
> -			rdev->new_data_offset = rdev->data_offset;
> -		smp_wmb();
> -		conf->generation --;
> -		conf->reshape_progress = MaxSector;
> -		mddev->reshape_position = MaxSector;
> -		write_seqcount_end(&conf->gen_lock);
> -		spin_unlock_irq(&conf->device_lock);
> -		return -EAGAIN;
> -	}
> +	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
>   	conf->reshape_checkpoint = jiffies;
> -	md_wakeup_thread(mddev->sync_thread);
>   	md_new_event();
>   	return 0;
>   }
  
Yu Kuai Feb. 28, 2024, 12:44 p.m. UTC | #2
Hi,

在 2024/02/28 20:07, Xiao Ni 写道:
> I have a question here. Is it the reason sync_thread can't run 
> md_do_sync because kthread_should_stop, so it doesn't have the chance to 
> set MD_RECOVERY_DONE? Why creating sync thread in md_check_recovery 
> doesn't have this problem? Could you explain more about this?

raid10_run() only register sync_thread, without calling
md_wakeup_thread() to set the bit 'THREAD_WAKEUP', md_do_sync() will not
be executed.

raid5 defines 'pers->start' hence md_start() will call
md_wakeup_thread().

md_start_sync() will always call md_wakeup_thread() hence there is no
such problem.

BTW, this patch fix the same problem as you mentioned in your other
thread:

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2266358d8074..54790261254d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4904,6 +4904,7 @@ static void stop_sync_thread(struct mddev *mddev, 
bool locked, bool check_seq)
  	 * never happen
  	 */
  	md_wakeup_thread_directly(mddev->sync_thread);
+	md_wakeup_thread(mddev->sync_thread);
  	if (work_pending(&mddev->sync_work))
  		flush_work(&mddev->sync_work);

However, I think the one to register sync_thread is responsible to wake
it up.

Thanks,
Kuai
  
Xiao Ni Feb. 28, 2024, 12:57 p.m. UTC | #3
On Wed, Feb 28, 2024 at 8:44 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2024/02/28 20:07, Xiao Ni 写道:
> > I have a question here. Is it the reason sync_thread can't run
> > md_do_sync because kthread_should_stop, so it doesn't have the chance to
> > set MD_RECOVERY_DONE? Why creating sync thread in md_check_recovery
> > doesn't have this problem? Could you explain more about this?
>
> raid10_run() only register sync_thread, without calling
> md_wakeup_thread() to set the bit 'THREAD_WAKEUP', md_do_sync() will not
> be executed.

I c. The user is responsible to wake up the thread. If raid10 wakes up
the thread in the right way, we don't need to move register reshape
thread to md_check_recovery, right?

>
> raid5 defines 'pers->start' hence md_start() will call
> md_wakeup_thread().
>
> md_start_sync() will always call md_wakeup_thread() hence there is no
> such problem.
>
> BTW, this patch fix the same problem as you mentioned in your other
> thread:
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 2266358d8074..54790261254d 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -4904,6 +4904,7 @@ static void stop_sync_thread(struct mddev *mddev,
> bool locked, bool check_seq)
>          * never happen
>          */
>         md_wakeup_thread_directly(mddev->sync_thread);
> +       md_wakeup_thread(mddev->sync_thread);
>         if (work_pending(&mddev->sync_work))
>                 flush_work(&mddev->sync_work);

The first patch of my patch set has this already. Maybe it's the
reason that my patch01 can fix this similar problem.

>
> However, I think the one to register sync_thread is responsible to wake
> it up.

Agree, the user that registers thread should wake it up. So start/stop
sync thread apis are common. And they can be called by many users.

Best Regards
Xiao
>
> Thanks,
> Kuai
>
  

Patch

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c65dfd156090..6c5d0a372927 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9372,6 +9372,7 @@  static void md_start_sync(struct work_struct *ws)
 	struct mddev *mddev = container_of(ws, struct mddev, sync_work);
 	int spares = 0;
 	bool suspend = false;
+	char *name;
 
 	if (md_spares_need_change(mddev))
 		suspend = true;
@@ -9404,8 +9405,10 @@  static void md_start_sync(struct work_struct *ws)
 	if (spares)
 		md_bitmap_write_all(mddev->bitmap);
 
+	name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
+			"reshape" : "resync";
 	rcu_assign_pointer(mddev->sync_thread,
-			   md_register_thread(md_do_sync, mddev, "resync"));
+			   md_register_thread(md_do_sync, mddev, name));
 	if (!mddev->sync_thread) {
 		pr_warn("%s: could not start resync thread...\n",
 			mdname(mddev));
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7412066ea22c..a5f8419e2df1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4175,11 +4175,7 @@  static int raid10_run(struct mddev *mddev)
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-		rcu_assign_pointer(mddev->sync_thread,
-			md_register_thread(md_do_sync, mddev, "reshape"));
-		if (!mddev->sync_thread)
-			goto out_free_conf;
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
 
 	return 0;
@@ -4573,16 +4569,8 @@  static int raid10_start_reshape(struct mddev *mddev)
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-
-	rcu_assign_pointer(mddev->sync_thread,
-			   md_register_thread(md_do_sync, mddev, "reshape"));
-	if (!mddev->sync_thread) {
-		ret = -EAGAIN;
-		goto abort;
-	}
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	conf->reshape_checkpoint = jiffies;
-	md_wakeup_thread(mddev->sync_thread);
 	md_new_event();
 	return 0;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8497880135ee..6a7a32f7fb91 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7936,11 +7936,7 @@  static int raid5_run(struct mddev *mddev)
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-		rcu_assign_pointer(mddev->sync_thread,
-			md_register_thread(md_do_sync, mddev, "reshape"));
-		if (!mddev->sync_thread)
-			goto abort;
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
 
 	/* Ok, everything is just fine now */
@@ -8506,29 +8502,8 @@  static int raid5_start_reshape(struct mddev *mddev)
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-	rcu_assign_pointer(mddev->sync_thread,
-			   md_register_thread(md_do_sync, mddev, "reshape"));
-	if (!mddev->sync_thread) {
-		mddev->recovery = 0;
-		spin_lock_irq(&conf->device_lock);
-		write_seqcount_begin(&conf->gen_lock);
-		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
-		mddev->new_chunk_sectors =
-			conf->chunk_sectors = conf->prev_chunk_sectors;
-		mddev->new_layout = conf->algorithm = conf->prev_algo;
-		rdev_for_each(rdev, mddev)
-			rdev->new_data_offset = rdev->data_offset;
-		smp_wmb();
-		conf->generation --;
-		conf->reshape_progress = MaxSector;
-		mddev->reshape_position = MaxSector;
-		write_seqcount_end(&conf->gen_lock);
-		spin_unlock_irq(&conf->device_lock);
-		return -EAGAIN;
-	}
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	conf->reshape_checkpoint = jiffies;
-	md_wakeup_thread(mddev->sync_thread);
 	md_new_event();
 	return 0;
 }