[-next] md: synchronize flush io with array reconfiguration

Message ID 20231108180210.3657203-1-yukuai1@huaweicloud.com
State New
Headers
Series [-next] md: synchronize flush io with array reconfiguration |

Commit Message

Yu Kuai Nov. 8, 2023, 6:02 p.m. UTC
  From: Yu Kuai <yukuai3@huawei.com>

Currently rcu is used to protect iterating rdev from submit_flushes():

submit_flushes			remove_and_add_spares
				 synchronize_rcu
				 pers->hot_remove_disk()
 rcu_read_lock()
 rdev_for_each_rcu
  if (rdev->raid_disk >= 0)
				 rdev->radi_disk = -1;
   atomic_inc(&rdev->nr_pending)
   rcu_read_unlock()
   bi = bio_alloc_bioset()
   bi->bi_end_io = md_end_flush
   bi->private = rdev
   submit_bio
   // issue io for removed rdev

Fix this problem by grabbing 'acive_io' before iterating rdev, make sure
that remove_and_add_spares() won't concurrent with submit_flushes().

Fixes: a2826aa92e2e ("md: support barrier requests on all personalities.")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/md.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)
  

Comments

Song Liu Nov. 24, 2023, 5:36 p.m. UTC | #1
On Wed, Nov 8, 2023 at 2:07 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> Currently rcu is used to protect iterating rdev from submit_flushes():
>
> submit_flushes                  remove_and_add_spares
>                                  synchronize_rcu
>                                  pers->hot_remove_disk()
>  rcu_read_lock()
>  rdev_for_each_rcu
>   if (rdev->raid_disk >= 0)
>                                  rdev->radi_disk = -1;
>    atomic_inc(&rdev->nr_pending)
>    rcu_read_unlock()
>    bi = bio_alloc_bioset()
>    bi->bi_end_io = md_end_flush
>    bi->private = rdev
>    submit_bio
>    // issue io for removed rdev
>
> Fix this problem by grabbing 'acive_io' before iterating rdev, make sure
> that remove_and_add_spares() won't concurrent with submit_flushes().
>
> Fixes: a2826aa92e2e ("md: support barrier requests on all personalities.")
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>

LGTM.

> ---
>  drivers/md/md.c | 21 +++++++++++++++------
>  1 file changed, 15 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 4ee4593c874a..eb3e455bcbae 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -529,6 +529,9 @@ static void md_end_flush(struct bio *bio)
>         rdev_dec_pending(rdev, mddev);
>
>         if (atomic_dec_and_test(&mddev->flush_pending)) {
> +               /* The pair is percpu_ref_tryget() from md_flush_request() */
> +               percpu_ref_put(&mddev->active_io);
> +
>                 /* The pre-request flush has finished */
>                 queue_work(md_wq, &mddev->flush_work);
>         }
> @@ -548,12 +551,8 @@ static void submit_flushes(struct work_struct *ws)
>         rdev_for_each_rcu(rdev, mddev)
>                 if (rdev->raid_disk >= 0 &&
>                     !test_bit(Faulty, &rdev->flags)) {
> -                       /* Take two references, one is dropped
> -                        * when request finishes, one after
> -                        * we reclaim rcu_read_lock
> -                        */
>                         struct bio *bi;
> -                       atomic_inc(&rdev->nr_pending);
> +
>                         atomic_inc(&rdev->nr_pending);
>                         rcu_read_unlock();
>                         bi = bio_alloc_bioset(rdev->bdev, 0,
> @@ -564,7 +563,6 @@ static void submit_flushes(struct work_struct *ws)
>                         atomic_inc(&mddev->flush_pending);
>                         submit_bio(bi);
>                         rcu_read_lock();
> -                       rdev_dec_pending(rdev, mddev);
>                 }
>         rcu_read_unlock();
>         if (atomic_dec_and_test(&mddev->flush_pending))
> @@ -617,6 +615,17 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio)
>         /* new request after previous flush is completed */
>         if (ktime_after(req_start, mddev->prev_flush_start)) {
>                 WARN_ON(mddev->flush_bio);
> +               /*
> +                * Grab a reference to make sure mddev_suspend() will wait for
> +                * this flush to be done.
> +                *
> +                * md_flush_reqeust() is called under md_handle_request() and
> +                * 'active_io' is already grabbed, hence percpu_ref_tryget()
> +                * won't fail, percpu_ref_tryget_live() can't be used because
> +                * percpu_ref_kill() can be called by mddev_suspend()
> +                * concurrently.
> +                */
> +               percpu_ref_tryget(&mddev->active_io);

Probably add an warn_on here to catch any issues in the future.

Thanks,
Song

>                 mddev->flush_bio = bio;
>                 bio = NULL;
>         }
> --
> 2.39.2
>
  
Yu Kuai Nov. 25, 2023, 6:48 a.m. UTC | #2
Hi,

在 2023/11/25 1:36, Song Liu 写道:
> On Wed, Nov 8, 2023 at 2:07 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Currently rcu is used to protect iterating rdev from submit_flushes():
>>
>> submit_flushes                  remove_and_add_spares
>>                                   synchronize_rcu
>>                                   pers->hot_remove_disk()
>>   rcu_read_lock()
>>   rdev_for_each_rcu
>>    if (rdev->raid_disk >= 0)
>>                                   rdev->radi_disk = -1;
>>     atomic_inc(&rdev->nr_pending)
>>     rcu_read_unlock()
>>     bi = bio_alloc_bioset()
>>     bi->bi_end_io = md_end_flush
>>     bi->private = rdev
>>     submit_bio
>>     // issue io for removed rdev
>>
>> Fix this problem by grabbing 'acive_io' before iterating rdev, make sure
>> that remove_and_add_spares() won't concurrent with submit_flushes().
>>
>> Fixes: a2826aa92e2e ("md: support barrier requests on all personalities.")
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> 
> LGTM.
> 
>> ---
>>   drivers/md/md.c | 21 +++++++++++++++------
>>   1 file changed, 15 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 4ee4593c874a..eb3e455bcbae 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -529,6 +529,9 @@ static void md_end_flush(struct bio *bio)
>>          rdev_dec_pending(rdev, mddev);
>>
>>          if (atomic_dec_and_test(&mddev->flush_pending)) {
>> +               /* The pair is percpu_ref_tryget() from md_flush_request() */
>> +               percpu_ref_put(&mddev->active_io);
>> +
>>                  /* The pre-request flush has finished */
>>                  queue_work(md_wq, &mddev->flush_work);
>>          }
>> @@ -548,12 +551,8 @@ static void submit_flushes(struct work_struct *ws)
>>          rdev_for_each_rcu(rdev, mddev)
>>                  if (rdev->raid_disk >= 0 &&
>>                      !test_bit(Faulty, &rdev->flags)) {
>> -                       /* Take two references, one is dropped
>> -                        * when request finishes, one after
>> -                        * we reclaim rcu_read_lock
>> -                        */
>>                          struct bio *bi;
>> -                       atomic_inc(&rdev->nr_pending);
>> +
>>                          atomic_inc(&rdev->nr_pending);
>>                          rcu_read_unlock();
>>                          bi = bio_alloc_bioset(rdev->bdev, 0,
>> @@ -564,7 +563,6 @@ static void submit_flushes(struct work_struct *ws)
>>                          atomic_inc(&mddev->flush_pending);
>>                          submit_bio(bi);
>>                          rcu_read_lock();
>> -                       rdev_dec_pending(rdev, mddev);
>>                  }
>>          rcu_read_unlock();
>>          if (atomic_dec_and_test(&mddev->flush_pending))
>> @@ -617,6 +615,17 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio)
>>          /* new request after previous flush is completed */
>>          if (ktime_after(req_start, mddev->prev_flush_start)) {
>>                  WARN_ON(mddev->flush_bio);
>> +               /*
>> +                * Grab a reference to make sure mddev_suspend() will wait for
>> +                * this flush to be done.
>> +                *
>> +                * md_flush_reqeust() is called under md_handle_request() and
>> +                * 'active_io' is already grabbed, hence percpu_ref_tryget()
>> +                * won't fail, percpu_ref_tryget_live() can't be used because
>> +                * percpu_ref_kill() can be called by mddev_suspend()
>> +                * concurrently.
>> +                */
>> +               percpu_ref_tryget(&mddev->active_io);
> 
> Probably add an warn_on here to catch any issues in the future.

Will do this in v2.

Thanks,
Kuai

> 
> Thanks,
> Song
> 
>>                  mddev->flush_bio = bio;
>>                  bio = NULL;
>>          }
>> --
>> 2.39.2
>>
> .
>
  

Patch

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4ee4593c874a..eb3e455bcbae 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -529,6 +529,9 @@  static void md_end_flush(struct bio *bio)
 	rdev_dec_pending(rdev, mddev);
 
 	if (atomic_dec_and_test(&mddev->flush_pending)) {
+		/* The pair is percpu_ref_tryget() from md_flush_request() */
+		percpu_ref_put(&mddev->active_io);
+
 		/* The pre-request flush has finished */
 		queue_work(md_wq, &mddev->flush_work);
 	}
@@ -548,12 +551,8 @@  static void submit_flushes(struct work_struct *ws)
 	rdev_for_each_rcu(rdev, mddev)
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(Faulty, &rdev->flags)) {
-			/* Take two references, one is dropped
-			 * when request finishes, one after
-			 * we reclaim rcu_read_lock
-			 */
 			struct bio *bi;
-			atomic_inc(&rdev->nr_pending);
+
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
 			bi = bio_alloc_bioset(rdev->bdev, 0,
@@ -564,7 +563,6 @@  static void submit_flushes(struct work_struct *ws)
 			atomic_inc(&mddev->flush_pending);
 			submit_bio(bi);
 			rcu_read_lock();
-			rdev_dec_pending(rdev, mddev);
 		}
 	rcu_read_unlock();
 	if (atomic_dec_and_test(&mddev->flush_pending))
@@ -617,6 +615,17 @@  bool md_flush_request(struct mddev *mddev, struct bio *bio)
 	/* new request after previous flush is completed */
 	if (ktime_after(req_start, mddev->prev_flush_start)) {
 		WARN_ON(mddev->flush_bio);
+		/*
+		 * Grab a reference to make sure mddev_suspend() will wait for
+		 * this flush to be done.
+		 *
+		 * md_flush_reqeust() is called under md_handle_request() and
+		 * 'active_io' is already grabbed, hence percpu_ref_tryget()
+		 * won't fail, percpu_ref_tryget_live() can't be used because
+		 * percpu_ref_kill() can be called by mddev_suspend()
+		 * concurrently.
+		 */
+		percpu_ref_tryget(&mddev->active_io);
 		mddev->flush_bio = bio;
 		bio = NULL;
 	}