diff mbox series

[-next,v2,7/7] md/raid1-10: limit the number of plugged bio

Message ID	20230426082031.1299149-8-yukuai1@huaweicloud.com
State	New
Headers	Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20; From: Yu Kuai <yukuai1@huaweicloud.com> To: song@kernel.org, akpm@osdl.org, neilb@suse.de Cc: linux-raid@vger.kernel.org, linux-kernel@vger.kernel.org, yukuai3@huawei.com, yukuai1@huaweicloud.com, yi.zhang@huawei.com, yangerkun@huawei.com Subject: [PATCH -next v2 7/7] md/raid1-10: limit the number of plugged bio Date: Wed, 26 Apr 2023 16:20:31 +0800 Message-Id: <20230426082031.1299149-8-yukuai1@huaweicloud.com> In-Reply-To: <20230426082031.1299149-1-yukuai1@huaweicloud.com> References: <20230426082031.1299149-1-yukuai1@huaweicloud.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	limit the number of plugged bio \| [-next,v2,0/7] limit the number of plugged bio [-next,v2,1/7] md/raid10: prevent soft lockup while flush writes [-next,v2,2/7] md/raid1-10: factor out a helper to add bio to plug [-next,v2,3/7] md/raid1-10: factor out a helper to submit normal write [-next,v2,4/7] md/raid1-10: submit write io directly if bitmap is not enabled [-next,v2,5/7] md/md-bitmap: add a new helper to unplug bitmap asynchrously [-next,v2,6/7] md/raid1-10: don't handle pluged bio by daemon thread [-next,v2,7/7] md/raid1-10: limit the number of plugged bio

Commit Message

Yu Kuai April 26, 2023, 8:20 a.m. UTC

  From: Yu Kuai <yukuai3@huawei.com>

bio can be added to plug infinitely, and following writeback test can
trigger huge amount of plugged bio:

Test script:
modprobe brd rd_nr=4 rd_size=10485760
mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
echo 0 > /proc/sys/vm/dirty_background_ratio
echo 60 > /proc/sys/vm/dirty_ratio
fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 -iodepth=128 -name=test

Test result:
Monitor /sys/block/md0/inflight will found that inflight keep increasing
until fio finish writing, after running for about 2 minutes:

[root@fedora ~]# cat /sys/block/md0/inflight
       0  4474191

Fix the problem by limiting the number of plugged bio based on the number
of copies for original bio.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/raid1-10.c | 9 ++++++++-
 drivers/md/raid1.c    | 2 +-
 drivers/md/raid10.c   | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)

Comments

Xiao Ni May 29, 2023, 2:08 a.m. UTC | #1

Hi Kuai

There is a limitation of the memory in your test. But for most
situations, customers should not set this. Can this change introduce a
performance regression against other situations?

Best Regards
Xiao

On Wed, Apr 26, 2023 at 4:24 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> From: Yu Kuai <yukuai3@huawei.com>
>
> bio can be added to plug infinitely, and following writeback test can
> trigger huge amount of plugged bio:
>
> Test script:
> modprobe brd rd_nr=4 rd_size=10485760
> mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
> echo 0 > /proc/sys/vm/dirty_background_ratio
> echo 60 > /proc/sys/vm/dirty_ratio
> fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 -iodepth=128 -name=test
>
> Test result:
> Monitor /sys/block/md0/inflight will found that inflight keep increasing
> until fio finish writing, after running for about 2 minutes:
>
> [root@fedora ~]# cat /sys/block/md0/inflight
>        0  4474191
>
> Fix the problem by limiting the number of plugged bio based on the number
> of copies for original bio.
>
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  drivers/md/raid1-10.c | 9 ++++++++-
>  drivers/md/raid1.c    | 2 +-
>  drivers/md/raid10.c   | 2 +-
>  3 files changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
> index 98d678b7df3f..35fb80aa37aa 100644
> --- a/drivers/md/raid1-10.c
> +++ b/drivers/md/raid1-10.c
> @@ -21,6 +21,7 @@
>  #define IO_MADE_GOOD ((struct bio *)2)
>
>  #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
> +#define MAX_PLUG_BIO 32
>
>  /* for managing resync I/O pages */
>  struct resync_pages {
> @@ -31,6 +32,7 @@ struct resync_pages {
>  struct raid1_plug_cb {
>         struct blk_plug_cb      cb;
>         struct bio_list         pending;
> +       unsigned int            count;
>  };
>
>  static void rbio_pool_free(void *rbio, void *data)
> @@ -127,7 +129,7 @@ static inline void md_submit_write(struct bio *bio)
>  }
>
>  static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
> -                                     blk_plug_cb_fn unplug)
> +                                     blk_plug_cb_fn unplug, int copies)
>  {
>         struct raid1_plug_cb *plug = NULL;
>         struct blk_plug_cb *cb;
> @@ -147,6 +149,11 @@ static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
>
>         plug = container_of(cb, struct raid1_plug_cb, cb);
>         bio_list_add(&plug->pending, bio);
> +       if (++plug->count / MAX_PLUG_BIO >= copies) {
> +               list_del(&cb->list);
> +               cb->callback(cb, false);
> +       }
> +
>
>         return true;
>  }
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 639e09cecf01..c6066408a913 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1562,7 +1562,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>                                               r1_bio->sector);
>                 /* flush_pending_writes() needs access to the rdev so...*/
>                 mbio->bi_bdev = (void *)rdev;
> -               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug)) {
> +               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
>                         spin_lock_irqsave(&conf->device_lock, flags);
>                         bio_list_add(&conf->pending_bio_list, mbio);
>                         spin_unlock_irqrestore(&conf->device_lock, flags);
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index bd9e655ca408..7135cfaf75db 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1306,7 +1306,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
>
>         atomic_inc(&r10_bio->remaining);
>
> -       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug)) {
> +       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) {
>                 spin_lock_irqsave(&conf->device_lock, flags);
>                 bio_list_add(&conf->pending_bio_list, mbio);
>                 spin_unlock_irqrestore(&conf->device_lock, flags);
> --
> 2.39.2
>

Yu Kuai May 29, 2023, 2:19 a.m. UTC | #2

Hi,

在 2023/05/29 10:08, Xiao Ni 写道:
> Hi Kuai
> 
> There is a limitation of the memory in your test. But for most
> situations, customers should not set this. Can this change introduce a
> performance regression against other situations?

Noted that this limitation is just to triggered writeback as soon as
possible in the test, and it's 100% sure real situations can trigger
dirty pages write back asynchronously and continue to produce new dirty
pages.

If a lot of bio is not plugged, then it's the same as before; if a lot
of bio is plugged, noted that before this patchset, these bio will spent
quite a long time in plug, and hence I think performance should be
better.

Thanks,
Kuai
> 
> Best Regards
> Xiao
> 
> On Wed, Apr 26, 2023 at 4:24 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> bio can be added to plug infinitely, and following writeback test can
>> trigger huge amount of plugged bio:
>>
>> Test script:
>> modprobe brd rd_nr=4 rd_size=10485760
>> mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
>> echo 0 > /proc/sys/vm/dirty_background_ratio
>> echo 60 > /proc/sys/vm/dirty_ratio
>> fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 -iodepth=128 -name=test
>>
>> Test result:
>> Monitor /sys/block/md0/inflight will found that inflight keep increasing
>> until fio finish writing, after running for about 2 minutes:
>>
>> [root@fedora ~]# cat /sys/block/md0/inflight
>>         0  4474191
>>
>> Fix the problem by limiting the number of plugged bio based on the number
>> of copies for original bio.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> ---
>>   drivers/md/raid1-10.c | 9 ++++++++-
>>   drivers/md/raid1.c    | 2 +-
>>   drivers/md/raid10.c   | 2 +-
>>   3 files changed, 10 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
>> index 98d678b7df3f..35fb80aa37aa 100644
>> --- a/drivers/md/raid1-10.c
>> +++ b/drivers/md/raid1-10.c
>> @@ -21,6 +21,7 @@
>>   #define IO_MADE_GOOD ((struct bio *)2)
>>
>>   #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
>> +#define MAX_PLUG_BIO 32
>>
>>   /* for managing resync I/O pages */
>>   struct resync_pages {
>> @@ -31,6 +32,7 @@ struct resync_pages {
>>   struct raid1_plug_cb {
>>          struct blk_plug_cb      cb;
>>          struct bio_list         pending;
>> +       unsigned int            count;
>>   };
>>
>>   static void rbio_pool_free(void *rbio, void *data)
>> @@ -127,7 +129,7 @@ static inline void md_submit_write(struct bio *bio)
>>   }
>>
>>   static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
>> -                                     blk_plug_cb_fn unplug)
>> +                                     blk_plug_cb_fn unplug, int copies)
>>   {
>>          struct raid1_plug_cb *plug = NULL;
>>          struct blk_plug_cb *cb;
>> @@ -147,6 +149,11 @@ static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
>>
>>          plug = container_of(cb, struct raid1_plug_cb, cb);
>>          bio_list_add(&plug->pending, bio);
>> +       if (++plug->count / MAX_PLUG_BIO >= copies) {
>> +               list_del(&cb->list);
>> +               cb->callback(cb, false);
>> +       }
>> +
>>
>>          return true;
>>   }
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index 639e09cecf01..c6066408a913 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -1562,7 +1562,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>>                                                r1_bio->sector);
>>                  /* flush_pending_writes() needs access to the rdev so...*/
>>                  mbio->bi_bdev = (void *)rdev;
>> -               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug)) {
>> +               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
>>                          spin_lock_irqsave(&conf->device_lock, flags);
>>                          bio_list_add(&conf->pending_bio_list, mbio);
>>                          spin_unlock_irqrestore(&conf->device_lock, flags);
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index bd9e655ca408..7135cfaf75db 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -1306,7 +1306,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
>>
>>          atomic_inc(&r10_bio->remaining);
>>
>> -       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug)) {
>> +       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) {
>>                  spin_lock_irqsave(&conf->device_lock, flags);
>>                  bio_list_add(&conf->pending_bio_list, mbio);
>>                  spin_unlock_irqrestore(&conf->device_lock, flags);
>> --
>> 2.39.2
>>
> 
> .
>

Xiao Ni May 29, 2023, 3:10 a.m. UTC | #3

On Mon, May 29, 2023 at 10:20 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2023/05/29 10:08, Xiao Ni 写道:
> > Hi Kuai
> >
> > There is a limitation of the memory in your test. But for most
> > situations, customers should not set this. Can this change introduce a
> > performance regression against other situations?
>
> Noted that this limitation is just to triggered writeback as soon as
> possible in the test, and it's 100% sure real situations can trigger
> dirty pages write back asynchronously and continue to produce new dirty
> pages.

Hi

I'm confused here. If we want to trigger write back quickly, it needs
to set these two values with a smaller number, rather than 0 and 60.
Right?
>
> If a lot of bio is not plugged, then it's the same as before; if a lot
> of bio is plugged, noted that before this patchset, these bio will spent
> quite a long time in plug, and hence I think performance should be
> better.

Hmm, it depends on if it's sequential or not? If it's a big io
request, can it miss the merge opportunity?

Regards
Xiao

>
> Thanks,
> Kuai
> >
> > Best Regards
> > Xiao
> >
> > On Wed, Apr 26, 2023 at 4:24 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >>
> >> From: Yu Kuai <yukuai3@huawei.com>
> >>
> >> bio can be added to plug infinitely, and following writeback test can
> >> trigger huge amount of plugged bio:
> >>
> >> Test script:
> >> modprobe brd rd_nr=4 rd_size=10485760
> >> mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
> >> echo 0 > /proc/sys/vm/dirty_background_ratio
> >> echo 60 > /proc/sys/vm/dirty_ratio
> >> fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 -iodepth=128 -name=test
> >>
> >> Test result:
> >> Monitor /sys/block/md0/inflight will found that inflight keep increasing
> >> until fio finish writing, after running for about 2 minutes:
> >>
> >> [root@fedora ~]# cat /sys/block/md0/inflight
> >>         0  4474191
> >>
> >> Fix the problem by limiting the number of plugged bio based on the number
> >> of copies for original bio.
> >>
> >> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> >> ---
> >>   drivers/md/raid1-10.c | 9 ++++++++-
> >>   drivers/md/raid1.c    | 2 +-
> >>   drivers/md/raid10.c   | 2 +-
> >>   3 files changed, 10 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
> >> index 98d678b7df3f..35fb80aa37aa 100644
> >> --- a/drivers/md/raid1-10.c
> >> +++ b/drivers/md/raid1-10.c
> >> @@ -21,6 +21,7 @@
> >>   #define IO_MADE_GOOD ((struct bio *)2)
> >>
> >>   #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
> >> +#define MAX_PLUG_BIO 32
> >>
> >>   /* for managing resync I/O pages */
> >>   struct resync_pages {
> >> @@ -31,6 +32,7 @@ struct resync_pages {
> >>   struct raid1_plug_cb {
> >>          struct blk_plug_cb      cb;
> >>          struct bio_list         pending;
> >> +       unsigned int            count;
> >>   };
> >>
> >>   static void rbio_pool_free(void *rbio, void *data)
> >> @@ -127,7 +129,7 @@ static inline void md_submit_write(struct bio *bio)
> >>   }
> >>
> >>   static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
> >> -                                     blk_plug_cb_fn unplug)
> >> +                                     blk_plug_cb_fn unplug, int copies)
> >>   {
> >>          struct raid1_plug_cb *plug = NULL;
> >>          struct blk_plug_cb *cb;
> >> @@ -147,6 +149,11 @@ static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
> >>
> >>          plug = container_of(cb, struct raid1_plug_cb, cb);
> >>          bio_list_add(&plug->pending, bio);
> >> +       if (++plug->count / MAX_PLUG_BIO >= copies) {
> >> +               list_del(&cb->list);
> >> +               cb->callback(cb, false);
> >> +       }
> >> +
> >>
> >>          return true;
> >>   }
> >> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> >> index 639e09cecf01..c6066408a913 100644
> >> --- a/drivers/md/raid1.c
> >> +++ b/drivers/md/raid1.c
> >> @@ -1562,7 +1562,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
> >>                                                r1_bio->sector);
> >>                  /* flush_pending_writes() needs access to the rdev so...*/
> >>                  mbio->bi_bdev = (void *)rdev;
> >> -               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug)) {
> >> +               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
> >>                          spin_lock_irqsave(&conf->device_lock, flags);
> >>                          bio_list_add(&conf->pending_bio_list, mbio);
> >>                          spin_unlock_irqrestore(&conf->device_lock, flags);
> >> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> >> index bd9e655ca408..7135cfaf75db 100644
> >> --- a/drivers/md/raid10.c
> >> +++ b/drivers/md/raid10.c
> >> @@ -1306,7 +1306,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
> >>
> >>          atomic_inc(&r10_bio->remaining);
> >>
> >> -       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug)) {
> >> +       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) {
> >>                  spin_lock_irqsave(&conf->device_lock, flags);
> >>                  bio_list_add(&conf->pending_bio_list, mbio);
> >>                  spin_unlock_irqrestore(&conf->device_lock, flags);
> >> --
> >> 2.39.2
> >>
> >
> > .
> >
>

Yu Kuai May 29, 2023, 3:18 a.m. UTC | #4

Hi,

在 2023/05/29 11:10, Xiao Ni 写道:
> On Mon, May 29, 2023 at 10:20 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> Hi,
>>
>> 在 2023/05/29 10:08, Xiao Ni 写道:
>>> Hi Kuai
>>>
>>> There is a limitation of the memory in your test. But for most
>>> situations, customers should not set this. Can this change introduce a
>>> performance regression against other situations?
>>
>> Noted that this limitation is just to triggered writeback as soon as
>> possible in the test, and it's 100% sure real situations can trigger
>> dirty pages write back asynchronously and continue to produce new dirty
>> pages.
> 
> Hi
> 
> I'm confused here. If we want to trigger write back quickly, it needs
> to set these two values with a smaller number, rather than 0 and 60.
> Right?

60 is not required, I'll remove this setting.

0 just means write back if there are any dirty pages.
>>
>> If a lot of bio is not plugged, then it's the same as before; if a lot
>> of bio is plugged, noted that before this patchset, these bio will spent
>> quite a long time in plug, and hence I think performance should be
>> better.
> 
> Hmm, it depends on if it's sequential or not? If it's a big io
> request, can it miss the merge opportunity?

The bio will still be merged to underlying disks' rq(if it's rq based),
underlying disk won't flush plug untill the number of request exceed
threshold.

Thanks,
Kuai
> 
> Regards
> Xiao
> 
>>
>> Thanks,
>> Kuai
>>>
>>> Best Regards
>>> Xiao
>>>
>>> On Wed, Apr 26, 2023 at 4:24 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>>>
>>>> From: Yu Kuai <yukuai3@huawei.com>
>>>>
>>>> bio can be added to plug infinitely, and following writeback test can
>>>> trigger huge amount of plugged bio:
>>>>
>>>> Test script:
>>>> modprobe brd rd_nr=4 rd_size=10485760
>>>> mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
>>>> echo 0 > /proc/sys/vm/dirty_background_ratio
>>>> echo 60 > /proc/sys/vm/dirty_ratio
>>>> fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 -iodepth=128 -name=test
>>>>
>>>> Test result:
>>>> Monitor /sys/block/md0/inflight will found that inflight keep increasing
>>>> until fio finish writing, after running for about 2 minutes:
>>>>
>>>> [root@fedora ~]# cat /sys/block/md0/inflight
>>>>          0  4474191
>>>>
>>>> Fix the problem by limiting the number of plugged bio based on the number
>>>> of copies for original bio.
>>>>
>>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>>> ---
>>>>    drivers/md/raid1-10.c | 9 ++++++++-
>>>>    drivers/md/raid1.c    | 2 +-
>>>>    drivers/md/raid10.c   | 2 +-
>>>>    3 files changed, 10 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
>>>> index 98d678b7df3f..35fb80aa37aa 100644
>>>> --- a/drivers/md/raid1-10.c
>>>> +++ b/drivers/md/raid1-10.c
>>>> @@ -21,6 +21,7 @@
>>>>    #define IO_MADE_GOOD ((struct bio *)2)
>>>>
>>>>    #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
>>>> +#define MAX_PLUG_BIO 32
>>>>
>>>>    /* for managing resync I/O pages */
>>>>    struct resync_pages {
>>>> @@ -31,6 +32,7 @@ struct resync_pages {
>>>>    struct raid1_plug_cb {
>>>>           struct blk_plug_cb      cb;
>>>>           struct bio_list         pending;
>>>> +       unsigned int            count;
>>>>    };
>>>>
>>>>    static void rbio_pool_free(void *rbio, void *data)
>>>> @@ -127,7 +129,7 @@ static inline void md_submit_write(struct bio *bio)
>>>>    }
>>>>
>>>>    static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
>>>> -                                     blk_plug_cb_fn unplug)
>>>> +                                     blk_plug_cb_fn unplug, int copies)
>>>>    {
>>>>           struct raid1_plug_cb *plug = NULL;
>>>>           struct blk_plug_cb *cb;
>>>> @@ -147,6 +149,11 @@ static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
>>>>
>>>>           plug = container_of(cb, struct raid1_plug_cb, cb);
>>>>           bio_list_add(&plug->pending, bio);
>>>> +       if (++plug->count / MAX_PLUG_BIO >= copies) {
>>>> +               list_del(&cb->list);
>>>> +               cb->callback(cb, false);
>>>> +       }
>>>> +
>>>>
>>>>           return true;
>>>>    }
>>>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>>>> index 639e09cecf01..c6066408a913 100644
>>>> --- a/drivers/md/raid1.c
>>>> +++ b/drivers/md/raid1.c
>>>> @@ -1562,7 +1562,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>>>>                                                 r1_bio->sector);
>>>>                   /* flush_pending_writes() needs access to the rdev so...*/
>>>>                   mbio->bi_bdev = (void *)rdev;
>>>> -               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug)) {
>>>> +               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
>>>>                           spin_lock_irqsave(&conf->device_lock, flags);
>>>>                           bio_list_add(&conf->pending_bio_list, mbio);
>>>>                           spin_unlock_irqrestore(&conf->device_lock, flags);
>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>>> index bd9e655ca408..7135cfaf75db 100644
>>>> --- a/drivers/md/raid10.c
>>>> +++ b/drivers/md/raid10.c
>>>> @@ -1306,7 +1306,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
>>>>
>>>>           atomic_inc(&r10_bio->remaining);
>>>>
>>>> -       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug)) {
>>>> +       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) {
>>>>                   spin_lock_irqsave(&conf->device_lock, flags);
>>>>                   bio_list_add(&conf->pending_bio_list, mbio);
>>>>                   spin_unlock_irqrestore(&conf->device_lock, flags);
>>>> --
>>>> 2.39.2
>>>>
>>>
>>> .
>>>
>>
> 
> .
>

Xiao Ni May 29, 2023, 7:57 a.m. UTC | #5

On Mon, May 29, 2023 at 11:18 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2023/05/29 11:10, Xiao Ni 写道:
> > On Mon, May 29, 2023 at 10:20 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >>
> >> Hi,
> >>
> >> 在 2023/05/29 10:08, Xiao Ni 写道:
> >>> Hi Kuai
> >>>
> >>> There is a limitation of the memory in your test. But for most
> >>> situations, customers should not set this. Can this change introduce a
> >>> performance regression against other situations?
> >>
> >> Noted that this limitation is just to triggered writeback as soon as
> >> possible in the test, and it's 100% sure real situations can trigger
> >> dirty pages write back asynchronously and continue to produce new dirty
> >> pages.
> >
> > Hi
> >
> > I'm confused here. If we want to trigger write back quickly, it needs
> > to set these two values with a smaller number, rather than 0 and 60.
> > Right?
>
> 60 is not required, I'll remove this setting.
>
> 0 just means write back if there are any dirty pages.

Hi Kuai

Does 0 mean disabling write back? I tried to find the doc that
describes the meaning when setting dirty_background_ration to 0, but I
didn't find it.
In https://www.kernel.org/doc/html/next/admin-guide/sysctl/vm.html it
doesn't describe this. But it says something like this

Note:
  dirty_background_bytes is the counterpart of dirty_background_ratio. Only
  one of them may be specified at a time. When one sysctl is written it is
  immediately taken into account to evaluate the dirty memory limits and the
  other appears as 0 when read.

Maybe you can specify dirty_background_ratio to 1 if you want to
trigger write back ASAP.

> >>
> >> If a lot of bio is not plugged, then it's the same as before; if a lot
> >> of bio is plugged, noted that before this patchset, these bio will spent
> >> quite a long time in plug, and hence I think performance should be
> >> better.
> >
> > Hmm, it depends on if it's sequential or not? If it's a big io
> > request, can it miss the merge opportunity?
>
> The bio will still be merged to underlying disks' rq(if it's rq based),
> underlying disk won't flush plug untill the number of request exceed
> threshold.

Thanks for this.

Regards
Xiao
>
> Thanks,
> Kuai
> >
> > Regards
> > Xiao
> >
> >>
> >> Thanks,
> >> Kuai
> >>>
> >>> Best Regards
> >>> Xiao
> >>>
> >>> On Wed, Apr 26, 2023 at 4:24 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >>>>
> >>>> From: Yu Kuai <yukuai3@huawei.com>
> >>>>
> >>>> bio can be added to plug infinitely, and following writeback test can
> >>>> trigger huge amount of plugged bio:
> >>>>
> >>>> Test script:
> >>>> modprobe brd rd_nr=4 rd_size=10485760
> >>>> mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
> >>>> echo 0 > /proc/sys/vm/dirty_background_ratio
> >>>> echo 60 > /proc/sys/vm/dirty_ratio
> >>>> fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 -iodepth=128 -name=test
> >>>>
> >>>> Test result:
> >>>> Monitor /sys/block/md0/inflight will found that inflight keep increasing
> >>>> until fio finish writing, after running for about 2 minutes:
> >>>>
> >>>> [root@fedora ~]# cat /sys/block/md0/inflight
> >>>>          0  4474191
> >>>>
> >>>> Fix the problem by limiting the number of plugged bio based on the number
> >>>> of copies for original bio.
> >>>>
> >>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> >>>> ---
> >>>>    drivers/md/raid1-10.c | 9 ++++++++-
> >>>>    drivers/md/raid1.c    | 2 +-
> >>>>    drivers/md/raid10.c   | 2 +-
> >>>>    3 files changed, 10 insertions(+), 3 deletions(-)
> >>>>
> >>>> diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
> >>>> index 98d678b7df3f..35fb80aa37aa 100644
> >>>> --- a/drivers/md/raid1-10.c
> >>>> +++ b/drivers/md/raid1-10.c
> >>>> @@ -21,6 +21,7 @@
> >>>>    #define IO_MADE_GOOD ((struct bio *)2)
> >>>>
> >>>>    #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
> >>>> +#define MAX_PLUG_BIO 32
> >>>>
> >>>>    /* for managing resync I/O pages */
> >>>>    struct resync_pages {
> >>>> @@ -31,6 +32,7 @@ struct resync_pages {
> >>>>    struct raid1_plug_cb {
> >>>>           struct blk_plug_cb      cb;
> >>>>           struct bio_list         pending;
> >>>> +       unsigned int            count;
> >>>>    };
> >>>>
> >>>>    static void rbio_pool_free(void *rbio, void *data)
> >>>> @@ -127,7 +129,7 @@ static inline void md_submit_write(struct bio *bio)
> >>>>    }
> >>>>
> >>>>    static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
> >>>> -                                     blk_plug_cb_fn unplug)
> >>>> +                                     blk_plug_cb_fn unplug, int copies)
> >>>>    {
> >>>>           struct raid1_plug_cb *plug = NULL;
> >>>>           struct blk_plug_cb *cb;
> >>>> @@ -147,6 +149,11 @@ static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
> >>>>
> >>>>           plug = container_of(cb, struct raid1_plug_cb, cb);
> >>>>           bio_list_add(&plug->pending, bio);
> >>>> +       if (++plug->count / MAX_PLUG_BIO >= copies) {
> >>>> +               list_del(&cb->list);
> >>>> +               cb->callback(cb, false);
> >>>> +       }
> >>>> +
> >>>>
> >>>>           return true;
> >>>>    }
> >>>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> >>>> index 639e09cecf01..c6066408a913 100644
> >>>> --- a/drivers/md/raid1.c
> >>>> +++ b/drivers/md/raid1.c
> >>>> @@ -1562,7 +1562,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
> >>>>                                                 r1_bio->sector);
> >>>>                   /* flush_pending_writes() needs access to the rdev so...*/
> >>>>                   mbio->bi_bdev = (void *)rdev;
> >>>> -               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug)) {
> >>>> +               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
> >>>>                           spin_lock_irqsave(&conf->device_lock, flags);
> >>>>                           bio_list_add(&conf->pending_bio_list, mbio);
> >>>>                           spin_unlock_irqrestore(&conf->device_lock, flags);
> >>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> >>>> index bd9e655ca408..7135cfaf75db 100644
> >>>> --- a/drivers/md/raid10.c
> >>>> +++ b/drivers/md/raid10.c
> >>>> @@ -1306,7 +1306,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
> >>>>
> >>>>           atomic_inc(&r10_bio->remaining);
> >>>>
> >>>> -       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug)) {
> >>>> +       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) {
> >>>>                   spin_lock_irqsave(&conf->device_lock, flags);
> >>>>                   bio_list_add(&conf->pending_bio_list, mbio);
> >>>>                   spin_unlock_irqrestore(&conf->device_lock, flags);
> >>>> --
> >>>> 2.39.2
> >>>>
> >>>
> >>> .
> >>>
> >>
> >
> > .
> >
>

Yu Kuai May 29, 2023, 8:50 a.m. UTC | #6

Hi,

在 2023/05/29 15:57, Xiao Ni 写道:
> On Mon, May 29, 2023 at 11:18 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> Hi,
>>
>> 在 2023/05/29 11:10, Xiao Ni 写道:
>>> On Mon, May 29, 2023 at 10:20 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>>>
>>>> Hi,
>>>>
>>>> 在 2023/05/29 10:08, Xiao Ni 写道:
>>>>> Hi Kuai
>>>>>
>>>>> There is a limitation of the memory in your test. But for most
>>>>> situations, customers should not set this. Can this change introduce a
>>>>> performance regression against other situations?
>>>>
>>>> Noted that this limitation is just to triggered writeback as soon as
>>>> possible in the test, and it's 100% sure real situations can trigger
>>>> dirty pages write back asynchronously and continue to produce new dirty
>>>> pages.
>>>
>>> Hi
>>>
>>> I'm confused here. If we want to trigger write back quickly, it needs
>>> to set these two values with a smaller number, rather than 0 and 60.
>>> Right?
>>
>> 60 is not required, I'll remove this setting.
>>
>> 0 just means write back if there are any dirty pages.
> 
> Hi Kuai
> 
> Does 0 mean disabling write back? I tried to find the doc that
> describes the meaning when setting dirty_background_ratio to 0, but I
> didn't find it.
> In https://www.kernel.org/doc/html/next/admin-guide/sysctl/vm.html it
> doesn't describe this. But it says something like this
> 
> Note:
>    dirty_background_bytes is the counterpart of dirty_background_ratio. Only
>    one of them may be specified at a time. When one sysctl is written it is
>    immediately taken into account to evaluate the dirty memory limits and the
>    other appears as 0 when read.
> 
> Maybe you can specify dirty_background_ratio to 1 if you want to
> trigger write back ASAP.

The purpose here is to trigger write back ASAP, I'm not an expert here,
but based on test result, 0 obviously doesn't mean disable write back.

Set dirty_background_bytes to a value, dirty_background_ratio will be
set to 0 together, which means dirty_background_ratio is disabled.
However, change dirty_background_ratio from default value to 0, will end
up both dirty_background_ratio and dirty_background_bytes to be 0, and
based on following related code, I think 0 just means write back if
there are any dirty pages.

domain_dirty_limits:
  bg_bytes = dirty_background_bytes -> 0
  bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100 -> 0

  if (bg_bytes)
	 bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
  else
	 bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; -> 0

  dtc->bg_thresh = bg_thresh; -> 0

balance_dirty_pages
  nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
  if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
       !writeback_in_progress(wb))
    wb_start_background_writeback(wb); -> writeback ASAP

Thanks,
Kuai
> 
>>>>
>>>> If a lot of bio is not plugged, then it's the same as before; if a lot
>>>> of bio is plugged, noted that before this patchset, these bio will spent
>>>> quite a long time in plug, and hence I think performance should be
>>>> better.
>>>
>>> Hmm, it depends on if it's sequential or not? If it's a big io
>>> request, can it miss the merge opportunity?
>>
>> The bio will still be merged to underlying disks' rq(if it's rq based),
>> underlying disk won't flush plug untill the number of request exceed
>> threshold.
> 
> Thanks for this.
> 
> Regards
> Xiao
>>
>> Thanks,
>> Kuai
>>>
>>> Regards
>>> Xiao
>>>
>>>>
>>>> Thanks,
>>>> Kuai
>>>>>
>>>>> Best Regards
>>>>> Xiao
>>>>>
>>>>> On Wed, Apr 26, 2023 at 4:24 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>>>>>
>>>>>> From: Yu Kuai <yukuai3@huawei.com>
>>>>>>
>>>>>> bio can be added to plug infinitely, and following writeback test can
>>>>>> trigger huge amount of plugged bio:
>>>>>>
>>>>>> Test script:
>>>>>> modprobe brd rd_nr=4 rd_size=10485760
>>>>>> mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
>>>>>> echo 0 > /proc/sys/vm/dirty_background_ratio
>>>>>> echo 60 > /proc/sys/vm/dirty_ratio
>>>>>> fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 -iodepth=128 -name=test
>>>>>>
>>>>>> Test result:
>>>>>> Monitor /sys/block/md0/inflight will found that inflight keep increasing
>>>>>> until fio finish writing, after running for about 2 minutes:
>>>>>>
>>>>>> [root@fedora ~]# cat /sys/block/md0/inflight
>>>>>>           0  4474191
>>>>>>
>>>>>> Fix the problem by limiting the number of plugged bio based on the number
>>>>>> of copies for original bio.
>>>>>>
>>>>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>>>>> ---
>>>>>>     drivers/md/raid1-10.c | 9 ++++++++-
>>>>>>     drivers/md/raid1.c    | 2 +-
>>>>>>     drivers/md/raid10.c   | 2 +-
>>>>>>     3 files changed, 10 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
>>>>>> index 98d678b7df3f..35fb80aa37aa 100644
>>>>>> --- a/drivers/md/raid1-10.c
>>>>>> +++ b/drivers/md/raid1-10.c
>>>>>> @@ -21,6 +21,7 @@
>>>>>>     #define IO_MADE_GOOD ((struct bio *)2)
>>>>>>
>>>>>>     #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
>>>>>> +#define MAX_PLUG_BIO 32
>>>>>>
>>>>>>     /* for managing resync I/O pages */
>>>>>>     struct resync_pages {
>>>>>> @@ -31,6 +32,7 @@ struct resync_pages {
>>>>>>     struct raid1_plug_cb {
>>>>>>            struct blk_plug_cb      cb;
>>>>>>            struct bio_list         pending;
>>>>>> +       unsigned int            count;
>>>>>>     };
>>>>>>
>>>>>>     static void rbio_pool_free(void *rbio, void *data)
>>>>>> @@ -127,7 +129,7 @@ static inline void md_submit_write(struct bio *bio)
>>>>>>     }
>>>>>>
>>>>>>     static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
>>>>>> -                                     blk_plug_cb_fn unplug)
>>>>>> +                                     blk_plug_cb_fn unplug, int copies)
>>>>>>     {
>>>>>>            struct raid1_plug_cb *plug = NULL;
>>>>>>            struct blk_plug_cb *cb;
>>>>>> @@ -147,6 +149,11 @@ static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
>>>>>>
>>>>>>            plug = container_of(cb, struct raid1_plug_cb, cb);
>>>>>>            bio_list_add(&plug->pending, bio);
>>>>>> +       if (++plug->count / MAX_PLUG_BIO >= copies) {
>>>>>> +               list_del(&cb->list);
>>>>>> +               cb->callback(cb, false);
>>>>>> +       }
>>>>>> +
>>>>>>
>>>>>>            return true;
>>>>>>     }
>>>>>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>>>>>> index 639e09cecf01..c6066408a913 100644
>>>>>> --- a/drivers/md/raid1.c
>>>>>> +++ b/drivers/md/raid1.c
>>>>>> @@ -1562,7 +1562,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>>>>>>                                                  r1_bio->sector);
>>>>>>                    /* flush_pending_writes() needs access to the rdev so...*/
>>>>>>                    mbio->bi_bdev = (void *)rdev;
>>>>>> -               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug)) {
>>>>>> +               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
>>>>>>                            spin_lock_irqsave(&conf->device_lock, flags);
>>>>>>                            bio_list_add(&conf->pending_bio_list, mbio);
>>>>>>                            spin_unlock_irqrestore(&conf->device_lock, flags);
>>>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>>>>>> index bd9e655ca408..7135cfaf75db 100644
>>>>>> --- a/drivers/md/raid10.c
>>>>>> +++ b/drivers/md/raid10.c
>>>>>> @@ -1306,7 +1306,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
>>>>>>
>>>>>>            atomic_inc(&r10_bio->remaining);
>>>>>>
>>>>>> -       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug)) {
>>>>>> +       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) {
>>>>>>                    spin_lock_irqsave(&conf->device_lock, flags);
>>>>>>                    bio_list_add(&conf->pending_bio_list, mbio);
>>>>>>                    spin_unlock_irqrestore(&conf->device_lock, flags);
>>>>>> --
>>>>>> 2.39.2
>>>>>>
>>>>>
>>>>> .
>>>>>
>>>>
>>>
>>> .
>>>
>>
> 
> .
>

Xiao Ni May 30, 2023, 12:58 a.m. UTC | #7

On Mon, May 29, 2023 at 4:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2023/05/29 15:57, Xiao Ni 写道:
> > On Mon, May 29, 2023 at 11:18 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >>
> >> Hi,
> >>
> >> 在 2023/05/29 11:10, Xiao Ni 写道:
> >>> On Mon, May 29, 2023 at 10:20 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >>>>
> >>>> Hi,
> >>>>
> >>>> 在 2023/05/29 10:08, Xiao Ni 写道:
> >>>>> Hi Kuai
> >>>>>
> >>>>> There is a limitation of the memory in your test. But for most
> >>>>> situations, customers should not set this. Can this change introduce a
> >>>>> performance regression against other situations?
> >>>>
> >>>> Noted that this limitation is just to triggered writeback as soon as
> >>>> possible in the test, and it's 100% sure real situations can trigger
> >>>> dirty pages write back asynchronously and continue to produce new dirty
> >>>> pages.
> >>>
> >>> Hi
> >>>
> >>> I'm confused here. If we want to trigger write back quickly, it needs
> >>> to set these two values with a smaller number, rather than 0 and 60.
> >>> Right?
> >>
> >> 60 is not required, I'll remove this setting.
> >>
> >> 0 just means write back if there are any dirty pages.
> >
> > Hi Kuai
> >
> > Does 0 mean disabling write back? I tried to find the doc that
> > describes the meaning when setting dirty_background_ratio to 0, but I
> > didn't find it.
> > In https://www.kernel.org/doc/html/next/admin-guide/sysctl/vm.html it
> > doesn't describe this. But it says something like this
> >
> > Note:
> >    dirty_background_bytes is the counterpart of dirty_background_ratio. Only
> >    one of them may be specified at a time. When one sysctl is written it is
> >    immediately taken into account to evaluate the dirty memory limits and the
> >    other appears as 0 when read.
> >
> > Maybe you can specify dirty_background_ratio to 1 if you want to
> > trigger write back ASAP.
>
> The purpose here is to trigger write back ASAP, I'm not an expert here,
> but based on test result, 0 obviously doesn't mean disable write back.
>
> Set dirty_background_bytes to a value, dirty_background_ratio will be
> set to 0 together, which means dirty_background_ratio is disabled.
> However, change dirty_background_ratio from default value to 0, will end
> up both dirty_background_ratio and dirty_background_bytes to be 0, and
> based on following related code, I think 0 just means write back if
> there are any dirty pages.
>
> domain_dirty_limits:
>   bg_bytes = dirty_background_bytes -> 0
>   bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100 -> 0
>
>   if (bg_bytes)
>          bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
>   else
>          bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; -> 0
>
>   dtc->bg_thresh = bg_thresh; -> 0
>
> balance_dirty_pages
>   nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
>   if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
>        !writeback_in_progress(wb))
>     wb_start_background_writeback(wb); -> writeback ASAP
>
> Thanks,
> Kuai

Hi Kuai

I'm not an expert about this either. Thanks for all your patches, I
can study more things too. But I still have some questions.

I did a test in my environment something like this:
modprobe brd rd_nr=4 rd_size=10485760
mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
echo 0 > /proc/sys/vm/dirty_background_ratio
fio -filename=/dev/md0 -ioengine=libaio -rw=write -thread -bs=1k-8k
-numjobs=1 -iodepth=128 --runtime=10 -name=xxx
It will cause OOM and the system hangs

modprobe brd rd_nr=4 rd_size=10485760
mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
echo 1 > /proc/sys/vm/dirty_background_ratio (THIS is the only different place)
fio -filename=/dev/md0 -ioengine=libaio -rw=write -thread -bs=1k-8k
-numjobs=1 -iodepth=128 --runtime=10 -name=xxx
It can finish successfully.  The value of dirty_background_ration is 1
here means it flushes ASAP

So your method should be the opposite way as you designed. All the
memory can't be flushed in time, so it uses all memory very soon and
the memory runs out and the system hangs. The reason I'm looking at
the test is that do we really need this change. Because in the real
world, most customers don't disable write back. Anyway, it depends on
Song's decision and thanks for your patches again. I'll review V3 and
try to do some performance tests.

Best Regards
Xiao
> >
> >>>>
> >>>> If a lot of bio is not plugged, then it's the same as before; if a lot
> >>>> of bio is plugged, noted that before this patchset, these bio will spent
> >>>> quite a long time in plug, and hence I think performance should be
> >>>> better.
> >>>
> >>> Hmm, it depends on if it's sequential or not? If it's a big io
> >>> request, can it miss the merge opportunity?
> >>
> >> The bio will still be merged to underlying disks' rq(if it's rq based),
> >> underlying disk won't flush plug untill the number of request exceed
> >> threshold.
> >
> > Thanks for this.
> >
> > Regards
> > Xiao
> >>
> >> Thanks,
> >> Kuai
> >>>
> >>> Regards
> >>> Xiao
> >>>
> >>>>
> >>>> Thanks,
> >>>> Kuai
> >>>>>
> >>>>> Best Regards
> >>>>> Xiao
> >>>>>
> >>>>> On Wed, Apr 26, 2023 at 4:24 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >>>>>>
> >>>>>> From: Yu Kuai <yukuai3@huawei.com>
> >>>>>>
> >>>>>> bio can be added to plug infinitely, and following writeback test can
> >>>>>> trigger huge amount of plugged bio:
> >>>>>>
> >>>>>> Test script:
> >>>>>> modprobe brd rd_nr=4 rd_size=10485760
> >>>>>> mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
> >>>>>> echo 0 > /proc/sys/vm/dirty_background_ratio
> >>>>>> echo 60 > /proc/sys/vm/dirty_ratio
> >>>>>> fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 -iodepth=128 -name=test
> >>>>>>
> >>>>>> Test result:
> >>>>>> Monitor /sys/block/md0/inflight will found that inflight keep increasing
> >>>>>> until fio finish writing, after running for about 2 minutes:
> >>>>>>
> >>>>>> [root@fedora ~]# cat /sys/block/md0/inflight
> >>>>>>           0  4474191
> >>>>>>
> >>>>>> Fix the problem by limiting the number of plugged bio based on the number
> >>>>>> of copies for original bio.
> >>>>>>
> >>>>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> >>>>>> ---
> >>>>>>     drivers/md/raid1-10.c | 9 ++++++++-
> >>>>>>     drivers/md/raid1.c    | 2 +-
> >>>>>>     drivers/md/raid10.c   | 2 +-
> >>>>>>     3 files changed, 10 insertions(+), 3 deletions(-)
> >>>>>>
> >>>>>> diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
> >>>>>> index 98d678b7df3f..35fb80aa37aa 100644
> >>>>>> --- a/drivers/md/raid1-10.c
> >>>>>> +++ b/drivers/md/raid1-10.c
> >>>>>> @@ -21,6 +21,7 @@
> >>>>>>     #define IO_MADE_GOOD ((struct bio *)2)
> >>>>>>
> >>>>>>     #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
> >>>>>> +#define MAX_PLUG_BIO 32
> >>>>>>
> >>>>>>     /* for managing resync I/O pages */
> >>>>>>     struct resync_pages {
> >>>>>> @@ -31,6 +32,7 @@ struct resync_pages {
> >>>>>>     struct raid1_plug_cb {
> >>>>>>            struct blk_plug_cb      cb;
> >>>>>>            struct bio_list         pending;
> >>>>>> +       unsigned int            count;
> >>>>>>     };
> >>>>>>
> >>>>>>     static void rbio_pool_free(void *rbio, void *data)
> >>>>>> @@ -127,7 +129,7 @@ static inline void md_submit_write(struct bio *bio)
> >>>>>>     }
> >>>>>>
> >>>>>>     static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
> >>>>>> -                                     blk_plug_cb_fn unplug)
> >>>>>> +                                     blk_plug_cb_fn unplug, int copies)
> >>>>>>     {
> >>>>>>            struct raid1_plug_cb *plug = NULL;
> >>>>>>            struct blk_plug_cb *cb;
> >>>>>> @@ -147,6 +149,11 @@ static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
> >>>>>>
> >>>>>>            plug = container_of(cb, struct raid1_plug_cb, cb);
> >>>>>>            bio_list_add(&plug->pending, bio);
> >>>>>> +       if (++plug->count / MAX_PLUG_BIO >= copies) {
> >>>>>> +               list_del(&cb->list);
> >>>>>> +               cb->callback(cb, false);
> >>>>>> +       }
> >>>>>> +
> >>>>>>
> >>>>>>            return true;
> >>>>>>     }
> >>>>>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> >>>>>> index 639e09cecf01..c6066408a913 100644
> >>>>>> --- a/drivers/md/raid1.c
> >>>>>> +++ b/drivers/md/raid1.c
> >>>>>> @@ -1562,7 +1562,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
> >>>>>>                                                  r1_bio->sector);
> >>>>>>                    /* flush_pending_writes() needs access to the rdev so...*/
> >>>>>>                    mbio->bi_bdev = (void *)rdev;
> >>>>>> -               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug)) {
> >>>>>> +               if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
> >>>>>>                            spin_lock_irqsave(&conf->device_lock, flags);
> >>>>>>                            bio_list_add(&conf->pending_bio_list, mbio);
> >>>>>>                            spin_unlock_irqrestore(&conf->device_lock, flags);
> >>>>>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> >>>>>> index bd9e655ca408..7135cfaf75db 100644
> >>>>>> --- a/drivers/md/raid10.c
> >>>>>> +++ b/drivers/md/raid10.c
> >>>>>> @@ -1306,7 +1306,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
> >>>>>>
> >>>>>>            atomic_inc(&r10_bio->remaining);
> >>>>>>
> >>>>>> -       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug)) {
> >>>>>> +       if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) {
> >>>>>>                    spin_lock_irqsave(&conf->device_lock, flags);
> >>>>>>                    bio_list_add(&conf->pending_bio_list, mbio);
> >>>>>>                    spin_unlock_irqrestore(&conf->device_lock, flags);
> >>>>>> --
> >>>>>> 2.39.2
> >>>>>>
> >>>>>
> >>>>> .
> >>>>>
> >>>>
> >>>
> >>> .
> >>>
> >>
> >
> > .
> >
>

Yu Kuai May 30, 2023, 1:19 a.m. UTC | #8

Hi,

在 2023/05/30 8:58, Xiao Ni 写道:
> On Mon, May 29, 2023 at 4:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>
>> Hi,
>>
>> 在 2023/05/29 15:57, Xiao Ni 写道:
>>> On Mon, May 29, 2023 at 11:18 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>>>
>>>> Hi,
>>>>
>>>> 在 2023/05/29 11:10, Xiao Ni 写道:
>>>>> On Mon, May 29, 2023 at 10:20 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>>>>>>
>>>>>> Hi,
>>>>>>
>>>>>> 在 2023/05/29 10:08, Xiao Ni 写道:
>>>>>>> Hi Kuai
>>>>>>>
>>>>>>> There is a limitation of the memory in your test. But for most
>>>>>>> situations, customers should not set this. Can this change introduce a
>>>>>>> performance regression against other situations?
>>>>>>
>>>>>> Noted that this limitation is just to triggered writeback as soon as
>>>>>> possible in the test, and it's 100% sure real situations can trigger
>>>>>> dirty pages write back asynchronously and continue to produce new dirty
>>>>>> pages.
>>>>>
>>>>> Hi
>>>>>
>>>>> I'm confused here. If we want to trigger write back quickly, it needs
>>>>> to set these two values with a smaller number, rather than 0 and 60.
>>>>> Right?
>>>>
>>>> 60 is not required, I'll remove this setting.
>>>>
>>>> 0 just means write back if there are any dirty pages.
>>>
>>> Hi Kuai
>>>
>>> Does 0 mean disabling write back? I tried to find the doc that
>>> describes the meaning when setting dirty_background_ratio to 0, but I
>>> didn't find it.
>>> In https://www.kernel.org/doc/html/next/admin-guide/sysctl/vm.html it
>>> doesn't describe this. But it says something like this
>>>
>>> Note:
>>>     dirty_background_bytes is the counterpart of dirty_background_ratio. Only
>>>     one of them may be specified at a time. When one sysctl is written it is
>>>     immediately taken into account to evaluate the dirty memory limits and the
>>>     other appears as 0 when read.
>>>
>>> Maybe you can specify dirty_background_ratio to 1 if you want to
>>> trigger write back ASAP.
>>
>> The purpose here is to trigger write back ASAP, I'm not an expert here,
>> but based on test result, 0 obviously doesn't mean disable write back.
>>
>> Set dirty_background_bytes to a value, dirty_background_ratio will be
>> set to 0 together, which means dirty_background_ratio is disabled.
>> However, change dirty_background_ratio from default value to 0, will end
>> up both dirty_background_ratio and dirty_background_bytes to be 0, and
>> based on following related code, I think 0 just means write back if
>> there are any dirty pages.
>>
>> domain_dirty_limits:
>>    bg_bytes = dirty_background_bytes -> 0
>>    bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100 -> 0
>>
>>    if (bg_bytes)
>>           bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
>>    else
>>           bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; -> 0
>>
>>    dtc->bg_thresh = bg_thresh; -> 0
>>
>> balance_dirty_pages
>>    nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
>>    if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
>>         !writeback_in_progress(wb))
>>      wb_start_background_writeback(wb); -> writeback ASAP
>>
>> Thanks,
>> Kuai
> 
> Hi Kuai
> 
> I'm not an expert about this either. Thanks for all your patches, I
> can study more things too. But I still have some questions.
> 
> I did a test in my environment something like this:
> modprobe brd rd_nr=4 rd_size=10485760
> mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
> echo 0 > /proc/sys/vm/dirty_background_ratio
> fio -filename=/dev/md0 -ioengine=libaio -rw=write -thread -bs=1k-8k
> -numjobs=1 -iodepth=128 --runtime=10 -name=xxx
> It will cause OOM and the system hangs

OOM means you trigger this problem... Plug hold lots of bios and cost
lots of memory, it's not that write back is disabled, you can verify
this by monitor md inflight, noted that don't use too much memory for
ramdisk(rd_nr * rd_size) in the test so that OOM won't be triggered.

Have you tried to test with this patchset?

> 
> modprobe brd rd_nr=4 rd_size=10485760
> mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
> echo 1 > /proc/sys/vm/dirty_background_ratio (THIS is the only different place)
> fio -filename=/dev/md0 -ioengine=libaio -rw=write -thread -bs=1k-8k
> -numjobs=1 -iodepth=128 --runtime=10 -name=xxx
> It can finish successfully.  The value of dirty_background_ration is 1
> here means it flushes ASAP

This really doesn't mean flushes ASAP, our test report this problem in
the real test that doesn't modify dirty_background_ratio. I guess
somewhere triggers io_scheduler(), probably background thread think
dirty pages doesn't match threshold, but I'm not sure for now.

Thanks,
Kuai
> 
> So your method should be the opposite way as you designed. All the
> memory can't be flushed in time, so it uses all memory very soon and
> the memory runs out and the system hangs. The reason I'm looking at
> the test is that do we really need this change. Because in the real
> world, most customers don't disable write back. Anyway, it depends on
> Song's decision and thanks for your patches again. I'll review V3 and
> try to do some performance tests.
> 
> Best Regards
> Xiao

Xiao Ni May 30, 2023, 2:25 a.m. UTC | #9

On Tue, May 30, 2023 at 9:20 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2023/05/30 8:58, Xiao Ni 写道:
> > On Mon, May 29, 2023 at 4:50 PM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >>
> >> Hi,
> >>
> >> 在 2023/05/29 15:57, Xiao Ni 写道:
> >>> On Mon, May 29, 2023 at 11:18 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >>>>
> >>>> Hi,
> >>>>
> >>>> 在 2023/05/29 11:10, Xiao Ni 写道:
> >>>>> On Mon, May 29, 2023 at 10:20 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
> >>>>>>
> >>>>>> Hi,
> >>>>>>
> >>>>>> 在 2023/05/29 10:08, Xiao Ni 写道:
> >>>>>>> Hi Kuai
> >>>>>>>
> >>>>>>> There is a limitation of the memory in your test. But for most
> >>>>>>> situations, customers should not set this. Can this change introduce a
> >>>>>>> performance regression against other situations?
> >>>>>>
> >>>>>> Noted that this limitation is just to triggered writeback as soon as
> >>>>>> possible in the test, and it's 100% sure real situations can trigger
> >>>>>> dirty pages write back asynchronously and continue to produce new dirty
> >>>>>> pages.
> >>>>>
> >>>>> Hi
> >>>>>
> >>>>> I'm confused here. If we want to trigger write back quickly, it needs
> >>>>> to set these two values with a smaller number, rather than 0 and 60.
> >>>>> Right?
> >>>>
> >>>> 60 is not required, I'll remove this setting.
> >>>>
> >>>> 0 just means write back if there are any dirty pages.
> >>>
> >>> Hi Kuai
> >>>
> >>> Does 0 mean disabling write back? I tried to find the doc that
> >>> describes the meaning when setting dirty_background_ratio to 0, but I
> >>> didn't find it.
> >>> In https://www.kernel.org/doc/html/next/admin-guide/sysctl/vm.html it
> >>> doesn't describe this. But it says something like this
> >>>
> >>> Note:
> >>>     dirty_background_bytes is the counterpart of dirty_background_ratio. Only
> >>>     one of them may be specified at a time. When one sysctl is written it is
> >>>     immediately taken into account to evaluate the dirty memory limits and the
> >>>     other appears as 0 when read.
> >>>
> >>> Maybe you can specify dirty_background_ratio to 1 if you want to
> >>> trigger write back ASAP.
> >>
> >> The purpose here is to trigger write back ASAP, I'm not an expert here,
> >> but based on test result, 0 obviously doesn't mean disable write back.
> >>
> >> Set dirty_background_bytes to a value, dirty_background_ratio will be
> >> set to 0 together, which means dirty_background_ratio is disabled.
> >> However, change dirty_background_ratio from default value to 0, will end
> >> up both dirty_background_ratio and dirty_background_bytes to be 0, and
> >> based on following related code, I think 0 just means write back if
> >> there are any dirty pages.
> >>
> >> domain_dirty_limits:
> >>    bg_bytes = dirty_background_bytes -> 0
> >>    bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100 -> 0
> >>
> >>    if (bg_bytes)
> >>           bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
> >>    else
> >>           bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; -> 0
> >>
> >>    dtc->bg_thresh = bg_thresh; -> 0
> >>
> >> balance_dirty_pages
> >>    nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
> >>    if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
> >>         !writeback_in_progress(wb))
> >>      wb_start_background_writeback(wb); -> writeback ASAP
> >>
> >> Thanks,
> >> Kuai
> >
> > Hi Kuai
> >
> > I'm not an expert about this either. Thanks for all your patches, I
> > can study more things too. But I still have some questions.
> >
> > I did a test in my environment something like this:
> > modprobe brd rd_nr=4 rd_size=10485760
> > mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
> > echo 0 > /proc/sys/vm/dirty_background_ratio
> > fio -filename=/dev/md0 -ioengine=libaio -rw=write -thread -bs=1k-8k
> > -numjobs=1 -iodepth=128 --runtime=10 -name=xxx
> > It will cause OOM and the system hangs
>
> OOM means you trigger this problem... Plug hold lots of bios and cost
> lots of memory, it's not that write back is disabled, you can verify
> this by monitor md inflight, noted that don't use too much memory for
> ramdisk(rd_nr * rd_size) in the test so that OOM won't be triggered.
>
> Have you tried to test with this patchset?

Yes, I know I have reproduced this problem. I'll have the v3 patchest.
>
> >
> > modprobe brd rd_nr=4 rd_size=10485760
> > mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean
> > echo 1 > /proc/sys/vm/dirty_background_ratio (THIS is the only different place)
> > fio -filename=/dev/md0 -ioengine=libaio -rw=write -thread -bs=1k-8k
> > -numjobs=1 -iodepth=128 --runtime=10 -name=xxx
> > It can finish successfully.  The value of dirty_background_ration is 1
> > here means it flushes ASAP
>
> This really doesn't mean flushes ASAP, our test report this problem in
> the real test that doesn't modify dirty_background_ratio. I guess
> somewhere triggers io_scheduler(), probably background thread think
> dirty pages doesn't match threshold, but I'm not sure for now.

Thanks for notifying me of this.

Regards
Xiao
>
> Thanks,
> Kuai
> >
> > So your method should be the opposite way as you designed. All the
> > memory can't be flushed in time, so it uses all memory very soon and
> > the memory runs out and the system hangs. The reason I'm looking at
> > the test is that do we really need this change. Because in the real
> > world, most customers don't disable write back. Anyway, it depends on
> > Song's decision and thanks for your patches again. I'll review V3 and
> > try to do some performance tests.
> >
> > Best Regards
> > Xiao
>

diff mbox series

Patch

diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 98d678b7df3f..35fb80aa37aa 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -21,6 +21,7 @@ 
 #define IO_MADE_GOOD ((struct bio *)2)
 
 #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
+#define MAX_PLUG_BIO 32
 
 /* for managing resync I/O pages */
 struct resync_pages {
@@ -31,6 +32,7 @@  struct resync_pages {
 struct raid1_plug_cb {
 	struct blk_plug_cb	cb;
 	struct bio_list		pending;
+	unsigned int		count;
 };
 
 static void rbio_pool_free(void *rbio, void *data)
@@ -127,7 +129,7 @@  static inline void md_submit_write(struct bio *bio)
 }
 
 static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
-				      blk_plug_cb_fn unplug)
+				      blk_plug_cb_fn unplug, int copies)
 {
 	struct raid1_plug_cb *plug = NULL;
 	struct blk_plug_cb *cb;
@@ -147,6 +149,11 @@  static inline bool md_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
 
 	plug = container_of(cb, struct raid1_plug_cb, cb);
 	bio_list_add(&plug->pending, bio);
+	if (++plug->count / MAX_PLUG_BIO >= copies) {
+		list_del(&cb->list);
+		cb->callback(cb, false);
+	}
+
 
 	return true;
 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 639e09cecf01..c6066408a913 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1562,7 +1562,7 @@  static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 					      r1_bio->sector);
 		/* flush_pending_writes() needs access to the rdev so...*/
 		mbio->bi_bdev = (void *)rdev;
-		if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug)) {
+		if (!md_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
 			spin_lock_irqsave(&conf->device_lock, flags);
 			bio_list_add(&conf->pending_bio_list, mbio);
 			spin_unlock_irqrestore(&conf->device_lock, flags);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index bd9e655ca408..7135cfaf75db 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1306,7 +1306,7 @@  static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 
 	atomic_inc(&r10_bio->remaining);
 
-	if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug)) {
+	if (!md_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) {
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
 		spin_unlock_irqrestore(&conf->device_lock, flags);