[2/3] memblock: Make finding index faster when modify regions.
Commit Message
We can use binary search to find the index to modify regions in
memblock_add_range() and memblock_isolate_range(). Because the
arrangement of regions is ordered. It may be faster when there are
many regions. So implemented a binary search and a new macro to walk
regions.
Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
---
mm/memblock.c | 33 +++++++++++++++++++++++++++++----
1 file changed, 29 insertions(+), 4 deletions(-)
Comments
Hi,
On Fri, Jan 13, 2023 at 04:26:58PM +0800, Peng Zhang wrote:
> We can use binary search to find the index to modify regions in
> memblock_add_range() and memblock_isolate_range(). Because the
> arrangement of regions is ordered. It may be faster when there are
> many regions. So implemented a binary search and a new macro to walk
> regions.
Did you see a measurable speedup with this optimization?
I'm not in favor of micro-optimizations that complicate code.
> Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
> ---
> mm/memblock.c | 33 +++++++++++++++++++++++++++++----
> 1 file changed, 29 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 6eedcfc5dcc1..cb92770ac22e 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -149,6 +149,11 @@ static __refdata struct memblock_type *memblock_memory = &memblock.memory;
> i < memblock_type->cnt; \
> i++, rgn = &memblock_type->regions[i])
>
> +#define for_each_memblock_type_start(i, start, memblock_type, rgn) \
> + for (i = start, rgn = &memblock_type->regions[i]; \
> + i < memblock_type->cnt; \
> + i++, rgn = &memblock_type->regions[i])
> +
> #define memblock_dbg(fmt, ...) \
> do { \
> if (memblock_debug) \
> @@ -181,6 +186,24 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
> return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
> }
>
> +/*
> + * Binary search for the first region not to the left of @base.
> + */
> +static unsigned long __init_memblock memblock_lower_bound_region(struct memblock_type *type,
> + phys_addr_t base)
> +{
> + unsigned long idx, low_idx = 0, high_idx = type->cnt;
> +
> + while (low_idx < high_idx) {
> + idx = (low_idx + high_idx) >> 1;
> + if (type->regions[idx].base + type->regions[idx].size <= base)
> + low_idx = idx + 1;
> + else
> + high_idx = idx;
> + }
> + return low_idx;
> +}
> +
> bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
> phys_addr_t base, phys_addr_t size)
> {
> @@ -581,7 +604,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
> bool insert = false;
> phys_addr_t obase = base;
> phys_addr_t end = base + memblock_cap_size(base, &size);
> - int idx, nr_new;
> + int idx, start_idx, nr_new;
> struct memblock_region *rgn;
>
> if (!size)
> @@ -607,6 +630,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
> */
> if (type->cnt * 2 + 1 <= type->max)
> insert = true;
> + start_idx = memblock_lower_bound_region(type, base);
>
> repeat:
> /*
> @@ -617,7 +641,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
> base = obase;
> nr_new = 0;
>
> - for_each_memblock_type(idx, type, rgn) {
> + for_each_memblock_type_start(idx, start_idx, type, rgn) {
> phys_addr_t rbase = rgn->base;
> phys_addr_t rend = rbase + rgn->size;
>
> @@ -737,7 +761,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
> int *start_rgn, int *end_rgn)
> {
> phys_addr_t end = base + memblock_cap_size(base, &size);
> - int idx;
> + int idx, start_idx;
> struct memblock_region *rgn;
>
> *start_rgn = *end_rgn = 0;
> @@ -750,7 +774,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
> if (memblock_double_array(type, base, size) < 0)
> return -ENOMEM;
>
> - for_each_memblock_type(idx, type, rgn) {
> + start_idx = memblock_lower_bound_region(type, base);
> + for_each_memblock_type_start(idx, start_idx, type, rgn) {
> phys_addr_t rbase = rgn->base;
> phys_addr_t rend = rbase + rgn->size;
>
> --
> 2.20.1
>
在 2023/1/15 22:02, Mike Rapoport 写道:
> Hi,
>
> On Fri, Jan 13, 2023 at 04:26:58PM +0800, Peng Zhang wrote:
>> We can use binary search to find the index to modify regions in
>> memblock_add_range() and memblock_isolate_range(). Because the
>> arrangement of regions is ordered. It may be faster when there are
>> many regions. So implemented a binary search and a new macro to walk
>> regions.
>
> Did you see a measurable speedup with this optimization?
> I'm not in favor of micro-optimizations that complicate code.
>
Thank you for your reply. I haven't measured this patch yet,
theoretically this small optimization might be difficult to observe.
If you think this patch complicates the code, you can ignore this patch.
These three patches are independent and they can be applied
independently. The logic of the third patch is very simple. It will not
complicate the code. It is tested by the default configuration of qemu.
The total number of iterations of memblock_merge_regions() in the third
patch is reduced from more than one thousand to more than one hundred,
this is only in the case of a small number of regions. Can you consider
the third patch?
Sincerely yours,
Peng.
On Mon, Jan 16, 2023 at 11:17:40AM +0800, Peng Zhang wrote:
>
>
> 在 2023/1/15 22:02, Mike Rapoport 写道:
> > Hi,
> >
> > On Fri, Jan 13, 2023 at 04:26:58PM +0800, Peng Zhang wrote:
> > > We can use binary search to find the index to modify regions in
> > > memblock_add_range() and memblock_isolate_range(). Because the
> > > arrangement of regions is ordered. It may be faster when there are
> > > many regions. So implemented a binary search and a new macro to walk
> > > regions.
> >
> > Did you see a measurable speedup with this optimization?
> > I'm not in favor of micro-optimizations that complicate code.
> >
> Thank you for your reply. I haven't measured this patch yet, theoretically
> this small optimization might be difficult to observe.
> If you think this patch complicates the code, you can ignore this patch.
>
> These three patches are independent and they can be applied independently.
> The logic of the third patch is very simple. It will not complicate the
> code. It is tested by the default configuration of qemu. The total number of
> iterations of memblock_merge_regions() in the third patch is reduced from
> more than one thousand to more than one hundred, this is only in the case of
> a small number of regions. Can you consider the third patch?
Can you please send the numbers and show how did you obtained them?
> Sincerely yours,
> Peng.
在 2023/1/16 15:37, Mike Rapoport 写道:
> On Mon, Jan 16, 2023 at 11:17:40AM +0800, Peng Zhang wrote:
>>
>>
>> 在 2023/1/15 22:02, Mike Rapoport 写道:
>>> Hi,
>>>
>>> On Fri, Jan 13, 2023 at 04:26:58PM +0800, Peng Zhang wrote:
>>>> We can use binary search to find the index to modify regions in
>>>> memblock_add_range() and memblock_isolate_range(). Because the
>>>> arrangement of regions is ordered. It may be faster when there are
>>>> many regions. So implemented a binary search and a new macro to walk
>>>> regions.
>>>
>>> Did you see a measurable speedup with this optimization?
>>> I'm not in favor of micro-optimizations that complicate code.
>>>
>> Thank you for your reply. I haven't measured this patch yet, theoretically
>> this small optimization might be difficult to observe.
>> If you think this patch complicates the code, you can ignore this patch.
>>
>> These three patches are independent and they can be applied independently.
>> The logic of the third patch is very simple. It will not complicate the
>> code. It is tested by the default configuration of qemu. The total number of
>> iterations of memblock_merge_regions() in the third patch is reduced from
>> more than one thousand to more than one hundred, this is only in the case of
>> a small number of regions. Can you consider the third patch?
>
> Can you please send the numbers and show how did you obtained them?
>
>> Sincerely yours,
>> Peng.
>
I obtained the numbers like this:
void memblock_merge_regions(struct memblock_type *type) {
static int iteration_count = 0;
static int max_nr_regions = 0;
max_nr_regions = max(max_nr_regions, (int)type->cnt);
...
while () {
iteration_count++;
...
}
pr_info("iteration_count: %d max_nr_regions %d", iteration_count,
max_nr_regions);
}
Boot the kernel by qemu.
The folowing numbers is the last output.
master branch:
iteration_count: 1762 max_nr_regions 41
patched:
iteration_count: 182 max_nr_regions 41
If max_nr_regions is larger, the difference will be more.
Thanks.
Sincerely yours,
Peng
On Mon, Jan 16, 2023 at 04:40:47PM +0800, Peng Zhang wrote:
>
>
> 在 2023/1/16 15:37, Mike Rapoport 写道:
> > On Mon, Jan 16, 2023 at 11:17:40AM +0800, Peng Zhang wrote:
> > >
> > >
> > > 在 2023/1/15 22:02, Mike Rapoport 写道:
> > > > Hi,
> > > >
> > > > On Fri, Jan 13, 2023 at 04:26:58PM +0800, Peng Zhang wrote:
> > > > > We can use binary search to find the index to modify regions in
> > > > > memblock_add_range() and memblock_isolate_range(). Because the
> > > > > arrangement of regions is ordered. It may be faster when there are
> > > > > many regions. So implemented a binary search and a new macro to walk
> > > > > regions.
> > > >
> > > > Did you see a measurable speedup with this optimization?
> > > > I'm not in favor of micro-optimizations that complicate code.
> > > >
> > > Thank you for your reply. I haven't measured this patch yet, theoretically
> > > this small optimization might be difficult to observe.
> > > If you think this patch complicates the code, you can ignore this patch.
> > >
> > > These three patches are independent and they can be applied independently.
> > > The logic of the third patch is very simple. It will not complicate the
> > > code. It is tested by the default configuration of qemu. The total number of
> > > iterations of memblock_merge_regions() in the third patch is reduced from
> > > more than one thousand to more than one hundred, this is only in the case of
> > > a small number of regions. Can you consider the third patch?
> >
> > Can you please send the numbers and show how did you obtained them?
> >
> > > Sincerely yours,
> > > Peng.
> >
> I obtained the numbers like this:
>
> void memblock_merge_regions(struct memblock_type *type) {
> static int iteration_count = 0;
> static int max_nr_regions = 0;
>
> max_nr_regions = max(max_nr_regions, (int)type->cnt);
> ...
> while () {
> iteration_count++;
> ...
> }
> pr_info("iteration_count: %d max_nr_regions %d", iteration_count,
> max_nr_regions);
> }
>
> Boot the kernel by qemu.
> The folowing numbers is the last output.
>
> master branch:
> iteration_count: 1762 max_nr_regions 41
>
> patched:
> iteration_count: 182 max_nr_regions 41
The numbers that indicate what speed up your patch achieves should be a
part of the changelog. It would be great if you could test it on a real
machine and measure the actual time saved by your changes.
> If max_nr_regions is larger, the difference will be more.
>
> Thanks.
>
> Sincerely yours,
> Peng
@@ -149,6 +149,11 @@ static __refdata struct memblock_type *memblock_memory = &memblock.memory;
i < memblock_type->cnt; \
i++, rgn = &memblock_type->regions[i])
+#define for_each_memblock_type_start(i, start, memblock_type, rgn) \
+ for (i = start, rgn = &memblock_type->regions[i]; \
+ i < memblock_type->cnt; \
+ i++, rgn = &memblock_type->regions[i])
+
#define memblock_dbg(fmt, ...) \
do { \
if (memblock_debug) \
@@ -181,6 +186,24 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}
+/*
+ * Binary search for the first region not to the left of @base.
+ */
+static unsigned long __init_memblock memblock_lower_bound_region(struct memblock_type *type,
+ phys_addr_t base)
+{
+ unsigned long idx, low_idx = 0, high_idx = type->cnt;
+
+ while (low_idx < high_idx) {
+ idx = (low_idx + high_idx) >> 1;
+ if (type->regions[idx].base + type->regions[idx].size <= base)
+ low_idx = idx + 1;
+ else
+ high_idx = idx;
+ }
+ return low_idx;
+}
+
bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
phys_addr_t base, phys_addr_t size)
{
@@ -581,7 +604,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
bool insert = false;
phys_addr_t obase = base;
phys_addr_t end = base + memblock_cap_size(base, &size);
- int idx, nr_new;
+ int idx, start_idx, nr_new;
struct memblock_region *rgn;
if (!size)
@@ -607,6 +630,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
*/
if (type->cnt * 2 + 1 <= type->max)
insert = true;
+ start_idx = memblock_lower_bound_region(type, base);
repeat:
/*
@@ -617,7 +641,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
base = obase;
nr_new = 0;
- for_each_memblock_type(idx, type, rgn) {
+ for_each_memblock_type_start(idx, start_idx, type, rgn) {
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;
@@ -737,7 +761,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
int *start_rgn, int *end_rgn)
{
phys_addr_t end = base + memblock_cap_size(base, &size);
- int idx;
+ int idx, start_idx;
struct memblock_region *rgn;
*start_rgn = *end_rgn = 0;
@@ -750,7 +774,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
if (memblock_double_array(type, base, size) < 0)
return -ENOMEM;
- for_each_memblock_type(idx, type, rgn) {
+ start_idx = memblock_lower_bound_region(type, base);
+ for_each_memblock_type_start(idx, start_idx, type, rgn) {
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;