[2/3] memblock: Make finding index faster when modify regions.

Message ID 20230113082659.65276-3-zhangpeng.00@bytedance.com
State New
Headers
Series Some small improvements for memblock. |

Commit Message

Peng Zhang Jan. 13, 2023, 8:26 a.m. UTC
  We can use binary search to find the index to modify regions in
memblock_add_range() and memblock_isolate_range(). Because the
arrangement of regions is ordered. It may be faster when there are
many regions. So implemented a binary search and a new macro to walk
regions.

Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
---
 mm/memblock.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)
  

Comments

Mike Rapoport Jan. 15, 2023, 2:02 p.m. UTC | #1
Hi,

On Fri, Jan 13, 2023 at 04:26:58PM +0800, Peng Zhang wrote:
> We can use binary search to find the index to modify regions in
> memblock_add_range() and memblock_isolate_range(). Because the
> arrangement of regions is ordered. It may be faster when there are
> many regions. So implemented a binary search and a new macro to walk
> regions.

Did you see a measurable speedup with this optimization?
I'm not in favor of micro-optimizations that complicate code.

> Signed-off-by: Peng Zhang <zhangpeng.00@bytedance.com>
> ---
>  mm/memblock.c | 33 +++++++++++++++++++++++++++++----
>  1 file changed, 29 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 6eedcfc5dcc1..cb92770ac22e 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -149,6 +149,11 @@ static __refdata struct memblock_type *memblock_memory = &memblock.memory;
>  	     i < memblock_type->cnt;					\
>  	     i++, rgn = &memblock_type->regions[i])
>  
> +#define for_each_memblock_type_start(i, start, memblock_type, rgn)	\
> +	for (i = start, rgn = &memblock_type->regions[i];		\
> +	     i < memblock_type->cnt;					\
> +	     i++, rgn = &memblock_type->regions[i])
> +
>  #define memblock_dbg(fmt, ...)						\
>  	do {								\
>  		if (memblock_debug)					\
> @@ -181,6 +186,24 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
>  	return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
>  }
>  
> +/*
> + * Binary search for the first region not to the left of @base.
> + */
> +static unsigned long __init_memblock memblock_lower_bound_region(struct memblock_type *type,
> +								 phys_addr_t base)
> +{
> +	unsigned long idx, low_idx = 0, high_idx = type->cnt;
> +
> +	while (low_idx < high_idx) {
> +		idx = (low_idx + high_idx) >> 1;
> +		if (type->regions[idx].base + type->regions[idx].size <= base)
> +			low_idx = idx + 1;
> +		else
> +			high_idx = idx;
> +	}
> +	return low_idx;
> +}
> +
>  bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
>  					phys_addr_t base, phys_addr_t size)
>  {
> @@ -581,7 +604,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
>  	bool insert = false;
>  	phys_addr_t obase = base;
>  	phys_addr_t end = base + memblock_cap_size(base, &size);
> -	int idx, nr_new;
> +	int idx, start_idx, nr_new;
>  	struct memblock_region *rgn;
>  
>  	if (!size)
> @@ -607,6 +630,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
>  	 */
>  	if (type->cnt * 2 + 1 <= type->max)
>  		insert = true;
> +	start_idx = memblock_lower_bound_region(type, base);
>  
>  repeat:
>  	/*
> @@ -617,7 +641,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
>  	base = obase;
>  	nr_new = 0;
>  
> -	for_each_memblock_type(idx, type, rgn) {
> +	for_each_memblock_type_start(idx, start_idx, type, rgn) {
>  		phys_addr_t rbase = rgn->base;
>  		phys_addr_t rend = rbase + rgn->size;
>  
> @@ -737,7 +761,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
>  					int *start_rgn, int *end_rgn)
>  {
>  	phys_addr_t end = base + memblock_cap_size(base, &size);
> -	int idx;
> +	int idx, start_idx;
>  	struct memblock_region *rgn;
>  
>  	*start_rgn = *end_rgn = 0;
> @@ -750,7 +774,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
>  		if (memblock_double_array(type, base, size) < 0)
>  			return -ENOMEM;
>  
> -	for_each_memblock_type(idx, type, rgn) {
> +	start_idx = memblock_lower_bound_region(type, base);
> +	for_each_memblock_type_start(idx, start_idx, type, rgn) {
>  		phys_addr_t rbase = rgn->base;
>  		phys_addr_t rend = rbase + rgn->size;
>  
> -- 
> 2.20.1
>
  
Peng Zhang Jan. 16, 2023, 3:17 a.m. UTC | #2
在 2023/1/15 22:02, Mike Rapoport 写道:
> Hi,
> 
> On Fri, Jan 13, 2023 at 04:26:58PM +0800, Peng Zhang wrote:
>> We can use binary search to find the index to modify regions in
>> memblock_add_range() and memblock_isolate_range(). Because the
>> arrangement of regions is ordered. It may be faster when there are
>> many regions. So implemented a binary search and a new macro to walk
>> regions.
> 
> Did you see a measurable speedup with this optimization?
> I'm not in favor of micro-optimizations that complicate code.
>
Thank you for your reply. I haven't measured this patch yet, 
theoretically this small optimization might be difficult to observe.
If you think this patch complicates the code, you can ignore this patch.

These three patches are independent and they can be applied 
independently. The logic of the third patch is very simple. It will not 
complicate the code. It is tested by the default configuration of qemu. 
The total number of iterations of memblock_merge_regions() in the third 
patch is reduced from more than one thousand to more than one hundred, 
this is only in the case of a small number of regions. Can you consider 
the third patch?

Sincerely yours,
Peng.
  
Mike Rapoport Jan. 16, 2023, 7:37 a.m. UTC | #3
On Mon, Jan 16, 2023 at 11:17:40AM +0800, Peng Zhang wrote:
> 
> 
> 在 2023/1/15 22:02, Mike Rapoport 写道:
> > Hi,
> > 
> > On Fri, Jan 13, 2023 at 04:26:58PM +0800, Peng Zhang wrote:
> > > We can use binary search to find the index to modify regions in
> > > memblock_add_range() and memblock_isolate_range(). Because the
> > > arrangement of regions is ordered. It may be faster when there are
> > > many regions. So implemented a binary search and a new macro to walk
> > > regions.
> > 
> > Did you see a measurable speedup with this optimization?
> > I'm not in favor of micro-optimizations that complicate code.
> > 
> Thank you for your reply. I haven't measured this patch yet, theoretically
> this small optimization might be difficult to observe.
> If you think this patch complicates the code, you can ignore this patch.
> 
> These three patches are independent and they can be applied independently.
> The logic of the third patch is very simple. It will not complicate the
> code. It is tested by the default configuration of qemu. The total number of
> iterations of memblock_merge_regions() in the third patch is reduced from
> more than one thousand to more than one hundred, this is only in the case of
> a small number of regions. Can you consider the third patch?

Can you please send the numbers and show how did you obtained them?

> Sincerely yours,
> Peng.
  
Peng Zhang Jan. 16, 2023, 8:40 a.m. UTC | #4
在 2023/1/16 15:37, Mike Rapoport 写道:
> On Mon, Jan 16, 2023 at 11:17:40AM +0800, Peng Zhang wrote:
>>
>>
>> 在 2023/1/15 22:02, Mike Rapoport 写道:
>>> Hi,
>>>
>>> On Fri, Jan 13, 2023 at 04:26:58PM +0800, Peng Zhang wrote:
>>>> We can use binary search to find the index to modify regions in
>>>> memblock_add_range() and memblock_isolate_range(). Because the
>>>> arrangement of regions is ordered. It may be faster when there are
>>>> many regions. So implemented a binary search and a new macro to walk
>>>> regions.
>>>
>>> Did you see a measurable speedup with this optimization?
>>> I'm not in favor of micro-optimizations that complicate code.
>>>
>> Thank you for your reply. I haven't measured this patch yet, theoretically
>> this small optimization might be difficult to observe.
>> If you think this patch complicates the code, you can ignore this patch.
>>
>> These three patches are independent and they can be applied independently.
>> The logic of the third patch is very simple. It will not complicate the
>> code. It is tested by the default configuration of qemu. The total number of
>> iterations of memblock_merge_regions() in the third patch is reduced from
>> more than one thousand to more than one hundred, this is only in the case of
>> a small number of regions. Can you consider the third patch?
> 
> Can you please send the numbers and show how did you obtained them?
> 
>> Sincerely yours,
>> Peng.
> 
I obtained the numbers like this:

void memblock_merge_regions(struct memblock_type *type) {
	static int iteration_count = 0;
	static int max_nr_regions = 0;

	max_nr_regions = max(max_nr_regions, (int)type->cnt);
	...
	while () {
		iteration_count++;
		...
	}
	pr_info("iteration_count: %d max_nr_regions %d", iteration_count, 
max_nr_regions);
}

Boot the kernel by qemu.
The folowing numbers is the last output.

master branch:
iteration_count: 1762 max_nr_regions 41

patched:
iteration_count: 182 max_nr_regions 41

If max_nr_regions is larger, the difference will be more.

Thanks.

Sincerely yours,
Peng
  
Mike Rapoport Jan. 19, 2023, 1 p.m. UTC | #5
On Mon, Jan 16, 2023 at 04:40:47PM +0800, Peng Zhang wrote:
> 
> 
> 在 2023/1/16 15:37, Mike Rapoport 写道:
> > On Mon, Jan 16, 2023 at 11:17:40AM +0800, Peng Zhang wrote:
> > > 
> > > 
> > > 在 2023/1/15 22:02, Mike Rapoport 写道:
> > > > Hi,
> > > > 
> > > > On Fri, Jan 13, 2023 at 04:26:58PM +0800, Peng Zhang wrote:
> > > > > We can use binary search to find the index to modify regions in
> > > > > memblock_add_range() and memblock_isolate_range(). Because the
> > > > > arrangement of regions is ordered. It may be faster when there are
> > > > > many regions. So implemented a binary search and a new macro to walk
> > > > > regions.
> > > > 
> > > > Did you see a measurable speedup with this optimization?
> > > > I'm not in favor of micro-optimizations that complicate code.
> > > > 
> > > Thank you for your reply. I haven't measured this patch yet, theoretically
> > > this small optimization might be difficult to observe.
> > > If you think this patch complicates the code, you can ignore this patch.
> > > 
> > > These three patches are independent and they can be applied independently.
> > > The logic of the third patch is very simple. It will not complicate the
> > > code. It is tested by the default configuration of qemu. The total number of
> > > iterations of memblock_merge_regions() in the third patch is reduced from
> > > more than one thousand to more than one hundred, this is only in the case of
> > > a small number of regions. Can you consider the third patch?
> > 
> > Can you please send the numbers and show how did you obtained them?
> > 
> > > Sincerely yours,
> > > Peng.
> > 
> I obtained the numbers like this:
> 
> void memblock_merge_regions(struct memblock_type *type) {
> 	static int iteration_count = 0;
> 	static int max_nr_regions = 0;
> 
> 	max_nr_regions = max(max_nr_regions, (int)type->cnt);
> 	...
> 	while () {
> 		iteration_count++;
> 		...
> 	}
> 	pr_info("iteration_count: %d max_nr_regions %d", iteration_count,
> max_nr_regions);
> }
> 
> Boot the kernel by qemu.
> The folowing numbers is the last output.
> 
> master branch:
> iteration_count: 1762 max_nr_regions 41
> 
> patched:
> iteration_count: 182 max_nr_regions 41

The numbers that indicate what speed up your patch achieves should be a
part of the changelog. It would be great if you could test it on a real
machine and measure the actual time saved by your changes.

> If max_nr_regions is larger, the difference will be more.
> 
> Thanks.
> 
> Sincerely yours,
> Peng
  

Patch

diff --git a/mm/memblock.c b/mm/memblock.c
index 6eedcfc5dcc1..cb92770ac22e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -149,6 +149,11 @@  static __refdata struct memblock_type *memblock_memory = &memblock.memory;
 	     i < memblock_type->cnt;					\
 	     i++, rgn = &memblock_type->regions[i])
 
+#define for_each_memblock_type_start(i, start, memblock_type, rgn)	\
+	for (i = start, rgn = &memblock_type->regions[i];		\
+	     i < memblock_type->cnt;					\
+	     i++, rgn = &memblock_type->regions[i])
+
 #define memblock_dbg(fmt, ...)						\
 	do {								\
 		if (memblock_debug)					\
@@ -181,6 +186,24 @@  static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
 	return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
 }
 
+/*
+ * Binary search for the first region not to the left of @base.
+ */
+static unsigned long __init_memblock memblock_lower_bound_region(struct memblock_type *type,
+								 phys_addr_t base)
+{
+	unsigned long idx, low_idx = 0, high_idx = type->cnt;
+
+	while (low_idx < high_idx) {
+		idx = (low_idx + high_idx) >> 1;
+		if (type->regions[idx].base + type->regions[idx].size <= base)
+			low_idx = idx + 1;
+		else
+			high_idx = idx;
+	}
+	return low_idx;
+}
+
 bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
 					phys_addr_t base, phys_addr_t size)
 {
@@ -581,7 +604,7 @@  static int __init_memblock memblock_add_range(struct memblock_type *type,
 	bool insert = false;
 	phys_addr_t obase = base;
 	phys_addr_t end = base + memblock_cap_size(base, &size);
-	int idx, nr_new;
+	int idx, start_idx, nr_new;
 	struct memblock_region *rgn;
 
 	if (!size)
@@ -607,6 +630,7 @@  static int __init_memblock memblock_add_range(struct memblock_type *type,
 	 */
 	if (type->cnt * 2 + 1 <= type->max)
 		insert = true;
+	start_idx = memblock_lower_bound_region(type, base);
 
 repeat:
 	/*
@@ -617,7 +641,7 @@  static int __init_memblock memblock_add_range(struct memblock_type *type,
 	base = obase;
 	nr_new = 0;
 
-	for_each_memblock_type(idx, type, rgn) {
+	for_each_memblock_type_start(idx, start_idx, type, rgn) {
 		phys_addr_t rbase = rgn->base;
 		phys_addr_t rend = rbase + rgn->size;
 
@@ -737,7 +761,7 @@  static int __init_memblock memblock_isolate_range(struct memblock_type *type,
 					int *start_rgn, int *end_rgn)
 {
 	phys_addr_t end = base + memblock_cap_size(base, &size);
-	int idx;
+	int idx, start_idx;
 	struct memblock_region *rgn;
 
 	*start_rgn = *end_rgn = 0;
@@ -750,7 +774,8 @@  static int __init_memblock memblock_isolate_range(struct memblock_type *type,
 		if (memblock_double_array(type, base, size) < 0)
 			return -ENOMEM;
 
-	for_each_memblock_type(idx, type, rgn) {
+	start_idx = memblock_lower_bound_region(type, base);
+	for_each_memblock_type_start(idx, start_idx, type, rgn) {
 		phys_addr_t rbase = rgn->base;
 		phys_addr_t rend = rbase + rgn->size;