mm: compaction: skip memory hole rapidly when isolating migratable pages

Message ID 8cc668b77c8eb2fa78058b3d81386ebed9c5a9cd.1686294549.git.baolin.wang@linux.alibaba.com
State New
Headers
Series mm: compaction: skip memory hole rapidly when isolating migratable pages |

Commit Message

Baolin Wang June 9, 2023, 9:45 a.m. UTC
  On some machines, the normal zone can have a large memory hole like
below memory layout, and we can see the range from 0x100000000 to
0x1800000000 is a hole. So when isolating some migratable pages, the
scanner can meet the hole and it will take more time to skip the large
hole. From my measurement, I can see the isolation scanner will take
80us ~ 100us to skip the large hole [0x100000000 - 0x1800000000].

So adding a new helper to fast search next online memory section
to skip the large hole can help to find next suitable pageblock
efficiently. With this patch, I can see the large hole scanning only
takes < 1us.

[    0.000000] Zone ranges:
[    0.000000]   DMA      [mem 0x0000000040000000-0x00000000ffffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   [mem 0x0000000100000000-0x0000001fa7ffffff]
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000040000000-0x0000000fffffffff]
[    0.000000]   node   0: [mem 0x0000001800000000-0x0000001fa3c7ffff]
[    0.000000]   node   0: [mem 0x0000001fa3c80000-0x0000001fa3ffffff]
[    0.000000]   node   0: [mem 0x0000001fa4000000-0x0000001fa402ffff]
[    0.000000]   node   0: [mem 0x0000001fa4030000-0x0000001fa40effff]
[    0.000000]   node   0: [mem 0x0000001fa40f0000-0x0000001fa73cffff]
[    0.000000]   node   0: [mem 0x0000001fa73d0000-0x0000001fa745ffff]
[    0.000000]   node   0: [mem 0x0000001fa7460000-0x0000001fa746ffff]
[    0.000000]   node   0: [mem 0x0000001fa7470000-0x0000001fa758ffff]
[    0.000000]   node   0: [mem 0x0000001fa7590000-0x0000001fa7ffffff]

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 include/linux/mmzone.h | 10 ++++++++++
 mm/compaction.c        | 23 ++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)
  

Comments

kernel test robot June 9, 2023, 2:48 p.m. UTC | #1
Hi Baolin,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]

url:    https://github.com/intel-lab-lkp/linux/commits/Baolin-Wang/mm-compaction-skip-memory-hole-rapidly-when-isolating-migratable-pages/20230609-174659
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/8cc668b77c8eb2fa78058b3d81386ebed9c5a9cd.1686294549.git.baolin.wang%40linux.alibaba.com
patch subject: [PATCH] mm: compaction: skip memory hole rapidly when isolating migratable pages
config: arm-randconfig-r025-20230609 (https://download.01.org/0day-ci/archive/20230609/202306092250.cp65gzkn-lkp@intel.com/config)
compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
reproduce (this is a W=1 build):
        mkdir -p ~/bin
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install arm cross compiling tool for clang build
        # apt-get install binutils-arm-linux-gnueabi
        git remote add akpm-mm https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git
        git fetch akpm-mm mm-everything
        git checkout akpm-mm/mm-everything
        b4 shazam https://lore.kernel.org/r/8cc668b77c8eb2fa78058b3d81386ebed9c5a9cd.1686294549.git.baolin.wang@linux.alibaba.com
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang ~/bin/make.cross W=1 O=build_dir ARCH=arm olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang ~/bin/make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202306092250.cp65gzkn-lkp@intel.com/

All errors (new ones prefixed by >>):

>> mm/compaction.c:235:27: error: call to undeclared function 'pfn_to_section_nr'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
           unsigned long start_nr = pfn_to_section_nr(start_pfn);
                                    ^
>> mm/compaction.c:237:6: error: call to undeclared function 'online_section_nr'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
           if (online_section_nr(start_nr))
               ^
>> mm/compaction.c:240:19: error: call to undeclared function 'next_online_section_nr'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
           next_online_nr = next_online_section_nr(start_nr);
                            ^
>> mm/compaction.c:242:10: error: call to undeclared function 'section_nr_to_pfn'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
                   return section_nr_to_pfn(next_online_nr);
                          ^
   4 errors generated.


vim +/pfn_to_section_nr +235 mm/compaction.c

   231	
   232	static unsigned long skip_hole_pageblock(unsigned long start_pfn)
   233	{
   234		unsigned long next_online_nr;
 > 235		unsigned long start_nr = pfn_to_section_nr(start_pfn);
   236	
 > 237		if (online_section_nr(start_nr))
   238			return -1UL;
   239	
 > 240		next_online_nr = next_online_section_nr(start_nr);
   241		if (next_online_nr != -1UL)
 > 242			return section_nr_to_pfn(next_online_nr);
   243	
   244		return -1UL;
   245	}
   246
  
Baolin Wang June 11, 2023, 1:38 a.m. UTC | #2
On 6/9/2023 10:48 PM, kernel test robot wrote:
> Hi Baolin,
> 
> kernel test robot noticed the following build errors:
> 
> [auto build test ERROR on akpm-mm/mm-everything]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Baolin-Wang/mm-compaction-skip-memory-hole-rapidly-when-isolating-migratable-pages/20230609-174659
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> patch link:    https://lore.kernel.org/r/8cc668b77c8eb2fa78058b3d81386ebed9c5a9cd.1686294549.git.baolin.wang%40linux.alibaba.com
> patch subject: [PATCH] mm: compaction: skip memory hole rapidly when isolating migratable pages
> config: arm-randconfig-r025-20230609 (https://download.01.org/0day-ci/archive/20230609/202306092250.cp65gzkn-lkp@intel.com/config)
> compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
> reproduce (this is a W=1 build):
>          mkdir -p ~/bin
>          wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>          chmod +x ~/bin/make.cross
>          # install arm cross compiling tool for clang build
>          # apt-get install binutils-arm-linux-gnueabi
>          git remote add akpm-mm https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git
>          git fetch akpm-mm mm-everything
>          git checkout akpm-mm/mm-everything
>          b4 shazam https://lore.kernel.org/r/8cc668b77c8eb2fa78058b3d81386ebed9c5a9cd.1686294549.git.baolin.wang@linux.alibaba.com
>          # save the config file
>          mkdir build_dir && cp config build_dir/.config
>          COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang ~/bin/make.cross W=1 O=build_dir ARCH=arm olddefconfig
>          COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang ~/bin/make.cross W=1 O=build_dir ARCH=arm SHELL=/bin/bash
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202306092250.cp65gzkn-lkp@intel.com/
> 
> All errors (new ones prefixed by >>):
> 
>>> mm/compaction.c:235:27: error: call to undeclared function 'pfn_to_section_nr'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
>             unsigned long start_nr = pfn_to_section_nr(start_pfn);
>                                      ^
>>> mm/compaction.c:237:6: error: call to undeclared function 'online_section_nr'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
>             if (online_section_nr(start_nr))
>                 ^
>>> mm/compaction.c:240:19: error: call to undeclared function 'next_online_section_nr'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
>             next_online_nr = next_online_section_nr(start_nr);
>                              ^
>>> mm/compaction.c:242:10: error: call to undeclared function 'section_nr_to_pfn'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
>                     return section_nr_to_pfn(next_online_nr);
>                            ^
>     4 errors generated.

Thanks for report. I should provide a dummy function if CONFIG_SPARSEMEM 
is not selected.
  
Huang, Ying June 12, 2023, 6:39 a.m. UTC | #3
Baolin Wang <baolin.wang@linux.alibaba.com> writes:

> On some machines, the normal zone can have a large memory hole like
> below memory layout, and we can see the range from 0x100000000 to
> 0x1800000000 is a hole. So when isolating some migratable pages, the
> scanner can meet the hole and it will take more time to skip the large
> hole. From my measurement, I can see the isolation scanner will take
> 80us ~ 100us to skip the large hole [0x100000000 - 0x1800000000].
>
> So adding a new helper to fast search next online memory section
> to skip the large hole can help to find next suitable pageblock
> efficiently. With this patch, I can see the large hole scanning only
> takes < 1us.
>
> [    0.000000] Zone ranges:
> [    0.000000]   DMA      [mem 0x0000000040000000-0x00000000ffffffff]
> [    0.000000]   DMA32    empty
> [    0.000000]   Normal   [mem 0x0000000100000000-0x0000001fa7ffffff]
> [    0.000000] Movable zone start for each node
> [    0.000000] Early memory node ranges
> [    0.000000]   node   0: [mem 0x0000000040000000-0x0000000fffffffff]
> [    0.000000]   node   0: [mem 0x0000001800000000-0x0000001fa3c7ffff]
> [    0.000000]   node   0: [mem 0x0000001fa3c80000-0x0000001fa3ffffff]
> [    0.000000]   node   0: [mem 0x0000001fa4000000-0x0000001fa402ffff]
> [    0.000000]   node   0: [mem 0x0000001fa4030000-0x0000001fa40effff]
> [    0.000000]   node   0: [mem 0x0000001fa40f0000-0x0000001fa73cffff]
> [    0.000000]   node   0: [mem 0x0000001fa73d0000-0x0000001fa745ffff]
> [    0.000000]   node   0: [mem 0x0000001fa7460000-0x0000001fa746ffff]
> [    0.000000]   node   0: [mem 0x0000001fa7470000-0x0000001fa758ffff]
> [    0.000000]   node   0: [mem 0x0000001fa7590000-0x0000001fa7ffffff]
>
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
>  include/linux/mmzone.h | 10 ++++++++++
>  mm/compaction.c        | 23 ++++++++++++++++++++++-
>  2 files changed, 32 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 5a7ada0413da..87e6c535d895 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -2000,6 +2000,16 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr)
>  	return -1;
>  }
>  
> +static inline unsigned long next_online_section_nr(unsigned long section_nr)
> +{
> +	while (++section_nr <= __highest_present_section_nr) {
> +		if (online_section_nr(section_nr))
> +			return section_nr;
> +	}
> +
> +	return -1UL;
> +}
> +
>  /*
>   * These are _only_ used during initialisation, therefore they
>   * can use __initdata ...  They could have names to indicate
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 3398ef3a55fe..3a55fdd20c49 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -229,6 +229,21 @@ static void reset_cached_positions(struct zone *zone)
>  				pageblock_start_pfn(zone_end_pfn(zone) - 1);
>  }
>  
> +static unsigned long skip_hole_pageblock(unsigned long start_pfn)
> +{
> +	unsigned long next_online_nr;
> +	unsigned long start_nr = pfn_to_section_nr(start_pfn);
> +
> +	if (online_section_nr(start_nr))
> +		return -1UL;

Define a macro for the maigic "-1UL"?  Which is used for multiple times
in the patch.

> +
> +	next_online_nr = next_online_section_nr(start_nr);
> +	if (next_online_nr != -1UL)
> +		return section_nr_to_pfn(next_online_nr);
> +
> +	return -1UL;
> +}
> +
>  /*
>   * Compound pages of >= pageblock_order should consistently be skipped until
>   * released. It is always pointless to compact pages of such order (if they are
> @@ -1991,8 +2006,14 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
>  
>  		page = pageblock_pfn_to_page(block_start_pfn,
>  						block_end_pfn, cc->zone);
> -		if (!page)
> +		if (!page) {
> +			unsigned long next_pfn;
> +
> +			next_pfn = skip_hole_pageblock(block_start_pfn);
> +			if (next_pfn != -1UL)
> +				block_end_pfn = next_pfn;
>  			continue;
> +		}
>  
>  		/*
>  		 * If isolation recently failed, do not retry. Only check the

Do we need to do similar change in isolate_freepages()?

Best Regards,
Huang, Ying
  
Baolin Wang June 12, 2023, 9:36 a.m. UTC | #4
On 6/12/2023 2:39 PM, Huang, Ying wrote:
> Baolin Wang <baolin.wang@linux.alibaba.com> writes:
> 
>> On some machines, the normal zone can have a large memory hole like
>> below memory layout, and we can see the range from 0x100000000 to
>> 0x1800000000 is a hole. So when isolating some migratable pages, the
>> scanner can meet the hole and it will take more time to skip the large
>> hole. From my measurement, I can see the isolation scanner will take
>> 80us ~ 100us to skip the large hole [0x100000000 - 0x1800000000].
>>
>> So adding a new helper to fast search next online memory section
>> to skip the large hole can help to find next suitable pageblock
>> efficiently. With this patch, I can see the large hole scanning only
>> takes < 1us.
>>
>> [    0.000000] Zone ranges:
>> [    0.000000]   DMA      [mem 0x0000000040000000-0x00000000ffffffff]
>> [    0.000000]   DMA32    empty
>> [    0.000000]   Normal   [mem 0x0000000100000000-0x0000001fa7ffffff]
>> [    0.000000] Movable zone start for each node
>> [    0.000000] Early memory node ranges
>> [    0.000000]   node   0: [mem 0x0000000040000000-0x0000000fffffffff]
>> [    0.000000]   node   0: [mem 0x0000001800000000-0x0000001fa3c7ffff]
>> [    0.000000]   node   0: [mem 0x0000001fa3c80000-0x0000001fa3ffffff]
>> [    0.000000]   node   0: [mem 0x0000001fa4000000-0x0000001fa402ffff]
>> [    0.000000]   node   0: [mem 0x0000001fa4030000-0x0000001fa40effff]
>> [    0.000000]   node   0: [mem 0x0000001fa40f0000-0x0000001fa73cffff]
>> [    0.000000]   node   0: [mem 0x0000001fa73d0000-0x0000001fa745ffff]
>> [    0.000000]   node   0: [mem 0x0000001fa7460000-0x0000001fa746ffff]
>> [    0.000000]   node   0: [mem 0x0000001fa7470000-0x0000001fa758ffff]
>> [    0.000000]   node   0: [mem 0x0000001fa7590000-0x0000001fa7ffffff]
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> ---
>>   include/linux/mmzone.h | 10 ++++++++++
>>   mm/compaction.c        | 23 ++++++++++++++++++++++-
>>   2 files changed, 32 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index 5a7ada0413da..87e6c535d895 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -2000,6 +2000,16 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr)
>>   	return -1;
>>   }
>>   
>> +static inline unsigned long next_online_section_nr(unsigned long section_nr)
>> +{
>> +	while (++section_nr <= __highest_present_section_nr) {
>> +		if (online_section_nr(section_nr))
>> +			return section_nr;
>> +	}
>> +
>> +	return -1UL;
>> +}
>> +
>>   /*
>>    * These are _only_ used during initialisation, therefore they
>>    * can use __initdata ...  They could have names to indicate
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index 3398ef3a55fe..3a55fdd20c49 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -229,6 +229,21 @@ static void reset_cached_positions(struct zone *zone)
>>   				pageblock_start_pfn(zone_end_pfn(zone) - 1);
>>   }
>>   
>> +static unsigned long skip_hole_pageblock(unsigned long start_pfn)
>> +{
>> +	unsigned long next_online_nr;
>> +	unsigned long start_nr = pfn_to_section_nr(start_pfn);
>> +
>> +	if (online_section_nr(start_nr))
>> +		return -1UL;
> 
> Define a macro for the maigic "-1UL"?  Which is used for multiple times
> in the patch.

I am struggling to find a readable macro for these '-1UL', since the 
'-1UL' in next_online_section_nr() indicates that it can not find an 
online section. However the '-1' in skip_hole_pageblock() indicates that 
it can not find an online pfn.

So after more thinking, I will change to return 'NR_MEM_SECTIONS' if can 
not find next online section in next_online_section_nr(). And in 
skip_hole_pageblock(), I will change to return 0 if can not find next 
online pfn. What do you think?

static unsigned long skip_hole_pageblock(unsigned long start_pfn)
{
         unsigned long next_online_nr;
         unsigned long start_nr = pfn_to_section_nr(start_pfn);

         if (online_section_nr(start_nr))
                 return 0;

         next_online_nr = next_online_section_nr(start_nr);
         if (next_online_nr < NR_MEM_SECTIONS)
                 return section_nr_to_pfn(next_online_nr);

         return 0;
}

>> +
>> +	next_online_nr = next_online_section_nr(start_nr);
>> +	if (next_online_nr != -1UL)
>> +		return section_nr_to_pfn(next_online_nr);
>> +
>> +	return -1UL;
>> +}
>> +
>>   /*
>>    * Compound pages of >= pageblock_order should consistently be skipped until
>>    * released. It is always pointless to compact pages of such order (if they are
>> @@ -1991,8 +2006,14 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
>>   
>>   		page = pageblock_pfn_to_page(block_start_pfn,
>>   						block_end_pfn, cc->zone);
>> -		if (!page)
>> +		if (!page) {
>> +			unsigned long next_pfn;
>> +
>> +			next_pfn = skip_hole_pageblock(block_start_pfn);
>> +			if (next_pfn != -1UL)
>> +				block_end_pfn = next_pfn;
>>   			continue;
>> +		}
>>   
>>   		/*
>>   		 * If isolation recently failed, do not retry. Only check the
> 
> Do we need to do similar change in isolate_freepages()?

Yes, it's in my todo list with some measurement data.

Thanks for your comments.
  
David Hildenbrand June 12, 2023, 9:54 a.m. UTC | #5
On 12.06.23 11:36, Baolin Wang wrote:
> 
> 
> On 6/12/2023 2:39 PM, Huang, Ying wrote:
>> Baolin Wang <baolin.wang@linux.alibaba.com> writes:
>>
>>> On some machines, the normal zone can have a large memory hole like
>>> below memory layout, and we can see the range from 0x100000000 to
>>> 0x1800000000 is a hole. So when isolating some migratable pages, the
>>> scanner can meet the hole and it will take more time to skip the large
>>> hole. From my measurement, I can see the isolation scanner will take
>>> 80us ~ 100us to skip the large hole [0x100000000 - 0x1800000000].
>>>
>>> So adding a new helper to fast search next online memory section
>>> to skip the large hole can help to find next suitable pageblock
>>> efficiently. With this patch, I can see the large hole scanning only
>>> takes < 1us.
>>>
>>> [    0.000000] Zone ranges:
>>> [    0.000000]   DMA      [mem 0x0000000040000000-0x00000000ffffffff]
>>> [    0.000000]   DMA32    empty
>>> [    0.000000]   Normal   [mem 0x0000000100000000-0x0000001fa7ffffff]
>>> [    0.000000] Movable zone start for each node
>>> [    0.000000] Early memory node ranges
>>> [    0.000000]   node   0: [mem 0x0000000040000000-0x0000000fffffffff]
>>> [    0.000000]   node   0: [mem 0x0000001800000000-0x0000001fa3c7ffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa3c80000-0x0000001fa3ffffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa4000000-0x0000001fa402ffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa4030000-0x0000001fa40effff]
>>> [    0.000000]   node   0: [mem 0x0000001fa40f0000-0x0000001fa73cffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa73d0000-0x0000001fa745ffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa7460000-0x0000001fa746ffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa7470000-0x0000001fa758ffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa7590000-0x0000001fa7ffffff]
>>>
>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> ---
>>>    include/linux/mmzone.h | 10 ++++++++++
>>>    mm/compaction.c        | 23 ++++++++++++++++++++++-
>>>    2 files changed, 32 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>> index 5a7ada0413da..87e6c535d895 100644
>>> --- a/include/linux/mmzone.h
>>> +++ b/include/linux/mmzone.h
>>> @@ -2000,6 +2000,16 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr)
>>>    	return -1;
>>>    }
>>>    
>>> +static inline unsigned long next_online_section_nr(unsigned long section_nr)
>>> +{
>>> +	while (++section_nr <= __highest_present_section_nr) {
>>> +		if (online_section_nr(section_nr))
>>> +			return section_nr;
>>> +	}
>>> +
>>> +	return -1UL;
>>> +}
>>> +
>>>    /*
>>>     * These are _only_ used during initialisation, therefore they
>>>     * can use __initdata ...  They could have names to indicate
>>> diff --git a/mm/compaction.c b/mm/compaction.c
>>> index 3398ef3a55fe..3a55fdd20c49 100644
>>> --- a/mm/compaction.c
>>> +++ b/mm/compaction.c
>>> @@ -229,6 +229,21 @@ static void reset_cached_positions(struct zone *zone)
>>>    				pageblock_start_pfn(zone_end_pfn(zone) - 1);
>>>    }
>>>    
>>> +static unsigned long skip_hole_pageblock(unsigned long start_pfn)
>>> +{
>>> +	unsigned long next_online_nr;
>>> +	unsigned long start_nr = pfn_to_section_nr(start_pfn);
>>> +
>>> +	if (online_section_nr(start_nr))
>>> +		return -1UL;
>>
>> Define a macro for the maigic "-1UL"?  Which is used for multiple times
>> in the patch.
> 
> I am struggling to find a readable macro for these '-1UL', since the
> '-1UL' in next_online_section_nr() indicates that it can not find an
> online section. However the '-1' in skip_hole_pageblock() indicates that
> it can not find an online pfn.

Maybe something like

#define SECTION_NR_INVALID -1UL

> 
> So after more thinking, I will change to return 'NR_MEM_SECTIONS' if can
> not find next online section in next_online_section_nr(). And in
> skip_hole_pageblock(), I will change to return 0 if can not find next
> online pfn. What do you think?

Well, 0 "might be" (and most likely is) a valid section number, so you'd 
simulate some kind-of a wraparound. I guess I'd prefer 
SECTION_NR_INVALID instead.
  
Baolin Wang June 12, 2023, 10:10 a.m. UTC | #6
On 6/12/2023 5:54 PM, David Hildenbrand wrote:
> On 12.06.23 11:36, Baolin Wang wrote:
>>
>>
>> On 6/12/2023 2:39 PM, Huang, Ying wrote:
>>> Baolin Wang <baolin.wang@linux.alibaba.com> writes:
>>>
>>>> On some machines, the normal zone can have a large memory hole like
>>>> below memory layout, and we can see the range from 0x100000000 to
>>>> 0x1800000000 is a hole. So when isolating some migratable pages, the
>>>> scanner can meet the hole and it will take more time to skip the large
>>>> hole. From my measurement, I can see the isolation scanner will take
>>>> 80us ~ 100us to skip the large hole [0x100000000 - 0x1800000000].
>>>>
>>>> So adding a new helper to fast search next online memory section
>>>> to skip the large hole can help to find next suitable pageblock
>>>> efficiently. With this patch, I can see the large hole scanning only
>>>> takes < 1us.
>>>>
>>>> [    0.000000] Zone ranges:
>>>> [    0.000000]   DMA      [mem 0x0000000040000000-0x00000000ffffffff]
>>>> [    0.000000]   DMA32    empty
>>>> [    0.000000]   Normal   [mem 0x0000000100000000-0x0000001fa7ffffff]
>>>> [    0.000000] Movable zone start for each node
>>>> [    0.000000] Early memory node ranges
>>>> [    0.000000]   node   0: [mem 0x0000000040000000-0x0000000fffffffff]
>>>> [    0.000000]   node   0: [mem 0x0000001800000000-0x0000001fa3c7ffff]
>>>> [    0.000000]   node   0: [mem 0x0000001fa3c80000-0x0000001fa3ffffff]
>>>> [    0.000000]   node   0: [mem 0x0000001fa4000000-0x0000001fa402ffff]
>>>> [    0.000000]   node   0: [mem 0x0000001fa4030000-0x0000001fa40effff]
>>>> [    0.000000]   node   0: [mem 0x0000001fa40f0000-0x0000001fa73cffff]
>>>> [    0.000000]   node   0: [mem 0x0000001fa73d0000-0x0000001fa745ffff]
>>>> [    0.000000]   node   0: [mem 0x0000001fa7460000-0x0000001fa746ffff]
>>>> [    0.000000]   node   0: [mem 0x0000001fa7470000-0x0000001fa758ffff]
>>>> [    0.000000]   node   0: [mem 0x0000001fa7590000-0x0000001fa7ffffff]
>>>>
>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>> ---
>>>>    include/linux/mmzone.h | 10 ++++++++++
>>>>    mm/compaction.c        | 23 ++++++++++++++++++++++-
>>>>    2 files changed, 32 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>> index 5a7ada0413da..87e6c535d895 100644
>>>> --- a/include/linux/mmzone.h
>>>> +++ b/include/linux/mmzone.h
>>>> @@ -2000,6 +2000,16 @@ static inline unsigned long 
>>>> next_present_section_nr(unsigned long section_nr)
>>>>        return -1;
>>>>    }
>>>> +static inline unsigned long next_online_section_nr(unsigned long 
>>>> section_nr)
>>>> +{
>>>> +    while (++section_nr <= __highest_present_section_nr) {
>>>> +        if (online_section_nr(section_nr))
>>>> +            return section_nr;
>>>> +    }
>>>> +
>>>> +    return -1UL;
>>>> +}
>>>> +
>>>>    /*
>>>>     * These are _only_ used during initialisation, therefore they
>>>>     * can use __initdata ...  They could have names to indicate
>>>> diff --git a/mm/compaction.c b/mm/compaction.c
>>>> index 3398ef3a55fe..3a55fdd20c49 100644
>>>> --- a/mm/compaction.c
>>>> +++ b/mm/compaction.c
>>>> @@ -229,6 +229,21 @@ static void reset_cached_positions(struct zone 
>>>> *zone)
>>>>                    pageblock_start_pfn(zone_end_pfn(zone) - 1);
>>>>    }
>>>> +static unsigned long skip_hole_pageblock(unsigned long start_pfn)
>>>> +{
>>>> +    unsigned long next_online_nr;
>>>> +    unsigned long start_nr = pfn_to_section_nr(start_pfn);
>>>> +
>>>> +    if (online_section_nr(start_nr))
>>>> +        return -1UL;
>>>
>>> Define a macro for the maigic "-1UL"?  Which is used for multiple times
>>> in the patch.
>>
>> I am struggling to find a readable macro for these '-1UL', since the
>> '-1UL' in next_online_section_nr() indicates that it can not find an
>> online section. However the '-1' in skip_hole_pageblock() indicates that
>> it can not find an online pfn.
> 
> Maybe something like
> 
> #define SECTION_NR_INVALID -1UL

Actually we already have a NR_MEM_SECTIONS macro, which means it is an 
invalid section if the section nr >= NR_MEM_SECTIONS. So using 
NR_MEM_SECTIONS seems more suitable?

>> So after more thinking, I will change to return 'NR_MEM_SECTIONS' if can
>> not find next online section in next_online_section_nr(). And in
>> skip_hole_pageblock(), I will change to return 0 if can not find next
>> online pfn. What do you think?
> 
> Well, 0 "might be" (and most likely is) a valid section number, so you'd 
> simulate some kind-of a wraparound. I guess I'd prefer 
> SECTION_NR_INVALID instead.

0 means can not find next online pfn number, not a section number in 
skip_hole_pageblock().
  
David Hildenbrand June 12, 2023, 10:11 a.m. UTC | #7
On 12.06.23 12:10, Baolin Wang wrote:
> 
> 
> On 6/12/2023 5:54 PM, David Hildenbrand wrote:
>> On 12.06.23 11:36, Baolin Wang wrote:
>>>
>>>
>>> On 6/12/2023 2:39 PM, Huang, Ying wrote:
>>>> Baolin Wang <baolin.wang@linux.alibaba.com> writes:
>>>>
>>>>> On some machines, the normal zone can have a large memory hole like
>>>>> below memory layout, and we can see the range from 0x100000000 to
>>>>> 0x1800000000 is a hole. So when isolating some migratable pages, the
>>>>> scanner can meet the hole and it will take more time to skip the large
>>>>> hole. From my measurement, I can see the isolation scanner will take
>>>>> 80us ~ 100us to skip the large hole [0x100000000 - 0x1800000000].
>>>>>
>>>>> So adding a new helper to fast search next online memory section
>>>>> to skip the large hole can help to find next suitable pageblock
>>>>> efficiently. With this patch, I can see the large hole scanning only
>>>>> takes < 1us.
>>>>>
>>>>> [    0.000000] Zone ranges:
>>>>> [    0.000000]   DMA      [mem 0x0000000040000000-0x00000000ffffffff]
>>>>> [    0.000000]   DMA32    empty
>>>>> [    0.000000]   Normal   [mem 0x0000000100000000-0x0000001fa7ffffff]
>>>>> [    0.000000] Movable zone start for each node
>>>>> [    0.000000] Early memory node ranges
>>>>> [    0.000000]   node   0: [mem 0x0000000040000000-0x0000000fffffffff]
>>>>> [    0.000000]   node   0: [mem 0x0000001800000000-0x0000001fa3c7ffff]
>>>>> [    0.000000]   node   0: [mem 0x0000001fa3c80000-0x0000001fa3ffffff]
>>>>> [    0.000000]   node   0: [mem 0x0000001fa4000000-0x0000001fa402ffff]
>>>>> [    0.000000]   node   0: [mem 0x0000001fa4030000-0x0000001fa40effff]
>>>>> [    0.000000]   node   0: [mem 0x0000001fa40f0000-0x0000001fa73cffff]
>>>>> [    0.000000]   node   0: [mem 0x0000001fa73d0000-0x0000001fa745ffff]
>>>>> [    0.000000]   node   0: [mem 0x0000001fa7460000-0x0000001fa746ffff]
>>>>> [    0.000000]   node   0: [mem 0x0000001fa7470000-0x0000001fa758ffff]
>>>>> [    0.000000]   node   0: [mem 0x0000001fa7590000-0x0000001fa7ffffff]
>>>>>
>>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>>> ---
>>>>>     include/linux/mmzone.h | 10 ++++++++++
>>>>>     mm/compaction.c        | 23 ++++++++++++++++++++++-
>>>>>     2 files changed, 32 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>>> index 5a7ada0413da..87e6c535d895 100644
>>>>> --- a/include/linux/mmzone.h
>>>>> +++ b/include/linux/mmzone.h
>>>>> @@ -2000,6 +2000,16 @@ static inline unsigned long
>>>>> next_present_section_nr(unsigned long section_nr)
>>>>>         return -1;
>>>>>     }
>>>>> +static inline unsigned long next_online_section_nr(unsigned long
>>>>> section_nr)
>>>>> +{
>>>>> +    while (++section_nr <= __highest_present_section_nr) {
>>>>> +        if (online_section_nr(section_nr))
>>>>> +            return section_nr;
>>>>> +    }
>>>>> +
>>>>> +    return -1UL;
>>>>> +}
>>>>> +
>>>>>     /*
>>>>>      * These are _only_ used during initialisation, therefore they
>>>>>      * can use __initdata ...  They could have names to indicate
>>>>> diff --git a/mm/compaction.c b/mm/compaction.c
>>>>> index 3398ef3a55fe..3a55fdd20c49 100644
>>>>> --- a/mm/compaction.c
>>>>> +++ b/mm/compaction.c
>>>>> @@ -229,6 +229,21 @@ static void reset_cached_positions(struct zone
>>>>> *zone)
>>>>>                     pageblock_start_pfn(zone_end_pfn(zone) - 1);
>>>>>     }
>>>>> +static unsigned long skip_hole_pageblock(unsigned long start_pfn)
>>>>> +{
>>>>> +    unsigned long next_online_nr;
>>>>> +    unsigned long start_nr = pfn_to_section_nr(start_pfn);
>>>>> +
>>>>> +    if (online_section_nr(start_nr))
>>>>> +        return -1UL;
>>>>
>>>> Define a macro for the maigic "-1UL"?  Which is used for multiple times
>>>> in the patch.
>>>
>>> I am struggling to find a readable macro for these '-1UL', since the
>>> '-1UL' in next_online_section_nr() indicates that it can not find an
>>> online section. However the '-1' in skip_hole_pageblock() indicates that
>>> it can not find an online pfn.
>>
>> Maybe something like
>>
>> #define SECTION_NR_INVALID -1UL
> 
> Actually we already have a NR_MEM_SECTIONS macro, which means it is an
> invalid section if the section nr >= NR_MEM_SECTIONS. So using
> NR_MEM_SECTIONS seems more suitable?

Indeed, that would also work!
  
Huang, Ying June 13, 2023, 1:08 a.m. UTC | #8
Baolin Wang <baolin.wang@linux.alibaba.com> writes:

> On 6/12/2023 2:39 PM, Huang, Ying wrote:
>> Baolin Wang <baolin.wang@linux.alibaba.com> writes:
>> 
>>> On some machines, the normal zone can have a large memory hole like
>>> below memory layout, and we can see the range from 0x100000000 to
>>> 0x1800000000 is a hole. So when isolating some migratable pages, the
>>> scanner can meet the hole and it will take more time to skip the large
>>> hole. From my measurement, I can see the isolation scanner will take
>>> 80us ~ 100us to skip the large hole [0x100000000 - 0x1800000000].
>>>
>>> So adding a new helper to fast search next online memory section
>>> to skip the large hole can help to find next suitable pageblock
>>> efficiently. With this patch, I can see the large hole scanning only
>>> takes < 1us.
>>>
>>> [    0.000000] Zone ranges:
>>> [    0.000000]   DMA      [mem 0x0000000040000000-0x00000000ffffffff]
>>> [    0.000000]   DMA32    empty
>>> [    0.000000]   Normal   [mem 0x0000000100000000-0x0000001fa7ffffff]
>>> [    0.000000] Movable zone start for each node
>>> [    0.000000] Early memory node ranges
>>> [    0.000000]   node   0: [mem 0x0000000040000000-0x0000000fffffffff]
>>> [    0.000000]   node   0: [mem 0x0000001800000000-0x0000001fa3c7ffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa3c80000-0x0000001fa3ffffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa4000000-0x0000001fa402ffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa4030000-0x0000001fa40effff]
>>> [    0.000000]   node   0: [mem 0x0000001fa40f0000-0x0000001fa73cffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa73d0000-0x0000001fa745ffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa7460000-0x0000001fa746ffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa7470000-0x0000001fa758ffff]
>>> [    0.000000]   node   0: [mem 0x0000001fa7590000-0x0000001fa7ffffff]
>>>
>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> ---
>>>   include/linux/mmzone.h | 10 ++++++++++
>>>   mm/compaction.c        | 23 ++++++++++++++++++++++-
>>>   2 files changed, 32 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>> index 5a7ada0413da..87e6c535d895 100644
>>> --- a/include/linux/mmzone.h
>>> +++ b/include/linux/mmzone.h
>>> @@ -2000,6 +2000,16 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr)
>>>   	return -1;
>>>   }
>>>   +static inline unsigned long next_online_section_nr(unsigned long
>>> section_nr)
>>> +{
>>> +	while (++section_nr <= __highest_present_section_nr) {
>>> +		if (online_section_nr(section_nr))
>>> +			return section_nr;
>>> +	}
>>> +
>>> +	return -1UL;
>>> +}
>>> +
>>>   /*
>>>    * These are _only_ used during initialisation, therefore they
>>>    * can use __initdata ...  They could have names to indicate
>>> diff --git a/mm/compaction.c b/mm/compaction.c
>>> index 3398ef3a55fe..3a55fdd20c49 100644
>>> --- a/mm/compaction.c
>>> +++ b/mm/compaction.c
>>> @@ -229,6 +229,21 @@ static void reset_cached_positions(struct zone *zone)
>>>   				pageblock_start_pfn(zone_end_pfn(zone) - 1);
>>>   }
>>>   +static unsigned long skip_hole_pageblock(unsigned long
>>> start_pfn)
>>> +{
>>> +	unsigned long next_online_nr;
>>> +	unsigned long start_nr = pfn_to_section_nr(start_pfn);
>>> +
>>> +	if (online_section_nr(start_nr))
>>> +		return -1UL;
>> Define a macro for the maigic "-1UL"?  Which is used for multiple
>> times
>> in the patch.
>
> I am struggling to find a readable macro for these '-1UL', since the
> '-1UL' in next_online_section_nr() indicates that it can not find an 
> online section. However the '-1' in skip_hole_pageblock() indicates
> that it can not find an online pfn.
>
> So after more thinking, I will change to return 'NR_MEM_SECTIONS' if
> can not find next online section in next_online_section_nr(). And in 
> skip_hole_pageblock(), I will change to return 0 if can not find next
> online pfn. What do you think?
>
> static unsigned long skip_hole_pageblock(unsigned long start_pfn)
> {
>         unsigned long next_online_nr;
>         unsigned long start_nr = pfn_to_section_nr(start_pfn);
>
>         if (online_section_nr(start_nr))
>                 return 0;
>
>         next_online_nr = next_online_section_nr(start_nr);
>         if (next_online_nr < NR_MEM_SECTIONS)
>                 return section_nr_to_pfn(next_online_nr);
>
>         return 0;
> }

Sounds good to me.

Best Regards,
Huang, Ying

>>> +
>>> +	next_online_nr = next_online_section_nr(start_nr);
>>> +	if (next_online_nr != -1UL)
>>> +		return section_nr_to_pfn(next_online_nr);
>>> +
>>> +	return -1UL;
>>> +}
>>> +
>>>   /*
>>>    * Compound pages of >= pageblock_order should consistently be skipped until
>>>    * released. It is always pointless to compact pages of such order (if they are
>>> @@ -1991,8 +2006,14 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
>>>     		page = pageblock_pfn_to_page(block_start_pfn,
>>>   						block_end_pfn, cc->zone);
>>> -		if (!page)
>>> +		if (!page) {
>>> +			unsigned long next_pfn;
>>> +
>>> +			next_pfn = skip_hole_pageblock(block_start_pfn);
>>> +			if (next_pfn != -1UL)
>>> +				block_end_pfn = next_pfn;
>>>   			continue;
>>> +		}
>>>     		/*
>>>   		 * If isolation recently failed, do not retry. Only check the
>> Do we need to do similar change in isolate_freepages()?
>
> Yes, it's in my todo list with some measurement data.
>
> Thanks for your comments.
  

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5a7ada0413da..87e6c535d895 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -2000,6 +2000,16 @@  static inline unsigned long next_present_section_nr(unsigned long section_nr)
 	return -1;
 }
 
+static inline unsigned long next_online_section_nr(unsigned long section_nr)
+{
+	while (++section_nr <= __highest_present_section_nr) {
+		if (online_section_nr(section_nr))
+			return section_nr;
+	}
+
+	return -1UL;
+}
+
 /*
  * These are _only_ used during initialisation, therefore they
  * can use __initdata ...  They could have names to indicate
diff --git a/mm/compaction.c b/mm/compaction.c
index 3398ef3a55fe..3a55fdd20c49 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -229,6 +229,21 @@  static void reset_cached_positions(struct zone *zone)
 				pageblock_start_pfn(zone_end_pfn(zone) - 1);
 }
 
+static unsigned long skip_hole_pageblock(unsigned long start_pfn)
+{
+	unsigned long next_online_nr;
+	unsigned long start_nr = pfn_to_section_nr(start_pfn);
+
+	if (online_section_nr(start_nr))
+		return -1UL;
+
+	next_online_nr = next_online_section_nr(start_nr);
+	if (next_online_nr != -1UL)
+		return section_nr_to_pfn(next_online_nr);
+
+	return -1UL;
+}
+
 /*
  * Compound pages of >= pageblock_order should consistently be skipped until
  * released. It is always pointless to compact pages of such order (if they are
@@ -1991,8 +2006,14 @@  static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
 
 		page = pageblock_pfn_to_page(block_start_pfn,
 						block_end_pfn, cc->zone);
-		if (!page)
+		if (!page) {
+			unsigned long next_pfn;
+
+			next_pfn = skip_hole_pageblock(block_start_pfn);
+			if (next_pfn != -1UL)
+				block_end_pfn = next_pfn;
 			continue;
+		}
 
 		/*
 		 * If isolation recently failed, do not retry. Only check the