mm:ALLOC_HIGHATOMIC flag allocation issuse

Message ID 20231108065408.1861-1-justinjiang@vivo.com
State New
Headers
Series mm:ALLOC_HIGHATOMIC flag allocation issuse |

Commit Message

Zhiguo Jiang Nov. 8, 2023, 6:54 a.m. UTC
  In case that alloc_flags contains ALLOC_HIGHATOMIC and alloc order
is order1/2/3/10 in rmqueue(), if pages are alloced successfully
from pcplist, a free pageblock will be also moved from the alloced
migratetype freelist to MIGRATE_HIGHATOMIC freelist, rather than
alloc from MIGRATE_HIGHATOMIC freelist firstly, so this will result
in an increasing number of pages on the MIGRATE_HIGHATOMIC freelist,
pages in other migratetype freelist are reduced and more likely to
allocation failure.

Currently the sequence of ALLOC_HIGHATOMIC allocation is:
pcplist --> rmqueue_bulk() --> rmqueue_buddy() MIGRATE_HIGHATOMIC
--> rmqueue_buddy() allocation migratetype.

Due to the fact that requesting pages from the pcplist is faster than
buddy, the sequence of modifying the ALLOC_HIGHATOMIC allocation is:
pcplist --> rmqueue_buddy() MIGRATE_HIGHATOMIC --> rmqueue_buddy()
allocation migratetype.

This patch can solve the failure problem of allocating other types of
pages due to excessive MIGRATE_HIGHATOMIC freelist reservations.

In comparative testing, cat /proc/pagetypeinfo and the HighAtomic
freelist size is:
Test without this patch:
Node 0, zone Normal, type HighAtomic 2369 771 138 15 0 0 0 0 0 0 0
Test with this patch:
Node 0, zone Normal, type HighAtomic 206 82 4 2 1 0 0 0 0 0 0 

Signed-off-by: Zhiguo Jiang <justinjiang@vivo.com>
---
 mm/internal.h   |  1 +
 mm/page_alloc.c | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)
 mode change 100644 => 100755 mm/internal.h
 mode change 100644 => 100755 mm/page_alloc.c
  

Comments

Andrew Morton Nov. 8, 2023, 9:57 p.m. UTC | #1
On Wed,  8 Nov 2023 14:54:07 +0800 Zhiguo Jiang <justinjiang@vivo.com> wrote:

> In case that alloc_flags contains ALLOC_HIGHATOMIC and alloc order
> is order1/2/3/10 in rmqueue(), if pages are alloced successfully
> from pcplist, a free pageblock will be also moved from the alloced
> migratetype freelist to MIGRATE_HIGHATOMIC freelist, rather than
> alloc from MIGRATE_HIGHATOMIC freelist firstly, so this will result
> in an increasing number of pages on the MIGRATE_HIGHATOMIC freelist,
> pages in other migratetype freelist are reduced and more likely to
> allocation failure.
> 
> Currently the sequence of ALLOC_HIGHATOMIC allocation is:
> pcplist --> rmqueue_bulk() --> rmqueue_buddy() MIGRATE_HIGHATOMIC
> --> rmqueue_buddy() allocation migratetype.
> 
> Due to the fact that requesting pages from the pcplist is faster than
> buddy, the sequence of modifying the ALLOC_HIGHATOMIC allocation is:
> pcplist --> rmqueue_buddy() MIGRATE_HIGHATOMIC --> rmqueue_buddy()
> allocation migratetype.
> 
> This patch can solve the failure problem of allocating other types of
> pages due to excessive MIGRATE_HIGHATOMIC freelist reservations.
> 
> In comparative testing, cat /proc/pagetypeinfo and the HighAtomic
> freelist size is:
> Test without this patch:
> Node 0, zone Normal, type HighAtomic 2369 771 138 15 0 0 0 0 0 0 0
> Test with this patch:
> Node 0, zone Normal, type HighAtomic 206 82 4 2 1 0 0 0 0 0 0 

Hopefully hannes can check this for us, but I have a stylistic concern...

> +#define ALLOC_PCPLIST		0x1000 /* Allocations from pcplist */
>  
>  /* Flags that allow allocations below the min watermark. */
>  #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index b8544f08cc36..67cec88164b1
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2854,11 +2854,15 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
>  			int batch = nr_pcp_alloc(pcp, zone, order);
>  			int alloced;
>  
> +			if (alloc_flags & ALLOC_HIGHATOMIC)
> +				goto out;
> +

A comment here explaining why we're doing this would help readers.

>  			alloced = rmqueue_bulk(zone, order,
>  					batch, list,
>  					migratetype, alloc_flags);
>  
>  			pcp->count += alloced << order;
> +out:
>  			if (unlikely(list_empty(list)))
>  				return NULL;
>  		}
> @@ -2921,7 +2925,7 @@ __no_sanitize_memory
>  static inline
>  struct page *rmqueue(struct zone *preferred_zone,
>  			struct zone *zone, unsigned int order,
> -			gfp_t gfp_flags, unsigned int alloc_flags,
> +			gfp_t gfp_flags, unsigned int *alloc_flags,
>  			int migratetype)
>  {
>  	struct page *page;
> @@ -2934,17 +2938,19 @@ struct page *rmqueue(struct zone *preferred_zone,
>  
>  	if (likely(pcp_allowed_order(order))) {
>  		page = rmqueue_pcplist(preferred_zone, zone, order,
> -				       migratetype, alloc_flags);
> -		if (likely(page))
> +				       migratetype, *alloc_flags);
> +		if (likely(page)) {
> +			*alloc_flags |= ALLOC_PCPLIST;
>  			goto out;
> +		}
>  	}

So we're effectively returning a boolean to the caller via *alloc_flags.

This isn't a great way of doing it.  Wouldn't it be cleaner to pass a
new bool* argument to rmqueue() for this?  Make it explicit?

rmqueue() will be inlined into its sole caller, so this approach
shouldn't add overhead.

> -	page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
> +	page = rmqueue_buddy(preferred_zone, zone, order, *alloc_flags,
>  							migratetype);
> 
>  out:
>  	/* Separate test+clear to avoid unnecessary atomics */
> -	if ((alloc_flags & ALLOC_KSWAPD) &&
> +	if ((*alloc_flags & ALLOC_KSWAPD) &&
>  	    unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
>  		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
>  		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
> @@ -3343,7 +3349,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>  
>  try_this_zone:
>  		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
> -				gfp_mask, alloc_flags, ac->migratetype);
> +				gfp_mask, &alloc_flags, ac->migratetype);
>  		if (page) {
>  			prep_new_page(page, order, gfp_mask, alloc_flags);
>  
> @@ -3351,7 +3357,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>  			 * If this is a high-order atomic allocation then check
>  			 * if the pageblock should be reserved for the future
>  			 */
> -			if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
> +			if (unlikely(alloc_flags & ALLOC_HIGHATOMIC) &&
> +				unlikely(!(alloc_flags & ALLOC_PCPLIST)))

Again, a comment explaining the reason for the test would be good.

>  				reserve_highatomic_pageblock(page, zone);
>  
>  			return page;
> -- 
> 2.39.0
>
  
Zhiguo Jiang Nov. 9, 2023, 8:30 a.m. UTC | #2
在 2023/11/9 5:57, Andrew Morton 写道:
> On Wed,  8 Nov 2023 14:54:07 +0800 Zhiguo Jiang <justinjiang@vivo.com> wrote:
Hi Andrew Morton :

In response to your three good suggestions, I have made relevant 
modifications in [PATCH v2 1/1], thanks for your review.
https://lore.kernel.org/all/20231109073133.792-2-justinjiang@vivo.com/

>> In case that alloc_flags contains ALLOC_HIGHATOMIC and alloc order
>> is order1/2/3/10 in rmqueue(), if pages are alloced successfully
>> from pcplist, a free pageblock will be also moved from the alloced
>> migratetype freelist to MIGRATE_HIGHATOMIC freelist, rather than
>> alloc from MIGRATE_HIGHATOMIC freelist firstly, so this will result
>> in an increasing number of pages on the MIGRATE_HIGHATOMIC freelist,
>> pages in other migratetype freelist are reduced and more likely to
>> allocation failure.
>>
>> Currently the sequence of ALLOC_HIGHATOMIC allocation is:
>> pcplist --> rmqueue_bulk() --> rmqueue_buddy() MIGRATE_HIGHATOMIC
>> --> rmqueue_buddy() allocation migratetype.
>>
>> Due to the fact that requesting pages from the pcplist is faster than
>> buddy, the sequence of modifying the ALLOC_HIGHATOMIC allocation is:
>> pcplist --> rmqueue_buddy() MIGRATE_HIGHATOMIC --> rmqueue_buddy()
>> allocation migratetype.
>>
>> This patch can solve the failure problem of allocating other types of
>> pages due to excessive MIGRATE_HIGHATOMIC freelist reservations.
>>
>> In comparative testing, cat /proc/pagetypeinfo and the HighAtomic
>> freelist size is:
>> Test without this patch:
>> Node 0, zone Normal, type HighAtomic 2369 771 138 15 0 0 0 0 0 0 0
>> Test with this patch:
>> Node 0, zone Normal, type HighAtomic 206 82 4 2 1 0 0 0 0 0 0
> Hopefully hannes can check this for us, but I have a stylistic concern...
>
>> +#define ALLOC_PCPLIST		0x1000 /* Allocations from pcplist */
>>   
>>   /* Flags that allow allocations below the min watermark. */
>>   #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index b8544f08cc36..67cec88164b1
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -2854,11 +2854,15 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
>>   			int batch = nr_pcp_alloc(pcp, zone, order);
>>   			int alloced;
>>   
>> +			if (alloc_flags & ALLOC_HIGHATOMIC)
>> +				goto out;
>> +
> A comment here explaining why we're doing this would help readers.
add in [PATCH v2 1/1].
>
>>   			alloced = rmqueue_bulk(zone, order,
>>   					batch, list,
>>   					migratetype, alloc_flags);
>>   
>>   			pcp->count += alloced << order;
>> +out:
>>   			if (unlikely(list_empty(list)))
>>   				return NULL;
>>   		}
>> @@ -2921,7 +2925,7 @@ __no_sanitize_memory
>>   static inline
>>   struct page *rmqueue(struct zone *preferred_zone,
>>   			struct zone *zone, unsigned int order,
>> -			gfp_t gfp_flags, unsigned int alloc_flags,
>> +			gfp_t gfp_flags, unsigned int *alloc_flags,
>>   			int migratetype)
>>   {
>>   	struct page *page;
>> @@ -2934,17 +2938,19 @@ struct page *rmqueue(struct zone *preferred_zone,
>>   
>>   	if (likely(pcp_allowed_order(order))) {
>>   		page = rmqueue_pcplist(preferred_zone, zone, order,
>> -				       migratetype, alloc_flags);
>> -		if (likely(page))
>> +				       migratetype, *alloc_flags);
>> +		if (likely(page)) {
>> +			*alloc_flags |= ALLOC_PCPLIST;
>>   			goto out;
>> +		}
>>   	}
> So we're effectively returning a boolean to the caller via *alloc_flags.
>
> This isn't a great way of doing it.  Wouldn't it be cleaner to pass a
> new bool* argument to rmqueue() for this?  Make it explicit?
>
> rmqueue() will be inlined into its sole caller, so this approach
> shouldn't add overhead.
Yes,I agree with your suggestion that pass a new bool* argument to 
rmqueue() for this, it is more clearer.
I have modified in [PATCH v2 1/1].
>
>> -	page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
>> +	page = rmqueue_buddy(preferred_zone, zone, order, *alloc_flags,
>>   							migratetype);
>>
>>   out:
>>   	/* Separate test+clear to avoid unnecessary atomics */
>> -	if ((alloc_flags & ALLOC_KSWAPD) &&
>> +	if ((*alloc_flags & ALLOC_KSWAPD) &&
>>   	    unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
>>   		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
>>   		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
>> @@ -3343,7 +3349,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>>   
>>   try_this_zone:
>>   		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
>> -				gfp_mask, alloc_flags, ac->migratetype);
>> +				gfp_mask, &alloc_flags, ac->migratetype);
>>   		if (page) {
>>   			prep_new_page(page, order, gfp_mask, alloc_flags);
>>   
>> @@ -3351,7 +3357,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
>>   			 * If this is a high-order atomic allocation then check
>>   			 * if the pageblock should be reserved for the future
>>   			 */
>> -			if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
>> +			if (unlikely(alloc_flags & ALLOC_HIGHATOMIC) &&
>> +				unlikely(!(alloc_flags & ALLOC_PCPLIST)))
> Again, a comment explaining the reason for the test would be good.
add in [PATCH v2 1/1].
>
>>   				reserve_highatomic_pageblock(page, zone);
>>   
>>   			return page;
>> -- 
>> 2.39.0
>>
  

Patch

diff --git a/mm/internal.h b/mm/internal.h
index b61034bd50f5..f523242baf0c
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -905,6 +905,7 @@  unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 #endif
 #define ALLOC_HIGHATOMIC	0x200 /* Allows access to MIGRATE_HIGHATOMIC */
 #define ALLOC_KSWAPD		0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
+#define ALLOC_PCPLIST		0x1000 /* Allocations from pcplist */
 
 /* Flags that allow allocations below the min watermark. */
 #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b8544f08cc36..67cec88164b1
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2854,11 +2854,15 @@  struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 			int batch = nr_pcp_alloc(pcp, zone, order);
 			int alloced;
 
+			if (alloc_flags & ALLOC_HIGHATOMIC)
+				goto out;
+
 			alloced = rmqueue_bulk(zone, order,
 					batch, list,
 					migratetype, alloc_flags);
 
 			pcp->count += alloced << order;
+out:
 			if (unlikely(list_empty(list)))
 				return NULL;
 		}
@@ -2921,7 +2925,7 @@  __no_sanitize_memory
 static inline
 struct page *rmqueue(struct zone *preferred_zone,
 			struct zone *zone, unsigned int order,
-			gfp_t gfp_flags, unsigned int alloc_flags,
+			gfp_t gfp_flags, unsigned int *alloc_flags,
 			int migratetype)
 {
 	struct page *page;
@@ -2934,17 +2938,19 @@  struct page *rmqueue(struct zone *preferred_zone,
 
 	if (likely(pcp_allowed_order(order))) {
 		page = rmqueue_pcplist(preferred_zone, zone, order,
-				       migratetype, alloc_flags);
-		if (likely(page))
+				       migratetype, *alloc_flags);
+		if (likely(page)) {
+			*alloc_flags |= ALLOC_PCPLIST;
 			goto out;
+		}
 	}
 
-	page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
+	page = rmqueue_buddy(preferred_zone, zone, order, *alloc_flags,
 							migratetype);

 out:
 	/* Separate test+clear to avoid unnecessary atomics */
-	if ((alloc_flags & ALLOC_KSWAPD) &&
+	if ((*alloc_flags & ALLOC_KSWAPD) &&
 	    unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
 		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
 		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
@@ -3343,7 +3349,7 @@  get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 
 try_this_zone:
 		page = rmqueue(ac->preferred_zoneref->zone, zone, order,
-				gfp_mask, alloc_flags, ac->migratetype);
+				gfp_mask, &alloc_flags, ac->migratetype);
 		if (page) {
 			prep_new_page(page, order, gfp_mask, alloc_flags);
 
@@ -3351,7 +3357,8 @@  get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 			 * If this is a high-order atomic allocation then check
 			 * if the pageblock should be reserved for the future
 			 */
-			if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
+			if (unlikely(alloc_flags & ALLOC_HIGHATOMIC) &&
+				unlikely(!(alloc_flags & ALLOC_PCPLIST)))
 				reserve_highatomic_pageblock(page, zone);
 
 			return page;