[v2,1/6] mm: hugetlb: Skip prep of tail pages when HVO is enabled

Message ID 20230730151606.2871391-2-usama.arif@bytedance.com
State New
Headers
Series mm/memblock: Skip prep and initialization of struct pages freed later by HVO |

Commit Message

Usama Arif July 30, 2023, 3:16 p.m. UTC
  When vmemmap is optimizable, it will free all the duplicated tail
pages in hugetlb_vmemmap_optimize while preparing the new hugepage.
Hence, there is no need to prepare them.

For 1G x86 hugepages, it avoids preparing
262144 - 64 = 262080 struct pages per hugepage.

The indirection of using __prep_compound_gigantic_folio is also removed,
as it just creates extra functions to indicate demote which can be done
with the argument.

Signed-off-by: Usama Arif <usama.arif@bytedance.com>
---
 mm/hugetlb.c         | 32 ++++++++++++++------------------
 mm/hugetlb_vmemmap.c |  2 +-
 mm/hugetlb_vmemmap.h | 15 +++++++++++----
 3 files changed, 26 insertions(+), 23 deletions(-)
  

Comments

Mike Kravetz July 31, 2023, 11:18 p.m. UTC | #1
On 07/30/23 16:16, Usama Arif wrote:
> When vmemmap is optimizable, it will free all the duplicated tail
> pages in hugetlb_vmemmap_optimize while preparing the new hugepage.
> Hence, there is no need to prepare them.
> 
> For 1G x86 hugepages, it avoids preparing
> 262144 - 64 = 262080 struct pages per hugepage.
> 
> The indirection of using __prep_compound_gigantic_folio is also removed,
> as it just creates extra functions to indicate demote which can be done
> with the argument.
> 
> Signed-off-by: Usama Arif <usama.arif@bytedance.com>
> ---
>  mm/hugetlb.c         | 32 ++++++++++++++------------------
>  mm/hugetlb_vmemmap.c |  2 +-
>  mm/hugetlb_vmemmap.h | 15 +++++++++++----
>  3 files changed, 26 insertions(+), 23 deletions(-)

Thanks,

I just started looking at this series.  Adding Muchun on Cc:

> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 64a3239b6407..541c07b6d60f 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1942,14 +1942,23 @@ static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int ni
>  	spin_unlock_irq(&hugetlb_lock);
>  }
>  
> -static bool __prep_compound_gigantic_folio(struct folio *folio,
> -					unsigned int order, bool demote)
> +static bool prep_compound_gigantic_folio(struct folio *folio, struct hstate *h, bool demote)
>  {
>  	int i, j;
> +	int order = huge_page_order(h);
>  	int nr_pages = 1 << order;
>  	struct page *p;
>  
>  	__folio_clear_reserved(folio);
> +
> +	/*
> +	 * No need to prep pages that will be freed later by hugetlb_vmemmap_optimize.
> +	 * Hence, reduce nr_pages to the pages that will be kept.
> +	 */
> +	if (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) &&
> +			vmemmap_should_optimize(h, &folio->page))

IIUC, vmemmap_optimize_enabled (checked in vmemmap_should_optimize) can be
modified at runtime via sysctl.  If so, what prevents it from being changed
after this check and the later call to hugetlb_vmemmap_optimize()?
  
Muchun Song Aug. 1, 2023, 2:04 a.m. UTC | #2
On 2023/7/30 23:16, Usama Arif wrote:
> When vmemmap is optimizable, it will free all the duplicated tail
> pages in hugetlb_vmemmap_optimize while preparing the new hugepage.
> Hence, there is no need to prepare them.
>
> For 1G x86 hugepages, it avoids preparing
> 262144 - 64 = 262080 struct pages per hugepage.
>
> The indirection of using __prep_compound_gigantic_folio is also removed,
> as it just creates extra functions to indicate demote which can be done
> with the argument.
>
> Signed-off-by: Usama Arif <usama.arif@bytedance.com>
> ---
>   mm/hugetlb.c         | 32 ++++++++++++++------------------
>   mm/hugetlb_vmemmap.c |  2 +-
>   mm/hugetlb_vmemmap.h | 15 +++++++++++----
>   3 files changed, 26 insertions(+), 23 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 64a3239b6407..541c07b6d60f 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1942,14 +1942,23 @@ static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int ni
>   	spin_unlock_irq(&hugetlb_lock);
>   }
>   
> -static bool __prep_compound_gigantic_folio(struct folio *folio,
> -					unsigned int order, bool demote)
> +static bool prep_compound_gigantic_folio(struct folio *folio, struct hstate *h, bool demote)
>   {
>   	int i, j;
> +	int order = huge_page_order(h);
>   	int nr_pages = 1 << order;
>   	struct page *p;
>   
>   	__folio_clear_reserved(folio);
> +
> +	/*
> +	 * No need to prep pages that will be freed later by hugetlb_vmemmap_optimize.
> +	 * Hence, reduce nr_pages to the pages that will be kept.
> +	 */
> +	if (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) &&
> +			vmemmap_should_optimize(h, &folio->page))
> +		nr_pages = HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page);

We need to initialize the refcount to zero of tail pages (see the big
comment below in this function), given a situation that someone (maybe
GUP) could get a ref on the tail pages when the vmemmap is optimizing,
what prevent this from happening?

Thanks.

> +
>   	for (i = 0; i < nr_pages; i++) {
>   		p = folio_page(folio, i);
>   
> @@ -2019,18 +2028,6 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
>   	return false;
>   }
>   
> -static bool prep_compound_gigantic_folio(struct folio *folio,
> -							unsigned int order)
> -{
> -	return __prep_compound_gigantic_folio(folio, order, false);
> -}
> -
> -static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
> -							unsigned int order)
> -{
> -	return __prep_compound_gigantic_folio(folio, order, true);
> -}
> -
>   /*
>    * PageHuge() only returns true for hugetlbfs pages, but not for normal or
>    * transparent huge pages.  See the PageTransHuge() documentation for more
> @@ -2185,7 +2182,7 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
>   	if (!folio)
>   		return NULL;
>   	if (hstate_is_gigantic(h)) {
> -		if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
> +		if (!prep_compound_gigantic_folio(folio, h, false)) {
>   			/*
>   			 * Rare failure to convert pages to compound page.
>   			 * Free pages and try again - ONCE!
> @@ -3201,7 +3198,7 @@ static void __init gather_bootmem_prealloc(void)
>   
>   		VM_BUG_ON(!hstate_is_gigantic(h));
>   		WARN_ON(folio_ref_count(folio) != 1);
> -		if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
> +		if (prep_compound_gigantic_folio(folio, h, false)) {
>   			WARN_ON(folio_test_reserved(folio));
>   			prep_new_hugetlb_folio(h, folio, folio_nid(folio));
>   			free_huge_page(page); /* add to the hugepage allocator */
> @@ -3624,8 +3621,7 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
>   		subpage = folio_page(folio, i);
>   		inner_folio = page_folio(subpage);
>   		if (hstate_is_gigantic(target_hstate))
> -			prep_compound_gigantic_folio_for_demote(inner_folio,
> -							target_hstate->order);
> +			prep_compound_gigantic_folio(inner_folio, target_hstate, true);
>   		else
>   			prep_compound_page(subpage, target_hstate->order);
>   		folio_change_private(inner_folio, NULL);
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index c2007ef5e9b0..b721e87de2b3 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -486,7 +486,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
>   }
>   
>   /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
> -static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
> +bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
>   {
>   	if (!READ_ONCE(vmemmap_optimize_enabled))
>   		return false;
> diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> index 25bd0e002431..3e7978a9af73 100644
> --- a/mm/hugetlb_vmemmap.h
> +++ b/mm/hugetlb_vmemmap.h
> @@ -10,16 +10,17 @@
>   #define _LINUX_HUGETLB_VMEMMAP_H
>   #include <linux/hugetlb.h>
>   
> -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
> -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
> -
>   /*
>    * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
>    * Documentation/vm/vmemmap_dedup.rst.
>    */
>   #define HUGETLB_VMEMMAP_RESERVE_SIZE	PAGE_SIZE
>   
> +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
> +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
> +bool vmemmap_should_optimize(const struct hstate *h, const struct page *head);
> +
>   static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
>   {
>   	return pages_per_huge_page(h) * sizeof(struct page);
> @@ -51,6 +52,12 @@ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate
>   {
>   	return 0;
>   }
> +
> +static inline bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
> +{
> +	return false;
> +}
> +
>   #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
>   
>   static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)
  
Usama Arif Aug. 2, 2023, 10:05 a.m. UTC | #3
On 01/08/2023 00:18, Mike Kravetz wrote:
> On 07/30/23 16:16, Usama Arif wrote:
>> When vmemmap is optimizable, it will free all the duplicated tail
>> pages in hugetlb_vmemmap_optimize while preparing the new hugepage.
>> Hence, there is no need to prepare them.
>>
>> For 1G x86 hugepages, it avoids preparing
>> 262144 - 64 = 262080 struct pages per hugepage.
>>
>> The indirection of using __prep_compound_gigantic_folio is also removed,
>> as it just creates extra functions to indicate demote which can be done
>> with the argument.
>>
>> Signed-off-by: Usama Arif <usama.arif@bytedance.com>
>> ---
>>   mm/hugetlb.c         | 32 ++++++++++++++------------------
>>   mm/hugetlb_vmemmap.c |  2 +-
>>   mm/hugetlb_vmemmap.h | 15 +++++++++++----
>>   3 files changed, 26 insertions(+), 23 deletions(-)
> 
> Thanks,
> 
> I just started looking at this series.  Adding Muchun on Cc:
> 
>>
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 64a3239b6407..541c07b6d60f 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -1942,14 +1942,23 @@ static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int ni
>>   	spin_unlock_irq(&hugetlb_lock);
>>   }
>>   
>> -static bool __prep_compound_gigantic_folio(struct folio *folio,
>> -					unsigned int order, bool demote)
>> +static bool prep_compound_gigantic_folio(struct folio *folio, struct hstate *h, bool demote)
>>   {
>>   	int i, j;
>> +	int order = huge_page_order(h);
>>   	int nr_pages = 1 << order;
>>   	struct page *p;
>>   
>>   	__folio_clear_reserved(folio);
>> +
>> +	/*
>> +	 * No need to prep pages that will be freed later by hugetlb_vmemmap_optimize.
>> +	 * Hence, reduce nr_pages to the pages that will be kept.
>> +	 */
>> +	if (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) &&
>> +			vmemmap_should_optimize(h, &folio->page))
> 
> IIUC, vmemmap_optimize_enabled (checked in vmemmap_should_optimize) can be
> modified at runtime via sysctl.  If so, what prevents it from being changed
> after this check and the later call to hugetlb_vmemmap_optimize()?

Hi,

Thanks for the review.

Yes thats a good catch. The solution for this issue would be to to turn 
hugetlb_free_vmemmap into a callback core_param and have a lock around 
the write and when its used in gather_bootmem_prealloc, etc.

But the bigger issue during runtime is what Muchun pointed out that the 
struct page refcount is not frozen to 0.

My main usecase (and maybe for others as well?) is reserving these 
gigantic pages at boot time. I thought the runtime improvement might 
come from free with it but doesnt look like it. Both issues could be 
solved by just limiting it to boot time, as vmemmap_optimize_enabled 
cannot be changed during boot time, and reference to those pages cannot 
gotten by anything else as well (they aren't even initialized by 
memblock after patch 6), so will include the below diff to solve both 
the issues in next revision.


diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8434100f60ae..790842a6f978 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1956,7 +1956,8 @@ static bool prep_compound_gigantic_folio(struct 
folio *folio, struct hstate *h,
          * Hence, reduce nr_pages to the pages that will be kept.
          */
         if (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) &&
-                       vmemmap_should_optimize(h, &folio->page))
+                       vmemmap_should_optimize(h, &folio->page) &&
+                       system_state == SYSTEM_BOOTING)
                 nr_pages = HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct 
page);
         for (i = 0; i < nr_pages; i++) {

Thanks,
Usama
  
Usama Arif Aug. 2, 2023, 10:06 a.m. UTC | #4
On 01/08/2023 03:04, Muchun Song wrote:
> 
> 
> On 2023/7/30 23:16, Usama Arif wrote:
>> When vmemmap is optimizable, it will free all the duplicated tail
>> pages in hugetlb_vmemmap_optimize while preparing the new hugepage.
>> Hence, there is no need to prepare them.
>>
>> For 1G x86 hugepages, it avoids preparing
>> 262144 - 64 = 262080 struct pages per hugepage.
>>
>> The indirection of using __prep_compound_gigantic_folio is also removed,
>> as it just creates extra functions to indicate demote which can be done
>> with the argument.
>>
>> Signed-off-by: Usama Arif <usama.arif@bytedance.com>
>> ---
>>   mm/hugetlb.c         | 32 ++++++++++++++------------------
>>   mm/hugetlb_vmemmap.c |  2 +-
>>   mm/hugetlb_vmemmap.h | 15 +++++++++++----
>>   3 files changed, 26 insertions(+), 23 deletions(-)
>>
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 64a3239b6407..541c07b6d60f 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -1942,14 +1942,23 @@ static void prep_new_hugetlb_folio(struct 
>> hstate *h, struct folio *folio, int ni
>>       spin_unlock_irq(&hugetlb_lock);
>>   }
>> -static bool __prep_compound_gigantic_folio(struct folio *folio,
>> -                    unsigned int order, bool demote)
>> +static bool prep_compound_gigantic_folio(struct folio *folio, struct 
>> hstate *h, bool demote)
>>   {
>>       int i, j;
>> +    int order = huge_page_order(h);
>>       int nr_pages = 1 << order;
>>       struct page *p;
>>       __folio_clear_reserved(folio);
>> +
>> +    /*
>> +     * No need to prep pages that will be freed later by 
>> hugetlb_vmemmap_optimize.
>> +     * Hence, reduce nr_pages to the pages that will be kept.
>> +     */
>> +    if (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) &&
>> +            vmemmap_should_optimize(h, &folio->page))
>> +        nr_pages = HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page);
> 
> We need to initialize the refcount to zero of tail pages (see the big
> comment below in this function), given a situation that someone (maybe
> GUP) could get a ref on the tail pages when the vmemmap is optimizing,
> what prevent this from happening?
> 
> Thanks.
> 

Thanks for pointing this out, will limit to boot time for solving this 
in next version.
  

Patch

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 64a3239b6407..541c07b6d60f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1942,14 +1942,23 @@  static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int ni
 	spin_unlock_irq(&hugetlb_lock);
 }
 
-static bool __prep_compound_gigantic_folio(struct folio *folio,
-					unsigned int order, bool demote)
+static bool prep_compound_gigantic_folio(struct folio *folio, struct hstate *h, bool demote)
 {
 	int i, j;
+	int order = huge_page_order(h);
 	int nr_pages = 1 << order;
 	struct page *p;
 
 	__folio_clear_reserved(folio);
+
+	/*
+	 * No need to prep pages that will be freed later by hugetlb_vmemmap_optimize.
+	 * Hence, reduce nr_pages to the pages that will be kept.
+	 */
+	if (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) &&
+			vmemmap_should_optimize(h, &folio->page))
+		nr_pages = HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page);
+
 	for (i = 0; i < nr_pages; i++) {
 		p = folio_page(folio, i);
 
@@ -2019,18 +2028,6 @@  static bool __prep_compound_gigantic_folio(struct folio *folio,
 	return false;
 }
 
-static bool prep_compound_gigantic_folio(struct folio *folio,
-							unsigned int order)
-{
-	return __prep_compound_gigantic_folio(folio, order, false);
-}
-
-static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
-							unsigned int order)
-{
-	return __prep_compound_gigantic_folio(folio, order, true);
-}
-
 /*
  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
  * transparent huge pages.  See the PageTransHuge() documentation for more
@@ -2185,7 +2182,7 @@  static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
 	if (!folio)
 		return NULL;
 	if (hstate_is_gigantic(h)) {
-		if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
+		if (!prep_compound_gigantic_folio(folio, h, false)) {
 			/*
 			 * Rare failure to convert pages to compound page.
 			 * Free pages and try again - ONCE!
@@ -3201,7 +3198,7 @@  static void __init gather_bootmem_prealloc(void)
 
 		VM_BUG_ON(!hstate_is_gigantic(h));
 		WARN_ON(folio_ref_count(folio) != 1);
-		if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
+		if (prep_compound_gigantic_folio(folio, h, false)) {
 			WARN_ON(folio_test_reserved(folio));
 			prep_new_hugetlb_folio(h, folio, folio_nid(folio));
 			free_huge_page(page); /* add to the hugepage allocator */
@@ -3624,8 +3621,7 @@  static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
 		subpage = folio_page(folio, i);
 		inner_folio = page_folio(subpage);
 		if (hstate_is_gigantic(target_hstate))
-			prep_compound_gigantic_folio_for_demote(inner_folio,
-							target_hstate->order);
+			prep_compound_gigantic_folio(inner_folio, target_hstate, true);
 		else
 			prep_compound_page(subpage, target_hstate->order);
 		folio_change_private(inner_folio, NULL);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index c2007ef5e9b0..b721e87de2b3 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -486,7 +486,7 @@  int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
 }
 
 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
-static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
+bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
 {
 	if (!READ_ONCE(vmemmap_optimize_enabled))
 		return false;
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 25bd0e002431..3e7978a9af73 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -10,16 +10,17 @@ 
 #define _LINUX_HUGETLB_VMEMMAP_H
 #include <linux/hugetlb.h>
 
-#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
-int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
-void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
-
 /*
  * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
  * Documentation/vm/vmemmap_dedup.rst.
  */
 #define HUGETLB_VMEMMAP_RESERVE_SIZE	PAGE_SIZE
 
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
+void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
+bool vmemmap_should_optimize(const struct hstate *h, const struct page *head);
+
 static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
 {
 	return pages_per_huge_page(h) * sizeof(struct page);
@@ -51,6 +52,12 @@  static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate
 {
 	return 0;
 }
+
+static inline bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
+{
+	return false;
+}
+
 #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
 
 static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)