[mm-unstable,v5,01/10] mm: add folio dtor and order setter functions

Message ID 20221129225039.82257-2-sidhartha.kumar@oracle.com
State New
Headers
Series convert core hugetlb functions to folios |

Commit Message

Sidhartha Kumar Nov. 29, 2022, 10:50 p.m. UTC
  Add folio equivalents for set_compound_order() and set_compound_page_dtor().

Also remove extra new-lines introduced by mm/hugetlb: convert
move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert
hugetlb_cgroup_uncharge_page() to folios.

Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
Suggested-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
---
 include/linux/mm.h | 16 ++++++++++++++++
 mm/hugetlb.c       |  4 +---
 2 files changed, 17 insertions(+), 3 deletions(-)
  

Comments

Mike Kravetz Dec. 7, 2022, 12:18 a.m. UTC | #1
On 11/29/22 14:50, Sidhartha Kumar wrote:
> Add folio equivalents for set_compound_order() and set_compound_page_dtor().
> 
> Also remove extra new-lines introduced by mm/hugetlb: convert
> move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert
> hugetlb_cgroup_uncharge_page() to folios.
> 
> Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
> Suggested-by: Muchun Song <songmuchun@bytedance.com>
> Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
> ---
>  include/linux/mm.h | 16 ++++++++++++++++
>  mm/hugetlb.c       |  4 +---
>  2 files changed, 17 insertions(+), 3 deletions(-)

Sorry, for the delay,

Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
  
Muchun Song Dec. 7, 2022, 3:34 a.m. UTC | #2
> On Nov 30, 2022, at 06:50, Sidhartha Kumar <sidhartha.kumar@oracle.com> wrote:
> 
> Add folio equivalents for set_compound_order() and set_compound_page_dtor().
> 
> Also remove extra new-lines introduced by mm/hugetlb: convert
> move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert
> hugetlb_cgroup_uncharge_page() to folios.
> 
> Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
> Suggested-by: Muchun Song <songmuchun@bytedance.com>
> Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
> ---
> include/linux/mm.h | 16 ++++++++++++++++
> mm/hugetlb.c       |  4 +---
> 2 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a48c5ad16a5e..2bdef8a5298a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -972,6 +972,13 @@ static inline void set_compound_page_dtor(struct page *page,
> page[1].compound_dtor = compound_dtor;
> }
> 
> +static inline void folio_set_compound_dtor(struct folio *folio,
> + enum compound_dtor_id compound_dtor)
> +{
> + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
> + folio->_folio_dtor = compound_dtor;
> +}
> +
> void destroy_large_folio(struct folio *folio);
> 
> static inline int head_compound_pincount(struct page *head)
> @@ -987,6 +994,15 @@ static inline void set_compound_order(struct page *page, unsigned int order)
> #endif
> }
> 
> +static inline void folio_set_compound_order(struct folio *folio,
> + unsigned int order)
> +{
> + folio->_folio_order = order;
> +#ifdef CONFIG_64BIT
> + folio->_folio_nr_pages = order ? 1U << order : 0;

It seems that you think the user could pass 0 to order. However,
->_folio_nr_pages and ->_folio_order fields are invalid for order-0 pages.
You should not touch it. So this should be:

static inline void folio_set_compound_order(struct folio *folio,
					    unsigned int order)
{
	if (!folio_test_large(folio))
		return;

	folio->_folio_order = order;
#ifdef CONFIG_64BIT
	folio->_folio_nr_pages = 1U << order;
#endif
}

If we can make sure all the users of folio_set_compound_order() should pass
a non-order-0 page (it is true for now), then I suggest adding a VM_BUG_ON()
here to catch unexpected users.

static inline void folio_set_compound_order(struct folio *folio,
					    unsigned int order)
{
	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);

	folio->_folio_order = order;
#ifdef CONFIG_64BIT
	folio->_folio_nr_pages = 1U << order;
#endif
}

Thanks.

> +#endif
> +}
> +
  
Mike Kravetz Dec. 7, 2022, 3:42 a.m. UTC | #3
On 12/07/22 11:34, Muchun Song wrote:
> 
> 
> > On Nov 30, 2022, at 06:50, Sidhartha Kumar <sidhartha.kumar@oracle.com> wrote:
> > 
> > Add folio equivalents for set_compound_order() and set_compound_page_dtor().
> > 
> > Also remove extra new-lines introduced by mm/hugetlb: convert
> > move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert
> > hugetlb_cgroup_uncharge_page() to folios.
> > 
> > Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
> > Suggested-by: Muchun Song <songmuchun@bytedance.com>
> > Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
> > ---
> > include/linux/mm.h | 16 ++++++++++++++++
> > mm/hugetlb.c       |  4 +---
> > 2 files changed, 17 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index a48c5ad16a5e..2bdef8a5298a 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -972,6 +972,13 @@ static inline void set_compound_page_dtor(struct page *page,
> > page[1].compound_dtor = compound_dtor;
> > }
> > 
> > +static inline void folio_set_compound_dtor(struct folio *folio,
> > + enum compound_dtor_id compound_dtor)
> > +{
> > + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
> > + folio->_folio_dtor = compound_dtor;
> > +}
> > +
> > void destroy_large_folio(struct folio *folio);
> > 
> > static inline int head_compound_pincount(struct page *head)
> > @@ -987,6 +994,15 @@ static inline void set_compound_order(struct page *page, unsigned int order)
> > #endif
> > }
> > 
> > +static inline void folio_set_compound_order(struct folio *folio,
> > + unsigned int order)
> > +{
> > + folio->_folio_order = order;
> > +#ifdef CONFIG_64BIT
> > + folio->_folio_nr_pages = order ? 1U << order : 0;
> 
> It seems that you think the user could pass 0 to order. However,
> ->_folio_nr_pages and ->_folio_order fields are invalid for order-0 pages.
> You should not touch it. So this should be:
> 
> static inline void folio_set_compound_order(struct folio *folio,
> 					    unsigned int order)
> {
> 	if (!folio_test_large(folio))
> 		return;
> 
> 	folio->_folio_order = order;
> #ifdef CONFIG_64BIT
> 	folio->_folio_nr_pages = 1U << order;
> #endif
> }

I believe this was changed to accommodate the code in
__destroy_compound_gigantic_page().  It is used in a subsequent patch.
Here is the v6.0 version of the routine.

static void __destroy_compound_gigantic_page(struct page *page,
					unsigned int order, bool demote)
{
	int i;
	int nr_pages = 1 << order;
	struct page *p = page + 1;

	atomic_set(compound_mapcount_ptr(page), 0);
	atomic_set(compound_pincount_ptr(page), 0);

	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
		p->mapping = NULL;
		clear_compound_head(p);
		if (!demote)
			set_page_refcounted(p);
	}

	set_compound_order(page, 0);
#ifdef CONFIG_64BIT
	page[1].compound_nr = 0;
#endif
	__ClearPageHead(page);
}


Might have been better to change this set_compound_order call to
folio_set_compound_order in this patch.
  
Muchun Song Dec. 7, 2022, 4:11 a.m. UTC | #4
> On Dec 7, 2022, at 11:42, Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
> On 12/07/22 11:34, Muchun Song wrote:
>> 
>> 
>>> On Nov 30, 2022, at 06:50, Sidhartha Kumar <sidhartha.kumar@oracle.com> wrote:
>>> 
>>> Add folio equivalents for set_compound_order() and set_compound_page_dtor().
>>> 
>>> Also remove extra new-lines introduced by mm/hugetlb: convert
>>> move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert
>>> hugetlb_cgroup_uncharge_page() to folios.
>>> 
>>> Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
>>> Suggested-by: Muchun Song <songmuchun@bytedance.com>
>>> Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
>>> ---
>>> include/linux/mm.h | 16 ++++++++++++++++
>>> mm/hugetlb.c       |  4 +---
>>> 2 files changed, 17 insertions(+), 3 deletions(-)
>>> 
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index a48c5ad16a5e..2bdef8a5298a 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -972,6 +972,13 @@ static inline void set_compound_page_dtor(struct page *page,
>>> page[1].compound_dtor = compound_dtor;
>>> }
>>> 
>>> +static inline void folio_set_compound_dtor(struct folio *folio,
>>> + enum compound_dtor_id compound_dtor)
>>> +{
>>> + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
>>> + folio->_folio_dtor = compound_dtor;
>>> +}
>>> +
>>> void destroy_large_folio(struct folio *folio);
>>> 
>>> static inline int head_compound_pincount(struct page *head)
>>> @@ -987,6 +994,15 @@ static inline void set_compound_order(struct page *page, unsigned int order)
>>> #endif
>>> }
>>> 
>>> +static inline void folio_set_compound_order(struct folio *folio,
>>> + unsigned int order)
>>> +{
>>> + folio->_folio_order = order;
>>> +#ifdef CONFIG_64BIT
>>> + folio->_folio_nr_pages = order ? 1U << order : 0;
>> 
>> It seems that you think the user could pass 0 to order. However,
>> ->_folio_nr_pages and ->_folio_order fields are invalid for order-0 pages.
>> You should not touch it. So this should be:
>> 
>> static inline void folio_set_compound_order(struct folio *folio,
>>     unsigned int order)
>> {
>> 	if (!folio_test_large(folio))
>> 		return;
>> 
>> 	folio->_folio_order = order;
>> #ifdef CONFIG_64BIT
>> 	folio->_folio_nr_pages = 1U << order;
>> #endif
>> }
> 
> I believe this was changed to accommodate the code in
> __destroy_compound_gigantic_page().  It is used in a subsequent patch.
> Here is the v6.0 version of the routine.

Thanks for your clarification.

> 
> static void __destroy_compound_gigantic_page(struct page *page,
> unsigned int order, bool demote)
> {
> 	int i;
> 	int nr_pages = 1 << order;
> 	struct page *p = page + 1;
> 
> 	atomic_set(compound_mapcount_ptr(page), 0);
> 	atomic_set(compound_pincount_ptr(page), 0);
> 
> 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
> 		p->mapping = NULL;
> 		clear_compound_head(p);
> 		if (!demote)
> 			set_page_refcounted(p);
> 	}
> 
> 	set_compound_order(page, 0);
> #ifdef CONFIG_64BIT
> 	page[1].compound_nr = 0;
> #endif
> 	__ClearPageHead(page);
> }
> 
> 
> Might have been better to change this set_compound_order call to
> folio_set_compound_order in this patch.
> 

Agree. It has confused me a lot. I suggest changing the code to the
followings. The folio_test_large() check is still to avoid unexpected
users for OOB.

static inline void folio_set_compound_order(struct folio *folio,
					    unsigned int order)
{
	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
	// or
	// if (!folio_test_large(folio))
	// 	return;

	folio->_folio_order = order;
#ifdef CONFIG_64BIT
	folio->_folio_nr_pages = order ? 1U << order : 0;
#endif
}

Thanks.
  
Mike Kravetz Dec. 7, 2022, 6:12 p.m. UTC | #5
On 12/07/22 12:11, Muchun Song wrote:
> 
> 
> > On Dec 7, 2022, at 11:42, Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > 
> > On 12/07/22 11:34, Muchun Song wrote:
> >> 
> >> 
> >>> On Nov 30, 2022, at 06:50, Sidhartha Kumar <sidhartha.kumar@oracle.com> wrote:
> >>> 
> >>> Add folio equivalents for set_compound_order() and set_compound_page_dtor().
> >>> 
> >>> Also remove extra new-lines introduced by mm/hugetlb: convert
> >>> move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert
> >>> hugetlb_cgroup_uncharge_page() to folios.
> >>> 
> >>> Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
> >>> Suggested-by: Muchun Song <songmuchun@bytedance.com>
> >>> Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
> >>> ---
> >>> include/linux/mm.h | 16 ++++++++++++++++
> >>> mm/hugetlb.c       |  4 +---
> >>> 2 files changed, 17 insertions(+), 3 deletions(-)
> >>> 
> >>> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >>> index a48c5ad16a5e..2bdef8a5298a 100644
> >>> --- a/include/linux/mm.h
> >>> +++ b/include/linux/mm.h
> >>> @@ -972,6 +972,13 @@ static inline void set_compound_page_dtor(struct page *page,
> >>> page[1].compound_dtor = compound_dtor;
> >>> }
> >>> 
> >>> +static inline void folio_set_compound_dtor(struct folio *folio,
> >>> + enum compound_dtor_id compound_dtor)
> >>> +{
> >>> + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
> >>> + folio->_folio_dtor = compound_dtor;
> >>> +}
> >>> +
> >>> void destroy_large_folio(struct folio *folio);
> >>> 
> >>> static inline int head_compound_pincount(struct page *head)
> >>> @@ -987,6 +994,15 @@ static inline void set_compound_order(struct page *page, unsigned int order)
> >>> #endif
> >>> }
> >>> 
> >>> +static inline void folio_set_compound_order(struct folio *folio,
> >>> + unsigned int order)
> >>> +{
> >>> + folio->_folio_order = order;
> >>> +#ifdef CONFIG_64BIT
> >>> + folio->_folio_nr_pages = order ? 1U << order : 0;
> >> 
> >> It seems that you think the user could pass 0 to order. However,
> >> ->_folio_nr_pages and ->_folio_order fields are invalid for order-0 pages.
> >> You should not touch it. So this should be:
> >> 
> >> static inline void folio_set_compound_order(struct folio *folio,
> >>     unsigned int order)
> >> {
> >> 	if (!folio_test_large(folio))
> >> 		return;
> >> 
> >> 	folio->_folio_order = order;
> >> #ifdef CONFIG_64BIT
> >> 	folio->_folio_nr_pages = 1U << order;
> >> #endif
> >> }
> > 
> > I believe this was changed to accommodate the code in
> > __destroy_compound_gigantic_page().  It is used in a subsequent patch.
> > Here is the v6.0 version of the routine.
> 
> Thanks for your clarification.
> 
> > 
> > static void __destroy_compound_gigantic_page(struct page *page,
> > unsigned int order, bool demote)
> > {
> > 	int i;
> > 	int nr_pages = 1 << order;
> > 	struct page *p = page + 1;
> > 
> > 	atomic_set(compound_mapcount_ptr(page), 0);
> > 	atomic_set(compound_pincount_ptr(page), 0);
> > 
> > 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
> > 		p->mapping = NULL;
> > 		clear_compound_head(p);
> > 		if (!demote)
> > 			set_page_refcounted(p);
> > 	}
> > 
> > 	set_compound_order(page, 0);
> > #ifdef CONFIG_64BIT
> > 	page[1].compound_nr = 0;
> > #endif
> > 	__ClearPageHead(page);
> > }
> > 
> > 
> > Might have been better to change this set_compound_order call to
> > folio_set_compound_order in this patch.
> > 
> 
> Agree. It has confused me a lot. I suggest changing the code to the
> followings. The folio_test_large() check is still to avoid unexpected
> users for OOB.
> 
> static inline void folio_set_compound_order(struct folio *folio,
> 					    unsigned int order)
> {
> 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
> 	// or
> 	// if (!folio_test_large(folio))
> 	// 	return;
> 
> 	folio->_folio_order = order;
> #ifdef CONFIG_64BIT
> 	folio->_folio_nr_pages = order ? 1U << order : 0;
> #endif
> }

I think the VM_BUG_ON_FOLIO is appropriate as it would at least flag
data corruption.

Thinking about this some more, it seems that hugetlb is the only caller
that abuses folio_set_compound_order (and previously set_compound_order)
by passing in a zero order.  Since it is unlikely that anyone knows of
this abuse, it might be good to add a comment to the routine to note
why it handles the zero case.  This might help prevent changes which
would potentially break hugetlb.
  
Sidhartha Kumar Dec. 7, 2022, 6:49 p.m. UTC | #6
On 12/7/22 10:12 AM, Mike Kravetz wrote:
> On 12/07/22 12:11, Muchun Song wrote:
>>
>>
>>> On Dec 7, 2022, at 11:42, Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>>
>>> On 12/07/22 11:34, Muchun Song wrote:
>>>>
>>>>
>>>>> On Nov 30, 2022, at 06:50, Sidhartha Kumar <sidhartha.kumar@oracle.com> wrote:
>>>>>
>>>>> Add folio equivalents for set_compound_order() and set_compound_page_dtor().
>>>>>
>>>>> Also remove extra new-lines introduced by mm/hugetlb: convert
>>>>> move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert
>>>>> hugetlb_cgroup_uncharge_page() to folios.
>>>>>
>>>>> Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>>> Suggested-by: Muchun Song <songmuchun@bytedance.com>
>>>>> Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
>>>>> ---
>>>>> include/linux/mm.h | 16 ++++++++++++++++
>>>>> mm/hugetlb.c       |  4 +---
>>>>> 2 files changed, 17 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>>> index a48c5ad16a5e..2bdef8a5298a 100644
>>>>> --- a/include/linux/mm.h
>>>>> +++ b/include/linux/mm.h
>>>>> @@ -972,6 +972,13 @@ static inline void set_compound_page_dtor(struct page *page,
>>>>> page[1].compound_dtor = compound_dtor;
>>>>> }
>>>>>
>>>>> +static inline void folio_set_compound_dtor(struct folio *folio,
>>>>> + enum compound_dtor_id compound_dtor)
>>>>> +{
>>>>> + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
>>>>> + folio->_folio_dtor = compound_dtor;
>>>>> +}
>>>>> +
>>>>> void destroy_large_folio(struct folio *folio);
>>>>>
>>>>> static inline int head_compound_pincount(struct page *head)
>>>>> @@ -987,6 +994,15 @@ static inline void set_compound_order(struct page *page, unsigned int order)
>>>>> #endif
>>>>> }
>>>>>
>>>>> +static inline void folio_set_compound_order(struct folio *folio,
>>>>> + unsigned int order)
>>>>> +{
>>>>> + folio->_folio_order = order;
>>>>> +#ifdef CONFIG_64BIT
>>>>> + folio->_folio_nr_pages = order ? 1U << order : 0;
>>>>
>>>> It seems that you think the user could pass 0 to order. However,
>>>> ->_folio_nr_pages and ->_folio_order fields are invalid for order-0 pages.
>>>> You should not touch it. So this should be:
>>>>
>>>> static inline void folio_set_compound_order(struct folio *folio,
>>>>      unsigned int order)
>>>> {
>>>> 	if (!folio_test_large(folio))
>>>> 		return;
>>>>
>>>> 	folio->_folio_order = order;
>>>> #ifdef CONFIG_64BIT
>>>> 	folio->_folio_nr_pages = 1U << order;
>>>> #endif
>>>> }
>>>
>>> I believe this was changed to accommodate the code in
>>> __destroy_compound_gigantic_page().  It is used in a subsequent patch.
>>> Here is the v6.0 version of the routine.
>>
>> Thanks for your clarification.
>>
>>>
>>> static void __destroy_compound_gigantic_page(struct page *page,
>>> unsigned int order, bool demote)
>>> {
>>> 	int i;
>>> 	int nr_pages = 1 << order;
>>> 	struct page *p = page + 1;
>>>
>>> 	atomic_set(compound_mapcount_ptr(page), 0);
>>> 	atomic_set(compound_pincount_ptr(page), 0);
>>>
>>> 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
>>> 		p->mapping = NULL;
>>> 		clear_compound_head(p);
>>> 		if (!demote)
>>> 			set_page_refcounted(p);
>>> 	}
>>>
>>> 	set_compound_order(page, 0);
>>> #ifdef CONFIG_64BIT
>>> 	page[1].compound_nr = 0;
>>> #endif
>>> 	__ClearPageHead(page);
>>> }
>>>
>>>
>>> Might have been better to change this set_compound_order call to
>>> folio_set_compound_order in this patch.
>>>
>>
>> Agree. It has confused me a lot. I suggest changing the code to the
>> followings. The folio_test_large() check is still to avoid unexpected
>> users for OOB.
>>
>> static inline void folio_set_compound_order(struct folio *folio,
>> 					    unsigned int order)
>> {
>> 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>> 	// or
>> 	// if (!folio_test_large(folio))
>> 	// 	return;
>>
>> 	folio->_folio_order = order;
>> #ifdef CONFIG_64BIT
>> 	folio->_folio_nr_pages = order ? 1U << order : 0;
>> #endif
>> }
> 
> I think the VM_BUG_ON_FOLIO is appropriate as it would at least flag
> data corruption.
> 
As Mike pointed out, my intention with supporting the 0 case was to 
cleanup the __destroy_compound_gigantic_page code by moving the ifdef 
CONFIG_64BIT lines to folio_set_compound_order(). I'll add the 
VM_BUG_ON_FOLIO line as well as a comment to make it clear it is not 
normally supported.

> Thinking about this some more, it seems that hugetlb is the only caller
> that abuses folio_set_compound_order (and previously set_compound_order)
> by passing in a zero order.  Since it is unlikely that anyone knows of
> this abuse, it might be good to add a comment to the routine to note
> why it handles the zero case.  This might help prevent changes which
> would potentially break hugetlb.

+/*
+ * _folio_nr_pages and _folio_order are invalid for
+ * order-zero pages. An exception is hugetlb, which passes
+ * in a zero order in __destroy_compound_gigantic_page().
+ */
  static inline void folio_set_compound_order(struct folio *folio,
                 unsigned int order)
  {
+       VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+
         folio->_folio_order = order;
  #ifdef CONFIG_64BIT
         folio->_folio_nr_pages = order ? 1U << order : 0;

Does this comment work?
  
Sidhartha Kumar Dec. 7, 2022, 7:05 p.m. UTC | #7
On 12/7/22 10:49 AM, Sidhartha Kumar wrote:
> On 12/7/22 10:12 AM, Mike Kravetz wrote:
>> On 12/07/22 12:11, Muchun Song wrote:
>>>
>>>
>>>> On Dec 7, 2022, at 11:42, Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>>>
>>>> On 12/07/22 11:34, Muchun Song wrote:
>>>>>
>>>>>
>>>>>> On Nov 30, 2022, at 06:50, Sidhartha Kumar 
>>>>>> <sidhartha.kumar@oracle.com> wrote:
>>>>>>
>>>>>> Add folio equivalents for set_compound_order() and 
>>>>>> set_compound_page_dtor().
>>>>>>
>>>>>> Also remove extra new-lines introduced by mm/hugetlb: convert
>>>>>> move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert
>>>>>> hugetlb_cgroup_uncharge_page() to folios.
>>>>>>
>>>>>> Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>>>> Suggested-by: Muchun Song <songmuchun@bytedance.com>
>>>>>> Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
>>>>>> ---
>>>>>> include/linux/mm.h | 16 ++++++++++++++++
>>>>>> mm/hugetlb.c       |  4 +---
>>>>>> 2 files changed, 17 insertions(+), 3 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>>>> index a48c5ad16a5e..2bdef8a5298a 100644
>>>>>> --- a/include/linux/mm.h
>>>>>> +++ b/include/linux/mm.h
>>>>>> @@ -972,6 +972,13 @@ static inline void 
>>>>>> set_compound_page_dtor(struct page *page,
>>>>>> page[1].compound_dtor = compound_dtor;
>>>>>> }
>>>>>>
>>>>>> +static inline void folio_set_compound_dtor(struct folio *folio,
>>>>>> + enum compound_dtor_id compound_dtor)
>>>>>> +{
>>>>>> + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
>>>>>> + folio->_folio_dtor = compound_dtor;
>>>>>> +}
>>>>>> +
>>>>>> void destroy_large_folio(struct folio *folio);
>>>>>>
>>>>>> static inline int head_compound_pincount(struct page *head)
>>>>>> @@ -987,6 +994,15 @@ static inline void set_compound_order(struct 
>>>>>> page *page, unsigned int order)
>>>>>> #endif
>>>>>> }
>>>>>>
>>>>>> +static inline void folio_set_compound_order(struct folio *folio,
>>>>>> + unsigned int order)
>>>>>> +{
>>>>>> + folio->_folio_order = order;
>>>>>> +#ifdef CONFIG_64BIT
>>>>>> + folio->_folio_nr_pages = order ? 1U << order : 0;
>>>>>
>>>>> It seems that you think the user could pass 0 to order. However,
>>>>> ->_folio_nr_pages and ->_folio_order fields are invalid for order-0 
>>>>> pages.
>>>>> You should not touch it. So this should be:
>>>>>
>>>>> static inline void folio_set_compound_order(struct folio *folio,
>>>>>      unsigned int order)
>>>>> {
>>>>>     if (!folio_test_large(folio))
>>>>>         return;
>>>>>
>>>>>     folio->_folio_order = order;
>>>>> #ifdef CONFIG_64BIT
>>>>>     folio->_folio_nr_pages = 1U << order;
>>>>> #endif
>>>>> }
>>>>
>>>> I believe this was changed to accommodate the code in
>>>> __destroy_compound_gigantic_page().  It is used in a subsequent patch.
>>>> Here is the v6.0 version of the routine.
>>>
>>> Thanks for your clarification.
>>>
>>>>
>>>> static void __destroy_compound_gigantic_page(struct page *page,
>>>> unsigned int order, bool demote)
>>>> {
>>>>     int i;
>>>>     int nr_pages = 1 << order;
>>>>     struct page *p = page + 1;
>>>>
>>>>     atomic_set(compound_mapcount_ptr(page), 0);
>>>>     atomic_set(compound_pincount_ptr(page), 0);
>>>>
>>>>     for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
>>>>         p->mapping = NULL;
>>>>         clear_compound_head(p);
>>>>         if (!demote)
>>>>             set_page_refcounted(p);
>>>>     }
>>>>
>>>>     set_compound_order(page, 0);
>>>> #ifdef CONFIG_64BIT
>>>>     page[1].compound_nr = 0;
>>>> #endif
>>>>     __ClearPageHead(page);
>>>> }
>>>>
>>>>
>>>> Might have been better to change this set_compound_order call to
>>>> folio_set_compound_order in this patch.
>>>>
>>>
>>> Agree. It has confused me a lot. I suggest changing the code to the
>>> followings. The folio_test_large() check is still to avoid unexpected
>>> users for OOB.
>>>
>>> static inline void folio_set_compound_order(struct folio *folio,
>>>                         unsigned int order)
>>> {
>>>     VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>>     // or
>>>     // if (!folio_test_large(folio))
>>>     //     return;
>>>
>>>     folio->_folio_order = order;
>>> #ifdef CONFIG_64BIT
>>>     folio->_folio_nr_pages = order ? 1U << order : 0;
>>> #endif
>>> }
>>
>> I think the VM_BUG_ON_FOLIO is appropriate as it would at least flag
>> data corruption.
>>
> As Mike pointed out, my intention with supporting the 0 case was to 
> cleanup the __destroy_compound_gigantic_page code by moving the ifdef 
> CONFIG_64BIT lines to folio_set_compound_order(). I'll add the 
> VM_BUG_ON_FOLIO line as well as a comment to make it clear it is not 
> normally supported.
> 
>> Thinking about this some more, it seems that hugetlb is the only caller
>> that abuses folio_set_compound_order (and previously set_compound_order)
>> by passing in a zero order.  Since it is unlikely that anyone knows of
>> this abuse, it might be good to add a comment to the routine to note
>> why it handles the zero case.  This might help prevent changes which
>> would potentially break hugetlb.
> 
> +/*
> + * _folio_nr_pages and _folio_order are invalid for
> + * order-zero pages. An exception is hugetlb, which passes
> + * in a zero order in __destroy_compound_gigantic_page().
> + */
>   static inline void folio_set_compound_order(struct folio *folio,
>                  unsigned int order)
>   {
> +       VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
> +
>          folio->_folio_order = order;
>   #ifdef CONFIG_64BIT
>          folio->_folio_nr_pages = order ? 1U << order : 0;
> 
> Does this comment work?
> 
> 

I will change the comment from referencing 
__destory_compound_gigantic_page()
to __destroy_compound_gigantic_folio, although 
__prep_compound_gigantic_folio() is another user of 
folio_set_compound_order(folio, 0). Should the sentence just be "An 
exception is hugetlb, which passes in a zero order"?
  
Mike Kravetz Dec. 7, 2022, 7:25 p.m. UTC | #8
On 12/07/22 11:05, Sidhartha Kumar wrote:
> On 12/7/22 10:49 AM, Sidhartha Kumar wrote:
> > On 12/7/22 10:12 AM, Mike Kravetz wrote:
> > > On 12/07/22 12:11, Muchun Song wrote:
> > > > > On Dec 7, 2022, at 11:42, Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > > > > On 12/07/22 11:34, Muchun Song wrote:
> > > > 
> > > > Agree. It has confused me a lot. I suggest changing the code to the
> > > > followings. The folio_test_large() check is still to avoid unexpected
> > > > users for OOB.
> > > > 
> > > > static inline void folio_set_compound_order(struct folio *folio,
> > > >                         unsigned int order)
> > > > {
> > > >     VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
> > > >     // or
> > > >     // if (!folio_test_large(folio))
> > > >     //     return;
> > > > 
> > > >     folio->_folio_order = order;
> > > > #ifdef CONFIG_64BIT
> > > >     folio->_folio_nr_pages = order ? 1U << order : 0;
> > > > #endif
> > > > }
> > > 
> > > I think the VM_BUG_ON_FOLIO is appropriate as it would at least flag
> > > data corruption.
> > > 
> > As Mike pointed out, my intention with supporting the 0 case was to
> > cleanup the __destroy_compound_gigantic_page code by moving the ifdef
> > CONFIG_64BIT lines to folio_set_compound_order(). I'll add the
> > VM_BUG_ON_FOLIO line as well as a comment to make it clear it is not
> > normally supported.
> > 
> > > Thinking about this some more, it seems that hugetlb is the only caller
> > > that abuses folio_set_compound_order (and previously set_compound_order)
> > > by passing in a zero order.  Since it is unlikely that anyone knows of
> > > this abuse, it might be good to add a comment to the routine to note
> > > why it handles the zero case.  This might help prevent changes which
> > > would potentially break hugetlb.
> > 
> > +/*
> > + * _folio_nr_pages and _folio_order are invalid for
> > + * order-zero pages. An exception is hugetlb, which passes
> > + * in a zero order in __destroy_compound_gigantic_page().
> > + */
> >   static inline void folio_set_compound_order(struct folio *folio,
> >                  unsigned int order)
> >   {
> > +       VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
> > +
> >          folio->_folio_order = order;
> >   #ifdef CONFIG_64BIT
> >          folio->_folio_nr_pages = order ? 1U << order : 0;
> > 
> > Does this comment work?
> > 
> > 
> 
> I will change the comment from referencing
> __destory_compound_gigantic_page()
> to __destroy_compound_gigantic_folio, although
> __prep_compound_gigantic_folio() is another user of
> folio_set_compound_order(folio, 0). Should the sentence just be "An
> exception is hugetlb, which passes in a zero order"?

How about a comment like this?

/*
 * folio_set_compound_order is generally passed a non-zero order to
 * set up/create a large folio.  However, hugetlb code abuses this by
 * passing in zero when 'dissolving' a large folio.
 */

My only concern is that someone may modify the routine such that it no
longer works when passed zero order.  It is not likely as anyone should
notice the special case for zero, and look for callers.
  
Muchun Song Dec. 8, 2022, 2:19 a.m. UTC | #9
> On Dec 8, 2022, at 03:25, Mike Kravetz <mike.kravetz@oracle.com> wrote:
> 
> On 12/07/22 11:05, Sidhartha Kumar wrote:
>> On 12/7/22 10:49 AM, Sidhartha Kumar wrote:
>>> On 12/7/22 10:12 AM, Mike Kravetz wrote:
>>>> On 12/07/22 12:11, Muchun Song wrote:
>>>>>> On Dec 7, 2022, at 11:42, Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>>>>> On 12/07/22 11:34, Muchun Song wrote:
>>>>> 
>>>>> Agree. It has confused me a lot. I suggest changing the code to the
>>>>> followings. The folio_test_large() check is still to avoid unexpected
>>>>> users for OOB.
>>>>> 
>>>>> static inline void folio_set_compound_order(struct folio *folio,
>>>>>                         unsigned int order)
>>>>> {
>>>>>     VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>>>>     // or
>>>>>     // if (!folio_test_large(folio))
>>>>>     //     return;
>>>>> 
>>>>>     folio->_folio_order = order;
>>>>> #ifdef CONFIG_64BIT
>>>>>     folio->_folio_nr_pages = order ? 1U << order : 0;
>>>>> #endif
>>>>> }
>>>> 
>>>> I think the VM_BUG_ON_FOLIO is appropriate as it would at least flag
>>>> data corruption.
>>>> 
>>> As Mike pointed out, my intention with supporting the 0 case was to
>>> cleanup the __destroy_compound_gigantic_page code by moving the ifdef
>>> CONFIG_64BIT lines to folio_set_compound_order(). I'll add the
>>> VM_BUG_ON_FOLIO line as well as a comment to make it clear it is not
>>> normally supported.
>>> 
>>>> Thinking about this some more, it seems that hugetlb is the only caller
>>>> that abuses folio_set_compound_order (and previously set_compound_order)
>>>> by passing in a zero order.  Since it is unlikely that anyone knows of
>>>> this abuse, it might be good to add a comment to the routine to note
>>>> why it handles the zero case.  This might help prevent changes which
>>>> would potentially break hugetlb.
>>> 
>>> +/*
>>> + * _folio_nr_pages and _folio_order are invalid for
>>> + * order-zero pages. An exception is hugetlb, which passes
>>> + * in a zero order in __destroy_compound_gigantic_page().
>>> + */
>>>  static inline void folio_set_compound_order(struct folio *folio,
>>>                 unsigned int order)
>>>  {
>>> +       VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>> +
>>>         folio->_folio_order = order;
>>>  #ifdef CONFIG_64BIT
>>>         folio->_folio_nr_pages = order ? 1U << order : 0;
>>> 
>>> Does this comment work?
>>> 
>>> 
>> 
>> I will change the comment from referencing
>> __destory_compound_gigantic_page()
>> to __destroy_compound_gigantic_folio, although
>> __prep_compound_gigantic_folio() is another user of
>> folio_set_compound_order(folio, 0). Should the sentence just be "An
>> exception is hugetlb, which passes in a zero order"?
> 
> How about a comment like this?
> 
> /*
> * folio_set_compound_order is generally passed a non-zero order to
> * set up/create a large folio.  However, hugetlb code abuses this by
> * passing in zero when 'dissolving' a large folio.
> */

How about adding a new helper like "folio_dissolve_compound(struct folio *folio)"?
then it may be unnecessary to add a comment.

Thanks.

> 
> My only concern is that someone may modify the routine such that it no
> longer works when passed zero order.  It is not likely as anyone should
> notice the special case for zero, and look for callers.
> -- 
> Mike Kravetz
  
John Hubbard Dec. 8, 2022, 2:31 a.m. UTC | #10
On 12/7/22 18:19, Muchun Song wrote:
> How about adding a new helper like "folio_dissolve_compound(struct folio *folio)"?
> then it may be unnecessary to add a comment.
> 

Yes, and in fact, I just finished spelling out exactly how to do that, in [1].


[1] https://lore.kernel.org/all/d17530ad-8e12-8069-d619-a2d72fe80e15@nvidia.com/


thanks,
  
Muchun Song Dec. 8, 2022, 4:44 a.m. UTC | #11
> On Dec 8, 2022, at 10:31, John Hubbard <jhubbard@nvidia.com> wrote:
> 
> On 12/7/22 18:19, Muchun Song wrote:
>> How about adding a new helper like "folio_dissolve_compound(struct folio *folio)"?
>> then it may be unnecessary to add a comment.
> 
> Yes, and in fact, I just finished spelling out exactly how to do that, in [1].

Thanks for your guidance. I have replied in thread [1].

> 
> 
> [1] https://lore.kernel.org/all/d17530ad-8e12-8069-d619-a2d72fe80e15@nvidia.com/
> 
> 
> thanks,
> -- 
> John Hubbard
> NVIDIA
  
David Hildenbrand Dec. 12, 2022, 6:34 p.m. UTC | #12
On 07.12.22 05:11, Muchun Song wrote:
> 
> 
>> On Dec 7, 2022, at 11:42, Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>
>> On 12/07/22 11:34, Muchun Song wrote:
>>>
>>>
>>>> On Nov 30, 2022, at 06:50, Sidhartha Kumar <sidhartha.kumar@oracle.com> wrote:
>>>>
>>>> Add folio equivalents for set_compound_order() and set_compound_page_dtor().
>>>>
>>>> Also remove extra new-lines introduced by mm/hugetlb: convert
>>>> move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert
>>>> hugetlb_cgroup_uncharge_page() to folios.
>>>>
>>>> Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>> Suggested-by: Muchun Song <songmuchun@bytedance.com>
>>>> Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
>>>> ---
>>>> include/linux/mm.h | 16 ++++++++++++++++
>>>> mm/hugetlb.c       |  4 +---
>>>> 2 files changed, 17 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>> index a48c5ad16a5e..2bdef8a5298a 100644
>>>> --- a/include/linux/mm.h
>>>> +++ b/include/linux/mm.h
>>>> @@ -972,6 +972,13 @@ static inline void set_compound_page_dtor(struct page *page,
>>>> page[1].compound_dtor = compound_dtor;
>>>> }
>>>>
>>>> +static inline void folio_set_compound_dtor(struct folio *folio,
>>>> + enum compound_dtor_id compound_dtor)
>>>> +{
>>>> + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
>>>> + folio->_folio_dtor = compound_dtor;
>>>> +}
>>>> +
>>>> void destroy_large_folio(struct folio *folio);
>>>>
>>>> static inline int head_compound_pincount(struct page *head)
>>>> @@ -987,6 +994,15 @@ static inline void set_compound_order(struct page *page, unsigned int order)
>>>> #endif
>>>> }
>>>>
>>>> +static inline void folio_set_compound_order(struct folio *folio,
>>>> + unsigned int order)
>>>> +{
>>>> + folio->_folio_order = order;
>>>> +#ifdef CONFIG_64BIT
>>>> + folio->_folio_nr_pages = order ? 1U << order : 0;
>>>
>>> It seems that you think the user could pass 0 to order. However,
>>> ->_folio_nr_pages and ->_folio_order fields are invalid for order-0 pages.
>>> You should not touch it. So this should be:
>>>
>>> static inline void folio_set_compound_order(struct folio *folio,
>>>      unsigned int order)
>>> {
>>> 	if (!folio_test_large(folio))
>>> 		return;
>>>
>>> 	folio->_folio_order = order;
>>> #ifdef CONFIG_64BIT
>>> 	folio->_folio_nr_pages = 1U << order;
>>> #endif
>>> }
>>
>> I believe this was changed to accommodate the code in
>> __destroy_compound_gigantic_page().  It is used in a subsequent patch.
>> Here is the v6.0 version of the routine.
> 
> Thanks for your clarification.
> 
>>
>> static void __destroy_compound_gigantic_page(struct page *page,
>> unsigned int order, bool demote)
>> {
>> 	int i;
>> 	int nr_pages = 1 << order;
>> 	struct page *p = page + 1;
>>
>> 	atomic_set(compound_mapcount_ptr(page), 0);
>> 	atomic_set(compound_pincount_ptr(page), 0);
>>
>> 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
>> 		p->mapping = NULL;
>> 		clear_compound_head(p);
>> 		if (!demote)
>> 			set_page_refcounted(p);
>> 	}
>>
>> 	set_compound_order(page, 0);
>> #ifdef CONFIG_64BIT
>> 	page[1].compound_nr = 0;
>> #endif
>> 	__ClearPageHead(page);
>> }
>>
>>
>> Might have been better to change this set_compound_order call to
>> folio_set_compound_order in this patch.
>>
> 
> Agree. It has confused me a lot. I suggest changing the code to the
> followings. The folio_test_large() check is still to avoid unexpected
> users for OOB.
> 
> static inline void folio_set_compound_order(struct folio *folio,
> 					    unsigned int order)
> {
> 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
> 	// or
> 	// if (!folio_test_large(folio))
> 	// 	return;
> 
> 	folio->_folio_order = order;
> #ifdef CONFIG_64BIT
> 	folio->_folio_nr_pages = order ? 1U << order : 0;
> #endif
> }
> 
> Thanks.

On mm-stable, dynamically allocating gigantic pages gives me:

[   23.625105] page:00000000ae27bd2a refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1800
[   23.635117] flags: 0x17fffc00000000(node=0|zone=2|lastcpupid=0x1ffff)
[   23.641969] raw: 0017fffc00000000 ffffa4edc489bb58 fffff784c6000048 0000000000000000
[   23.650141] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
[   23.657907] page dumped because: VM_BUG_ON_FOLIO(!folio_test_large(folio))
[   23.663955] ------------[ cut here ]------------
[   23.667680] kernel BUG at include/linux/mm.h:1030!
[   23.673378] invalid opcode: 0000 [#1] PREEMPT SMP PTI
[   23.681610] CPU: 3 PID: 1220 Comm: bash Not tainted 6.1.0-rc4+ #13
[   23.690281] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-1.fc36 04/01/2014
[   23.699837] RIP: 0010:__prep_compound_gigantic_folio+0x115/0x120
[   23.706587] Code: c7 44 24 5c 00 00 00 00 5b 44 89 d0 5d 41 5c 41 5d 41 5e c3 cc cc cc cc 48 83 e8 09
[   23.725954] RSP: 0018:ffffa4edc489bc90 EFLAGS: 00010296
[   23.731001] RAX: 000000000000003e RBX: ffffffffae39a720 RCX: 0000000000000000
[   23.736065] RDX: 0000000000000001 RSI: ffffffffad522c25 RDI: 00000000ffffffff
[   23.740866] RBP: 0000000000040000 R08: 0000000000000000 R09: ffffa4edc489bb58
[   23.745385] R10: 0000000000000003 R11: ffff926d7ff4b028 R12: fffff784c6000000
[   23.749464] R13: 0000000000040000 R14: fffff784c6000000 R15: ffff9266039fb900
[   23.753176] FS:  00007f69a9703740(0000) GS:ffff92696f8c0000(0000) knlGS:0000000000000000
[   23.758299] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   23.761275] CR2: 0000556789080640 CR3: 0000000105568005 CR4: 0000000000770ee0
[   23.764929] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   23.768572] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   23.772221] PKRU: 55555554
[   23.773696] Call Trace:
[   23.776170]  <TASK>
[   23.777258]  alloc_fresh_hugetlb_folio+0x14b/0x220
[   23.779406]  alloc_pool_huge_page+0x7c/0x110
[   23.781273]  __nr_hugepages_store_common+0x191/0x400
[   23.783369]  ? __mod_memcg_lruvec_state+0x93/0x110
[   23.785343]  ? __mod_lruvec_page_state+0xa6/0x1a0
[   23.787223]  ? _copy_from_iter+0x8c/0x540
[   23.788788]  ? __kmem_cache_alloc_node+0x13b/0x2a0
[   23.790577]  ? kernfs_fop_write_iter+0x174/0x1f0
[   23.792313]  nr_hugepages_store+0x57/0x70
[   23.793754]  kernfs_fop_write_iter+0x11b/0x1f0
[   23.795337]  vfs_write+0x1e1/0x3a0
[   23.796531]  ksys_write+0x53/0xd0
[   23.797690]  do_syscall_64+0x37/0x90
[   23.798947]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   23.800602] RIP: 0033:0x7f69a9501977
[   23.801798] Code: 0f 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 14
[   23.807322] RSP: 002b:00007ffce87f3338 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[   23.809460] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f69a9501977
[   23.811485] RDX: 0000000000000002 RSI: 00005652a970e5e0 RDI: 0000000000000001
[   23.813443] RBP: 00005652a970e5e0 R08: 0000000000000000 R09: 0000000000000073
[   23.815385] R10: 0000000000001000 R11: 0000000000000246 R12: 0000000000000002
[   23.817289] R13: 00007f69a95f9780 R14: 0000000000000002 R15: 00007f69a95f49e0
[   23.819176]  </TASK>
[   23.819792] Modules linked in: intel_rapl_msr intel_rapl_common intel_uncore_frequency_common isst_ir
[   23.829056] ---[ end trace 0000000000000000 ]---
[   23.830413] RIP: 0010:__prep_compound_gigantic_folio+0x115/0x120
[   23.832390] Code: c7 44 24 5c 00 00 00 00 5b 44 89 d0 5d 41 5c 41 5d 41 5e c3 cc cc cc cc 48 83 e8 09
[   23.836787] RSP: 0018:ffffa4edc489bc90 EFLAGS: 00010296
[   23.838083] RAX: 000000000000003e RBX: ffffffffae39a720 RCX: 0000000000000000
[   23.839764] RDX: 0000000000000001 RSI: ffffffffad522c25 RDI: 00000000ffffffff
[   23.841456] RBP: 0000000000040000 R08: 0000000000000000 R09: ffffa4edc489bb58
[   23.843150] R10: 0000000000000003 R11: ffff926d7ff4b028 R12: fffff784c6000000
[   23.844836] R13: 0000000000040000 R14: fffff784c6000000 R15: ffff9266039fb900
[   23.846521] FS:  00007f69a9703740(0000) GS:ffff92696f8c0000(0000) knlGS:0000000000000000
[   23.848417] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   23.849797] CR2: 0000556789080640 CR3: 0000000105568005 CR4: 0000000000770ee0
[   23.851491] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   23.853181] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   23.854861] PKRU: 55555554


Something's broken.
  
Sidhartha Kumar Dec. 12, 2022, 6:50 p.m. UTC | #13
On 12/12/22 10:34 AM, David Hildenbrand wrote:
> On 07.12.22 05:11, Muchun Song wrote:
>>
>>
>>> On Dec 7, 2022, at 11:42, Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>>
>>> On 12/07/22 11:34, Muchun Song wrote:
>>>>
>>>>
>>>>> On Nov 30, 2022, at 06:50, Sidhartha Kumar 
>>>>> <sidhartha.kumar@oracle.com> wrote:
>>>>>
>>>>> Add folio equivalents for set_compound_order() and 
>>>>> set_compound_page_dtor().
>>>>>
>>>>> Also remove extra new-lines introduced by mm/hugetlb: convert
>>>>> move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert
>>>>> hugetlb_cgroup_uncharge_page() to folios.
>>>>>
>>>>> Suggested-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>>> Suggested-by: Muchun Song <songmuchun@bytedance.com>
>>>>> Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
>>>>> ---
>>>>> include/linux/mm.h | 16 ++++++++++++++++
>>>>> mm/hugetlb.c       |  4 +---
>>>>> 2 files changed, 17 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>>> index a48c5ad16a5e..2bdef8a5298a 100644
>>>>> --- a/include/linux/mm.h
>>>>> +++ b/include/linux/mm.h
>>>>> @@ -972,6 +972,13 @@ static inline void 
>>>>> set_compound_page_dtor(struct page *page,
>>>>> page[1].compound_dtor = compound_dtor;
>>>>> }
>>>>>
>>>>> +static inline void folio_set_compound_dtor(struct folio *folio,
>>>>> + enum compound_dtor_id compound_dtor)
>>>>> +{
>>>>> + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
>>>>> + folio->_folio_dtor = compound_dtor;
>>>>> +}
>>>>> +
>>>>> void destroy_large_folio(struct folio *folio);
>>>>>
>>>>> static inline int head_compound_pincount(struct page *head)
>>>>> @@ -987,6 +994,15 @@ static inline void set_compound_order(struct 
>>>>> page *page, unsigned int order)
>>>>> #endif
>>>>> }
>>>>>
>>>>> +static inline void folio_set_compound_order(struct folio *folio,
>>>>> + unsigned int order)
>>>>> +{
>>>>> + folio->_folio_order = order;
>>>>> +#ifdef CONFIG_64BIT
>>>>> + folio->_folio_nr_pages = order ? 1U << order : 0;
>>>>
>>>> It seems that you think the user could pass 0 to order. However,
>>>> ->_folio_nr_pages and ->_folio_order fields are invalid for order-0 
>>>> pages.
>>>> You should not touch it. So this should be:
>>>>
>>>> static inline void folio_set_compound_order(struct folio *folio,
>>>>      unsigned int order)
>>>> {
>>>>     if (!folio_test_large(folio))
>>>>         return;
>>>>
>>>>     folio->_folio_order = order;
>>>> #ifdef CONFIG_64BIT
>>>>     folio->_folio_nr_pages = 1U << order;
>>>> #endif
>>>> }
>>>
>>> I believe this was changed to accommodate the code in
>>> __destroy_compound_gigantic_page().  It is used in a subsequent patch.
>>> Here is the v6.0 version of the routine.
>>
>> Thanks for your clarification.
>>
>>>
>>> static void __destroy_compound_gigantic_page(struct page *page,
>>> unsigned int order, bool demote)
>>> {
>>>     int i;
>>>     int nr_pages = 1 << order;
>>>     struct page *p = page + 1;
>>>
>>>     atomic_set(compound_mapcount_ptr(page), 0);
>>>     atomic_set(compound_pincount_ptr(page), 0);
>>>
>>>     for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
>>>         p->mapping = NULL;
>>>         clear_compound_head(p);
>>>         if (!demote)
>>>             set_page_refcounted(p);
>>>     }
>>>
>>>     set_compound_order(page, 0);
>>> #ifdef CONFIG_64BIT
>>>     page[1].compound_nr = 0;
>>> #endif
>>>     __ClearPageHead(page);
>>> }
>>>
>>>
>>> Might have been better to change this set_compound_order call to
>>> folio_set_compound_order in this patch.
>>>
>>
>> Agree. It has confused me a lot. I suggest changing the code to the
>> followings. The folio_test_large() check is still to avoid unexpected
>> users for OOB.
>>
>> static inline void folio_set_compound_order(struct folio *folio,
>>                         unsigned int order)
>> {
>>     VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>     // or
>>     // if (!folio_test_large(folio))
>>     //     return;
>>
>>     folio->_folio_order = order;
>> #ifdef CONFIG_64BIT
>>     folio->_folio_nr_pages = order ? 1U << order : 0;
>> #endif
>> }
>>
>> Thanks.
> 
> On mm-stable, dynamically allocating gigantic pages gives me:
> 
> [   23.625105] page:00000000ae27bd2a refcount:1 mapcount:0 
> mapping:0000000000000000 index:0x0 pfn:0x1800
> [   23.635117] flags: 0x17fffc00000000(node=0|zone=2|lastcpupid=0x1ffff)
> [   23.641969] raw: 0017fffc00000000 ffffa4edc489bb58 fffff784c6000048 
> 0000000000000000
> [   23.650141] raw: 0000000000000000 0000000000000000 00000001ffffffff 
> 0000000000000000
> [   23.657907] page dumped because: 
> VM_BUG_ON_FOLIO(!folio_test_large(folio))


static bool __prep_compound_gigantic_folio(struct folio *folio,
					unsigned int order, bool demote)
{
	int i, j;
	int nr_pages = 1 << order;
	struct page *p;

	/* we rely on prep_new_hugetlb_folio to set the destructor */
	folio_set_compound_order(folio, order);
	__folio_clear_reserved(folio);
	__folio_set_head(folio);

The problem looks to be because the head flag is set after the call to 
folio_set_compound_order() which looks for PG_head. I will test the fix 
of moving folio_set_compound_order() to after the __folio_set_head() call.

> [   23.663955] ------------[ cut here ]------------
> [   23.667680] kernel BUG at include/linux/mm.h:1030!
> [   23.673378] invalid opcode: 0000 [#1] PREEMPT SMP PTI
> [   23.681610] CPU: 3 PID: 1220 Comm: bash Not tainted 6.1.0-rc4+ #13
> [   23.690281] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> 1.16.0-1.fc36 04/01/2014
> [   23.699837] RIP: 0010:__prep_compound_gigantic_folio+0x115/0x120
> [   23.706587] Code: c7 44 24 5c 00 00 00 00 5b 44 89 d0 5d 41 5c 41 5d 
> 41 5e c3 cc cc cc cc 48 83 e8 09
> [   23.725954] RSP: 0018:ffffa4edc489bc90 EFLAGS: 00010296
> [   23.731001] RAX: 000000000000003e RBX: ffffffffae39a720 RCX: 
> 0000000000000000
> [   23.736065] RDX: 0000000000000001 RSI: ffffffffad522c25 RDI: 
> 00000000ffffffff
> [   23.740866] RBP: 0000000000040000 R08: 0000000000000000 R09: 
> ffffa4edc489bb58
> [   23.745385] R10: 0000000000000003 R11: ffff926d7ff4b028 R12: 
> fffff784c6000000
> [   23.749464] R13: 0000000000040000 R14: fffff784c6000000 R15: 
> ffff9266039fb900
> [   23.753176] FS:  00007f69a9703740(0000) GS:ffff92696f8c0000(0000) 
> knlGS:0000000000000000
> [   23.758299] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   23.761275] CR2: 0000556789080640 CR3: 0000000105568005 CR4: 
> 0000000000770ee0
> [   23.764929] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
> 0000000000000000
> [   23.768572] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
> 0000000000000400
> [   23.772221] PKRU: 55555554
> [   23.773696] Call Trace:
> [   23.776170]  <TASK>
> [   23.777258]  alloc_fresh_hugetlb_folio+0x14b/0x220
> [   23.779406]  alloc_pool_huge_page+0x7c/0x110
> [   23.781273]  __nr_hugepages_store_common+0x191/0x400
> [   23.783369]  ? __mod_memcg_lruvec_state+0x93/0x110
> [   23.785343]  ? __mod_lruvec_page_state+0xa6/0x1a0
> [   23.787223]  ? _copy_from_iter+0x8c/0x540
> [   23.788788]  ? __kmem_cache_alloc_node+0x13b/0x2a0
> [   23.790577]  ? kernfs_fop_write_iter+0x174/0x1f0
> [   23.792313]  nr_hugepages_store+0x57/0x70
> [   23.793754]  kernfs_fop_write_iter+0x11b/0x1f0
> [   23.795337]  vfs_write+0x1e1/0x3a0
> [   23.796531]  ksys_write+0x53/0xd0
> [   23.797690]  do_syscall_64+0x37/0x90
> [   23.798947]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
> [   23.800602] RIP: 0033:0x7f69a9501977
> [   23.801798] Code: 0f 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 
> 1f 00 f3 0f 1e fa 64 8b 04 25 14
> [   23.807322] RSP: 002b:00007ffce87f3338 EFLAGS: 00000246 ORIG_RAX: 
> 0000000000000001
> [   23.809460] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 
> 00007f69a9501977
> [   23.811485] RDX: 0000000000000002 RSI: 00005652a970e5e0 RDI: 
> 0000000000000001
> [   23.813443] RBP: 00005652a970e5e0 R08: 0000000000000000 R09: 
> 0000000000000073
> [   23.815385] R10: 0000000000001000 R11: 0000000000000246 R12: 
> 0000000000000002
> [   23.817289] R13: 00007f69a95f9780 R14: 0000000000000002 R15: 
> 00007f69a95f49e0
> [   23.819176]  </TASK>
> [   23.819792] Modules linked in: intel_rapl_msr intel_rapl_common 
> intel_uncore_frequency_common isst_ir
> [   23.829056] ---[ end trace 0000000000000000 ]---
> [   23.830413] RIP: 0010:__prep_compound_gigantic_folio+0x115/0x120
> [   23.832390] Code: c7 44 24 5c 00 00 00 00 5b 44 89 d0 5d 41 5c 41 5d 
> 41 5e c3 cc cc cc cc 48 83 e8 09
> [   23.836787] RSP: 0018:ffffa4edc489bc90 EFLAGS: 00010296
> [   23.838083] RAX: 000000000000003e RBX: ffffffffae39a720 RCX: 
> 0000000000000000
> [   23.839764] RDX: 0000000000000001 RSI: ffffffffad522c25 RDI: 
> 00000000ffffffff
> [   23.841456] RBP: 0000000000040000 R08: 0000000000000000 R09: 
> ffffa4edc489bb58
> [   23.843150] R10: 0000000000000003 R11: ffff926d7ff4b028 R12: 
> fffff784c6000000
> [   23.844836] R13: 0000000000040000 R14: fffff784c6000000 R15: 
> ffff9266039fb900
> [   23.846521] FS:  00007f69a9703740(0000) GS:ffff92696f8c0000(0000) 
> knlGS:0000000000000000
> [   23.848417] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   23.849797] CR2: 0000556789080640 CR3: 0000000105568005 CR4: 
> 0000000000770ee0
> [   23.851491] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
> 0000000000000000
> [   23.853181] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
> 0000000000000400
> [   23.854861] PKRU: 55555554
> 
> 
> Something's broken.
>
  

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a48c5ad16a5e..2bdef8a5298a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -972,6 +972,13 @@  static inline void set_compound_page_dtor(struct page *page,
 	page[1].compound_dtor = compound_dtor;
 }
 
+static inline void folio_set_compound_dtor(struct folio *folio,
+		enum compound_dtor_id compound_dtor)
+{
+	VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio);
+	folio->_folio_dtor = compound_dtor;
+}
+
 void destroy_large_folio(struct folio *folio);
 
 static inline int head_compound_pincount(struct page *head)
@@ -987,6 +994,15 @@  static inline void set_compound_order(struct page *page, unsigned int order)
 #endif
 }
 
+static inline void folio_set_compound_order(struct folio *folio,
+		unsigned int order)
+{
+	folio->_folio_order = order;
+#ifdef CONFIG_64BIT
+	folio->_folio_nr_pages = order ? 1U << order : 0;
+#endif
+}
+
 /* Returns the number of pages in this potentially compound page. */
 static inline unsigned long compound_nr(struct page *page)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4c2fe7fec475..6390de8975c5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1780,7 +1780,7 @@  static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
 {
 	hugetlb_vmemmap_optimize(h, &folio->page);
 	INIT_LIST_HEAD(&folio->lru);
-	folio->_folio_dtor = HUGETLB_PAGE_DTOR;
+	folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR);
 	hugetlb_set_folio_subpool(folio, NULL);
 	set_hugetlb_cgroup(folio, NULL);
 	set_hugetlb_cgroup_rsvd(folio, NULL);
@@ -2938,7 +2938,6 @@  struct page *alloc_huge_page(struct vm_area_struct *vma,
 	 * a reservation exists for the allocation.
 	 */
 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
-
 	if (!page) {
 		spin_unlock_irq(&hugetlb_lock);
 		page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
@@ -7351,7 +7350,6 @@  void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
 		int old_nid = folio_nid(old_folio);
 		int new_nid = folio_nid(new_folio);
 
-
 		folio_set_hugetlb_temporary(old_folio);
 		folio_clear_hugetlb_temporary(new_folio);