[1/2] mm: pgtable: add missing flag and statistics for kernel PTE page

Message ID f023a6687b9f2109401e7522b727aa4708dc05f1.1706774109.git.zhengqi.arch@bytedance.com
State New
Headers
Series [1/2] mm: pgtable: add missing flag and statistics for kernel PTE page |

Commit Message

Qi Zheng Feb. 1, 2024, 8:05 a.m. UTC
  For kernel PTE page, we do not need to allocate and initialize its split
ptlock, but as a page table page, it's still necessary to add PG_table
flag and NR_PAGETABLE statistics for it.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 include/asm-generic/pgalloc.h |  7 ++++++-
 include/linux/mm.h            | 21 ++++++++++++++++-----
 2 files changed, 22 insertions(+), 6 deletions(-)
  

Comments

Muchun Song Feb. 2, 2024, 2:47 a.m. UTC | #1
> On Feb 1, 2024, at 16:05, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
> 
> For kernel PTE page, we do not need to allocate and initialize its split
> ptlock, but as a page table page, it's still necessary to add PG_table
> flag and NR_PAGETABLE statistics for it.
> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>

Reviewed-by: Muchun Song <muchun.song@linux.dev>

Thanks.
  
Mike Rapoport Feb. 4, 2024, 10:58 a.m. UTC | #2
On Thu, Feb 01, 2024 at 04:05:40PM +0800, Qi Zheng wrote:
> For kernel PTE page, we do not need to allocate and initialize its split
> ptlock, but as a page table page, it's still necessary to add PG_table
> flag and NR_PAGETABLE statistics for it.
> 
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>  include/asm-generic/pgalloc.h |  7 ++++++-
>  include/linux/mm.h            | 21 ++++++++++++++++-----
>  2 files changed, 22 insertions(+), 6 deletions(-)

This should also update the architectures that define
__HAVE_ARCH_PTE_ALLOC_ONE_KERNEL, otherwise NR_PAGETABLE counts will get
wrong.

Another related thing is that many architectures have custom allocations
for early page tables and these would also benefit form NR_PAGETABLE
accounting.
 
> diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
> index 879e5f8aa5e9..908bd9140ac2 100644
> --- a/include/asm-generic/pgalloc.h
> +++ b/include/asm-generic/pgalloc.h
> @@ -23,6 +23,8 @@ static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
>  
>  	if (!ptdesc)
>  		return NULL;
> +
> +	__pagetable_pte_ctor(ptdesc);
>  	return ptdesc_address(ptdesc);
>  }
>  
> @@ -46,7 +48,10 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
>   */
>  static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
>  {
> -	pagetable_free(virt_to_ptdesc(pte));
> +	struct ptdesc *ptdesc = virt_to_ptdesc(pte);
> +
> +	__pagetable_pte_dtor(ptdesc);
> +	pagetable_free(ptdesc);
>  }
>  
>  /**
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index e442fd0efdd9..e37db032764e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2922,26 +2922,37 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
>  static inline void ptlock_free(struct ptdesc *ptdesc) {}
>  #endif /* USE_SPLIT_PTE_PTLOCKS */
>  
> -static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
> +static inline void __pagetable_pte_ctor(struct ptdesc *ptdesc)
>  {
>  	struct folio *folio = ptdesc_folio(ptdesc);
>  
> -	if (!ptlock_init(ptdesc))
> -		return false;
>  	__folio_set_pgtable(folio);
>  	lruvec_stat_add_folio(folio, NR_PAGETABLE);
> +}
> +
> +static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
> +{
> +	if (!ptlock_init(ptdesc))
> +		return false;
> +
> +	__pagetable_pte_ctor(ptdesc);
>  	return true;
>  }
>  
> -static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
> +static inline void __pagetable_pte_dtor(struct ptdesc *ptdesc)
>  {
>  	struct folio *folio = ptdesc_folio(ptdesc);
>  
> -	ptlock_free(ptdesc);
>  	__folio_clear_pgtable(folio);
>  	lruvec_stat_sub_folio(folio, NR_PAGETABLE);
>  }
>  
> +static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
> +{
> +	ptlock_free(ptdesc);
> +	__pagetable_pte_dtor(ptdesc);
> +}
> +
>  pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
>  static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
>  {
> -- 
> 2.30.2
> 
>
  
Qi Zheng Feb. 4, 2024, 11:39 a.m. UTC | #3
Hi Mike,

On 2024/2/4 18:58, Mike Rapoport wrote:
> On Thu, Feb 01, 2024 at 04:05:40PM +0800, Qi Zheng wrote:
>> For kernel PTE page, we do not need to allocate and initialize its split
>> ptlock, but as a page table page, it's still necessary to add PG_table
>> flag and NR_PAGETABLE statistics for it.
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>>   include/asm-generic/pgalloc.h |  7 ++++++-
>>   include/linux/mm.h            | 21 ++++++++++++++++-----
>>   2 files changed, 22 insertions(+), 6 deletions(-)
> 
> This should also update the architectures that define
> __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL, otherwise NR_PAGETABLE counts will get
> wrong.

Yes, this patchset only focuses on the generic implementation. For those
architectures that define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL, some reuse
the generic __pte_alloc_one_kernel(), but some have their own customized
implementations, which indeed need to be fixed.

I wasn't familiar with those architectures and didn't investigate why
they couldn't reuse the generic __pte_alloc_one_kernel(), so I didn't
fix them. It would be better if there are maintainers corresponding to
the architecture who can help fix it. After all, they have a better
understanding of the historical background and have a testing
environment. ;)

> 
> Another related thing is that many architectures have custom allocations
> for early page tables and these would also benefit form NR_PAGETABLE
> accounting.

Indeed, this is also a point that can be optimized.

Thanks.

>   
>> diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
>> index 879e5f8aa5e9..908bd9140ac2 100644
>> --- a/include/asm-generic/pgalloc.h
>> +++ b/include/asm-generic/pgalloc.h
>> @@ -23,6 +23,8 @@ static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
>>   
>>   	if (!ptdesc)
>>   		return NULL;
>> +
>> +	__pagetable_pte_ctor(ptdesc);
>>   	return ptdesc_address(ptdesc);
>>   }
>>   
>> @@ -46,7 +48,10 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
>>    */
>>   static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
>>   {
>> -	pagetable_free(virt_to_ptdesc(pte));
>> +	struct ptdesc *ptdesc = virt_to_ptdesc(pte);
>> +
>> +	__pagetable_pte_dtor(ptdesc);
>> +	pagetable_free(ptdesc);
>>   }
>>   
>>   /**
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index e442fd0efdd9..e37db032764e 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -2922,26 +2922,37 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
>>   static inline void ptlock_free(struct ptdesc *ptdesc) {}
>>   #endif /* USE_SPLIT_PTE_PTLOCKS */
>>   
>> -static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
>> +static inline void __pagetable_pte_ctor(struct ptdesc *ptdesc)
>>   {
>>   	struct folio *folio = ptdesc_folio(ptdesc);
>>   
>> -	if (!ptlock_init(ptdesc))
>> -		return false;
>>   	__folio_set_pgtable(folio);
>>   	lruvec_stat_add_folio(folio, NR_PAGETABLE);
>> +}
>> +
>> +static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
>> +{
>> +	if (!ptlock_init(ptdesc))
>> +		return false;
>> +
>> +	__pagetable_pte_ctor(ptdesc);
>>   	return true;
>>   }
>>   
>> -static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
>> +static inline void __pagetable_pte_dtor(struct ptdesc *ptdesc)
>>   {
>>   	struct folio *folio = ptdesc_folio(ptdesc);
>>   
>> -	ptlock_free(ptdesc);
>>   	__folio_clear_pgtable(folio);
>>   	lruvec_stat_sub_folio(folio, NR_PAGETABLE);
>>   }
>>   
>> +static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
>> +{
>> +	ptlock_free(ptdesc);
>> +	__pagetable_pte_dtor(ptdesc);
>> +}
>> +
>>   pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
>>   static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
>>   {
>> -- 
>> 2.30.2
>>
>>
>
  
Mike Rapoport Feb. 4, 2024, 12:15 p.m. UTC | #4
On Sun, Feb 04, 2024 at 07:39:38PM +0800, Qi Zheng wrote:
> Hi Mike,
> 
> On 2024/2/4 18:58, Mike Rapoport wrote:
> > On Thu, Feb 01, 2024 at 04:05:40PM +0800, Qi Zheng wrote:
> > > For kernel PTE page, we do not need to allocate and initialize its split
> > > ptlock, but as a page table page, it's still necessary to add PG_table
> > > flag and NR_PAGETABLE statistics for it.
> > > 
> > > Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> > > ---
> > >   include/asm-generic/pgalloc.h |  7 ++++++-
> > >   include/linux/mm.h            | 21 ++++++++++++++++-----
> > >   2 files changed, 22 insertions(+), 6 deletions(-)
> > 
> > This should also update the architectures that define
> > __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL, otherwise NR_PAGETABLE counts will get
> > wrong.
> 
> Yes, this patchset only focuses on the generic implementation. For those
> architectures that define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL, some reuse
> the generic __pte_alloc_one_kernel(), but some have their own customized
> implementations, which indeed need to be fixed.
> 
> I wasn't familiar with those architectures and didn't investigate why
> they couldn't reuse the generic __pte_alloc_one_kernel(), so I didn't
> fix them.

But with your patch NR_PAGETABLE will underflow e.g. on arm and it'd be a
regression for no good reason.

> It would be better if there are maintainers corresponding to
> the architecture who can help fix it. After all, they have a better
> understanding of the historical background and have a testing
> environment. ;)
  
Qi Zheng Feb. 4, 2024, 4:26 p.m. UTC | #5
On 2024/2/4 20:15, Mike Rapoport wrote:
> On Sun, Feb 04, 2024 at 07:39:38PM +0800, Qi Zheng wrote:
>> Hi Mike,
>>
>> On 2024/2/4 18:58, Mike Rapoport wrote:
>>> On Thu, Feb 01, 2024 at 04:05:40PM +0800, Qi Zheng wrote:
>>>> For kernel PTE page, we do not need to allocate and initialize its split
>>>> ptlock, but as a page table page, it's still necessary to add PG_table
>>>> flag and NR_PAGETABLE statistics for it.
>>>>
>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>> ---
>>>>    include/asm-generic/pgalloc.h |  7 ++++++-
>>>>    include/linux/mm.h            | 21 ++++++++++++++++-----
>>>>    2 files changed, 22 insertions(+), 6 deletions(-)
>>>
>>> This should also update the architectures that define
>>> __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL, otherwise NR_PAGETABLE counts will get
>>> wrong.
>>
>> Yes, this patchset only focuses on the generic implementation. For those
>> architectures that define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL, some reuse
>> the generic __pte_alloc_one_kernel(), but some have their own customized
>> implementations, which indeed need to be fixed.
>>
>> I wasn't familiar with those architectures and didn't investigate why
>> they couldn't reuse the generic __pte_alloc_one_kernel(), so I didn't
>> fix them.
> 
> But with your patch NR_PAGETABLE will underflow e.g. on arm and it'd be a
> regression for no good reason.

Oh, I see. In some architectures, they implement their own
pte_alloc_one_kernel() and do not call generic __pte_alloc_one_kernel(),
but still reuse generic pte_free_kernel(). So it needs to be fixed
together.

I will try to fix them and send the v2. But since I'm on vacation
recently, updates may not be quick.

Hi Andrew, please help to temporarily remove this patchset from the
mm-unstable.

Thanks!

> 
>> It would be better if there are maintainers corresponding to
>> the architecture who can help fix it. After all, they have a better
>> understanding of the historical background and have a testing
>> environment. ;)
>
  
Matthew Wilcox Feb. 4, 2024, 6:51 p.m. UTC | #6
On Thu, Feb 01, 2024 at 04:05:40PM +0800, Qi Zheng wrote:
> For kernel PTE page, we do not need to allocate and initialize its split
> ptlock, but as a page table page, it's still necessary to add PG_table
> flag and NR_PAGETABLE statistics for it.

No, this is wrong.

We do not account _kernel_ page tables to the _user_.  Just because
the kernel, say, called vmalloc() doesn't mean we should charge the
task for it.  Moreover, one task may call vmalloc() and a different task
would then call vfree().

This is a can of worms you don't want to open.  Why did you want to do
this?
  
Qi Zheng Feb. 5, 2024, 2:05 a.m. UTC | #7
Hi Matthew,

On 2024/2/5 02:51, Matthew Wilcox wrote:
> On Thu, Feb 01, 2024 at 04:05:40PM +0800, Qi Zheng wrote:
>> For kernel PTE page, we do not need to allocate and initialize its split
>> ptlock, but as a page table page, it's still necessary to add PG_table
>> flag and NR_PAGETABLE statistics for it.
> 
> No, this is wrong.
> 
> We do not account _kernel_ page tables to the _user_.  Just because
> the kernel, say, called vmalloc() doesn't mean we should charge the
> task for it.  Moreover, one task may call vmalloc() and a different task
> would then call vfree().
> 

Got it. Thanks for providing this information!

> This is a can of worms you don't want to open.  Why did you want to do
> this?

Ah, just because generic {pmd|pud}_alloc_one() has opened it. ;) And
When I looked through the commits (e.g. commit 1d40a5ea01d5), I couldn't
find the information you provided above. And that is why I CC'd you to
double check this, in case I might have overlooked some important
background information.

So we should actually fix generic {pmd|pud}_alloc_one() (and maybe some
implementation in the arch), right? And it would be better to add some
comments to clarify.

Thanks.
  

Patch

diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 879e5f8aa5e9..908bd9140ac2 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -23,6 +23,8 @@  static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
 
 	if (!ptdesc)
 		return NULL;
+
+	__pagetable_pte_ctor(ptdesc);
 	return ptdesc_address(ptdesc);
 }
 
@@ -46,7 +48,10 @@  static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
  */
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
-	pagetable_free(virt_to_ptdesc(pte));
+	struct ptdesc *ptdesc = virt_to_ptdesc(pte);
+
+	__pagetable_pte_dtor(ptdesc);
+	pagetable_free(ptdesc);
 }
 
 /**
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e442fd0efdd9..e37db032764e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2922,26 +2922,37 @@  static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
 static inline void ptlock_free(struct ptdesc *ptdesc) {}
 #endif /* USE_SPLIT_PTE_PTLOCKS */
 
-static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
+static inline void __pagetable_pte_ctor(struct ptdesc *ptdesc)
 {
 	struct folio *folio = ptdesc_folio(ptdesc);
 
-	if (!ptlock_init(ptdesc))
-		return false;
 	__folio_set_pgtable(folio);
 	lruvec_stat_add_folio(folio, NR_PAGETABLE);
+}
+
+static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
+{
+	if (!ptlock_init(ptdesc))
+		return false;
+
+	__pagetable_pte_ctor(ptdesc);
 	return true;
 }
 
-static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
+static inline void __pagetable_pte_dtor(struct ptdesc *ptdesc)
 {
 	struct folio *folio = ptdesc_folio(ptdesc);
 
-	ptlock_free(ptdesc);
 	__folio_clear_pgtable(folio);
 	lruvec_stat_sub_folio(folio, NR_PAGETABLE);
 }
 
+static inline void pagetable_pte_dtor(struct ptdesc *ptdesc)
+{
+	ptlock_free(ptdesc);
+	__pagetable_pte_dtor(ptdesc);
+}
+
 pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
 static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
 {