[v4,35/36] mm: Convert do_set_pte() to set_pte_range()

Message ID 20230315051444.3229621-36-willy@infradead.org
State New
Headers
Series New page table range API |

Commit Message

Matthew Wilcox March 15, 2023, 5:14 a.m. UTC
  From: Yin Fengwei <fengwei.yin@intel.com>

set_pte_range() allows to setup page table entries for a specific
range.  It takes advantage of batched rmap update for large folio.
It now takes care of calling update_mmu_cache_range().

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 Documentation/filesystems/locking.rst |  2 +-
 include/linux/mm.h                    |  3 ++-
 mm/filemap.c                          |  3 +--
 mm/memory.c                           | 27 +++++++++++++++------------
 4 files changed, 19 insertions(+), 16 deletions(-)
  

Comments

Ryan Roberts March 15, 2023, 3:26 p.m. UTC | #1
On 15/03/2023 05:14, Matthew Wilcox (Oracle) wrote:
> From: Yin Fengwei <fengwei.yin@intel.com>
> 
> set_pte_range() allows to setup page table entries for a specific
> range.  It takes advantage of batched rmap update for large folio.
> It now takes care of calling update_mmu_cache_range().
> 
> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  Documentation/filesystems/locking.rst |  2 +-
>  include/linux/mm.h                    |  3 ++-
>  mm/filemap.c                          |  3 +--
>  mm/memory.c                           | 27 +++++++++++++++------------
>  4 files changed, 19 insertions(+), 16 deletions(-)
> 
> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
> index 7de7a7272a5e..922886fefb7f 100644
> --- a/Documentation/filesystems/locking.rst
> +++ b/Documentation/filesystems/locking.rst
> @@ -663,7 +663,7 @@ locked. The VM will unlock the page.
>  Filesystem should find and map pages associated with offsets from "start_pgoff"
>  till "end_pgoff". ->map_pages() is called with page table locked and must
>  not block.  If it's not possible to reach a page without blocking,
> -filesystem should skip it. Filesystem should use do_set_pte() to setup
> +filesystem should skip it. Filesystem should use set_pte_range() to setup
>  page table entry. Pointer to entry associated with the page is passed in
>  "pte" field in vm_fault structure. Pointers to entries for other offsets
>  should be calculated relative to "pte".
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ee755bb4e1c1..81788c985a8c 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1299,7 +1299,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>  }
>  
>  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
> +		struct page *page, unsigned int nr, unsigned long addr);
>  
>  vm_fault_t finish_fault(struct vm_fault *vmf);
>  vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 6e2b0778db45..e2317623dcbf 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3504,8 +3504,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>  			ret = VM_FAULT_NOPAGE;
>  
>  		ref_count++;
> -		do_set_pte(vmf, page, addr);
> -		update_mmu_cache(vma, addr, vmf->pte);
> +		set_pte_range(vmf, folio, page, 1, addr);
>  	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>  
>  	/* Restore the vmf->pte */
> diff --git a/mm/memory.c b/mm/memory.c
> index 6aa21e8f3753..9a654802f104 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4274,7 +4274,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>  }
>  #endif
>  
> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
> +		struct page *page, unsigned int nr, unsigned long addr)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
> @@ -4282,7 +4283,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>  	bool prefault = vmf->address != addr;

I think you are changing behavior here - is this intentional? Previously this
would be evaluated per page, now its evaluated once for the whole range. The
intention below is that directly faulted pages are mapped young and prefaulted
pages are mapped old. But now a whole range will be mapped the same.

Thanks,
Ryan

>  	pte_t entry;
>  
> -	flush_icache_page(vma, page);
> +	flush_icache_pages(vma, page, nr);
>  	entry = mk_pte(page, vma->vm_page_prot);
>  
>  	if (prefault && arch_wants_old_prefaulted_pte())
> @@ -4296,14 +4297,18 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>  		entry = pte_mkuffd_wp(entry);
>  	/* copy-on-write page */
>  	if (write && !(vma->vm_flags & VM_SHARED)) {
> -		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
> -		page_add_new_anon_rmap(page, vma, addr);
> -		lru_cache_add_inactive_or_unevictable(page, vma);
> +		add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
> +		VM_BUG_ON_FOLIO(nr != 1, folio);
> +		folio_add_new_anon_rmap(folio, vma, addr);
> +		folio_add_lru_vma(folio, vma);
>  	} else {
> -		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
> -		page_add_file_rmap(page, vma, false);
> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
> +		folio_add_file_rmap_range(folio, page, nr, vma, false);
>  	}
> -	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
> +	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
> +
> +	/* no need to invalidate: a not-present page won't be cached */
> +	update_mmu_cache_range(vma, addr, vmf->pte, nr);
>  }
>  
>  static bool vmf_pte_changed(struct vm_fault *vmf)
> @@ -4376,11 +4381,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>  
>  	/* Re-check under ptl */
>  	if (likely(!vmf_pte_changed(vmf))) {
> -		do_set_pte(vmf, page, vmf->address);
> -
> -		/* no need to invalidate: a not-present page won't be cached */
> -		update_mmu_cache(vma, vmf->address, vmf->pte);
> +		struct folio *folio = page_folio(page);
>  
> +		set_pte_range(vmf, folio, page, 1, vmf->address);
>  		ret = 0;
>  	} else {
>  		update_mmu_tlb(vma, vmf->address, vmf->pte);
  
Yin Fengwei March 16, 2023, 4:23 p.m. UTC | #2
On 3/15/2023 11:26 PM, Ryan Roberts wrote:
> On 15/03/2023 05:14, Matthew Wilcox (Oracle) wrote:
>> From: Yin Fengwei <fengwei.yin@intel.com>
>>
>> set_pte_range() allows to setup page table entries for a specific
>> range.  It takes advantage of batched rmap update for large folio.
>> It now takes care of calling update_mmu_cache_range().
>>
>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>> ---
>>  Documentation/filesystems/locking.rst |  2 +-
>>  include/linux/mm.h                    |  3 ++-
>>  mm/filemap.c                          |  3 +--
>>  mm/memory.c                           | 27 +++++++++++++++------------
>>  4 files changed, 19 insertions(+), 16 deletions(-)
>>
>> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
>> index 7de7a7272a5e..922886fefb7f 100644
>> --- a/Documentation/filesystems/locking.rst
>> +++ b/Documentation/filesystems/locking.rst
>> @@ -663,7 +663,7 @@ locked. The VM will unlock the page.
>>  Filesystem should find and map pages associated with offsets from "start_pgoff"
>>  till "end_pgoff". ->map_pages() is called with page table locked and must
>>  not block.  If it's not possible to reach a page without blocking,
>> -filesystem should skip it. Filesystem should use do_set_pte() to setup
>> +filesystem should skip it. Filesystem should use set_pte_range() to setup
>>  page table entry. Pointer to entry associated with the page is passed in
>>  "pte" field in vm_fault structure. Pointers to entries for other offsets
>>  should be calculated relative to "pte".
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index ee755bb4e1c1..81788c985a8c 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1299,7 +1299,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>>  }
>>  
>>  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
>> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>> +		struct page *page, unsigned int nr, unsigned long addr);
>>  
>>  vm_fault_t finish_fault(struct vm_fault *vmf);
>>  vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index 6e2b0778db45..e2317623dcbf 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -3504,8 +3504,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>>  			ret = VM_FAULT_NOPAGE;
>>  
>>  		ref_count++;
>> -		do_set_pte(vmf, page, addr);
>> -		update_mmu_cache(vma, addr, vmf->pte);
>> +		set_pte_range(vmf, folio, page, 1, addr);
>>  	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>>  
>>  	/* Restore the vmf->pte */
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 6aa21e8f3753..9a654802f104 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4274,7 +4274,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>  }
>>  #endif
>>  
>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>> +		struct page *page, unsigned int nr, unsigned long addr)
>>  {
>>  	struct vm_area_struct *vma = vmf->vma;
>>  	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>> @@ -4282,7 +4283,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>  	bool prefault = vmf->address != addr;
> 
> I think you are changing behavior here - is this intentional? Previously this
> would be evaluated per page, now its evaluated once for the whole range. The
> intention below is that directly faulted pages are mapped young and prefaulted
> pages are mapped old. But now a whole range will be mapped the same.

Yes. You are right here.

Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
can avoid to handle vmf->address == addr specially. It's OK to 
drop prefault and change the logic here a little bit to:
  if (arch_wants_old_prefaulted_pte())
      entry = pte_mkold(entry);
  else
      entry = pte_sw_mkyong(entry);

It's not necessary to use pte_sw_mkyong for vmf->address == addr
because HW will set the ACCESS bit in page table entry.

Add Will Deacon in case I missed something here. Thanks.


Regards
Yin, Fengwei

> 
> Thanks,
> Ryan
> 
>>  	pte_t entry;
>>  
>> -	flush_icache_page(vma, page);
>> +	flush_icache_pages(vma, page, nr);
>>  	entry = mk_pte(page, vma->vm_page_prot);
>>  
>>  	if (prefault && arch_wants_old_prefaulted_pte())
>> @@ -4296,14 +4297,18 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>  		entry = pte_mkuffd_wp(entry);
>>  	/* copy-on-write page */
>>  	if (write && !(vma->vm_flags & VM_SHARED)) {
>> -		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>> -		page_add_new_anon_rmap(page, vma, addr);
>> -		lru_cache_add_inactive_or_unevictable(page, vma);
>> +		add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
>> +		VM_BUG_ON_FOLIO(nr != 1, folio);
>> +		folio_add_new_anon_rmap(folio, vma, addr);
>> +		folio_add_lru_vma(folio, vma);
>>  	} else {
>> -		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
>> -		page_add_file_rmap(page, vma, false);
>> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>> +		folio_add_file_rmap_range(folio, page, nr, vma, false);
>>  	}
>> -	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
>> +	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
>> +
>> +	/* no need to invalidate: a not-present page won't be cached */
>> +	update_mmu_cache_range(vma, addr, vmf->pte, nr);
>>  }
>>  
>>  static bool vmf_pte_changed(struct vm_fault *vmf)
>> @@ -4376,11 +4381,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>  
>>  	/* Re-check under ptl */
>>  	if (likely(!vmf_pte_changed(vmf))) {
>> -		do_set_pte(vmf, page, vmf->address);
>> -
>> -		/* no need to invalidate: a not-present page won't be cached */
>> -		update_mmu_cache(vma, vmf->address, vmf->pte);
>> +		struct folio *folio = page_folio(page);
>>  
>> +		set_pte_range(vmf, folio, page, 1, vmf->address);
>>  		ret = 0;
>>  	} else {
>>  		update_mmu_tlb(vma, vmf->address, vmf->pte);
>
  
Ryan Roberts March 16, 2023, 4:38 p.m. UTC | #3
On 16/03/2023 16:23, Yin, Fengwei wrote:
> 
> 
> On 3/15/2023 11:26 PM, Ryan Roberts wrote:
>> On 15/03/2023 05:14, Matthew Wilcox (Oracle) wrote:
>>> From: Yin Fengwei <fengwei.yin@intel.com>
>>>
>>> set_pte_range() allows to setup page table entries for a specific
>>> range.  It takes advantage of batched rmap update for large folio.
>>> It now takes care of calling update_mmu_cache_range().
>>>
>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>>> ---
>>>  Documentation/filesystems/locking.rst |  2 +-
>>>  include/linux/mm.h                    |  3 ++-
>>>  mm/filemap.c                          |  3 +--
>>>  mm/memory.c                           | 27 +++++++++++++++------------
>>>  4 files changed, 19 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
>>> index 7de7a7272a5e..922886fefb7f 100644
>>> --- a/Documentation/filesystems/locking.rst
>>> +++ b/Documentation/filesystems/locking.rst
>>> @@ -663,7 +663,7 @@ locked. The VM will unlock the page.
>>>  Filesystem should find and map pages associated with offsets from "start_pgoff"
>>>  till "end_pgoff". ->map_pages() is called with page table locked and must
>>>  not block.  If it's not possible to reach a page without blocking,
>>> -filesystem should skip it. Filesystem should use do_set_pte() to setup
>>> +filesystem should skip it. Filesystem should use set_pte_range() to setup
>>>  page table entry. Pointer to entry associated with the page is passed in
>>>  "pte" field in vm_fault structure. Pointers to entries for other offsets
>>>  should be calculated relative to "pte".
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index ee755bb4e1c1..81788c985a8c 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -1299,7 +1299,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>>>  }
>>>  
>>>  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
>>> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>> +		struct page *page, unsigned int nr, unsigned long addr);
>>>  
>>>  vm_fault_t finish_fault(struct vm_fault *vmf);
>>>  vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
>>> diff --git a/mm/filemap.c b/mm/filemap.c
>>> index 6e2b0778db45..e2317623dcbf 100644
>>> --- a/mm/filemap.c
>>> +++ b/mm/filemap.c
>>> @@ -3504,8 +3504,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>>>  			ret = VM_FAULT_NOPAGE;
>>>  
>>>  		ref_count++;
>>> -		do_set_pte(vmf, page, addr);
>>> -		update_mmu_cache(vma, addr, vmf->pte);
>>> +		set_pte_range(vmf, folio, page, 1, addr);
>>>  	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>>>  
>>>  	/* Restore the vmf->pte */
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 6aa21e8f3753..9a654802f104 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4274,7 +4274,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>>  }
>>>  #endif
>>>  
>>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>> +		struct page *page, unsigned int nr, unsigned long addr)
>>>  {
>>>  	struct vm_area_struct *vma = vmf->vma;
>>>  	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>>> @@ -4282,7 +4283,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>>  	bool prefault = vmf->address != addr;
>>
>> I think you are changing behavior here - is this intentional? Previously this
>> would be evaluated per page, now its evaluated once for the whole range. The
>> intention below is that directly faulted pages are mapped young and prefaulted
>> pages are mapped old. But now a whole range will be mapped the same.
> 
> Yes. You are right here.
> 
> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
> can avoid to handle vmf->address == addr specially. It's OK to 
> drop prefault and change the logic here a little bit to:
>   if (arch_wants_old_prefaulted_pte())
>       entry = pte_mkold(entry);
>   else
>       entry = pte_sw_mkyong(entry);
> 
> It's not necessary to use pte_sw_mkyong for vmf->address == addr
> because HW will set the ACCESS bit in page table entry.
> 
> Add Will Deacon in case I missed something here. Thanks.

I'll defer to Will's response, but not all arm HW supports HW access flag
management. In that case it's done by SW, so I would imagine that by setting
this to old initially, we will get a second fault to set the access bit, which
will slow things down. I wonder if you will need to split this into (up to) 3
calls to set_ptes()?

> 
> 
> Regards
> Yin, Fengwei
> 
>>
>> Thanks,
>> Ryan
>>
>>>  	pte_t entry;
>>>  
>>> -	flush_icache_page(vma, page);
>>> +	flush_icache_pages(vma, page, nr);
>>>  	entry = mk_pte(page, vma->vm_page_prot);
>>>  
>>>  	if (prefault && arch_wants_old_prefaulted_pte())
>>> @@ -4296,14 +4297,18 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>>  		entry = pte_mkuffd_wp(entry);
>>>  	/* copy-on-write page */
>>>  	if (write && !(vma->vm_flags & VM_SHARED)) {
>>> -		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>> -		page_add_new_anon_rmap(page, vma, addr);
>>> -		lru_cache_add_inactive_or_unevictable(page, vma);
>>> +		add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
>>> +		VM_BUG_ON_FOLIO(nr != 1, folio);
>>> +		folio_add_new_anon_rmap(folio, vma, addr);
>>> +		folio_add_lru_vma(folio, vma);
>>>  	} else {
>>> -		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
>>> -		page_add_file_rmap(page, vma, false);
>>> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>>> +		folio_add_file_rmap_range(folio, page, nr, vma, false);
>>>  	}
>>> -	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
>>> +	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
>>> +
>>> +	/* no need to invalidate: a not-present page won't be cached */
>>> +	update_mmu_cache_range(vma, addr, vmf->pte, nr);
>>>  }
>>>  
>>>  static bool vmf_pte_changed(struct vm_fault *vmf)
>>> @@ -4376,11 +4381,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>  
>>>  	/* Re-check under ptl */
>>>  	if (likely(!vmf_pte_changed(vmf))) {
>>> -		do_set_pte(vmf, page, vmf->address);
>>> -
>>> -		/* no need to invalidate: a not-present page won't be cached */
>>> -		update_mmu_cache(vma, vmf->address, vmf->pte);
>>> +		struct folio *folio = page_folio(page);
>>>  
>>> +		set_pte_range(vmf, folio, page, 1, vmf->address);
>>>  		ret = 0;
>>>  	} else {
>>>  		update_mmu_tlb(vma, vmf->address, vmf->pte);
>>
  
Yin Fengwei March 16, 2023, 4:41 p.m. UTC | #4
On 3/17/2023 12:38 AM, Ryan Roberts wrote:
> On 16/03/2023 16:23, Yin, Fengwei wrote:
>>
>>
>> On 3/15/2023 11:26 PM, Ryan Roberts wrote:
>>> On 15/03/2023 05:14, Matthew Wilcox (Oracle) wrote:
>>>> From: Yin Fengwei <fengwei.yin@intel.com>
>>>>
>>>> set_pte_range() allows to setup page table entries for a specific
>>>> range.  It takes advantage of batched rmap update for large folio.
>>>> It now takes care of calling update_mmu_cache_range().
>>>>
>>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>>>> ---
>>>>  Documentation/filesystems/locking.rst |  2 +-
>>>>  include/linux/mm.h                    |  3 ++-
>>>>  mm/filemap.c                          |  3 +--
>>>>  mm/memory.c                           | 27 +++++++++++++++------------
>>>>  4 files changed, 19 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
>>>> index 7de7a7272a5e..922886fefb7f 100644
>>>> --- a/Documentation/filesystems/locking.rst
>>>> +++ b/Documentation/filesystems/locking.rst
>>>> @@ -663,7 +663,7 @@ locked. The VM will unlock the page.
>>>>  Filesystem should find and map pages associated with offsets from "start_pgoff"
>>>>  till "end_pgoff". ->map_pages() is called with page table locked and must
>>>>  not block.  If it's not possible to reach a page without blocking,
>>>> -filesystem should skip it. Filesystem should use do_set_pte() to setup
>>>> +filesystem should skip it. Filesystem should use set_pte_range() to setup
>>>>  page table entry. Pointer to entry associated with the page is passed in
>>>>  "pte" field in vm_fault structure. Pointers to entries for other offsets
>>>>  should be calculated relative to "pte".
>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>> index ee755bb4e1c1..81788c985a8c 100644
>>>> --- a/include/linux/mm.h
>>>> +++ b/include/linux/mm.h
>>>> @@ -1299,7 +1299,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>>>>  }
>>>>  
>>>>  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>>>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
>>>> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>>> +		struct page *page, unsigned int nr, unsigned long addr);
>>>>  
>>>>  vm_fault_t finish_fault(struct vm_fault *vmf);
>>>>  vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
>>>> diff --git a/mm/filemap.c b/mm/filemap.c
>>>> index 6e2b0778db45..e2317623dcbf 100644
>>>> --- a/mm/filemap.c
>>>> +++ b/mm/filemap.c
>>>> @@ -3504,8 +3504,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>>>>  			ret = VM_FAULT_NOPAGE;
>>>>  
>>>>  		ref_count++;
>>>> -		do_set_pte(vmf, page, addr);
>>>> -		update_mmu_cache(vma, addr, vmf->pte);
>>>> +		set_pte_range(vmf, folio, page, 1, addr);
>>>>  	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>>>>  
>>>>  	/* Restore the vmf->pte */
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index 6aa21e8f3753..9a654802f104 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -4274,7 +4274,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>>>  }
>>>>  #endif
>>>>  
>>>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>>> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>>> +		struct page *page, unsigned int nr, unsigned long addr)
>>>>  {
>>>>  	struct vm_area_struct *vma = vmf->vma;
>>>>  	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>>>> @@ -4282,7 +4283,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>>>  	bool prefault = vmf->address != addr;
>>>
>>> I think you are changing behavior here - is this intentional? Previously this
>>> would be evaluated per page, now its evaluated once for the whole range. The
>>> intention below is that directly faulted pages are mapped young and prefaulted
>>> pages are mapped old. But now a whole range will be mapped the same.
>>
>> Yes. You are right here.
>>
>> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
>> can avoid to handle vmf->address == addr specially. It's OK to 
>> drop prefault and change the logic here a little bit to:
>>   if (arch_wants_old_prefaulted_pte())
>>       entry = pte_mkold(entry);
>>   else
>>       entry = pte_sw_mkyong(entry);
>>
>> It's not necessary to use pte_sw_mkyong for vmf->address == addr
>> because HW will set the ACCESS bit in page table entry.
>>
>> Add Will Deacon in case I missed something here. Thanks.
> 
> I'll defer to Will's response, but not all arm HW supports HW access flag
> management. In that case it's done by SW, so I would imagine that by setting
> this to old initially, we will get a second fault to set the access bit, which
> will slow things down. I wonder if you will need to split this into (up to) 3
> calls to set_ptes()?
If no HW access flag, arch_wants_old_prefaulted_pte() will return false. So
path will goto pte_sw_mkyong(entry). Right?


Regards
Yin, Fengwei

> 
>>
>>
>> Regards
>> Yin, Fengwei
>>
>>>
>>> Thanks,
>>> Ryan
>>>
>>>>  	pte_t entry;
>>>>  
>>>> -	flush_icache_page(vma, page);
>>>> +	flush_icache_pages(vma, page, nr);
>>>>  	entry = mk_pte(page, vma->vm_page_prot);
>>>>  
>>>>  	if (prefault && arch_wants_old_prefaulted_pte())
>>>> @@ -4296,14 +4297,18 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>>>  		entry = pte_mkuffd_wp(entry);
>>>>  	/* copy-on-write page */
>>>>  	if (write && !(vma->vm_flags & VM_SHARED)) {
>>>> -		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>>> -		page_add_new_anon_rmap(page, vma, addr);
>>>> -		lru_cache_add_inactive_or_unevictable(page, vma);
>>>> +		add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
>>>> +		VM_BUG_ON_FOLIO(nr != 1, folio);
>>>> +		folio_add_new_anon_rmap(folio, vma, addr);
>>>> +		folio_add_lru_vma(folio, vma);
>>>>  	} else {
>>>> -		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
>>>> -		page_add_file_rmap(page, vma, false);
>>>> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>>>> +		folio_add_file_rmap_range(folio, page, nr, vma, false);
>>>>  	}
>>>> -	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
>>>> +	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
>>>> +
>>>> +	/* no need to invalidate: a not-present page won't be cached */
>>>> +	update_mmu_cache_range(vma, addr, vmf->pte, nr);
>>>>  }
>>>>  
>>>>  static bool vmf_pte_changed(struct vm_fault *vmf)
>>>> @@ -4376,11 +4381,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>  
>>>>  	/* Re-check under ptl */
>>>>  	if (likely(!vmf_pte_changed(vmf))) {
>>>> -		do_set_pte(vmf, page, vmf->address);
>>>> -
>>>> -		/* no need to invalidate: a not-present page won't be cached */
>>>> -		update_mmu_cache(vma, vmf->address, vmf->pte);
>>>> +		struct folio *folio = page_folio(page);
>>>>  
>>>> +		set_pte_range(vmf, folio, page, 1, vmf->address);
>>>>  		ret = 0;
>>>>  	} else {
>>>>  		update_mmu_tlb(vma, vmf->address, vmf->pte);
>>>
>
  
Ryan Roberts March 16, 2023, 4:50 p.m. UTC | #5
On 16/03/2023 16:41, Yin, Fengwei wrote:
> 
> 
> On 3/17/2023 12:38 AM, Ryan Roberts wrote:
>> On 16/03/2023 16:23, Yin, Fengwei wrote:
>>>
>>>
>>> On 3/15/2023 11:26 PM, Ryan Roberts wrote:
>>>> On 15/03/2023 05:14, Matthew Wilcox (Oracle) wrote:
>>>>> From: Yin Fengwei <fengwei.yin@intel.com>
>>>>>
>>>>> set_pte_range() allows to setup page table entries for a specific
>>>>> range.  It takes advantage of batched rmap update for large folio.
>>>>> It now takes care of calling update_mmu_cache_range().
>>>>>
>>>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>>>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>>>>> ---
>>>>>  Documentation/filesystems/locking.rst |  2 +-
>>>>>  include/linux/mm.h                    |  3 ++-
>>>>>  mm/filemap.c                          |  3 +--
>>>>>  mm/memory.c                           | 27 +++++++++++++++------------
>>>>>  4 files changed, 19 insertions(+), 16 deletions(-)
>>>>>
>>>>> diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
>>>>> index 7de7a7272a5e..922886fefb7f 100644
>>>>> --- a/Documentation/filesystems/locking.rst
>>>>> +++ b/Documentation/filesystems/locking.rst
>>>>> @@ -663,7 +663,7 @@ locked. The VM will unlock the page.
>>>>>  Filesystem should find and map pages associated with offsets from "start_pgoff"
>>>>>  till "end_pgoff". ->map_pages() is called with page table locked and must
>>>>>  not block.  If it's not possible to reach a page without blocking,
>>>>> -filesystem should skip it. Filesystem should use do_set_pte() to setup
>>>>> +filesystem should skip it. Filesystem should use set_pte_range() to setup
>>>>>  page table entry. Pointer to entry associated with the page is passed in
>>>>>  "pte" field in vm_fault structure. Pointers to entries for other offsets
>>>>>  should be calculated relative to "pte".
>>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>>> index ee755bb4e1c1..81788c985a8c 100644
>>>>> --- a/include/linux/mm.h
>>>>> +++ b/include/linux/mm.h
>>>>> @@ -1299,7 +1299,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>>>>>  }
>>>>>  
>>>>>  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>>>>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
>>>>> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>>>> +		struct page *page, unsigned int nr, unsigned long addr);
>>>>>  
>>>>>  vm_fault_t finish_fault(struct vm_fault *vmf);
>>>>>  vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
>>>>> diff --git a/mm/filemap.c b/mm/filemap.c
>>>>> index 6e2b0778db45..e2317623dcbf 100644
>>>>> --- a/mm/filemap.c
>>>>> +++ b/mm/filemap.c
>>>>> @@ -3504,8 +3504,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>>>>>  			ret = VM_FAULT_NOPAGE;
>>>>>  
>>>>>  		ref_count++;
>>>>> -		do_set_pte(vmf, page, addr);
>>>>> -		update_mmu_cache(vma, addr, vmf->pte);
>>>>> +		set_pte_range(vmf, folio, page, 1, addr);
>>>>>  	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>>>>>  
>>>>>  	/* Restore the vmf->pte */
>>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>>> index 6aa21e8f3753..9a654802f104 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -4274,7 +4274,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>>>>  }
>>>>>  #endif
>>>>>  
>>>>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>>>> +void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>>>> +		struct page *page, unsigned int nr, unsigned long addr)
>>>>>  {
>>>>>  	struct vm_area_struct *vma = vmf->vma;
>>>>>  	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
>>>>> @@ -4282,7 +4283,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>>>>  	bool prefault = vmf->address != addr;
>>>>
>>>> I think you are changing behavior here - is this intentional? Previously this
>>>> would be evaluated per page, now its evaluated once for the whole range. The
>>>> intention below is that directly faulted pages are mapped young and prefaulted
>>>> pages are mapped old. But now a whole range will be mapped the same.
>>>
>>> Yes. You are right here.
>>>
>>> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
>>> can avoid to handle vmf->address == addr specially. It's OK to 
>>> drop prefault and change the logic here a little bit to:
>>>   if (arch_wants_old_prefaulted_pte())
>>>       entry = pte_mkold(entry);
>>>   else
>>>       entry = pte_sw_mkyong(entry);
>>>
>>> It's not necessary to use pte_sw_mkyong for vmf->address == addr
>>> because HW will set the ACCESS bit in page table entry.
>>>
>>> Add Will Deacon in case I missed something here. Thanks.
>>
>> I'll defer to Will's response, but not all arm HW supports HW access flag
>> management. In that case it's done by SW, so I would imagine that by setting
>> this to old initially, we will get a second fault to set the access bit, which
>> will slow things down. I wonder if you will need to split this into (up to) 3
>> calls to set_ptes()?
> If no HW access flag, arch_wants_old_prefaulted_pte() will return false. So
> path will goto pte_sw_mkyong(entry). Right?

Oops... yes, I agree with you - disregard my previous comment.

> 
> 
> Regards
> Yin, Fengwei
> 
>>
>>>
>>>
>>> Regards
>>> Yin, Fengwei
>>>
>>>>
>>>> Thanks,
>>>> Ryan
>>>>
>>>>>  	pte_t entry;
>>>>>  
>>>>> -	flush_icache_page(vma, page);
>>>>> +	flush_icache_pages(vma, page, nr);
>>>>>  	entry = mk_pte(page, vma->vm_page_prot);
>>>>>  
>>>>>  	if (prefault && arch_wants_old_prefaulted_pte())
>>>>> @@ -4296,14 +4297,18 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>>>>  		entry = pte_mkuffd_wp(entry);
>>>>>  	/* copy-on-write page */
>>>>>  	if (write && !(vma->vm_flags & VM_SHARED)) {
>>>>> -		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>>>> -		page_add_new_anon_rmap(page, vma, addr);
>>>>> -		lru_cache_add_inactive_or_unevictable(page, vma);
>>>>> +		add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
>>>>> +		VM_BUG_ON_FOLIO(nr != 1, folio);
>>>>> +		folio_add_new_anon_rmap(folio, vma, addr);
>>>>> +		folio_add_lru_vma(folio, vma);
>>>>>  	} else {
>>>>> -		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
>>>>> -		page_add_file_rmap(page, vma, false);
>>>>> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>>>>> +		folio_add_file_rmap_range(folio, page, nr, vma, false);
>>>>>  	}
>>>>> -	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
>>>>> +	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
>>>>> +
>>>>> +	/* no need to invalidate: a not-present page won't be cached */
>>>>> +	update_mmu_cache_range(vma, addr, vmf->pte, nr);
>>>>>  }
>>>>>  
>>>>>  static bool vmf_pte_changed(struct vm_fault *vmf)
>>>>> @@ -4376,11 +4381,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>>  
>>>>>  	/* Re-check under ptl */
>>>>>  	if (likely(!vmf_pte_changed(vmf))) {
>>>>> -		do_set_pte(vmf, page, vmf->address);
>>>>> -
>>>>> -		/* no need to invalidate: a not-present page won't be cached */
>>>>> -		update_mmu_cache(vma, vmf->address, vmf->pte);
>>>>> +		struct folio *folio = page_folio(page);
>>>>>  
>>>>> +		set_pte_range(vmf, folio, page, 1, vmf->address);
>>>>>  		ret = 0;
>>>>>  	} else {
>>>>>  		update_mmu_tlb(vma, vmf->address, vmf->pte);
>>>>
>>
  
Matthew Wilcox March 16, 2023, 5:52 p.m. UTC | #6
On Thu, Mar 16, 2023 at 04:38:58PM +0000, Ryan Roberts wrote:
> On 16/03/2023 16:23, Yin, Fengwei wrote:
> >> I think you are changing behavior here - is this intentional? Previously this
> >> would be evaluated per page, now its evaluated once for the whole range. The
> >> intention below is that directly faulted pages are mapped young and prefaulted
> >> pages are mapped old. But now a whole range will be mapped the same.
> > 
> > Yes. You are right here.
> > 
> > Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
> > can avoid to handle vmf->address == addr specially. It's OK to 
> > drop prefault and change the logic here a little bit to:
> >   if (arch_wants_old_prefaulted_pte())
> >       entry = pte_mkold(entry);
> >   else
> >       entry = pte_sw_mkyong(entry);
> > 
> > It's not necessary to use pte_sw_mkyong for vmf->address == addr
> > because HW will set the ACCESS bit in page table entry.
> > 
> > Add Will Deacon in case I missed something here. Thanks.
> 
> I'll defer to Will's response, but not all arm HW supports HW access flag
> management. In that case it's done by SW, so I would imagine that by setting
> this to old initially, we will get a second fault to set the access bit, which
> will slow things down. I wonder if you will need to split this into (up to) 3
> calls to set_ptes()?

I don't think we should do that.  The limited information I have from
various microarchitectures is that the PTEs must differ only in their
PFN bits in order to use larger TLB entries.  That includes the Accessed
bit (or equivalent).  So we should mkyoung all the PTEs in the same
folio, at least initially.

That said, we should still do this conditionally.  We'll prefault some
other folios too.  So I think this should be:

        bool prefault = (addr > vmf->address) || ((addr + nr) < vmf->address);
  
Yin Fengwei March 17, 2023, 1:58 a.m. UTC | #7
On 3/17/2023 1:52 AM, Matthew Wilcox wrote:
> On Thu, Mar 16, 2023 at 04:38:58PM +0000, Ryan Roberts wrote:
>> On 16/03/2023 16:23, Yin, Fengwei wrote:
>>>> I think you are changing behavior here - is this intentional? Previously this
>>>> would be evaluated per page, now its evaluated once for the whole range. The
>>>> intention below is that directly faulted pages are mapped young and prefaulted
>>>> pages are mapped old. But now a whole range will be mapped the same.
>>>
>>> Yes. You are right here.
>>>
>>> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
>>> can avoid to handle vmf->address == addr specially. It's OK to 
>>> drop prefault and change the logic here a little bit to:
>>>   if (arch_wants_old_prefaulted_pte())
>>>       entry = pte_mkold(entry);
>>>   else
>>>       entry = pte_sw_mkyong(entry);
>>>
>>> It's not necessary to use pte_sw_mkyong for vmf->address == addr
>>> because HW will set the ACCESS bit in page table entry.
>>>
>>> Add Will Deacon in case I missed something here. Thanks.
>>
>> I'll defer to Will's response, but not all arm HW supports HW access flag
>> management. In that case it's done by SW, so I would imagine that by setting
>> this to old initially, we will get a second fault to set the access bit, which
>> will slow things down. I wonder if you will need to split this into (up to) 3
>> calls to set_ptes()?
> 
> I don't think we should do that.  The limited information I have from
> various microarchitectures is that the PTEs must differ only in their
> PFN bits in order to use larger TLB entries.  That includes the Accessed
> bit (or equivalent).  So we should mkyoung all the PTEs in the same
> folio, at least initially.
> 
> That said, we should still do this conditionally.  We'll prefault some
> other folios too.  So I think this should be:
> 
>         bool prefault = (addr > vmf->address) || ((addr + nr) < vmf->address);
> 
According to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80, if hardware access
flag is supported on ARM64, there is benefit if prefault PTEs is set as "old".
If we change prefault like above, the PTEs is set as "yong" which loose benefit
on ARM64 with hardware access flag.

ITOH, if from "old" to "yong" is cheap, why not leave all PTEs of folio as "old"
and let hardware to update it to "yong"?

Regards
Yin, Fengwei
  
Matthew Wilcox March 17, 2023, 3:44 a.m. UTC | #8
On Fri, Mar 17, 2023 at 09:58:17AM +0800, Yin, Fengwei wrote:
> 
> 
> On 3/17/2023 1:52 AM, Matthew Wilcox wrote:
> > On Thu, Mar 16, 2023 at 04:38:58PM +0000, Ryan Roberts wrote:
> >> On 16/03/2023 16:23, Yin, Fengwei wrote:
> >>>> I think you are changing behavior here - is this intentional? Previously this
> >>>> would be evaluated per page, now its evaluated once for the whole range. The
> >>>> intention below is that directly faulted pages are mapped young and prefaulted
> >>>> pages are mapped old. But now a whole range will be mapped the same.
> >>>
> >>> Yes. You are right here.
> >>>
> >>> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
> >>> can avoid to handle vmf->address == addr specially. It's OK to 
> >>> drop prefault and change the logic here a little bit to:
> >>>   if (arch_wants_old_prefaulted_pte())
> >>>       entry = pte_mkold(entry);
> >>>   else
> >>>       entry = pte_sw_mkyong(entry);
> >>>
> >>> It's not necessary to use pte_sw_mkyong for vmf->address == addr
> >>> because HW will set the ACCESS bit in page table entry.
> >>>
> >>> Add Will Deacon in case I missed something here. Thanks.
> >>
> >> I'll defer to Will's response, but not all arm HW supports HW access flag
> >> management. In that case it's done by SW, so I would imagine that by setting
> >> this to old initially, we will get a second fault to set the access bit, which
> >> will slow things down. I wonder if you will need to split this into (up to) 3
> >> calls to set_ptes()?
> > 
> > I don't think we should do that.  The limited information I have from
> > various microarchitectures is that the PTEs must differ only in their
> > PFN bits in order to use larger TLB entries.  That includes the Accessed
> > bit (or equivalent).  So we should mkyoung all the PTEs in the same
> > folio, at least initially.
> > 
> > That said, we should still do this conditionally.  We'll prefault some
> > other folios too.  So I think this should be:
> > 
> >         bool prefault = (addr > vmf->address) || ((addr + nr) < vmf->address);
> > 
> According to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80, if hardware access
> flag is supported on ARM64, there is benefit if prefault PTEs is set as "old".
> If we change prefault like above, the PTEs is set as "yong" which loose benefit
> on ARM64 with hardware access flag.
> 
> ITOH, if from "old" to "yong" is cheap, why not leave all PTEs of folio as "old"
> and let hardware to update it to "yong"?

Because we're tracking the entire folio as a single entity.  So we're
better off avoiding the extra pagefaults to update the accessed bit,
which won't actually give us any information (vmscan needs to know "were
any of the accessed bits set", not "how many of them were set").

Anyway, hopefully Ryan can test this and let us know if it fixes the
regression he sees.
  
Yin Fengwei March 17, 2023, 6:33 a.m. UTC | #9
On 3/17/2023 11:44 AM, Matthew Wilcox wrote:
> On Fri, Mar 17, 2023 at 09:58:17AM +0800, Yin, Fengwei wrote:
>>
>>
>> On 3/17/2023 1:52 AM, Matthew Wilcox wrote:
>>> On Thu, Mar 16, 2023 at 04:38:58PM +0000, Ryan Roberts wrote:
>>>> On 16/03/2023 16:23, Yin, Fengwei wrote:
>>>>>> I think you are changing behavior here - is this intentional? Previously this
>>>>>> would be evaluated per page, now its evaluated once for the whole range. The
>>>>>> intention below is that directly faulted pages are mapped young and prefaulted
>>>>>> pages are mapped old. But now a whole range will be mapped the same.
>>>>>
>>>>> Yes. You are right here.
>>>>>
>>>>> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
>>>>> can avoid to handle vmf->address == addr specially. It's OK to 
>>>>> drop prefault and change the logic here a little bit to:
>>>>>   if (arch_wants_old_prefaulted_pte())
>>>>>       entry = pte_mkold(entry);
>>>>>   else
>>>>>       entry = pte_sw_mkyong(entry);
>>>>>
>>>>> It's not necessary to use pte_sw_mkyong for vmf->address == addr
>>>>> because HW will set the ACCESS bit in page table entry.
>>>>>
>>>>> Add Will Deacon in case I missed something here. Thanks.
>>>>
>>>> I'll defer to Will's response, but not all arm HW supports HW access flag
>>>> management. In that case it's done by SW, so I would imagine that by setting
>>>> this to old initially, we will get a second fault to set the access bit, which
>>>> will slow things down. I wonder if you will need to split this into (up to) 3
>>>> calls to set_ptes()?
>>>
>>> I don't think we should do that.  The limited information I have from
>>> various microarchitectures is that the PTEs must differ only in their
>>> PFN bits in order to use larger TLB entries.  That includes the Accessed
>>> bit (or equivalent).  So we should mkyoung all the PTEs in the same
>>> folio, at least initially.
>>>
>>> That said, we should still do this conditionally.  We'll prefault some
>>> other folios too.  So I think this should be:
>>>
>>>         bool prefault = (addr > vmf->address) || ((addr + nr) < vmf->address);
>>>
>> According to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80, if hardware access
>> flag is supported on ARM64, there is benefit if prefault PTEs is set as "old".
>> If we change prefault like above, the PTEs is set as "yong" which loose benefit
>> on ARM64 with hardware access flag.
>>
>> ITOH, if from "old" to "yong" is cheap, why not leave all PTEs of folio as "old"
>> and let hardware to update it to "yong"?
> 
> Because we're tracking the entire folio as a single entity.  So we're
> better off avoiding the extra pagefaults to update the accessed bit,
> which won't actually give us any information (vmscan needs to know "were
> any of the accessed bits set", not "how many of them were set").
There is no extra pagefaults to update the accessed bit. There are three cases here:
1. hardware support access flag and cheap from "old" to "yong" without extra fault
2. hardware support access flag and expensive from "old" to "yong" without extra fault
3. no hardware support access flag (extra pagefaults from "old" to "yong". Expensive)

For #2 and #3, it's expensive from "old" to "yong", so we always set PTEs "yong" in
page fault.
For #1, It's cheap from "old" to "yong", so it's OK to set PTEs "old" in page fault.
And hardware will set it to "yong" when access memory. Actually, ARM64 with hardware
access bit requires to set PTEs "old".

> 
> Anyway, hopefully Ryan can test this and let us know if it fixes the
> regression he sees.
I highly suspect the regression Ryan saw is not related with this but another my
stupid work. I will send out the testing patch soon. Thanks.


Regards
Yin, Fengwei
  
Ryan Roberts March 17, 2023, 8 a.m. UTC | #10
On 17/03/2023 06:33, Yin, Fengwei wrote:
> 
> 
> On 3/17/2023 11:44 AM, Matthew Wilcox wrote:
>> On Fri, Mar 17, 2023 at 09:58:17AM +0800, Yin, Fengwei wrote:
>>>
>>>
>>> On 3/17/2023 1:52 AM, Matthew Wilcox wrote:
>>>> On Thu, Mar 16, 2023 at 04:38:58PM +0000, Ryan Roberts wrote:
>>>>> On 16/03/2023 16:23, Yin, Fengwei wrote:
>>>>>>> I think you are changing behavior here - is this intentional? Previously this
>>>>>>> would be evaluated per page, now its evaluated once for the whole range. The
>>>>>>> intention below is that directly faulted pages are mapped young and prefaulted
>>>>>>> pages are mapped old. But now a whole range will be mapped the same.
>>>>>>
>>>>>> Yes. You are right here.
>>>>>>
>>>>>> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
>>>>>> can avoid to handle vmf->address == addr specially. It's OK to 
>>>>>> drop prefault and change the logic here a little bit to:
>>>>>>   if (arch_wants_old_prefaulted_pte())
>>>>>>       entry = pte_mkold(entry);
>>>>>>   else
>>>>>>       entry = pte_sw_mkyong(entry);
>>>>>>
>>>>>> It's not necessary to use pte_sw_mkyong for vmf->address == addr
>>>>>> because HW will set the ACCESS bit in page table entry.
>>>>>>
>>>>>> Add Will Deacon in case I missed something here. Thanks.
>>>>>
>>>>> I'll defer to Will's response, but not all arm HW supports HW access flag
>>>>> management. In that case it's done by SW, so I would imagine that by setting
>>>>> this to old initially, we will get a second fault to set the access bit, which
>>>>> will slow things down. I wonder if you will need to split this into (up to) 3
>>>>> calls to set_ptes()?
>>>>
>>>> I don't think we should do that.  The limited information I have from
>>>> various microarchitectures is that the PTEs must differ only in their
>>>> PFN bits in order to use larger TLB entries.  That includes the Accessed
>>>> bit (or equivalent).  So we should mkyoung all the PTEs in the same
>>>> folio, at least initially.
>>>>
>>>> That said, we should still do this conditionally.  We'll prefault some
>>>> other folios too.  So I think this should be:
>>>>
>>>>         bool prefault = (addr > vmf->address) || ((addr + nr) < vmf->address);
>>>>
>>> According to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80, if hardware access
>>> flag is supported on ARM64, there is benefit if prefault PTEs is set as "old".
>>> If we change prefault like above, the PTEs is set as "yong" which loose benefit
>>> on ARM64 with hardware access flag.
>>>
>>> ITOH, if from "old" to "yong" is cheap, why not leave all PTEs of folio as "old"
>>> and let hardware to update it to "yong"?
>>
>> Because we're tracking the entire folio as a single entity.  So we're
>> better off avoiding the extra pagefaults to update the accessed bit,
>> which won't actually give us any information (vmscan needs to know "were
>> any of the accessed bits set", not "how many of them were set").
> There is no extra pagefaults to update the accessed bit. There are three cases here:
> 1. hardware support access flag and cheap from "old" to "yong" without extra fault
> 2. hardware support access flag and expensive from "old" to "yong" without extra fault
> 3. no hardware support access flag (extra pagefaults from "old" to "yong". Expensive)
> 
> For #2 and #3, it's expensive from "old" to "yong", so we always set PTEs "yong" in
> page fault.
> For #1, It's cheap from "old" to "yong", so it's OK to set PTEs "old" in page fault.
> And hardware will set it to "yong" when access memory. Actually, ARM64 with hardware
> access bit requires to set PTEs "old".

Your logic makes sense, but it doesn't take into account the HPA
micro-architectural feature present in some ARM CPUs. HPA can transparently
coalesce multiple pages into a single TLB entry when certain conditions are met
(roughly; upto 4 pages physically and virtually contiguous and all within a
4-page natural alignment). But as Matthew says, this works out better when all
pte attributes (including access and dirty) match. Given the reason for setting
the prefault pages to old is so that vmscan can do a better job of finding cold
pages, and given vmscan will now be looking for folios and not individual pages
(I assume?), I agree with Matthew that we should make whole folios young or old.
It will marginally increase our chances of the access and dirty bits being
consistent across the whole 4-page block that the HW tries to coalesce. If we
unconditionally make everything old, the hw will set accessed for the single
page that faulted, and we therefore don't have consistency for that 4-page block.

> 
>>
>> Anyway, hopefully Ryan can test this and let us know if it fixes the
>> regression he sees.
> I highly suspect the regression Ryan saw is not related with this but another my
> stupid work. I will send out the testing patch soon. Thanks.

I tested a version of this where I made everything unconditionally young,
thinking it might be the source of the perf regression, before I reported it. It
doesn't make any difference. So I agree the regression is somewhere else.

Thanks,
Ryan

> 
> 
> Regards
> Yin, Fengwei
  
Yin Fengwei March 17, 2023, 8:19 a.m. UTC | #11
On 3/17/2023 4:00 PM, Ryan Roberts wrote:
> On 17/03/2023 06:33, Yin, Fengwei wrote:
>>
>>
>> On 3/17/2023 11:44 AM, Matthew Wilcox wrote:
>>> On Fri, Mar 17, 2023 at 09:58:17AM +0800, Yin, Fengwei wrote:
>>>>
>>>>
>>>> On 3/17/2023 1:52 AM, Matthew Wilcox wrote:
>>>>> On Thu, Mar 16, 2023 at 04:38:58PM +0000, Ryan Roberts wrote:
>>>>>> On 16/03/2023 16:23, Yin, Fengwei wrote:
>>>>>>>> I think you are changing behavior here - is this intentional? Previously this
>>>>>>>> would be evaluated per page, now its evaluated once for the whole range. The
>>>>>>>> intention below is that directly faulted pages are mapped young and prefaulted
>>>>>>>> pages are mapped old. But now a whole range will be mapped the same.
>>>>>>>
>>>>>>> Yes. You are right here.
>>>>>>>
>>>>>>> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
>>>>>>> can avoid to handle vmf->address == addr specially. It's OK to 
>>>>>>> drop prefault and change the logic here a little bit to:
>>>>>>>   if (arch_wants_old_prefaulted_pte())
>>>>>>>       entry = pte_mkold(entry);
>>>>>>>   else
>>>>>>>       entry = pte_sw_mkyong(entry);
>>>>>>>
>>>>>>> It's not necessary to use pte_sw_mkyong for vmf->address == addr
>>>>>>> because HW will set the ACCESS bit in page table entry.
>>>>>>>
>>>>>>> Add Will Deacon in case I missed something here. Thanks.
>>>>>>
>>>>>> I'll defer to Will's response, but not all arm HW supports HW access flag
>>>>>> management. In that case it's done by SW, so I would imagine that by setting
>>>>>> this to old initially, we will get a second fault to set the access bit, which
>>>>>> will slow things down. I wonder if you will need to split this into (up to) 3
>>>>>> calls to set_ptes()?
>>>>>
>>>>> I don't think we should do that.  The limited information I have from
>>>>> various microarchitectures is that the PTEs must differ only in their
>>>>> PFN bits in order to use larger TLB entries.  That includes the Accessed
>>>>> bit (or equivalent).  So we should mkyoung all the PTEs in the same
>>>>> folio, at least initially.
>>>>>
>>>>> That said, we should still do this conditionally.  We'll prefault some
>>>>> other folios too.  So I think this should be:
>>>>>
>>>>>         bool prefault = (addr > vmf->address) || ((addr + nr) < vmf->address);
>>>>>
>>>> According to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80, if hardware access
>>>> flag is supported on ARM64, there is benefit if prefault PTEs is set as "old".
>>>> If we change prefault like above, the PTEs is set as "yong" which loose benefit
>>>> on ARM64 with hardware access flag.
>>>>
>>>> ITOH, if from "old" to "yong" is cheap, why not leave all PTEs of folio as "old"
>>>> and let hardware to update it to "yong"?
>>>
>>> Because we're tracking the entire folio as a single entity.  So we're
>>> better off avoiding the extra pagefaults to update the accessed bit,
>>> which won't actually give us any information (vmscan needs to know "were
>>> any of the accessed bits set", not "how many of them were set").
>> There is no extra pagefaults to update the accessed bit. There are three cases here:
>> 1. hardware support access flag and cheap from "old" to "yong" without extra fault
>> 2. hardware support access flag and expensive from "old" to "yong" without extra fault
>> 3. no hardware support access flag (extra pagefaults from "old" to "yong". Expensive)
>>
>> For #2 and #3, it's expensive from "old" to "yong", so we always set PTEs "yong" in
>> page fault.
>> For #1, It's cheap from "old" to "yong", so it's OK to set PTEs "old" in page fault.
>> And hardware will set it to "yong" when access memory. Actually, ARM64 with hardware
>> access bit requires to set PTEs "old".
> 
> Your logic makes sense, but it doesn't take into account the HPA
> micro-architectural feature present in some ARM CPUs. HPA can transparently
> coalesce multiple pages into a single TLB entry when certain conditions are met
> (roughly; upto 4 pages physically and virtually contiguous and all within a
> 4-page natural alignment). But as Matthew says, this works out better when all
> pte attributes (including access and dirty) match. Given the reason for setting
> the prefault pages to old is so that vmscan can do a better job of finding cold
> pages, and given vmscan will now be looking for folios and not individual pages
> (I assume?), I agree with Matthew that we should make whole folios young or old.
> It will marginally increase our chances of the access and dirty bits being
> consistent across the whole 4-page block that the HW tries to coalesce. If we
> unconditionally make everything old, the hw will set accessed for the single
> page that faulted, and we therefore don't have consistency for that 4-page block.
My concern was that the benefit of "old" PTEs for ARM64 with hardware access bit
will be lost. The workloads (application launch latency and direct reclaim according
to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80) can show regression with this
series. Thanks.

BTW, with TLB merge feature, should hardware update coalesce multiple pages access
bit together? otherwise, it's avoidable that only one page access is set by hardware
finally.

Regards
Yin, Fengwei

> 
>>
>>>
>>> Anyway, hopefully Ryan can test this and let us know if it fixes the
>>> regression he sees.
>> I highly suspect the regression Ryan saw is not related with this but another my
>> stupid work. I will send out the testing patch soon. Thanks.
> 
> I tested a version of this where I made everything unconditionally young,
> thinking it might be the source of the perf regression, before I reported it. It
> doesn't make any difference. So I agree the regression is somewhere else.
> 
> Thanks,
> Ryan
> 
>>
>>
>> Regards
>> Yin, Fengwei
>
  
Ryan Roberts March 17, 2023, 1 p.m. UTC | #12
On 17/03/2023 08:19, Yin, Fengwei wrote:
> 
> 
> On 3/17/2023 4:00 PM, Ryan Roberts wrote:
>> On 17/03/2023 06:33, Yin, Fengwei wrote:
>>>
>>>
>>> On 3/17/2023 11:44 AM, Matthew Wilcox wrote:
>>>> On Fri, Mar 17, 2023 at 09:58:17AM +0800, Yin, Fengwei wrote:
>>>>>
>>>>>
>>>>> On 3/17/2023 1:52 AM, Matthew Wilcox wrote:
>>>>>> On Thu, Mar 16, 2023 at 04:38:58PM +0000, Ryan Roberts wrote:
>>>>>>> On 16/03/2023 16:23, Yin, Fengwei wrote:
>>>>>>>>> I think you are changing behavior here - is this intentional? Previously this
>>>>>>>>> would be evaluated per page, now its evaluated once for the whole range. The
>>>>>>>>> intention below is that directly faulted pages are mapped young and prefaulted
>>>>>>>>> pages are mapped old. But now a whole range will be mapped the same.
>>>>>>>>
>>>>>>>> Yes. You are right here.
>>>>>>>>
>>>>>>>> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
>>>>>>>> can avoid to handle vmf->address == addr specially. It's OK to 
>>>>>>>> drop prefault and change the logic here a little bit to:
>>>>>>>>   if (arch_wants_old_prefaulted_pte())
>>>>>>>>       entry = pte_mkold(entry);
>>>>>>>>   else
>>>>>>>>       entry = pte_sw_mkyong(entry);
>>>>>>>>
>>>>>>>> It's not necessary to use pte_sw_mkyong for vmf->address == addr
>>>>>>>> because HW will set the ACCESS bit in page table entry.
>>>>>>>>
>>>>>>>> Add Will Deacon in case I missed something here. Thanks.
>>>>>>>
>>>>>>> I'll defer to Will's response, but not all arm HW supports HW access flag
>>>>>>> management. In that case it's done by SW, so I would imagine that by setting
>>>>>>> this to old initially, we will get a second fault to set the access bit, which
>>>>>>> will slow things down. I wonder if you will need to split this into (up to) 3
>>>>>>> calls to set_ptes()?
>>>>>>
>>>>>> I don't think we should do that.  The limited information I have from
>>>>>> various microarchitectures is that the PTEs must differ only in their
>>>>>> PFN bits in order to use larger TLB entries.  That includes the Accessed
>>>>>> bit (or equivalent).  So we should mkyoung all the PTEs in the same
>>>>>> folio, at least initially.
>>>>>>
>>>>>> That said, we should still do this conditionally.  We'll prefault some
>>>>>> other folios too.  So I think this should be:
>>>>>>
>>>>>>         bool prefault = (addr > vmf->address) || ((addr + nr) < vmf->address);
>>>>>>
>>>>> According to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80, if hardware access
>>>>> flag is supported on ARM64, there is benefit if prefault PTEs is set as "old".
>>>>> If we change prefault like above, the PTEs is set as "yong" which loose benefit
>>>>> on ARM64 with hardware access flag.
>>>>>
>>>>> ITOH, if from "old" to "yong" is cheap, why not leave all PTEs of folio as "old"
>>>>> and let hardware to update it to "yong"?
>>>>
>>>> Because we're tracking the entire folio as a single entity.  So we're
>>>> better off avoiding the extra pagefaults to update the accessed bit,
>>>> which won't actually give us any information (vmscan needs to know "were
>>>> any of the accessed bits set", not "how many of them were set").
>>> There is no extra pagefaults to update the accessed bit. There are three cases here:
>>> 1. hardware support access flag and cheap from "old" to "yong" without extra fault
>>> 2. hardware support access flag and expensive from "old" to "yong" without extra fault
>>> 3. no hardware support access flag (extra pagefaults from "old" to "yong". Expensive)
>>>
>>> For #2 and #3, it's expensive from "old" to "yong", so we always set PTEs "yong" in
>>> page fault.
>>> For #1, It's cheap from "old" to "yong", so it's OK to set PTEs "old" in page fault.
>>> And hardware will set it to "yong" when access memory. Actually, ARM64 with hardware
>>> access bit requires to set PTEs "old".
>>
>> Your logic makes sense, but it doesn't take into account the HPA
>> micro-architectural feature present in some ARM CPUs. HPA can transparently
>> coalesce multiple pages into a single TLB entry when certain conditions are met
>> (roughly; upto 4 pages physically and virtually contiguous and all within a
>> 4-page natural alignment). But as Matthew says, this works out better when all
>> pte attributes (including access and dirty) match. Given the reason for setting
>> the prefault pages to old is so that vmscan can do a better job of finding cold
>> pages, and given vmscan will now be looking for folios and not individual pages
>> (I assume?), I agree with Matthew that we should make whole folios young or old.
>> It will marginally increase our chances of the access and dirty bits being
>> consistent across the whole 4-page block that the HW tries to coalesce. If we
>> unconditionally make everything old, the hw will set accessed for the single
>> page that faulted, and we therefore don't have consistency for that 4-page block.
> My concern was that the benefit of "old" PTEs for ARM64 with hardware access bit
> will be lost. The workloads (application launch latency and direct reclaim according
> to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80) can show regression with this
> series. Thanks.

My (potentially incorrect) understanding of the reason that marking the
prefaulted ptes as old was because it made it easier/quicker for vmscan to
identify those prefaulted pages and reclaim them under memory pressure. I
_assume_ now that we have large folios, that vmscan will be trying to pick
folios for reclaim, not individual subpages within the folio? In which case,
vmscan will only consider the folio as old if _all_ pages within are old. So
marking all the pages of a folio young vs marking 1 page in the folio young
won't make a difference from this perspective. But it will make a difference
from the perspective a HPA. (Please Matthew or somebody else, correct me if my
understanding is incorrect!)

> 
> BTW, with TLB merge feature, should hardware update coalesce multiple pages access
> bit together? otherwise, it's avoidable that only one page access is set by hardware
> finally.

No, the HW will only update the access flag for the single page that is
accessed. So yes, in the long run the value of the flags across the 4-page block
will diverge - that's why I said "marginal" above.

> 
> Regards
> Yin, Fengwei
> 
>>
>>>
>>>>
>>>> Anyway, hopefully Ryan can test this and let us know if it fixes the
>>>> regression he sees.
>>> I highly suspect the regression Ryan saw is not related with this but another my
>>> stupid work. I will send out the testing patch soon. Thanks.
>>
>> I tested a version of this where I made everything unconditionally young,
>> thinking it might be the source of the perf regression, before I reported it. It
>> doesn't make any difference. So I agree the regression is somewhere else.
>>
>> Thanks,
>> Ryan
>>
>>>
>>>
>>> Regards
>>> Yin, Fengwei
>>
  
Yin Fengwei March 17, 2023, 1:44 p.m. UTC | #13
On 3/17/2023 9:00 PM, Ryan Roberts wrote:
> On 17/03/2023 08:19, Yin, Fengwei wrote:
>>
>>
>> On 3/17/2023 4:00 PM, Ryan Roberts wrote:
>>> On 17/03/2023 06:33, Yin, Fengwei wrote:
>>>>
>>>>
>>>> On 3/17/2023 11:44 AM, Matthew Wilcox wrote:
>>>>> On Fri, Mar 17, 2023 at 09:58:17AM +0800, Yin, Fengwei wrote:
>>>>>>
>>>>>>
>>>>>> On 3/17/2023 1:52 AM, Matthew Wilcox wrote:
>>>>>>> On Thu, Mar 16, 2023 at 04:38:58PM +0000, Ryan Roberts wrote:
>>>>>>>> On 16/03/2023 16:23, Yin, Fengwei wrote:
>>>>>>>>>> I think you are changing behavior here - is this intentional? Previously this
>>>>>>>>>> would be evaluated per page, now its evaluated once for the whole range. The
>>>>>>>>>> intention below is that directly faulted pages are mapped young and prefaulted
>>>>>>>>>> pages are mapped old. But now a whole range will be mapped the same.
>>>>>>>>>
>>>>>>>>> Yes. You are right here.
>>>>>>>>>
>>>>>>>>> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
>>>>>>>>> can avoid to handle vmf->address == addr specially. It's OK to 
>>>>>>>>> drop prefault and change the logic here a little bit to:
>>>>>>>>>   if (arch_wants_old_prefaulted_pte())
>>>>>>>>>       entry = pte_mkold(entry);
>>>>>>>>>   else
>>>>>>>>>       entry = pte_sw_mkyong(entry);
>>>>>>>>>
>>>>>>>>> It's not necessary to use pte_sw_mkyong for vmf->address == addr
>>>>>>>>> because HW will set the ACCESS bit in page table entry.
>>>>>>>>>
>>>>>>>>> Add Will Deacon in case I missed something here. Thanks.
>>>>>>>>
>>>>>>>> I'll defer to Will's response, but not all arm HW supports HW access flag
>>>>>>>> management. In that case it's done by SW, so I would imagine that by setting
>>>>>>>> this to old initially, we will get a second fault to set the access bit, which
>>>>>>>> will slow things down. I wonder if you will need to split this into (up to) 3
>>>>>>>> calls to set_ptes()?
>>>>>>>
>>>>>>> I don't think we should do that.  The limited information I have from
>>>>>>> various microarchitectures is that the PTEs must differ only in their
>>>>>>> PFN bits in order to use larger TLB entries.  That includes the Accessed
>>>>>>> bit (or equivalent).  So we should mkyoung all the PTEs in the same
>>>>>>> folio, at least initially.
>>>>>>>
>>>>>>> That said, we should still do this conditionally.  We'll prefault some
>>>>>>> other folios too.  So I think this should be:
>>>>>>>
>>>>>>>         bool prefault = (addr > vmf->address) || ((addr + nr) < vmf->address);
>>>>>>>
>>>>>> According to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80, if hardware access
>>>>>> flag is supported on ARM64, there is benefit if prefault PTEs is set as "old".
>>>>>> If we change prefault like above, the PTEs is set as "yong" which loose benefit
>>>>>> on ARM64 with hardware access flag.
>>>>>>
>>>>>> ITOH, if from "old" to "yong" is cheap, why not leave all PTEs of folio as "old"
>>>>>> and let hardware to update it to "yong"?
>>>>>
>>>>> Because we're tracking the entire folio as a single entity.  So we're
>>>>> better off avoiding the extra pagefaults to update the accessed bit,
>>>>> which won't actually give us any information (vmscan needs to know "were
>>>>> any of the accessed bits set", not "how many of them were set").
>>>> There is no extra pagefaults to update the accessed bit. There are three cases here:
>>>> 1. hardware support access flag and cheap from "old" to "yong" without extra fault
>>>> 2. hardware support access flag and expensive from "old" to "yong" without extra fault
>>>> 3. no hardware support access flag (extra pagefaults from "old" to "yong". Expensive)
>>>>
>>>> For #2 and #3, it's expensive from "old" to "yong", so we always set PTEs "yong" in
>>>> page fault.
>>>> For #1, It's cheap from "old" to "yong", so it's OK to set PTEs "old" in page fault.
>>>> And hardware will set it to "yong" when access memory. Actually, ARM64 with hardware
>>>> access bit requires to set PTEs "old".
>>>
>>> Your logic makes sense, but it doesn't take into account the HPA
>>> micro-architectural feature present in some ARM CPUs. HPA can transparently
>>> coalesce multiple pages into a single TLB entry when certain conditions are met
>>> (roughly; upto 4 pages physically and virtually contiguous and all within a
>>> 4-page natural alignment). But as Matthew says, this works out better when all
>>> pte attributes (including access and dirty) match. Given the reason for setting
>>> the prefault pages to old is so that vmscan can do a better job of finding cold
>>> pages, and given vmscan will now be looking for folios and not individual pages
>>> (I assume?), I agree with Matthew that we should make whole folios young or old.
>>> It will marginally increase our chances of the access and dirty bits being
>>> consistent across the whole 4-page block that the HW tries to coalesce. If we
>>> unconditionally make everything old, the hw will set accessed for the single
>>> page that faulted, and we therefore don't have consistency for that 4-page block.
>> My concern was that the benefit of "old" PTEs for ARM64 with hardware access bit
>> will be lost. The workloads (application launch latency and direct reclaim according
>> to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80) can show regression with this
>> series. Thanks.
> 
> My (potentially incorrect) understanding of the reason that marking the
> prefaulted ptes as old was because it made it easier/quicker for vmscan to
> identify those prefaulted pages and reclaim them under memory pressure. I
> _assume_ now that we have large folios, that vmscan will be trying to pick
> folios for reclaim, not individual subpages within the folio? In which case,
> vmscan will only consider the folio as old if _all_ pages within are old. So
> marking all the pages of a folio young vs marking 1 page in the folio young
> won't make a difference from this perspective. But it will make a difference
> from the perspective a HPA. (Please Matthew or somebody else, correct me if my
> understanding is incorrect!)
Thanks a lot for your patient explanation. I got the point here. For the first
access, we mark the all PTEs of folio "yong". So later access will get large TLB.


Regards
Yin, Fengwei

> 
>>
>> BTW, with TLB merge feature, should hardware update coalesce multiple pages access
>> bit together? otherwise, it's avoidable that only one page access is set by hardware
>> finally.
> 
> No, the HW will only update the access flag for the single page that is
> accessed. So yes, in the long run the value of the flags across the 4-page block
> will diverge - that's why I said "marginal" above.
> 
>>
>> Regards
>> Yin, Fengwei
>>
>>>
>>>>
>>>>>
>>>>> Anyway, hopefully Ryan can test this and let us know if it fixes the
>>>>> regression he sees.
>>>> I highly suspect the regression Ryan saw is not related with this but another my
>>>> stupid work. I will send out the testing patch soon. Thanks.
>>>
>>> I tested a version of this where I made everything unconditionally young,
>>> thinking it might be the source of the perf regression, before I reported it. It
>>> doesn't make any difference. So I agree the regression is somewhere else.
>>>
>>> Thanks,
>>> Ryan
>>>
>>>>
>>>>
>>>> Regards
>>>> Yin, Fengwei
>>>
>
  
Yin Fengwei March 20, 2023, 1:38 p.m. UTC | #14
Hi Matthew,

On 3/17/2023 11:44 AM, Matthew Wilcox wrote:
> On Fri, Mar 17, 2023 at 09:58:17AM +0800, Yin, Fengwei wrote:
>>
>>
>> On 3/17/2023 1:52 AM, Matthew Wilcox wrote:
>>> On Thu, Mar 16, 2023 at 04:38:58PM +0000, Ryan Roberts wrote:
>>>> On 16/03/2023 16:23, Yin, Fengwei wrote:
>>>>>> I think you are changing behavior here - is this intentional? Previously this
>>>>>> would be evaluated per page, now its evaluated once for the whole range. The
>>>>>> intention below is that directly faulted pages are mapped young and prefaulted
>>>>>> pages are mapped old. But now a whole range will be mapped the same.
>>>>>
>>>>> Yes. You are right here.
>>>>>
>>>>> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
>>>>> can avoid to handle vmf->address == addr specially. It's OK to 
>>>>> drop prefault and change the logic here a little bit to:
>>>>>   if (arch_wants_old_prefaulted_pte())
>>>>>       entry = pte_mkold(entry);
>>>>>   else
>>>>>       entry = pte_sw_mkyong(entry);
>>>>>
>>>>> It's not necessary to use pte_sw_mkyong for vmf->address == addr
>>>>> because HW will set the ACCESS bit in page table entry.
>>>>>
>>>>> Add Will Deacon in case I missed something here. Thanks.
>>>>
>>>> I'll defer to Will's response, but not all arm HW supports HW access flag
>>>> management. In that case it's done by SW, so I would imagine that by setting
>>>> this to old initially, we will get a second fault to set the access bit, which
>>>> will slow things down. I wonder if you will need to split this into (up to) 3
>>>> calls to set_ptes()?
>>>
>>> I don't think we should do that.  The limited information I have from
>>> various microarchitectures is that the PTEs must differ only in their
>>> PFN bits in order to use larger TLB entries.  That includes the Accessed
>>> bit (or equivalent).  So we should mkyoung all the PTEs in the same
>>> folio, at least initially.
>>>
>>> That said, we should still do this conditionally.  We'll prefault some
>>> other folios too.  So I think this should be:
>>>
>>>         bool prefault = (addr > vmf->address) || ((addr + nr) < vmf->address);
>>>
>> According to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80, if hardware access
>> flag is supported on ARM64, there is benefit if prefault PTEs is set as "old".
>> If we change prefault like above, the PTEs is set as "yong" which loose benefit
>> on ARM64 with hardware access flag.
>>
>> ITOH, if from "old" to "yong" is cheap, why not leave all PTEs of folio as "old"
>> and let hardware to update it to "yong"?
> 
> Because we're tracking the entire folio as a single entity.  So we're
> better off avoiding the extra pagefaults to update the accessed bit,
> which won't actually give us any information (vmscan needs to know "were
> any of the accessed bits set", not "how many of them were set").
> 
> Anyway, hopefully Ryan can test this and let us know if it fixes the
> regression he sees.

Thanks a lot to Ryan for helping to test the debug patch I made.

Ryan confirmed that the following change could fix the kernel build regression:
diff --git a/mm/filemap.c b/mm/filemap.c
index db86e459dde6..343d6ff36b2c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3557,7 +3557,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,

                ret |= filemap_map_folio_range(vmf, folio,
                                xas.xa_index - folio->index, addr, nr_pages);
-               xas.xa_index += nr_pages;
+               xas.xa_index += folio_test_large(folio) ? nr_pages : 0;

                folio_unlock(folio);
                folio_put(folio);

I will make upstream-able change as "xas.xa_index += nr_pages - 1;"

Ryan and I also identify some other changes needed. I am not sure how to
integrate those changes to this series. Maybe an add-on patch after this
series? Thanks.

Regards
Yin, Fengwei
  
Matthew Wilcox March 20, 2023, 2:08 p.m. UTC | #15
On Mon, Mar 20, 2023 at 09:38:55PM +0800, Yin, Fengwei wrote:
> Thanks a lot to Ryan for helping to test the debug patch I made.
> 
> Ryan confirmed that the following change could fix the kernel build regression:
> diff --git a/mm/filemap.c b/mm/filemap.c
> index db86e459dde6..343d6ff36b2c 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3557,7 +3557,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
> 
>                 ret |= filemap_map_folio_range(vmf, folio,
>                                 xas.xa_index - folio->index, addr, nr_pages);
> -               xas.xa_index += nr_pages;
> +               xas.xa_index += folio_test_large(folio) ? nr_pages : 0;
> 
>                 folio_unlock(folio);
>                 folio_put(folio);
> 
> I will make upstream-able change as "xas.xa_index += nr_pages - 1;"

Thanks to both of you!

Really, we shouldn't need to interfere with xas.xa_index at all.
Does this work?

diff --git a/mm/filemap.c b/mm/filemap.c
index 8e4f95c5b65a..e40c967dde5f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3420,10 +3420,10 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
 	return false;
 }
 
-static struct folio *next_uptodate_page(struct folio *folio,
-				       struct address_space *mapping,
-				       struct xa_state *xas, pgoff_t end_pgoff)
+static struct folio *next_uptodate_folio(struct xa_state *xas,
+		struct address_space *mapping, pgoff_t end_pgoff)
 {
+	struct folio *folio = xas_next_entry(xas, end_pgoff);
 	unsigned long max_idx;
 
 	do {
@@ -3461,22 +3461,6 @@ static struct folio *next_uptodate_page(struct folio *folio,
 	return NULL;
 }
 
-static inline struct folio *first_map_page(struct address_space *mapping,
-					  struct xa_state *xas,
-					  pgoff_t end_pgoff)
-{
-	return next_uptodate_page(xas_find(xas, end_pgoff),
-				  mapping, xas, end_pgoff);
-}
-
-static inline struct folio *next_map_page(struct address_space *mapping,
-					 struct xa_state *xas,
-					 pgoff_t end_pgoff)
-{
-	return next_uptodate_page(xas_next_entry(xas, end_pgoff),
-				  mapping, xas, end_pgoff);
-}
-
 /*
  * Map page range [start_page, start_page + nr_pages) of folio.
  * start_page is gotten from start by folio_page(folio, start)
@@ -3552,7 +3536,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 	int nr_pages = 0;
 
 	rcu_read_lock();
-	folio = first_map_page(mapping, &xas, end_pgoff);
+	folio = next_uptodate_folio(&xas, mapping, end_pgoff);
 	if (!folio)
 		goto out;
 
@@ -3574,11 +3558,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 
 		ret |= filemap_map_folio_range(vmf, folio,
 				xas.xa_index - folio->index, addr, nr_pages);
-		xas.xa_index += nr_pages;
 
 		folio_unlock(folio);
 		folio_put(folio);
-	} while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);
+		folio = next_uptodate_folio(&xas, mapping, end_pgoff);
+	} while (folio);
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
 	rcu_read_unlock();

> Ryan and I also identify some other changes needed. I am not sure how to
> integrate those changes to this series. Maybe an add-on patch after this
> series? Thanks.

Up to you; I'm happy to integrate fixup patches into the current series
or add on new ones.
  
Yin Fengwei March 21, 2023, 1:58 a.m. UTC | #16
On 3/20/2023 10:08 PM, Matthew Wilcox wrote:
> On Mon, Mar 20, 2023 at 09:38:55PM +0800, Yin, Fengwei wrote:
>> Thanks a lot to Ryan for helping to test the debug patch I made.
>>
>> Ryan confirmed that the following change could fix the kernel build regression:
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index db86e459dde6..343d6ff36b2c 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -3557,7 +3557,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
>>
>>                 ret |= filemap_map_folio_range(vmf, folio,
>>                                 xas.xa_index - folio->index, addr, nr_pages);
>> -               xas.xa_index += nr_pages;
>> +               xas.xa_index += folio_test_large(folio) ? nr_pages : 0;
>>
>>                 folio_unlock(folio);
>>                 folio_put(folio);
>>
>> I will make upstream-able change as "xas.xa_index += nr_pages - 1;"
> 
> Thanks to both of you!
> 
> Really, we shouldn't need to interfere with xas.xa_index at all.
> Does this work?
I will give this a try and let you know the result.

> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 8e4f95c5b65a..e40c967dde5f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3420,10 +3420,10 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
>  	return false;
>  }
>  
> -static struct folio *next_uptodate_page(struct folio *folio,
> -				       struct address_space *mapping,
> -				       struct xa_state *xas, pgoff_t end_pgoff)
> +static struct folio *next_uptodate_folio(struct xa_state *xas,
> +		struct address_space *mapping, pgoff_t end_pgoff)
>  {
> +	struct folio *folio = xas_next_entry(xas, end_pgoff);
>  	unsigned long max_idx;
>  
>  	do {
> @@ -3461,22 +3461,6 @@ static struct folio *next_uptodate_page(struct folio *folio,
>  	return NULL;
>  }
>  
> -static inline struct folio *first_map_page(struct address_space *mapping,
> -					  struct xa_state *xas,
> -					  pgoff_t end_pgoff)
> -{
> -	return next_uptodate_page(xas_find(xas, end_pgoff),
> -				  mapping, xas, end_pgoff);
> -}
> -
> -static inline struct folio *next_map_page(struct address_space *mapping,
> -					 struct xa_state *xas,
> -					 pgoff_t end_pgoff)
> -{
> -	return next_uptodate_page(xas_next_entry(xas, end_pgoff),
> -				  mapping, xas, end_pgoff);
> -}
> -
>  /*
>   * Map page range [start_page, start_page + nr_pages) of folio.
>   * start_page is gotten from start by folio_page(folio, start)
> @@ -3552,7 +3536,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
>  	int nr_pages = 0;
>  
>  	rcu_read_lock();
> -	folio = first_map_page(mapping, &xas, end_pgoff);
> +	folio = next_uptodate_folio(&xas, mapping, end_pgoff);
>  	if (!folio)
>  		goto out;
>  
> @@ -3574,11 +3558,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
>  
>  		ret |= filemap_map_folio_range(vmf, folio,
>  				xas.xa_index - folio->index, addr, nr_pages);
> -		xas.xa_index += nr_pages;
>  
>  		folio_unlock(folio);
>  		folio_put(folio);
> -	} while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);
> +		folio = next_uptodate_folio(&xas, mapping, end_pgoff);
> +	} while (folio);
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>  out:
>  	rcu_read_unlock();
> 
>> Ryan and I also identify some other changes needed. I am not sure how to
>> integrate those changes to this series. Maybe an add-on patch after this
>> series? Thanks.
> 
> Up to you; I'm happy to integrate fixup patches into the current series
> or add on new ones.
Integrating to current series should be better. As it doesn't impact the
bisect operations. I will share the changes Ryan and I had after verify
the above change you proposed. Thanks.


Regards
Yin, Fengwei
  
Yin Fengwei March 21, 2023, 5:13 a.m. UTC | #17
On 3/20/23 22:08, Matthew Wilcox wrote:
> On Mon, Mar 20, 2023 at 09:38:55PM +0800, Yin, Fengwei wrote:
>> Thanks a lot to Ryan for helping to test the debug patch I made.
>>
>> Ryan confirmed that the following change could fix the kernel build regression:
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index db86e459dde6..343d6ff36b2c 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -3557,7 +3557,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
>>
>>                  ret |= filemap_map_folio_range(vmf, folio,
>>                                  xas.xa_index - folio->index, addr, nr_pages);
>> -               xas.xa_index += nr_pages;
>> +               xas.xa_index += folio_test_large(folio) ? nr_pages : 0;
>>
>>                  folio_unlock(folio);
>>                  folio_put(folio);
>>
>> I will make upstream-able change as "xas.xa_index += nr_pages - 1;"
> 
> Thanks to both of you!
> 
> Really, we shouldn't need to interfere with xas.xa_index at all.
> Does this work?
Yes. This works perfectly in my side. Thanks.

Regards
Yin, Fengwei

> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 8e4f95c5b65a..e40c967dde5f 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3420,10 +3420,10 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
>   	return false;
>   }
>   
> -static struct folio *next_uptodate_page(struct folio *folio,
> -				       struct address_space *mapping,
> -				       struct xa_state *xas, pgoff_t end_pgoff)
> +static struct folio *next_uptodate_folio(struct xa_state *xas,
> +		struct address_space *mapping, pgoff_t end_pgoff)
>   {
> +	struct folio *folio = xas_next_entry(xas, end_pgoff);
>   	unsigned long max_idx;
>   
>   	do {
> @@ -3461,22 +3461,6 @@ static struct folio *next_uptodate_page(struct folio *folio,
>   	return NULL;
>   }
>   
> -static inline struct folio *first_map_page(struct address_space *mapping,
> -					  struct xa_state *xas,
> -					  pgoff_t end_pgoff)
> -{
> -	return next_uptodate_page(xas_find(xas, end_pgoff),
> -				  mapping, xas, end_pgoff);
> -}
> -
> -static inline struct folio *next_map_page(struct address_space *mapping,
> -					 struct xa_state *xas,
> -					 pgoff_t end_pgoff)
> -{
> -	return next_uptodate_page(xas_next_entry(xas, end_pgoff),
> -				  mapping, xas, end_pgoff);
> -}
> -
>   /*
>    * Map page range [start_page, start_page + nr_pages) of folio.
>    * start_page is gotten from start by folio_page(folio, start)
> @@ -3552,7 +3536,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
>   	int nr_pages = 0;
>   
>   	rcu_read_lock();
> -	folio = first_map_page(mapping, &xas, end_pgoff);
> +	folio = next_uptodate_folio(&xas, mapping, end_pgoff);
>   	if (!folio)
>   		goto out;
>   
> @@ -3574,11 +3558,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
>   
>   		ret |= filemap_map_folio_range(vmf, folio,
>   				xas.xa_index - folio->index, addr, nr_pages);
> -		xas.xa_index += nr_pages;
>   
>   		folio_unlock(folio);
>   		folio_put(folio);
> -	} while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);
> +		folio = next_uptodate_folio(&xas, mapping, end_pgoff);
> +	} while (folio);
>   	pte_unmap_unlock(vmf->pte, vmf->ptl);
>   out:
>   	rcu_read_unlock();
> 
>> Ryan and I also identify some other changes needed. I am not sure how to
>> integrate those changes to this series. Maybe an add-on patch after this
>> series? Thanks.
> 
> Up to you; I'm happy to integrate fixup patches into the current series
> or add on new ones.
  
Will Deacon March 24, 2023, 2:58 p.m. UTC | #18
On Fri, Mar 17, 2023 at 04:19:44PM +0800, Yin, Fengwei wrote:
> 
> 
> On 3/17/2023 4:00 PM, Ryan Roberts wrote:
> > On 17/03/2023 06:33, Yin, Fengwei wrote:
> >>
> >>
> >> On 3/17/2023 11:44 AM, Matthew Wilcox wrote:
> >>> On Fri, Mar 17, 2023 at 09:58:17AM +0800, Yin, Fengwei wrote:
> >>>>
> >>>>
> >>>> On 3/17/2023 1:52 AM, Matthew Wilcox wrote:
> >>>>> On Thu, Mar 16, 2023 at 04:38:58PM +0000, Ryan Roberts wrote:
> >>>>>> On 16/03/2023 16:23, Yin, Fengwei wrote:
> >>>>>>>> I think you are changing behavior here - is this intentional? Previously this
> >>>>>>>> would be evaluated per page, now its evaluated once for the whole range. The
> >>>>>>>> intention below is that directly faulted pages are mapped young and prefaulted
> >>>>>>>> pages are mapped old. But now a whole range will be mapped the same.
> >>>>>>>
> >>>>>>> Yes. You are right here.
> >>>>>>>
> >>>>>>> Look at the prefault and cpu_has_hw_af for ARM64, it looks like we
> >>>>>>> can avoid to handle vmf->address == addr specially. It's OK to 
> >>>>>>> drop prefault and change the logic here a little bit to:
> >>>>>>>   if (arch_wants_old_prefaulted_pte())
> >>>>>>>       entry = pte_mkold(entry);
> >>>>>>>   else
> >>>>>>>       entry = pte_sw_mkyong(entry);
> >>>>>>>
> >>>>>>> It's not necessary to use pte_sw_mkyong for vmf->address == addr
> >>>>>>> because HW will set the ACCESS bit in page table entry.
> >>>>>>>
> >>>>>>> Add Will Deacon in case I missed something here. Thanks.
> >>>>>>
> >>>>>> I'll defer to Will's response, but not all arm HW supports HW access flag
> >>>>>> management. In that case it's done by SW, so I would imagine that by setting
> >>>>>> this to old initially, we will get a second fault to set the access bit, which
> >>>>>> will slow things down. I wonder if you will need to split this into (up to) 3
> >>>>>> calls to set_ptes()?
> >>>>>
> >>>>> I don't think we should do that.  The limited information I have from
> >>>>> various microarchitectures is that the PTEs must differ only in their
> >>>>> PFN bits in order to use larger TLB entries.  That includes the Accessed
> >>>>> bit (or equivalent).  So we should mkyoung all the PTEs in the same
> >>>>> folio, at least initially.
> >>>>>
> >>>>> That said, we should still do this conditionally.  We'll prefault some
> >>>>> other folios too.  So I think this should be:
> >>>>>
> >>>>>         bool prefault = (addr > vmf->address) || ((addr + nr) < vmf->address);
> >>>>>
> >>>> According to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80, if hardware access
> >>>> flag is supported on ARM64, there is benefit if prefault PTEs is set as "old".
> >>>> If we change prefault like above, the PTEs is set as "yong" which loose benefit
> >>>> on ARM64 with hardware access flag.
> >>>>
> >>>> ITOH, if from "old" to "yong" is cheap, why not leave all PTEs of folio as "old"
> >>>> and let hardware to update it to "yong"?
> >>>
> >>> Because we're tracking the entire folio as a single entity.  So we're
> >>> better off avoiding the extra pagefaults to update the accessed bit,
> >>> which won't actually give us any information (vmscan needs to know "were
> >>> any of the accessed bits set", not "how many of them were set").
> >> There is no extra pagefaults to update the accessed bit. There are three cases here:
> >> 1. hardware support access flag and cheap from "old" to "yong" without extra fault
> >> 2. hardware support access flag and expensive from "old" to "yong" without extra fault
> >> 3. no hardware support access flag (extra pagefaults from "old" to "yong". Expensive)
> >>
> >> For #2 and #3, it's expensive from "old" to "yong", so we always set PTEs "yong" in
> >> page fault.
> >> For #1, It's cheap from "old" to "yong", so it's OK to set PTEs "old" in page fault.
> >> And hardware will set it to "yong" when access memory. Actually, ARM64 with hardware
> >> access bit requires to set PTEs "old".
> > 
> > Your logic makes sense, but it doesn't take into account the HPA
> > micro-architectural feature present in some ARM CPUs. HPA can transparently
> > coalesce multiple pages into a single TLB entry when certain conditions are met
> > (roughly; upto 4 pages physically and virtually contiguous and all within a
> > 4-page natural alignment). But as Matthew says, this works out better when all
> > pte attributes (including access and dirty) match. Given the reason for setting
> > the prefault pages to old is so that vmscan can do a better job of finding cold
> > pages, and given vmscan will now be looking for folios and not individual pages
> > (I assume?), I agree with Matthew that we should make whole folios young or old.
> > It will marginally increase our chances of the access and dirty bits being
> > consistent across the whole 4-page block that the HW tries to coalesce. If we
> > unconditionally make everything old, the hw will set accessed for the single
> > page that faulted, and we therefore don't have consistency for that 4-page block.
> My concern was that the benefit of "old" PTEs for ARM64 with hardware access bit
> will be lost. The workloads (application launch latency and direct reclaim according
> to commit 46bdb4277f98e70d0c91f4289897ade533fe9e80) can show regression with this
> series. Thanks.

Yes, please don't fault everything in as young as it has caused horrible
vmscan behaviour leading to app-startup slowdown in the past:

https://lore.kernel.org/all/20210111140149.GB7642@willie-the-truck/

If we have to use the same value for all the ptes, then just base them
all on arch_wants_old_prefaulted_pte() as iirc hardware AF was pretty
cheap in practice for us.

Cheers,

Will
  
Matthew Wilcox March 24, 2023, 3:11 p.m. UTC | #19
On Fri, Mar 24, 2023 at 02:58:29PM +0000, Will Deacon wrote:
> Yes, please don't fault everything in as young as it has caused horrible
> vmscan behaviour leading to app-startup slowdown in the past:
> 
> https://lore.kernel.org/all/20210111140149.GB7642@willie-the-truck/
> 
> If we have to use the same value for all the ptes, then just base them
> all on arch_wants_old_prefaulted_pte() as iirc hardware AF was pretty
> cheap in practice for us.

I think that's wrong, because this is a different scenario.

Before:

We faulted in N single-page folios.  Each page/folio is tracked
independently.  That's N entries on whatever LRU list it ends up on.
The prefaulted ones _should_ be marked old -- they haven't been
accessed; we've just decided to put them in the page tables to
speed up faultaround.  The unaccessed pages need to fall off the LRU
list as quickly as possible; keeping them around only hurts if the
workload has no locality of reference.

After:

We fault in N folios, some possibly consisting of multiple pages.
Each folio is tracked separately, but individual pages in the folio
are not tracked; they belong to their folio.  In this scenario, if
the other PTEs for pages in the same folio are marked as young or old
doesn't matter; the entire folio will be tracked as young, because we
referenced one of the pages in this folio.  Marking the other PTEs as
young actually helps because we don't take pagefaults on them (whether
we have a HW or SW accessed bit).

(can i just say that i dislike how we mix up our old/young accessed/not
terminology here?)

We should still mark the PTEs referencing unaccessed folios as old.
No argument there, and this patch does that.  But it's fine for all the
PTEs referencing the accessed folio to have the young bit, at least as
far as I can tell.
  
Will Deacon March 24, 2023, 5:23 p.m. UTC | #20
On Fri, Mar 24, 2023 at 03:11:00PM +0000, Matthew Wilcox wrote:
> On Fri, Mar 24, 2023 at 02:58:29PM +0000, Will Deacon wrote:
> > Yes, please don't fault everything in as young as it has caused horrible
> > vmscan behaviour leading to app-startup slowdown in the past:
> > 
> > https://lore.kernel.org/all/20210111140149.GB7642@willie-the-truck/
> > 
> > If we have to use the same value for all the ptes, then just base them
> > all on arch_wants_old_prefaulted_pte() as iirc hardware AF was pretty
> > cheap in practice for us.
> 
> I think that's wrong, because this is a different scenario.
> 
> Before:
> 
> We faulted in N single-page folios.  Each page/folio is tracked
> independently.  That's N entries on whatever LRU list it ends up on.
> The prefaulted ones _should_ be marked old -- they haven't been
> accessed; we've just decided to put them in the page tables to
> speed up faultaround.  The unaccessed pages need to fall off the LRU
> list as quickly as possible; keeping them around only hurts if the
> workload has no locality of reference.
> 
> After:
> 
> We fault in N folios, some possibly consisting of multiple pages.
> Each folio is tracked separately, but individual pages in the folio
> are not tracked; they belong to their folio.  In this scenario, if
> the other PTEs for pages in the same folio are marked as young or old
> doesn't matter; the entire folio will be tracked as young, because we
> referenced one of the pages in this folio.  Marking the other PTEs as
> young actually helps because we don't take pagefaults on them (whether
> we have a HW or SW accessed bit).
> 
> (can i just say that i dislike how we mix up our old/young accessed/not
> terminology here?)
> 
> We should still mark the PTEs referencing unaccessed folios as old.
> No argument there, and this patch does that.  But it's fine for all the
> PTEs referencing the accessed folio to have the young bit, at least as
> far as I can tell.

Ok, thanks for the explanation. So as long as
arch_wants_old_prefaulted_pte() is taken into account for the unaccessed
folios, then I think we should be good? Unconditionally marking those
PTEs as old probably hurts x86.

Will
  
Yin Fengwei March 27, 2023, 1:23 a.m. UTC | #21
On 3/25/23 01:23, Will Deacon wrote:
> On Fri, Mar 24, 2023 at 03:11:00PM +0000, Matthew Wilcox wrote:
>> On Fri, Mar 24, 2023 at 02:58:29PM +0000, Will Deacon wrote:
>>> Yes, please don't fault everything in as young as it has caused horrible
>>> vmscan behaviour leading to app-startup slowdown in the past:
>>>
>>> https://lore.kernel.org/all/20210111140149.GB7642@willie-the-truck/
>>>
>>> If we have to use the same value for all the ptes, then just base them
>>> all on arch_wants_old_prefaulted_pte() as iirc hardware AF was pretty
>>> cheap in practice for us.
>>
>> I think that's wrong, because this is a different scenario.
>>
>> Before:
>>
>> We faulted in N single-page folios.  Each page/folio is tracked
>> independently.  That's N entries on whatever LRU list it ends up on.
>> The prefaulted ones _should_ be marked old -- they haven't been
>> accessed; we've just decided to put them in the page tables to
>> speed up faultaround.  The unaccessed pages need to fall off the LRU
>> list as quickly as possible; keeping them around only hurts if the
>> workload has no locality of reference.
>>
>> After:
>>
>> We fault in N folios, some possibly consisting of multiple pages.
>> Each folio is tracked separately, but individual pages in the folio
>> are not tracked; they belong to their folio.  In this scenario, if
>> the other PTEs for pages in the same folio are marked as young or old
>> doesn't matter; the entire folio will be tracked as young, because we
>> referenced one of the pages in this folio.  Marking the other PTEs as
>> young actually helps because we don't take pagefaults on them (whether
>> we have a HW or SW accessed bit).
>>
>> (can i just say that i dislike how we mix up our old/young accessed/not
>> terminology here?)
>>
>> We should still mark the PTEs referencing unaccessed folios as old.
>> No argument there, and this patch does that.  But it's fine for all the
>> PTEs referencing the accessed folio to have the young bit, at least as
>> far as I can tell.
> 
> Ok, thanks for the explanation. So as long as
> arch_wants_old_prefaulted_pte() is taken into account for the unaccessed
> folios, then I think we should be good? Unconditionally marking those
> PTEs as old probably hurts x86.
Yes. We do only mark PTEs old for arch_wants_old_prefaulted_pte()
system. Thanks.


Regards
Yin, Fengwei

> 
> Will
  

Patch

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 7de7a7272a5e..922886fefb7f 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -663,7 +663,7 @@  locked. The VM will unlock the page.
 Filesystem should find and map pages associated with offsets from "start_pgoff"
 till "end_pgoff". ->map_pages() is called with page table locked and must
 not block.  If it's not possible to reach a page without blocking,
-filesystem should skip it. Filesystem should use do_set_pte() to setup
+filesystem should skip it. Filesystem should use set_pte_range() to setup
 page table entry. Pointer to entry associated with the page is passed in
 "pte" field in vm_fault structure. Pointers to entries for other offsets
 should be calculated relative to "pte".
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ee755bb4e1c1..81788c985a8c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1299,7 +1299,8 @@  static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 }
 
 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
+void set_pte_range(struct vm_fault *vmf, struct folio *folio,
+		struct page *page, unsigned int nr, unsigned long addr);
 
 vm_fault_t finish_fault(struct vm_fault *vmf);
 vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
diff --git a/mm/filemap.c b/mm/filemap.c
index 6e2b0778db45..e2317623dcbf 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3504,8 +3504,7 @@  static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
 			ret = VM_FAULT_NOPAGE;
 
 		ref_count++;
-		do_set_pte(vmf, page, addr);
-		update_mmu_cache(vma, addr, vmf->pte);
+		set_pte_range(vmf, folio, page, 1, addr);
 	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
 
 	/* Restore the vmf->pte */
diff --git a/mm/memory.c b/mm/memory.c
index 6aa21e8f3753..9a654802f104 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4274,7 +4274,8 @@  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 }
 #endif
 
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+void set_pte_range(struct vm_fault *vmf, struct folio *folio,
+		struct page *page, unsigned int nr, unsigned long addr)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
@@ -4282,7 +4283,7 @@  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 	bool prefault = vmf->address != addr;
 	pte_t entry;
 
-	flush_icache_page(vma, page);
+	flush_icache_pages(vma, page, nr);
 	entry = mk_pte(page, vma->vm_page_prot);
 
 	if (prefault && arch_wants_old_prefaulted_pte())
@@ -4296,14 +4297,18 @@  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 		entry = pte_mkuffd_wp(entry);
 	/* copy-on-write page */
 	if (write && !(vma->vm_flags & VM_SHARED)) {
-		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
-		page_add_new_anon_rmap(page, vma, addr);
-		lru_cache_add_inactive_or_unevictable(page, vma);
+		add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr);
+		VM_BUG_ON_FOLIO(nr != 1, folio);
+		folio_add_new_anon_rmap(folio, vma, addr);
+		folio_add_lru_vma(folio, vma);
 	} else {
-		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
-		page_add_file_rmap(page, vma, false);
+		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+		folio_add_file_rmap_range(folio, page, nr, vma, false);
 	}
-	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
+	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
+
+	/* no need to invalidate: a not-present page won't be cached */
+	update_mmu_cache_range(vma, addr, vmf->pte, nr);
 }
 
 static bool vmf_pte_changed(struct vm_fault *vmf)
@@ -4376,11 +4381,9 @@  vm_fault_t finish_fault(struct vm_fault *vmf)
 
 	/* Re-check under ptl */
 	if (likely(!vmf_pte_changed(vmf))) {
-		do_set_pte(vmf, page, vmf->address);
-
-		/* no need to invalidate: a not-present page won't be cached */
-		update_mmu_cache(vma, vmf->address, vmf->pte);
+		struct folio *folio = page_folio(page);
 
+		set_pte_range(vmf, folio, page, 1, vmf->address);
 		ret = 0;
 	} else {
 		update_mmu_tlb(vma, vmf->address, vmf->pte);