[V2] arm64: properly define SOFT_DIRTY functionality

Message ID 20230704133633.1918147-1-npache@redhat.com
State New
Headers
Series [V2] arm64: properly define SOFT_DIRTY functionality |

Commit Message

Nico Pache July 4, 2023, 1:36 p.m. UTC
  ARM64 has a soft-dirty bit (software dirty) but never properly defines
CONFIG_ARCH_HAS_SOFT_DIRTY or its necessary functions. This patch
introduces the ability to set/clear the soft dirty bit in a similar
manner as the other arches that utilize it.

However, we must be careful... there are cases where the DBM bit is not
available and the software dirty bit plays a essential role in determining
whether or not a page is dirty. In these cases we must not allow the
user to clear the software dirty bit. We can check for these cases by
utilizing the arch_has_hw_pte_young() function which tests the availability
of DBM.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Liu Shixin <liushixin2@huawei.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 arch/arm64/Kconfig               |   1 +
 arch/arm64/include/asm/pgtable.h | 104 ++++++++++++++++++++++++++-----
 2 files changed, 90 insertions(+), 15 deletions(-)
  

Comments

Mark Rutland July 4, 2023, 2:08 p.m. UTC | #1
On Tue, Jul 04, 2023 at 09:36:33AM -0400, Nico Pache wrote:
> ARM64 has a soft-dirty bit (software dirty) but never properly defines
> CONFIG_ARCH_HAS_SOFT_DIRTY or its necessary functions. This patch
> introduces the ability to set/clear the soft dirty bit in a similar
> manner as the other arches that utilize it.

Anshuman already explained that this is not correct -- to enable
CONFIG_ARCH_HAS_SOFT_DIRTY, you need *another* PTE bit. Please don't send
another version following this approach.

Despite its name, pte_sw_dirty() has nothing to do with
CONFIG_ARCH_HAS_SOFT_DIRTY. We have pte_hw_dirty() and pte_sw_dirty() because
with Hardware Dirty bit management the HW dirty bit is *also* the write
permission bit, and to have a dirty non-writeable PTE state we have to use a SW
bit, which is what pte_sw_dirty() handles. Both pte_hw_dirty() and
pte_sw_dirty() comprise the regular dirty state.

That's *very* different from CONFIG_ARCH_HAS_SOFT_DIRTY, which is about having
a *separate* software dirty state that can be used for longer-term dirty
tracking (whether the page was last touched since some management SW
manipulated the page).

> However, we must be careful... there are cases where the DBM bit is not
> available and the software dirty bit plays a essential role in determining
> whether or not a page is dirty. In these cases we must not allow the
> user to clear the software dirty bit. We can check for these cases by
> utilizing the arch_has_hw_pte_young() function which tests the availability
> of DBM.

Regardless of the above, this doesn't seem to have been thought through. why
would it be ok for this to work or not work dependent on DBM?

Thanks,
Mark.

> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
> Cc: Liu Shixin <liushixin2@huawei.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Yu Zhao <yuzhao@google.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  arch/arm64/Kconfig               |   1 +
>  arch/arm64/include/asm/pgtable.h | 104 ++++++++++++++++++++++++++-----
>  2 files changed, 90 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 7856c3a3e35a..6ea73b8148c5 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -173,6 +173,7 @@ config ARM64
>  	select HAVE_ARCH_PREL32_RELOCATIONS
>  	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
>  	select HAVE_ARCH_SECCOMP_FILTER
> +	select HAVE_ARCH_SOFT_DIRTY
>  	select HAVE_ARCH_STACKLEAK
>  	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
>  	select HAVE_ARCH_TRACEHOOK
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 0bd18de9fd97..c4970c9ed114 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -51,6 +51,20 @@ static inline bool arch_thp_swp_supported(void)
>  }
>  #define arch_thp_swp_supported arch_thp_swp_supported
>  
> +/*
> + * On arm64 without hardware Access Flag, copying from user will fail because
> + * the pte is old and cannot be marked young. So we always end up with zeroed
> + * page after fork() + CoW for pfn mappings. We don't always have a
> + * hardware-managed access flag on arm64.
> + */
> +#define arch_has_hw_pte_young		cpu_has_hw_af
> +
> +/*
> + * Experimentally, it's cheap to set the access flag in hardware and we
> + * benefit from prefaulting mappings as 'old' to start with.
> + */
> +#define arch_wants_old_prefaulted_pte	cpu_has_hw_af
> +
>  /*
>   * Outside of a few very special situations (e.g. hibernation), we always
>   * use broadcast TLB invalidation instructions, therefore a spurious page
> @@ -121,8 +135,9 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
>  })
>  
>  #define pte_hw_dirty(pte)	(pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
> -#define pte_sw_dirty(pte)	(!!(pte_val(pte) & PTE_DIRTY))
> -#define pte_dirty(pte)		(pte_sw_dirty(pte) || pte_hw_dirty(pte))
> +#define pte_soft_dirty(pte)	(!!(pte_val(pte) & PTE_DIRTY))
> +#define pte_dirty(pte)		(pte_soft_dirty(pte) || pte_hw_dirty(pte))
> +#define pte_swp_soft_dirty(pte)	pte_soft_dirty(pte)
>  
>  #define pte_valid(pte)		(!!(pte_val(pte) & PTE_VALID))
>  /*
> @@ -189,7 +204,8 @@ static inline pte_t pte_mkwrite(pte_t pte)
>  
>  static inline pte_t pte_mkclean(pte_t pte)
>  {
> -	pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> +	if (!arch_has_hw_pte_young())
> +		pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
>  	pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
>  
>  	return pte;
> @@ -1077,25 +1093,83 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
>  #define phys_to_ttbr(addr)	(addr)
>  #endif
>  
> -/*
> - * On arm64 without hardware Access Flag, copying from user will fail because
> - * the pte is old and cannot be marked young. So we always end up with zeroed
> - * page after fork() + CoW for pfn mappings. We don't always have a
> - * hardware-managed access flag on arm64.
> - */
> -#define arch_has_hw_pte_young		cpu_has_hw_af
> +static inline bool pud_sect_supported(void)
> +{
> +	return PAGE_SIZE == SZ_4K;
> +}
>  
> +#ifdef CONFIG_ARM64_HW_AFDBM
>  /*
> - * Experimentally, it's cheap to set the access flag in hardware and we
> - * benefit from prefaulting mappings as 'old' to start with.
> + * if we have the DBM bit we can utilize the software dirty bit as
> + * a mechanism to introduce the soft_dirty functionality; however, without
> + * it this bit is crucial to determining if a entry is dirty and we cannot
> + * clear it via software. DBM can also be disabled or broken on some early
> + * armv8 devices, so check its availability before modifying it.
>   */
> -#define arch_wants_old_prefaulted_pte	cpu_has_hw_af
> +static inline pte_t pte_clear_soft_dirty(pte_t pte)
> +{
> +	if (!arch_has_hw_pte_young())
> +		return pte;
>  
> -static inline bool pud_sect_supported(void)
> +	return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> +}
> +
> +static inline pte_t pte_mksoft_dirty(pte_t pte)
>  {
> -	return PAGE_SIZE == SZ_4K;
> +	if (!arch_has_hw_pte_young())
> +		return pte;
> +
> +	return set_pte_bit(pte, __pgprot(PTE_DIRTY));
> +}
> +
> +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
> +{
> +	if (!arch_has_hw_pte_young())
> +		return pte;
> +
> +	return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> +}
> +
> +static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
> +{
> +	if (!arch_has_hw_pte_young())
> +		return pte;
> +
> +	return set_pte_bit(pte, __pgprot(PTE_DIRTY));
> +}
> +
> +static inline int pmd_soft_dirty(pmd_t pmd)
> +{
> +	return pte_soft_dirty(pmd_pte(pmd));
> +}
> +
> +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
> +{
> +	return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
> +}
> +
> +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
> +{
> +	return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
>  }
>  
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +static inline int pmd_swp_soft_dirty(pmd_t pmd)
> +{
> +	return pmd_soft_dirty(pmd);
> +}
> +
> +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
> +{
> +	return pmd_clear_soft_dirty(pmd);
> +}
> +
> +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
> +{
> +	return pmd_mksoft_dirty(pmd);
> +}
> +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> +#endif /* CONFIG_ARM64_HW_AFDBM */
>  
>  #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
>  #define ptep_modify_prot_start ptep_modify_prot_start
> -- 
> 2.41.0
>
  
Nico Pache July 4, 2023, 2:49 p.m. UTC | #2
Hi Mark,

On Tue, Jul 4, 2023 at 10:19 AM Mark Rutland <mark.rutland@arm.com> wrote:
>
> On Tue, Jul 04, 2023 at 09:36:33AM -0400, Nico Pache wrote:
> > ARM64 has a soft-dirty bit (software dirty) but never properly defines
> > CONFIG_ARCH_HAS_SOFT_DIRTY or its necessary functions. This patch
> > introduces the ability to set/clear the soft dirty bit in a similar
> > manner as the other arches that utilize it.
>
> Anshuman already explained that this is not correct -- to enable
> CONFIG_ARCH_HAS_SOFT_DIRTY, you need *another* PTE bit. Please don't send
> another version following this approach.
>
> Despite its name, pte_sw_dirty() has nothing to do with
> CONFIG_ARCH_HAS_SOFT_DIRTY. We have pte_hw_dirty() and pte_sw_dirty() because
> with Hardware Dirty bit management the HW dirty bit is *also* the write
> permission bit, and to have a dirty non-writeable PTE state we have to use a SW
> bit, which is what pte_sw_dirty() handles. Both pte_hw_dirty() and
> pte_sw_dirty() comprise the regular dirty state.
>
> That's *very* different from CONFIG_ARCH_HAS_SOFT_DIRTY, which is about having
> a *separate* software dirty state that can be used for longer-term dirty
> tracking (whether the page was last touched since some management SW
> manipulated the page).
>
> > However, we must be careful... there are cases where the DBM bit is not
> > available and the software dirty bit plays a essential role in determining
> > whether or not a page is dirty. In these cases we must not allow the
> > user to clear the software dirty bit. We can check for these cases by
> > utilizing the arch_has_hw_pte_young() function which tests the availability
> > of DBM.
>
> Regardless of the above, this doesn't seem to have been thought through. why
> would it be ok for this to work or not work dependent on DBM?
It was from my understanding of both reading the code, and the
following chart that the PTE_DIRTY bit was only used in the absence of
the DBM bit to determine the dirty state of a page.
/*
 * PTE bits configuration in the presence of hardware Dirty Bit Management
 * (PTE_WRITE == PTE_DBM):
 *
 * Dirty  Writable | PTE_RDONLY  PTE_WRITE  PTE_DIRTY (sw)
 *   0      0      |   1           0          0
 *   0      1      |   1           1          0
 *   1      0      |   1           0          1
 *   1      1      |   0           1          x
 *
 * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via
 * the page fault mechanism. Checking the dirty status of a pte becomes:
 *
 *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
 */

So from my understanding it seems that when DBM is present, it acts as
the PTE_WRITE bit, and the AF bit is the HW dirty bit. This gives me
the impression that the PTE_DIRTY bit is redundant; however, When DBM
is not present PTE_DIRTY becomes crucial in determining the dirty
state.

-- Nico
>
> Thanks,
> Mark.
>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: David Hildenbrand <david@redhat.com>
> > Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
> > Cc: Liu Shixin <liushixin2@huawei.com>
> > Cc: Will Deacon <will@kernel.org>
> > Cc: Yu Zhao <yuzhao@google.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> >  arch/arm64/Kconfig               |   1 +
> >  arch/arm64/include/asm/pgtable.h | 104 ++++++++++++++++++++++++++-----
> >  2 files changed, 90 insertions(+), 15 deletions(-)
> >
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index 7856c3a3e35a..6ea73b8148c5 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -173,6 +173,7 @@ config ARM64
> >       select HAVE_ARCH_PREL32_RELOCATIONS
> >       select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
> >       select HAVE_ARCH_SECCOMP_FILTER
> > +     select HAVE_ARCH_SOFT_DIRTY
> >       select HAVE_ARCH_STACKLEAK
> >       select HAVE_ARCH_THREAD_STRUCT_WHITELIST
> >       select HAVE_ARCH_TRACEHOOK
> > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > index 0bd18de9fd97..c4970c9ed114 100644
> > --- a/arch/arm64/include/asm/pgtable.h
> > +++ b/arch/arm64/include/asm/pgtable.h
> > @@ -51,6 +51,20 @@ static inline bool arch_thp_swp_supported(void)
> >  }
> >  #define arch_thp_swp_supported arch_thp_swp_supported
> >
> > +/*
> > + * On arm64 without hardware Access Flag, copying from user will fail because
> > + * the pte is old and cannot be marked young. So we always end up with zeroed
> > + * page after fork() + CoW for pfn mappings. We don't always have a
> > + * hardware-managed access flag on arm64.
> > + */
> > +#define arch_has_hw_pte_young                cpu_has_hw_af
> > +
> > +/*
> > + * Experimentally, it's cheap to set the access flag in hardware and we
> > + * benefit from prefaulting mappings as 'old' to start with.
> > + */
> > +#define arch_wants_old_prefaulted_pte        cpu_has_hw_af
> > +
> >  /*
> >   * Outside of a few very special situations (e.g. hibernation), we always
> >   * use broadcast TLB invalidation instructions, therefore a spurious page
> > @@ -121,8 +135,9 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
> >  })
> >
> >  #define pte_hw_dirty(pte)    (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
> > -#define pte_sw_dirty(pte)    (!!(pte_val(pte) & PTE_DIRTY))
> > -#define pte_dirty(pte)               (pte_sw_dirty(pte) || pte_hw_dirty(pte))
> > +#define pte_soft_dirty(pte)  (!!(pte_val(pte) & PTE_DIRTY))
> > +#define pte_dirty(pte)               (pte_soft_dirty(pte) || pte_hw_dirty(pte))
> > +#define pte_swp_soft_dirty(pte)      pte_soft_dirty(pte)
> >
> >  #define pte_valid(pte)               (!!(pte_val(pte) & PTE_VALID))
> >  /*
> > @@ -189,7 +204,8 @@ static inline pte_t pte_mkwrite(pte_t pte)
> >
> >  static inline pte_t pte_mkclean(pte_t pte)
> >  {
> > -     pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > +     if (!arch_has_hw_pte_young())
> > +             pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> >       pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
> >
> >       return pte;
> > @@ -1077,25 +1093,83 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
> >  #define phys_to_ttbr(addr)   (addr)
> >  #endif
> >
> > -/*
> > - * On arm64 without hardware Access Flag, copying from user will fail because
> > - * the pte is old and cannot be marked young. So we always end up with zeroed
> > - * page after fork() + CoW for pfn mappings. We don't always have a
> > - * hardware-managed access flag on arm64.
> > - */
> > -#define arch_has_hw_pte_young                cpu_has_hw_af
> > +static inline bool pud_sect_supported(void)
> > +{
> > +     return PAGE_SIZE == SZ_4K;
> > +}
> >
> > +#ifdef CONFIG_ARM64_HW_AFDBM
> >  /*
> > - * Experimentally, it's cheap to set the access flag in hardware and we
> > - * benefit from prefaulting mappings as 'old' to start with.
> > + * if we have the DBM bit we can utilize the software dirty bit as
> > + * a mechanism to introduce the soft_dirty functionality; however, without
> > + * it this bit is crucial to determining if a entry is dirty and we cannot
> > + * clear it via software. DBM can also be disabled or broken on some early
> > + * armv8 devices, so check its availability before modifying it.
> >   */
> > -#define arch_wants_old_prefaulted_pte        cpu_has_hw_af
> > +static inline pte_t pte_clear_soft_dirty(pte_t pte)
> > +{
> > +     if (!arch_has_hw_pte_young())
> > +             return pte;
> >
> > -static inline bool pud_sect_supported(void)
> > +     return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > +}
> > +
> > +static inline pte_t pte_mksoft_dirty(pte_t pte)
> >  {
> > -     return PAGE_SIZE == SZ_4K;
> > +     if (!arch_has_hw_pte_young())
> > +             return pte;
> > +
> > +     return set_pte_bit(pte, __pgprot(PTE_DIRTY));
> > +}
> > +
> > +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
> > +{
> > +     if (!arch_has_hw_pte_young())
> > +             return pte;
> > +
> > +     return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > +}
> > +
> > +static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
> > +{
> > +     if (!arch_has_hw_pte_young())
> > +             return pte;
> > +
> > +     return set_pte_bit(pte, __pgprot(PTE_DIRTY));
> > +}
> > +
> > +static inline int pmd_soft_dirty(pmd_t pmd)
> > +{
> > +     return pte_soft_dirty(pmd_pte(pmd));
> > +}
> > +
> > +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
> > +{
> > +     return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
> > +}
> > +
> > +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
> > +{
> > +     return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
> >  }
> >
> > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> > +static inline int pmd_swp_soft_dirty(pmd_t pmd)
> > +{
> > +     return pmd_soft_dirty(pmd);
> > +}
> > +
> > +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
> > +{
> > +     return pmd_clear_soft_dirty(pmd);
> > +}
> > +
> > +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
> > +{
> > +     return pmd_mksoft_dirty(pmd);
> > +}
> > +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> > +#endif /* CONFIG_ARM64_HW_AFDBM */
> >
> >  #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
> >  #define ptep_modify_prot_start ptep_modify_prot_start
> > --
> > 2.41.0
> >
>
  
Mark Rutland July 4, 2023, 3:18 p.m. UTC | #3
On Tue, Jul 04, 2023 at 10:49:06AM -0400, Nico Pache wrote:
> Hi Mark,
> 
> On Tue, Jul 4, 2023 at 10:19 AM Mark Rutland <mark.rutland@arm.com> wrote:
> >
> > On Tue, Jul 04, 2023 at 09:36:33AM -0400, Nico Pache wrote:
> > > ARM64 has a soft-dirty bit (software dirty) but never properly defines
> > > CONFIG_ARCH_HAS_SOFT_DIRTY or its necessary functions. This patch
> > > introduces the ability to set/clear the soft dirty bit in a similar
> > > manner as the other arches that utilize it.
> >
> > Anshuman already explained that this is not correct -- to enable
> > CONFIG_ARCH_HAS_SOFT_DIRTY, you need *another* PTE bit. Please don't send
> > another version following this approach.
> >
> > Despite its name, pte_sw_dirty() has nothing to do with
> > CONFIG_ARCH_HAS_SOFT_DIRTY. We have pte_hw_dirty() and pte_sw_dirty() because
> > with Hardware Dirty bit management the HW dirty bit is *also* the write
> > permission bit, and to have a dirty non-writeable PTE state we have to use a SW
> > bit, which is what pte_sw_dirty() handles. Both pte_hw_dirty() and
> > pte_sw_dirty() comprise the regular dirty state.
> >
> > That's *very* different from CONFIG_ARCH_HAS_SOFT_DIRTY, which is about having
> > a *separate* software dirty state that can be used for longer-term dirty
> > tracking (whether the page was last touched since some management SW
> > manipulated the page).
> >
> > > However, we must be careful... there are cases where the DBM bit is not
> > > available and the software dirty bit plays a essential role in determining
> > > whether or not a page is dirty. In these cases we must not allow the
> > > user to clear the software dirty bit. We can check for these cases by
> > > utilizing the arch_has_hw_pte_young() function which tests the availability
> > > of DBM.
> >
> > Regardless of the above, this doesn't seem to have been thought through. why
> > would it be ok for this to work or not work dependent on DBM?
> It was from my understanding of both reading the code, and the
> following chart that the PTE_DIRTY bit was only used in the absence of
> the DBM bit to determine the dirty state of a page.

The PTE_DIRTY bit is used regardless of DBM, for example, in the case I
mentioned of a dirty non-writeable page. Without PTE_DIRTY we'd have no way to
represent a write-protected dirty page.

See pte_wrprotect(), which copies moves HW dirty bit into the PTE_DIRTY bit
when removing write permission:

| static inline pte_t pte_wrprotect(pte_t pte) 
| {
|         /*   
|          * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY
|          * clear), set the PTE_DIRTY bit.
|          */
|         if (pte_hw_dirty(pte))
|                 pte = pte_mkdirty(pte);
| 
|         pte = clear_pte_bit(pte, __pgprot(PTE_WRITE));
|         pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
|         return pte; 
| }

... where pte_mkdirty() is:

| static inline pte_t pte_mkdirty(pte_t pte)
| {
|         pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
| 
|         if (pte_write(pte))
|                 pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
| 
|         return pte;
| }

> /*
>  * PTE bits configuration in the presence of hardware Dirty Bit Management
>  * (PTE_WRITE == PTE_DBM):
>  *
>  * Dirty  Writable | PTE_RDONLY  PTE_WRITE  PTE_DIRTY (sw)
>  *   0      0      |   1           0          0
>  *   0      1      |   1           1          0
>  *   1      0      |   1           0          1
>  *   1      1      |   0           1          x
>  *
>  * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via
>  * the page fault mechanism. Checking the dirty status of a pte becomes:
>  *
>  *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
>  */
> 
> So from my understanding it seems that when DBM is present, it acts as
> the PTE_WRITE bit, and the AF bit is the HW dirty bit. This gives me
> the impression that the PTE_DIRTY bit is redundant; however, When DBM
> is not present PTE_DIRTY becomes crucial in determining the dirty
> state.

As above, PTE_DIRTY is not redundant; regardless of DBM we need the PTE_DIRTY
bit for the regular dirty state. It distinguishes the first and third rows in
that table.

Thanks,
Mark.

> 
> -- Nico
> >
> > Thanks,
> > Mark.
> >
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > Cc: David Hildenbrand <david@redhat.com>
> > > Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
> > > Cc: Liu Shixin <liushixin2@huawei.com>
> > > Cc: Will Deacon <will@kernel.org>
> > > Cc: Yu Zhao <yuzhao@google.com>
> > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > ---
> > >  arch/arm64/Kconfig               |   1 +
> > >  arch/arm64/include/asm/pgtable.h | 104 ++++++++++++++++++++++++++-----
> > >  2 files changed, 90 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > index 7856c3a3e35a..6ea73b8148c5 100644
> > > --- a/arch/arm64/Kconfig
> > > +++ b/arch/arm64/Kconfig
> > > @@ -173,6 +173,7 @@ config ARM64
> > >       select HAVE_ARCH_PREL32_RELOCATIONS
> > >       select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
> > >       select HAVE_ARCH_SECCOMP_FILTER
> > > +     select HAVE_ARCH_SOFT_DIRTY
> > >       select HAVE_ARCH_STACKLEAK
> > >       select HAVE_ARCH_THREAD_STRUCT_WHITELIST
> > >       select HAVE_ARCH_TRACEHOOK
> > > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > > index 0bd18de9fd97..c4970c9ed114 100644
> > > --- a/arch/arm64/include/asm/pgtable.h
> > > +++ b/arch/arm64/include/asm/pgtable.h
> > > @@ -51,6 +51,20 @@ static inline bool arch_thp_swp_supported(void)
> > >  }
> > >  #define arch_thp_swp_supported arch_thp_swp_supported
> > >
> > > +/*
> > > + * On arm64 without hardware Access Flag, copying from user will fail because
> > > + * the pte is old and cannot be marked young. So we always end up with zeroed
> > > + * page after fork() + CoW for pfn mappings. We don't always have a
> > > + * hardware-managed access flag on arm64.
> > > + */
> > > +#define arch_has_hw_pte_young                cpu_has_hw_af
> > > +
> > > +/*
> > > + * Experimentally, it's cheap to set the access flag in hardware and we
> > > + * benefit from prefaulting mappings as 'old' to start with.
> > > + */
> > > +#define arch_wants_old_prefaulted_pte        cpu_has_hw_af
> > > +
> > >  /*
> > >   * Outside of a few very special situations (e.g. hibernation), we always
> > >   * use broadcast TLB invalidation instructions, therefore a spurious page
> > > @@ -121,8 +135,9 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
> > >  })
> > >
> > >  #define pte_hw_dirty(pte)    (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
> > > -#define pte_sw_dirty(pte)    (!!(pte_val(pte) & PTE_DIRTY))
> > > -#define pte_dirty(pte)               (pte_sw_dirty(pte) || pte_hw_dirty(pte))
> > > +#define pte_soft_dirty(pte)  (!!(pte_val(pte) & PTE_DIRTY))
> > > +#define pte_dirty(pte)               (pte_soft_dirty(pte) || pte_hw_dirty(pte))
> > > +#define pte_swp_soft_dirty(pte)      pte_soft_dirty(pte)
> > >
> > >  #define pte_valid(pte)               (!!(pte_val(pte) & PTE_VALID))
> > >  /*
> > > @@ -189,7 +204,8 @@ static inline pte_t pte_mkwrite(pte_t pte)
> > >
> > >  static inline pte_t pte_mkclean(pte_t pte)
> > >  {
> > > -     pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > +     if (!arch_has_hw_pte_young())
> > > +             pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > >       pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
> > >
> > >       return pte;
> > > @@ -1077,25 +1093,83 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
> > >  #define phys_to_ttbr(addr)   (addr)
> > >  #endif
> > >
> > > -/*
> > > - * On arm64 without hardware Access Flag, copying from user will fail because
> > > - * the pte is old and cannot be marked young. So we always end up with zeroed
> > > - * page after fork() + CoW for pfn mappings. We don't always have a
> > > - * hardware-managed access flag on arm64.
> > > - */
> > > -#define arch_has_hw_pte_young                cpu_has_hw_af
> > > +static inline bool pud_sect_supported(void)
> > > +{
> > > +     return PAGE_SIZE == SZ_4K;
> > > +}
> > >
> > > +#ifdef CONFIG_ARM64_HW_AFDBM
> > >  /*
> > > - * Experimentally, it's cheap to set the access flag in hardware and we
> > > - * benefit from prefaulting mappings as 'old' to start with.
> > > + * if we have the DBM bit we can utilize the software dirty bit as
> > > + * a mechanism to introduce the soft_dirty functionality; however, without
> > > + * it this bit is crucial to determining if a entry is dirty and we cannot
> > > + * clear it via software. DBM can also be disabled or broken on some early
> > > + * armv8 devices, so check its availability before modifying it.
> > >   */
> > > -#define arch_wants_old_prefaulted_pte        cpu_has_hw_af
> > > +static inline pte_t pte_clear_soft_dirty(pte_t pte)
> > > +{
> > > +     if (!arch_has_hw_pte_young())
> > > +             return pte;
> > >
> > > -static inline bool pud_sect_supported(void)
> > > +     return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > +}
> > > +
> > > +static inline pte_t pte_mksoft_dirty(pte_t pte)
> > >  {
> > > -     return PAGE_SIZE == SZ_4K;
> > > +     if (!arch_has_hw_pte_young())
> > > +             return pte;
> > > +
> > > +     return set_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > +}
> > > +
> > > +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
> > > +{
> > > +     if (!arch_has_hw_pte_young())
> > > +             return pte;
> > > +
> > > +     return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > +}
> > > +
> > > +static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
> > > +{
> > > +     if (!arch_has_hw_pte_young())
> > > +             return pte;
> > > +
> > > +     return set_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > +}
> > > +
> > > +static inline int pmd_soft_dirty(pmd_t pmd)
> > > +{
> > > +     return pte_soft_dirty(pmd_pte(pmd));
> > > +}
> > > +
> > > +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
> > > +{
> > > +     return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
> > > +}
> > > +
> > > +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
> > > +{
> > > +     return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
> > >  }
> > >
> > > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> > > +static inline int pmd_swp_soft_dirty(pmd_t pmd)
> > > +{
> > > +     return pmd_soft_dirty(pmd);
> > > +}
> > > +
> > > +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
> > > +{
> > > +     return pmd_clear_soft_dirty(pmd);
> > > +}
> > > +
> > > +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
> > > +{
> > > +     return pmd_mksoft_dirty(pmd);
> > > +}
> > > +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> > > +#endif /* CONFIG_ARM64_HW_AFDBM */
> > >
> > >  #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
> > >  #define ptep_modify_prot_start ptep_modify_prot_start
> > > --
> > > 2.41.0
> > >
> >
>
  
Nico Pache July 4, 2023, 4:15 p.m. UTC | #4
Thanks Mark, that explanation is very helpful and makes sense.

Sorry I dont normally work this close to hardware, let alone ARM
hardware, so my understanding of all this is still growing. I mistook
Anshuman's point as me missing a corner case, not that it was
downright wrong.

One last thing, could the AF bit be used instead of the PTE_DIRTY to
determine if a page is DIRTY & !WRITE?
ie) pte_dirty(pte) = pte_hw_dirty(pte) || (pte_young(pte) && !pte_write(pte)

or would this create cases of pages being considered dirty when they
have only been read?

Cheers,
-- Nico

On Tue, Jul 4, 2023 at 11:18 AM Mark Rutland <mark.rutland@arm.com> wrote:
>
> On Tue, Jul 04, 2023 at 10:49:06AM -0400, Nico Pache wrote:
> > Hi Mark,
> >
> > On Tue, Jul 4, 2023 at 10:19 AM Mark Rutland <mark.rutland@arm.com> wrote:
> > >
> > > On Tue, Jul 04, 2023 at 09:36:33AM -0400, Nico Pache wrote:
> > > > ARM64 has a soft-dirty bit (software dirty) but never properly defines
> > > > CONFIG_ARCH_HAS_SOFT_DIRTY or its necessary functions. This patch
> > > > introduces the ability to set/clear the soft dirty bit in a similar
> > > > manner as the other arches that utilize it.
> > >
> > > Anshuman already explained that this is not correct -- to enable
> > > CONFIG_ARCH_HAS_SOFT_DIRTY, you need *another* PTE bit. Please don't send
> > > another version following this approach.
> > >
> > > Despite its name, pte_sw_dirty() has nothing to do with
> > > CONFIG_ARCH_HAS_SOFT_DIRTY. We have pte_hw_dirty() and pte_sw_dirty() because
> > > with Hardware Dirty bit management the HW dirty bit is *also* the write
> > > permission bit, and to have a dirty non-writeable PTE state we have to use a SW
> > > bit, which is what pte_sw_dirty() handles. Both pte_hw_dirty() and
> > > pte_sw_dirty() comprise the regular dirty state.
> > >
> > > That's *very* different from CONFIG_ARCH_HAS_SOFT_DIRTY, which is about having
> > > a *separate* software dirty state that can be used for longer-term dirty
> > > tracking (whether the page was last touched since some management SW
> > > manipulated the page).
> > >
> > > > However, we must be careful... there are cases where the DBM bit is not
> > > > available and the software dirty bit plays a essential role in determining
> > > > whether or not a page is dirty. In these cases we must not allow the
> > > > user to clear the software dirty bit. We can check for these cases by
> > > > utilizing the arch_has_hw_pte_young() function which tests the availability
> > > > of DBM.
> > >
> > > Regardless of the above, this doesn't seem to have been thought through. why
> > > would it be ok for this to work or not work dependent on DBM?
> > It was from my understanding of both reading the code, and the
> > following chart that the PTE_DIRTY bit was only used in the absence of
> > the DBM bit to determine the dirty state of a page.
>
> The PTE_DIRTY bit is used regardless of DBM, for example, in the case I
> mentioned of a dirty non-writeable page. Without PTE_DIRTY we'd have no way to
> represent a write-protected dirty page.
>
> See pte_wrprotect(), which copies moves HW dirty bit into the PTE_DIRTY bit
> when removing write permission:
>
> | static inline pte_t pte_wrprotect(pte_t pte)
> | {
> |         /*
> |          * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY
> |          * clear), set the PTE_DIRTY bit.
> |          */
> |         if (pte_hw_dirty(pte))
> |                 pte = pte_mkdirty(pte);
> |
> |         pte = clear_pte_bit(pte, __pgprot(PTE_WRITE));
> |         pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
> |         return pte;
> | }
>
> ... where pte_mkdirty() is:
>
> | static inline pte_t pte_mkdirty(pte_t pte)
> | {
> |         pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
> |
> |         if (pte_write(pte))
> |                 pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
> |
> |         return pte;
> | }
>
> > /*
> >  * PTE bits configuration in the presence of hardware Dirty Bit Management
> >  * (PTE_WRITE == PTE_DBM):
> >  *
> >  * Dirty  Writable | PTE_RDONLY  PTE_WRITE  PTE_DIRTY (sw)
> >  *   0      0      |   1           0          0
> >  *   0      1      |   1           1          0
> >  *   1      0      |   1           0          1
> >  *   1      1      |   0           1          x
> >  *
> >  * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via
> >  * the page fault mechanism. Checking the dirty status of a pte becomes:
> >  *
> >  *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
> >  */
> >
> > So from my understanding it seems that when DBM is present, it acts as
> > the PTE_WRITE bit, and the AF bit is the HW dirty bit. This gives me
> > the impression that the PTE_DIRTY bit is redundant; however, When DBM
> > is not present PTE_DIRTY becomes crucial in determining the dirty
> > state.
>
> As above, PTE_DIRTY is not redundant; regardless of DBM we need the PTE_DIRTY
> bit for the regular dirty state. It distinguishes the first and third rows in
> that table.
>
> Thanks,
> Mark.
>
> >
> > -- Nico
> > >
> > > Thanks,
> > > Mark.
> > >
> > > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > > Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> > > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > > Cc: David Hildenbrand <david@redhat.com>
> > > > Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
> > > > Cc: Liu Shixin <liushixin2@huawei.com>
> > > > Cc: Will Deacon <will@kernel.org>
> > > > Cc: Yu Zhao <yuzhao@google.com>
> > > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > > ---
> > > >  arch/arm64/Kconfig               |   1 +
> > > >  arch/arm64/include/asm/pgtable.h | 104 ++++++++++++++++++++++++++-----
> > > >  2 files changed, 90 insertions(+), 15 deletions(-)
> > > >
> > > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > > index 7856c3a3e35a..6ea73b8148c5 100644
> > > > --- a/arch/arm64/Kconfig
> > > > +++ b/arch/arm64/Kconfig
> > > > @@ -173,6 +173,7 @@ config ARM64
> > > >       select HAVE_ARCH_PREL32_RELOCATIONS
> > > >       select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
> > > >       select HAVE_ARCH_SECCOMP_FILTER
> > > > +     select HAVE_ARCH_SOFT_DIRTY
> > > >       select HAVE_ARCH_STACKLEAK
> > > >       select HAVE_ARCH_THREAD_STRUCT_WHITELIST
> > > >       select HAVE_ARCH_TRACEHOOK
> > > > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > > > index 0bd18de9fd97..c4970c9ed114 100644
> > > > --- a/arch/arm64/include/asm/pgtable.h
> > > > +++ b/arch/arm64/include/asm/pgtable.h
> > > > @@ -51,6 +51,20 @@ static inline bool arch_thp_swp_supported(void)
> > > >  }
> > > >  #define arch_thp_swp_supported arch_thp_swp_supported
> > > >
> > > > +/*
> > > > + * On arm64 without hardware Access Flag, copying from user will fail because
> > > > + * the pte is old and cannot be marked young. So we always end up with zeroed
> > > > + * page after fork() + CoW for pfn mappings. We don't always have a
> > > > + * hardware-managed access flag on arm64.
> > > > + */
> > > > +#define arch_has_hw_pte_young                cpu_has_hw_af
> > > > +
> > > > +/*
> > > > + * Experimentally, it's cheap to set the access flag in hardware and we
> > > > + * benefit from prefaulting mappings as 'old' to start with.
> > > > + */
> > > > +#define arch_wants_old_prefaulted_pte        cpu_has_hw_af
> > > > +
> > > >  /*
> > > >   * Outside of a few very special situations (e.g. hibernation), we always
> > > >   * use broadcast TLB invalidation instructions, therefore a spurious page
> > > > @@ -121,8 +135,9 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
> > > >  })
> > > >
> > > >  #define pte_hw_dirty(pte)    (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
> > > > -#define pte_sw_dirty(pte)    (!!(pte_val(pte) & PTE_DIRTY))
> > > > -#define pte_dirty(pte)               (pte_sw_dirty(pte) || pte_hw_dirty(pte))
> > > > +#define pte_soft_dirty(pte)  (!!(pte_val(pte) & PTE_DIRTY))
> > > > +#define pte_dirty(pte)               (pte_soft_dirty(pte) || pte_hw_dirty(pte))
> > > > +#define pte_swp_soft_dirty(pte)      pte_soft_dirty(pte)
> > > >
> > > >  #define pte_valid(pte)               (!!(pte_val(pte) & PTE_VALID))
> > > >  /*
> > > > @@ -189,7 +204,8 @@ static inline pte_t pte_mkwrite(pte_t pte)
> > > >
> > > >  static inline pte_t pte_mkclean(pte_t pte)
> > > >  {
> > > > -     pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > > +     if (!arch_has_hw_pte_young())
> > > > +             pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > >       pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
> > > >
> > > >       return pte;
> > > > @@ -1077,25 +1093,83 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
> > > >  #define phys_to_ttbr(addr)   (addr)
> > > >  #endif
> > > >
> > > > -/*
> > > > - * On arm64 without hardware Access Flag, copying from user will fail because
> > > > - * the pte is old and cannot be marked young. So we always end up with zeroed
> > > > - * page after fork() + CoW for pfn mappings. We don't always have a
> > > > - * hardware-managed access flag on arm64.
> > > > - */
> > > > -#define arch_has_hw_pte_young                cpu_has_hw_af
> > > > +static inline bool pud_sect_supported(void)
> > > > +{
> > > > +     return PAGE_SIZE == SZ_4K;
> > > > +}
> > > >
> > > > +#ifdef CONFIG_ARM64_HW_AFDBM
> > > >  /*
> > > > - * Experimentally, it's cheap to set the access flag in hardware and we
> > > > - * benefit from prefaulting mappings as 'old' to start with.
> > > > + * if we have the DBM bit we can utilize the software dirty bit as
> > > > + * a mechanism to introduce the soft_dirty functionality; however, without
> > > > + * it this bit is crucial to determining if a entry is dirty and we cannot
> > > > + * clear it via software. DBM can also be disabled or broken on some early
> > > > + * armv8 devices, so check its availability before modifying it.
> > > >   */
> > > > -#define arch_wants_old_prefaulted_pte        cpu_has_hw_af
> > > > +static inline pte_t pte_clear_soft_dirty(pte_t pte)
> > > > +{
> > > > +     if (!arch_has_hw_pte_young())
> > > > +             return pte;
> > > >
> > > > -static inline bool pud_sect_supported(void)
> > > > +     return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > > +}
> > > > +
> > > > +static inline pte_t pte_mksoft_dirty(pte_t pte)
> > > >  {
> > > > -     return PAGE_SIZE == SZ_4K;
> > > > +     if (!arch_has_hw_pte_young())
> > > > +             return pte;
> > > > +
> > > > +     return set_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > > +}
> > > > +
> > > > +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
> > > > +{
> > > > +     if (!arch_has_hw_pte_young())
> > > > +             return pte;
> > > > +
> > > > +     return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > > +}
> > > > +
> > > > +static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
> > > > +{
> > > > +     if (!arch_has_hw_pte_young())
> > > > +             return pte;
> > > > +
> > > > +     return set_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > > +}
> > > > +
> > > > +static inline int pmd_soft_dirty(pmd_t pmd)
> > > > +{
> > > > +     return pte_soft_dirty(pmd_pte(pmd));
> > > > +}
> > > > +
> > > > +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
> > > > +{
> > > > +     return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
> > > > +}
> > > > +
> > > > +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
> > > > +{
> > > > +     return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
> > > >  }
> > > >
> > > > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> > > > +static inline int pmd_swp_soft_dirty(pmd_t pmd)
> > > > +{
> > > > +     return pmd_soft_dirty(pmd);
> > > > +}
> > > > +
> > > > +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
> > > > +{
> > > > +     return pmd_clear_soft_dirty(pmd);
> > > > +}
> > > > +
> > > > +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
> > > > +{
> > > > +     return pmd_mksoft_dirty(pmd);
> > > > +}
> > > > +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> > > > +#endif /* CONFIG_ARM64_HW_AFDBM */
> > > >
> > > >  #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
> > > >  #define ptep_modify_prot_start ptep_modify_prot_start
> > > > --
> > > > 2.41.0
> > > >
> > >
> >
>
  
Mark Rutland July 4, 2023, 4:27 p.m. UTC | #5
On Tue, Jul 04, 2023 at 12:15:18PM -0400, Nico Pache wrote:
> Thanks Mark, that explanation is very helpful and makes sense.
> 
> Sorry I dont normally work this close to hardware, let alone ARM
> hardware, so my understanding of all this is still growing. I mistook
> Anshuman's point as me missing a corner case, not that it was
> downright wrong.

No problem; thanks for confirming.

> One last thing, could the AF bit be used instead of the PTE_DIRTY to
> determine if a page is DIRTY & !WRITE?
> ie) pte_dirty(pte) = pte_hw_dirty(pte) || (pte_young(pte) && !pte_write(pte)

The AF tracks both reads and write accesses, and we need to be able to clear
that via pte_mkyoung() regardless of whether the page is dirty.

> or would this create cases of pages being considered dirty when they
> have only been read?

Yes -- that would happen, and would be a problem as it would significantly
amplify the set of pages we thought of as dirty (and I suspect might surprise
some code handling pages that are never mapped as writeable, since the dirty
state owuld be unexpected).

Thanks,
Mark.

> 
> Cheers,
> -- Nico
> 
> On Tue, Jul 4, 2023 at 11:18 AM Mark Rutland <mark.rutland@arm.com> wrote:
> >
> > On Tue, Jul 04, 2023 at 10:49:06AM -0400, Nico Pache wrote:
> > > Hi Mark,
> > >
> > > On Tue, Jul 4, 2023 at 10:19 AM Mark Rutland <mark.rutland@arm.com> wrote:
> > > >
> > > > On Tue, Jul 04, 2023 at 09:36:33AM -0400, Nico Pache wrote:
> > > > > ARM64 has a soft-dirty bit (software dirty) but never properly defines
> > > > > CONFIG_ARCH_HAS_SOFT_DIRTY or its necessary functions. This patch
> > > > > introduces the ability to set/clear the soft dirty bit in a similar
> > > > > manner as the other arches that utilize it.
> > > >
> > > > Anshuman already explained that this is not correct -- to enable
> > > > CONFIG_ARCH_HAS_SOFT_DIRTY, you need *another* PTE bit. Please don't send
> > > > another version following this approach.
> > > >
> > > > Despite its name, pte_sw_dirty() has nothing to do with
> > > > CONFIG_ARCH_HAS_SOFT_DIRTY. We have pte_hw_dirty() and pte_sw_dirty() because
> > > > with Hardware Dirty bit management the HW dirty bit is *also* the write
> > > > permission bit, and to have a dirty non-writeable PTE state we have to use a SW
> > > > bit, which is what pte_sw_dirty() handles. Both pte_hw_dirty() and
> > > > pte_sw_dirty() comprise the regular dirty state.
> > > >
> > > > That's *very* different from CONFIG_ARCH_HAS_SOFT_DIRTY, which is about having
> > > > a *separate* software dirty state that can be used for longer-term dirty
> > > > tracking (whether the page was last touched since some management SW
> > > > manipulated the page).
> > > >
> > > > > However, we must be careful... there are cases where the DBM bit is not
> > > > > available and the software dirty bit plays a essential role in determining
> > > > > whether or not a page is dirty. In these cases we must not allow the
> > > > > user to clear the software dirty bit. We can check for these cases by
> > > > > utilizing the arch_has_hw_pte_young() function which tests the availability
> > > > > of DBM.
> > > >
> > > > Regardless of the above, this doesn't seem to have been thought through. why
> > > > would it be ok for this to work or not work dependent on DBM?
> > > It was from my understanding of both reading the code, and the
> > > following chart that the PTE_DIRTY bit was only used in the absence of
> > > the DBM bit to determine the dirty state of a page.
> >
> > The PTE_DIRTY bit is used regardless of DBM, for example, in the case I
> > mentioned of a dirty non-writeable page. Without PTE_DIRTY we'd have no way to
> > represent a write-protected dirty page.
> >
> > See pte_wrprotect(), which copies moves HW dirty bit into the PTE_DIRTY bit
> > when removing write permission:
> >
> > | static inline pte_t pte_wrprotect(pte_t pte)
> > | {
> > |         /*
> > |          * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY
> > |          * clear), set the PTE_DIRTY bit.
> > |          */
> > |         if (pte_hw_dirty(pte))
> > |                 pte = pte_mkdirty(pte);
> > |
> > |         pte = clear_pte_bit(pte, __pgprot(PTE_WRITE));
> > |         pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
> > |         return pte;
> > | }
> >
> > ... where pte_mkdirty() is:
> >
> > | static inline pte_t pte_mkdirty(pte_t pte)
> > | {
> > |         pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
> > |
> > |         if (pte_write(pte))
> > |                 pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
> > |
> > |         return pte;
> > | }
> >
> > > /*
> > >  * PTE bits configuration in the presence of hardware Dirty Bit Management
> > >  * (PTE_WRITE == PTE_DBM):
> > >  *
> > >  * Dirty  Writable | PTE_RDONLY  PTE_WRITE  PTE_DIRTY (sw)
> > >  *   0      0      |   1           0          0
> > >  *   0      1      |   1           1          0
> > >  *   1      0      |   1           0          1
> > >  *   1      1      |   0           1          x
> > >  *
> > >  * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via
> > >  * the page fault mechanism. Checking the dirty status of a pte becomes:
> > >  *
> > >  *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
> > >  */
> > >
> > > So from my understanding it seems that when DBM is present, it acts as
> > > the PTE_WRITE bit, and the AF bit is the HW dirty bit. This gives me
> > > the impression that the PTE_DIRTY bit is redundant; however, When DBM
> > > is not present PTE_DIRTY becomes crucial in determining the dirty
> > > state.
> >
> > As above, PTE_DIRTY is not redundant; regardless of DBM we need the PTE_DIRTY
> > bit for the regular dirty state. It distinguishes the first and third rows in
> > that table.
> >
> > Thanks,
> > Mark.
> >
> > >
> > > -- Nico
> > > >
> > > > Thanks,
> > > > Mark.
> > > >
> > > > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > > > Cc: Anshuman Khandual <anshuman.khandual@arm.com>
> > > > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > > > Cc: David Hildenbrand <david@redhat.com>
> > > > > Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
> > > > > Cc: Liu Shixin <liushixin2@huawei.com>
> > > > > Cc: Will Deacon <will@kernel.org>
> > > > > Cc: Yu Zhao <yuzhao@google.com>
> > > > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > > > ---
> > > > >  arch/arm64/Kconfig               |   1 +
> > > > >  arch/arm64/include/asm/pgtable.h | 104 ++++++++++++++++++++++++++-----
> > > > >  2 files changed, 90 insertions(+), 15 deletions(-)
> > > > >
> > > > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > > > index 7856c3a3e35a..6ea73b8148c5 100644
> > > > > --- a/arch/arm64/Kconfig
> > > > > +++ b/arch/arm64/Kconfig
> > > > > @@ -173,6 +173,7 @@ config ARM64
> > > > >       select HAVE_ARCH_PREL32_RELOCATIONS
> > > > >       select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
> > > > >       select HAVE_ARCH_SECCOMP_FILTER
> > > > > +     select HAVE_ARCH_SOFT_DIRTY
> > > > >       select HAVE_ARCH_STACKLEAK
> > > > >       select HAVE_ARCH_THREAD_STRUCT_WHITELIST
> > > > >       select HAVE_ARCH_TRACEHOOK
> > > > > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > > > > index 0bd18de9fd97..c4970c9ed114 100644
> > > > > --- a/arch/arm64/include/asm/pgtable.h
> > > > > +++ b/arch/arm64/include/asm/pgtable.h
> > > > > @@ -51,6 +51,20 @@ static inline bool arch_thp_swp_supported(void)
> > > > >  }
> > > > >  #define arch_thp_swp_supported arch_thp_swp_supported
> > > > >
> > > > > +/*
> > > > > + * On arm64 without hardware Access Flag, copying from user will fail because
> > > > > + * the pte is old and cannot be marked young. So we always end up with zeroed
> > > > > + * page after fork() + CoW for pfn mappings. We don't always have a
> > > > > + * hardware-managed access flag on arm64.
> > > > > + */
> > > > > +#define arch_has_hw_pte_young                cpu_has_hw_af
> > > > > +
> > > > > +/*
> > > > > + * Experimentally, it's cheap to set the access flag in hardware and we
> > > > > + * benefit from prefaulting mappings as 'old' to start with.
> > > > > + */
> > > > > +#define arch_wants_old_prefaulted_pte        cpu_has_hw_af
> > > > > +
> > > > >  /*
> > > > >   * Outside of a few very special situations (e.g. hibernation), we always
> > > > >   * use broadcast TLB invalidation instructions, therefore a spurious page
> > > > > @@ -121,8 +135,9 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
> > > > >  })
> > > > >
> > > > >  #define pte_hw_dirty(pte)    (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
> > > > > -#define pte_sw_dirty(pte)    (!!(pte_val(pte) & PTE_DIRTY))
> > > > > -#define pte_dirty(pte)               (pte_sw_dirty(pte) || pte_hw_dirty(pte))
> > > > > +#define pte_soft_dirty(pte)  (!!(pte_val(pte) & PTE_DIRTY))
> > > > > +#define pte_dirty(pte)               (pte_soft_dirty(pte) || pte_hw_dirty(pte))
> > > > > +#define pte_swp_soft_dirty(pte)      pte_soft_dirty(pte)
> > > > >
> > > > >  #define pte_valid(pte)               (!!(pte_val(pte) & PTE_VALID))
> > > > >  /*
> > > > > @@ -189,7 +204,8 @@ static inline pte_t pte_mkwrite(pte_t pte)
> > > > >
> > > > >  static inline pte_t pte_mkclean(pte_t pte)
> > > > >  {
> > > > > -     pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > > > +     if (!arch_has_hw_pte_young())
> > > > > +             pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > > >       pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
> > > > >
> > > > >       return pte;
> > > > > @@ -1077,25 +1093,83 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
> > > > >  #define phys_to_ttbr(addr)   (addr)
> > > > >  #endif
> > > > >
> > > > > -/*
> > > > > - * On arm64 without hardware Access Flag, copying from user will fail because
> > > > > - * the pte is old and cannot be marked young. So we always end up with zeroed
> > > > > - * page after fork() + CoW for pfn mappings. We don't always have a
> > > > > - * hardware-managed access flag on arm64.
> > > > > - */
> > > > > -#define arch_has_hw_pte_young                cpu_has_hw_af
> > > > > +static inline bool pud_sect_supported(void)
> > > > > +{
> > > > > +     return PAGE_SIZE == SZ_4K;
> > > > > +}
> > > > >
> > > > > +#ifdef CONFIG_ARM64_HW_AFDBM
> > > > >  /*
> > > > > - * Experimentally, it's cheap to set the access flag in hardware and we
> > > > > - * benefit from prefaulting mappings as 'old' to start with.
> > > > > + * if we have the DBM bit we can utilize the software dirty bit as
> > > > > + * a mechanism to introduce the soft_dirty functionality; however, without
> > > > > + * it this bit is crucial to determining if a entry is dirty and we cannot
> > > > > + * clear it via software. DBM can also be disabled or broken on some early
> > > > > + * armv8 devices, so check its availability before modifying it.
> > > > >   */
> > > > > -#define arch_wants_old_prefaulted_pte        cpu_has_hw_af
> > > > > +static inline pte_t pte_clear_soft_dirty(pte_t pte)
> > > > > +{
> > > > > +     if (!arch_has_hw_pte_young())
> > > > > +             return pte;
> > > > >
> > > > > -static inline bool pud_sect_supported(void)
> > > > > +     return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > > > +}
> > > > > +
> > > > > +static inline pte_t pte_mksoft_dirty(pte_t pte)
> > > > >  {
> > > > > -     return PAGE_SIZE == SZ_4K;
> > > > > +     if (!arch_has_hw_pte_young())
> > > > > +             return pte;
> > > > > +
> > > > > +     return set_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > > > +}
> > > > > +
> > > > > +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
> > > > > +{
> > > > > +     if (!arch_has_hw_pte_young())
> > > > > +             return pte;
> > > > > +
> > > > > +     return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > > > +}
> > > > > +
> > > > > +static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
> > > > > +{
> > > > > +     if (!arch_has_hw_pte_young())
> > > > > +             return pte;
> > > > > +
> > > > > +     return set_pte_bit(pte, __pgprot(PTE_DIRTY));
> > > > > +}
> > > > > +
> > > > > +static inline int pmd_soft_dirty(pmd_t pmd)
> > > > > +{
> > > > > +     return pte_soft_dirty(pmd_pte(pmd));
> > > > > +}
> > > > > +
> > > > > +static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
> > > > > +{
> > > > > +     return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
> > > > > +}
> > > > > +
> > > > > +static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
> > > > > +{
> > > > > +     return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
> > > > >  }
> > > > >
> > > > > +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> > > > > +static inline int pmd_swp_soft_dirty(pmd_t pmd)
> > > > > +{
> > > > > +     return pmd_soft_dirty(pmd);
> > > > > +}
> > > > > +
> > > > > +static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
> > > > > +{
> > > > > +     return pmd_clear_soft_dirty(pmd);
> > > > > +}
> > > > > +
> > > > > +static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
> > > > > +{
> > > > > +     return pmd_mksoft_dirty(pmd);
> > > > > +}
> > > > > +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
> > > > > +#endif /* CONFIG_ARM64_HW_AFDBM */
> > > > >
> > > > >  #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
> > > > >  #define ptep_modify_prot_start ptep_modify_prot_start
> > > > > --
> > > > > 2.41.0
> > > > >
> > > >
> > >
> >
>
  
kernel test robot July 11, 2023, 10:28 p.m. UTC | #6
Hi Nico,

kernel test robot noticed the following build errors:

[auto build test ERROR on arm64/for-next/core]
[also build test ERROR on arm/for-next arm/fixes kvmarm/next soc/for-next linus/master v6.5-rc1 next-20230711]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Nico-Pache/arm64-properly-define-SOFT_DIRTY-functionality/20230704-213758
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
patch link:    https://lore.kernel.org/r/20230704133633.1918147-1-npache%40redhat.com
patch subject: [PATCH V2] arm64: properly define SOFT_DIRTY functionality
config: arm64-randconfig-r001-20230712 (https://download.01.org/0day-ci/archive/20230712/202307120618.qcxXw9mm-lkp@intel.com/config)
compiler: aarch64-linux-gcc (GCC) 12.3.0
reproduce: (https://download.01.org/0day-ci/archive/20230712/202307120618.qcxXw9mm-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202307120618.qcxXw9mm-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from include/linux/mm_inline.h:10,
                    from fs/proc/task_mmu.c:3:
   include/linux/swapops.h: In function 'pte_swp_clear_flags':
   include/linux/swapops.h:77:23: error: implicit declaration of function 'pte_swp_clear_soft_dirty'; did you mean 'pte_swp_soft_dirty'? [-Werror=implicit-function-declaration]
      77 |                 pte = pte_swp_clear_soft_dirty(pte);
         |                       ^~~~~~~~~~~~~~~~~~~~~~~~
         |                       pte_swp_soft_dirty
   include/linux/swapops.h:77:23: error: incompatible types when assigning to type 'pte_t' from type 'int'
   include/linux/swapops.h: In function 'pmd_to_swp_entry':
   include/linux/swapops.h:506:13: error: implicit declaration of function 'pmd_swp_soft_dirty'; did you mean 'pte_swp_soft_dirty'? [-Werror=implicit-function-declaration]
     506 |         if (pmd_swp_soft_dirty(pmd))
         |             ^~~~~~~~~~~~~~~~~~
         |             pte_swp_soft_dirty
   include/linux/swapops.h:507:23: error: implicit declaration of function 'pmd_swp_clear_soft_dirty'; did you mean 'pmd_swp_clear_uffd_wp'? [-Werror=implicit-function-declaration]
     507 |                 pmd = pmd_swp_clear_soft_dirty(pmd);
         |                       ^~~~~~~~~~~~~~~~~~~~~~~~
         |                       pmd_swp_clear_uffd_wp
   include/linux/swapops.h:507:23: error: incompatible types when assigning to type 'pmd_t' from type 'int'
   fs/proc/task_mmu.c: In function 'pagemap_pmd_range':
>> fs/proc/task_mmu.c:1488:29: error: implicit declaration of function 'pmd_soft_dirty'; did you mean 'pte_soft_dirty'? [-Werror=implicit-function-declaration]
    1488 |                         if (pmd_soft_dirty(pmd))
         |                             ^~~~~~~~~~~~~~
         |                             pte_soft_dirty
   cc1: some warnings being treated as errors


vim +1488 fs/proc/task_mmu.c

bcf8039ed45f56 Dave Hansen           2008-06-12  1463  
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1464  static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
2165009bdf63f7 Dave Hansen           2008-06-12  1465  			     struct mm_walk *walk)
85863e475e59af Matt Mackall          2008-02-04  1466  {
f995ece24dfecb Naoya Horiguchi       2015-02-11  1467  	struct vm_area_struct *vma = walk->vma;
2165009bdf63f7 Dave Hansen           2008-06-12  1468  	struct pagemapread *pm = walk->private;
bf929152e9f6c4 Kirill A. Shutemov    2013-11-14  1469  	spinlock_t *ptl;
05fbf357d94152 Konstantin Khlebnikov 2015-02-11  1470  	pte_t *pte, *orig_pte;
85863e475e59af Matt Mackall          2008-02-04  1471  	int err = 0;
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1472  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
24d7275ce27918 Yang Shi              2022-02-11  1473  	bool migration = false;
24d7275ce27918 Yang Shi              2022-02-11  1474  
b6ec57f4b92e9b Kirill A. Shutemov    2016-01-21  1475  	ptl = pmd_trans_huge_lock(pmdp, vma);
b6ec57f4b92e9b Kirill A. Shutemov    2016-01-21  1476  	if (ptl) {
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1477  		u64 flags = 0, frame = 0;
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1478  		pmd_t pmd = *pmdp;
84c3fc4e9c563d Zi Yan                2017-09-08  1479  		struct page *page = NULL;
0f8975ec4db2c8 Pavel Emelyanov       2013-07-03  1480  
b83d7e432399d4 Huang Ying            2017-11-02  1481  		if (vma->vm_flags & VM_SOFTDIRTY)
deb945441b9408 Konstantin Khlebnikov 2015-09-08  1482  			flags |= PM_SOFT_DIRTY;
d9104d1ca96624 Cyrill Gorcunov       2013-09-11  1483  
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1484  		if (pmd_present(pmd)) {
84c3fc4e9c563d Zi Yan                2017-09-08  1485  			page = pmd_page(pmd);
77bb499bb60f4b Konstantin Khlebnikov 2015-09-08  1486  
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1487  			flags |= PM_PRESENT;
b83d7e432399d4 Huang Ying            2017-11-02 @1488  			if (pmd_soft_dirty(pmd))
b83d7e432399d4 Huang Ying            2017-11-02  1489  				flags |= PM_SOFT_DIRTY;
fb8e37f35a2fe1 Peter Xu              2021-06-30  1490  			if (pmd_uffd_wp(pmd))
fb8e37f35a2fe1 Peter Xu              2021-06-30  1491  				flags |= PM_UFFD_WP;
1c90308e7a77af Konstantin Khlebnikov 2015-09-08  1492  			if (pm->show_pfn)
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1493  				frame = pmd_pfn(pmd) +
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1494  					((addr & ~PMD_MASK) >> PAGE_SHIFT);
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1495  		}
84c3fc4e9c563d Zi Yan                2017-09-08  1496  #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
84c3fc4e9c563d Zi Yan                2017-09-08  1497  		else if (is_swap_pmd(pmd)) {
84c3fc4e9c563d Zi Yan                2017-09-08  1498  			swp_entry_t entry = pmd_to_swp_entry(pmd);
ab6ecf247a9321 Huang Ying            2018-06-07  1499  			unsigned long offset;
84c3fc4e9c563d Zi Yan                2017-09-08  1500  
ab6ecf247a9321 Huang Ying            2018-06-07  1501  			if (pm->show_pfn) {
0d206b5d2e0d7d Peter Xu              2022-08-11  1502  				if (is_pfn_swap_entry(entry))
0d206b5d2e0d7d Peter Xu              2022-08-11  1503  					offset = swp_offset_pfn(entry);
0d206b5d2e0d7d Peter Xu              2022-08-11  1504  				else
0d206b5d2e0d7d Peter Xu              2022-08-11  1505  					offset = swp_offset(entry);
0d206b5d2e0d7d Peter Xu              2022-08-11  1506  				offset = offset +
ab6ecf247a9321 Huang Ying            2018-06-07  1507  					((addr & ~PMD_MASK) >> PAGE_SHIFT);
84c3fc4e9c563d Zi Yan                2017-09-08  1508  				frame = swp_type(entry) |
88c28f2469151b Huang Ying            2018-04-20  1509  					(offset << MAX_SWAPFILES_SHIFT);
ab6ecf247a9321 Huang Ying            2018-06-07  1510  			}
84c3fc4e9c563d Zi Yan                2017-09-08  1511  			flags |= PM_SWAP;
b83d7e432399d4 Huang Ying            2017-11-02  1512  			if (pmd_swp_soft_dirty(pmd))
b83d7e432399d4 Huang Ying            2017-11-02  1513  				flags |= PM_SOFT_DIRTY;
fb8e37f35a2fe1 Peter Xu              2021-06-30  1514  			if (pmd_swp_uffd_wp(pmd))
fb8e37f35a2fe1 Peter Xu              2021-06-30  1515  				flags |= PM_UFFD_WP;
84c3fc4e9c563d Zi Yan                2017-09-08  1516  			VM_BUG_ON(!is_pmd_migration_entry(pmd));
24d7275ce27918 Yang Shi              2022-02-11  1517  			migration = is_migration_entry(entry);
af5cdaf82238fb Alistair Popple       2021-06-30  1518  			page = pfn_swap_entry_to_page(entry);
84c3fc4e9c563d Zi Yan                2017-09-08  1519  		}
84c3fc4e9c563d Zi Yan                2017-09-08  1520  #endif
84c3fc4e9c563d Zi Yan                2017-09-08  1521  
24d7275ce27918 Yang Shi              2022-02-11  1522  		if (page && !migration && page_mapcount(page) == 1)
84c3fc4e9c563d Zi Yan                2017-09-08  1523  			flags |= PM_MMAP_EXCLUSIVE;
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1524  
5aaabe831eb527 Naoya Horiguchi       2012-03-21  1525  		for (; addr != end; addr += PAGE_SIZE) {
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1526  			pagemap_entry_t pme = make_pme(frame, flags);
5aaabe831eb527 Naoya Horiguchi       2012-03-21  1527  
092b50bacd1cdb Naoya Horiguchi       2012-03-21  1528  			err = add_to_pagemap(addr, &pme, pm);
5aaabe831eb527 Naoya Horiguchi       2012-03-21  1529  			if (err)
5aaabe831eb527 Naoya Horiguchi       2012-03-21  1530  				break;
ab6ecf247a9321 Huang Ying            2018-06-07  1531  			if (pm->show_pfn) {
ab6ecf247a9321 Huang Ying            2018-06-07  1532  				if (flags & PM_PRESENT)
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1533  					frame++;
88c28f2469151b Huang Ying            2018-04-20  1534  				else if (flags & PM_SWAP)
88c28f2469151b Huang Ying            2018-04-20  1535  					frame += (1 << MAX_SWAPFILES_SHIFT);
5aaabe831eb527 Naoya Horiguchi       2012-03-21  1536  			}
ab6ecf247a9321 Huang Ying            2018-06-07  1537  		}
bf929152e9f6c4 Kirill A. Shutemov    2013-11-14  1538  		spin_unlock(ptl);
5aaabe831eb527 Naoya Horiguchi       2012-03-21  1539  		return err;
5aaabe831eb527 Naoya Horiguchi       2012-03-21  1540  	}
5aaabe831eb527 Naoya Horiguchi       2012-03-21  1541  
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1542  	if (pmd_trans_unstable(pmdp))
45f83cefe3a5f0 Andrea Arcangeli      2012-03-28  1543  		return 0;
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1544  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
bcf8039ed45f56 Dave Hansen           2008-06-12  1545  
81d0fa623c5b8d Peter Feiner          2014-10-09  1546  	/*
f995ece24dfecb Naoya Horiguchi       2015-02-11  1547  	 * We can assume that @vma always points to a valid one and @end never
f995ece24dfecb Naoya Horiguchi       2015-02-11  1548  	 * goes beyond vma->vm_end.
81d0fa623c5b8d Peter Feiner          2014-10-09  1549  	 */
356515e7b64c26 Konstantin Khlebnikov 2015-09-08  1550  	orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
f995ece24dfecb Naoya Horiguchi       2015-02-11  1551  	for (; addr < end; pte++, addr += PAGE_SIZE) {
81d0fa623c5b8d Peter Feiner          2014-10-09  1552  		pagemap_entry_t pme;
05fbf357d94152 Konstantin Khlebnikov 2015-02-11  1553  
deb945441b9408 Konstantin Khlebnikov 2015-09-08  1554  		pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
092b50bacd1cdb Naoya Horiguchi       2012-03-21  1555  		err = add_to_pagemap(addr, &pme, pm);
85863e475e59af Matt Mackall          2008-02-04  1556  		if (err)
05fbf357d94152 Konstantin Khlebnikov 2015-02-11  1557  			break;
85863e475e59af Matt Mackall          2008-02-04  1558  	}
05fbf357d94152 Konstantin Khlebnikov 2015-02-11  1559  	pte_unmap_unlock(orig_pte, ptl);
05fbf357d94152 Konstantin Khlebnikov 2015-02-11  1560  
85863e475e59af Matt Mackall          2008-02-04  1561  	cond_resched();
85863e475e59af Matt Mackall          2008-02-04  1562  
85863e475e59af Matt Mackall          2008-02-04  1563  	return err;
85863e475e59af Matt Mackall          2008-02-04  1564  }
85863e475e59af Matt Mackall          2008-02-04  1565
  

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7856c3a3e35a..6ea73b8148c5 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -173,6 +173,7 @@  config ARM64
 	select HAVE_ARCH_PREL32_RELOCATIONS
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_SOFT_DIRTY
 	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0bd18de9fd97..c4970c9ed114 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -51,6 +51,20 @@  static inline bool arch_thp_swp_supported(void)
 }
 #define arch_thp_swp_supported arch_thp_swp_supported
 
+/*
+ * On arm64 without hardware Access Flag, copying from user will fail because
+ * the pte is old and cannot be marked young. So we always end up with zeroed
+ * page after fork() + CoW for pfn mappings. We don't always have a
+ * hardware-managed access flag on arm64.
+ */
+#define arch_has_hw_pte_young		cpu_has_hw_af
+
+/*
+ * Experimentally, it's cheap to set the access flag in hardware and we
+ * benefit from prefaulting mappings as 'old' to start with.
+ */
+#define arch_wants_old_prefaulted_pte	cpu_has_hw_af
+
 /*
  * Outside of a few very special situations (e.g. hibernation), we always
  * use broadcast TLB invalidation instructions, therefore a spurious page
@@ -121,8 +135,9 @@  static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
 })
 
 #define pte_hw_dirty(pte)	(pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
-#define pte_sw_dirty(pte)	(!!(pte_val(pte) & PTE_DIRTY))
-#define pte_dirty(pte)		(pte_sw_dirty(pte) || pte_hw_dirty(pte))
+#define pte_soft_dirty(pte)	(!!(pte_val(pte) & PTE_DIRTY))
+#define pte_dirty(pte)		(pte_soft_dirty(pte) || pte_hw_dirty(pte))
+#define pte_swp_soft_dirty(pte)	pte_soft_dirty(pte)
 
 #define pte_valid(pte)		(!!(pte_val(pte) & PTE_VALID))
 /*
@@ -189,7 +204,8 @@  static inline pte_t pte_mkwrite(pte_t pte)
 
 static inline pte_t pte_mkclean(pte_t pte)
 {
-	pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
+	if (!arch_has_hw_pte_young())
+		pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
 	pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
 
 	return pte;
@@ -1077,25 +1093,83 @@  static inline void update_mmu_cache(struct vm_area_struct *vma,
 #define phys_to_ttbr(addr)	(addr)
 #endif
 
-/*
- * On arm64 without hardware Access Flag, copying from user will fail because
- * the pte is old and cannot be marked young. So we always end up with zeroed
- * page after fork() + CoW for pfn mappings. We don't always have a
- * hardware-managed access flag on arm64.
- */
-#define arch_has_hw_pte_young		cpu_has_hw_af
+static inline bool pud_sect_supported(void)
+{
+	return PAGE_SIZE == SZ_4K;
+}
 
+#ifdef CONFIG_ARM64_HW_AFDBM
 /*
- * Experimentally, it's cheap to set the access flag in hardware and we
- * benefit from prefaulting mappings as 'old' to start with.
+ * if we have the DBM bit we can utilize the software dirty bit as
+ * a mechanism to introduce the soft_dirty functionality; however, without
+ * it this bit is crucial to determining if a entry is dirty and we cannot
+ * clear it via software. DBM can also be disabled or broken on some early
+ * armv8 devices, so check its availability before modifying it.
  */
-#define arch_wants_old_prefaulted_pte	cpu_has_hw_af
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+	if (!arch_has_hw_pte_young())
+		return pte;
 
-static inline bool pud_sect_supported(void)
+	return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
+}
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
 {
-	return PAGE_SIZE == SZ_4K;
+	if (!arch_has_hw_pte_young())
+		return pte;
+
+	return set_pte_bit(pte, __pgprot(PTE_DIRTY));
+}
+
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+	if (!arch_has_hw_pte_young())
+		return pte;
+
+	return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
+}
+
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+	if (!arch_has_hw_pte_young())
+		return pte;
+
+	return set_pte_bit(pte, __pgprot(PTE_DIRTY));
+}
+
+static inline int pmd_soft_dirty(pmd_t pmd)
+{
+	return pte_soft_dirty(pmd_pte(pmd));
+}
+
+static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+{
+	return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
+}
+
+static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
+{
+	return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
 }
 
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static inline int pmd_swp_soft_dirty(pmd_t pmd)
+{
+	return pmd_soft_dirty(pmd);
+}
+
+static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
+{
+	return pmd_clear_soft_dirty(pmd);
+}
+
+static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
+{
+	return pmd_mksoft_dirty(pmd);
+}
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+#endif /* CONFIG_ARM64_HW_AFDBM */
 
 #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
 #define ptep_modify_prot_start ptep_modify_prot_start