[V2] arm64/mm: Intercept pfn changes in set_pte_at()

Message ID 20230109052816.405335-1-anshuman.khandual@arm.com
State New
Headers
Series [V2] arm64/mm: Intercept pfn changes in set_pte_at() |

Commit Message

Anshuman Khandual Jan. 9, 2023, 5:28 a.m. UTC
  Changing pfn on a user page table mapped entry, without first going through
break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
to intercept such changes, via an updated pgattr_change_is_safe(). This new
check happens via __check_racy_pte_update(), which has now been renamed as
__check_safe_pte_update().

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
This applies on v6.2-rc3. This patch had some test time on an internal CI
system without any issues being reported.

Changes in V1:

https://lore.kernel.org/all/20221116031001.292236-1-anshuman.khandual@arm.com/

 arch/arm64/include/asm/pgtable.h | 8 ++++++--
 arch/arm64/mm/mmu.c              | 8 +++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)
  

Comments

Anshuman Khandual Jan. 24, 2023, 5:41 a.m. UTC | #1
On 1/9/23 10:58, Anshuman Khandual wrote:
> Changing pfn on a user page table mapped entry, without first going through
> break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
> to intercept such changes, via an updated pgattr_change_is_safe(). This new
> check happens via __check_racy_pte_update(), which has now been renamed as
> __check_safe_pte_update().
> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
> This applies on v6.2-rc3. This patch had some test time on an internal CI
> system without any issues being reported.

Gentle ping, any updates on this patch ? Still any concerns ?

> 
> Changes in V1:
> 
> https://lore.kernel.org/all/20221116031001.292236-1-anshuman.khandual@arm.com/
> 
>  arch/arm64/include/asm/pgtable.h | 8 ++++++--
>  arch/arm64/mm/mmu.c              | 8 +++++++-
>  2 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index b4bbeed80fb6..832c9c8fb58f 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -275,6 +275,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
>  }
>  
>  extern void __sync_icache_dcache(pte_t pteval);
> +bool pgattr_change_is_safe(u64 old, u64 new);
>  
>  /*
>   * PTE bits configuration in the presence of hardware Dirty Bit Management
> @@ -292,7 +293,7 @@ extern void __sync_icache_dcache(pte_t pteval);
>   *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
>   */
>  
> -static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
> +static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
>  					   pte_t pte)
>  {
>  	pte_t old_pte;
> @@ -318,6 +319,9 @@ static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
>  	VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte),
>  		     "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx",
>  		     __func__, pte_val(old_pte), pte_val(pte));
> +	VM_WARN_ONCE(!pgattr_change_is_safe(pte_val(old_pte), pte_val(pte)),
> +		     "%s: unsafe attribute change: 0x%016llx -> 0x%016llx",
> +		     __func__, pte_val(old_pte), pte_val(pte));
>  }
>  
>  static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
> @@ -346,7 +350,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
>  			mte_sync_tags(old_pte, pte);
>  	}
>  
> -	__check_racy_pte_update(mm, ptep, pte);
> +	__check_safe_pte_update(mm, ptep, pte);
>  
>  	set_pte(ptep, pte);
>  }
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 14c87e8d69d8..a1d16b35c4f6 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -133,7 +133,7 @@ static phys_addr_t __init early_pgtable_alloc(int shift)
>  	return phys;
>  }
>  
> -static bool pgattr_change_is_safe(u64 old, u64 new)
> +bool pgattr_change_is_safe(u64 old, u64 new)
>  {
>  	/*
>  	 * The following mapping attributes may be updated in live
> @@ -145,6 +145,12 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
>  	if (old == 0 || new == 0)
>  		return true;
>  
> +	/* If old and new ptes are valid, pfn should not change */
> +	if (pte_valid(__pte(old)) && pte_valid(__pte(new))) {
> +		if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
> +			return false;
> +	}
> +
>  	/* live contiguous mappings may not be manipulated at all */
>  	if ((old | new) & PTE_CONT)
>  		return false;
  
Will Deacon Jan. 26, 2023, 1:33 p.m. UTC | #2
On Tue, Jan 24, 2023 at 11:11:49AM +0530, Anshuman Khandual wrote:
> On 1/9/23 10:58, Anshuman Khandual wrote:
> > Changing pfn on a user page table mapped entry, without first going through
> > break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
> > to intercept such changes, via an updated pgattr_change_is_safe(). This new
> > check happens via __check_racy_pte_update(), which has now been renamed as
> > __check_safe_pte_update().
> > 
> > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > Cc: Will Deacon <will@kernel.org>
> > Cc: Mark Rutland <mark.rutland@arm.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: linux-arm-kernel@lists.infradead.org
> > Cc: linux-kernel@vger.kernel.org
> > Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> > ---
> > This applies on v6.2-rc3. This patch had some test time on an internal CI
> > system without any issues being reported.
> 
> Gentle ping, any updates on this patch ? Still any concerns ?

I don't think we really got to the bottom of Mark's concerns with
unreachable ptes on the stack, did we? I also have vague recollections
of somebody (Robin?) running into issues with the vmap code not honouring
BBM.

So I think we should confirm/fix the vmap issue before we enable this check
and also try to get some testing coverage to address Mark's worries. I think
he has a syzkaller instance set up, so that sound like a good place to
start.

Will
  
Robin Murphy Jan. 27, 2023, 12:43 p.m. UTC | #3
On 2023-01-26 13:33, Will Deacon wrote:
> On Tue, Jan 24, 2023 at 11:11:49AM +0530, Anshuman Khandual wrote:
>> On 1/9/23 10:58, Anshuman Khandual wrote:
>>> Changing pfn on a user page table mapped entry, without first going through
>>> break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
>>> to intercept such changes, via an updated pgattr_change_is_safe(). This new
>>> check happens via __check_racy_pte_update(), which has now been renamed as
>>> __check_safe_pte_update().
>>>
>>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>>> Cc: Will Deacon <will@kernel.org>
>>> Cc: Mark Rutland <mark.rutland@arm.com>
>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>> Cc: linux-arm-kernel@lists.infradead.org
>>> Cc: linux-kernel@vger.kernel.org
>>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>>> ---
>>> This applies on v6.2-rc3. This patch had some test time on an internal CI
>>> system without any issues being reported.
>>
>> Gentle ping, any updates on this patch ? Still any concerns ?
> 
> I don't think we really got to the bottom of Mark's concerns with
> unreachable ptes on the stack, did we? I also have vague recollections
> of somebody (Robin?) running into issues with the vmap code not honouring
> BBM.

Doesn't ring a bell, so either it wasn't me, or it was many years ago 
and about 5 levels deep into trying to fix something else :/

> So I think we should confirm/fix the vmap issue before we enable this check
> and also try to get some testing coverage to address Mark's worries. I think
> he has a syzkaller instance set up, so that sound like a good place to
> start.

I think we're also missing a subtlety here in that this restriction 
doesn't *always* apply. For instance if someone wants to move a page by 
making the mapping read-only, copying the contents to a new page, then 
pointing the RO mapping at that new page, that should technically not 
require BBM.

Thanks,
Robin.
  
Mark Rutland Jan. 27, 2023, 3:14 p.m. UTC | #4
Hi Annshuman,

On Mon, Jan 09, 2023 at 10:58:16AM +0530, Anshuman Khandual wrote:
> Changing pfn on a user page table mapped entry, without first going through
> break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
> to intercept such changes, via an updated pgattr_change_is_safe(). This new
> check happens via __check_racy_pte_update(), which has now been renamed as
> __check_safe_pte_update().
> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
> This applies on v6.2-rc3. This patch had some test time on an internal CI
> system without any issues being reported.

Can you elaborate on this a little bit? It's not entirely clear what that
internal CI system has tested. It would be helpful if you could indicate:

* What sort of testing has been done by the CI system? e.g. is this just
  booting, running LTP, something else?

* Has this tried a bunch of configurations and/or machines?

* If any targetted stress tests have been used? e.g. stress-ng's memory system
  tests?

I'm assuming that's hitting LTP on a few machines/configs, which'd be
reasonable. It'd just be nice to confirm exactly what has been tested.

I've added this to my lcoal syzkaller instance's test branch, and I'll shout if
that hits anything over the weekend.

> Changes in V1:
> 
> https://lore.kernel.org/all/20221116031001.292236-1-anshuman.khandual@arm.com/

Did you mean to list some cahnges here?

> 
>  arch/arm64/include/asm/pgtable.h | 8 ++++++--
>  arch/arm64/mm/mmu.c              | 8 +++++++-
>  2 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index b4bbeed80fb6..832c9c8fb58f 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -275,6 +275,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
>  }
>  
>  extern void __sync_icache_dcache(pte_t pteval);
> +bool pgattr_change_is_safe(u64 old, u64 new);
>  
>  /*
>   * PTE bits configuration in the presence of hardware Dirty Bit Management
> @@ -292,7 +293,7 @@ extern void __sync_icache_dcache(pte_t pteval);
>   *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
>   */
>  
> -static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
> +static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
>  					   pte_t pte)
>  {
>  	pte_t old_pte;
> @@ -318,6 +319,9 @@ static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
>  	VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte),
>  		     "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx",
>  		     __func__, pte_val(old_pte), pte_val(pte));
> +	VM_WARN_ONCE(!pgattr_change_is_safe(pte_val(old_pte), pte_val(pte)),
> +		     "%s: unsafe attribute change: 0x%016llx -> 0x%016llx",
> +		     __func__, pte_val(old_pte), pte_val(pte));
>  }
>  
>  static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
> @@ -346,7 +350,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
>  			mte_sync_tags(old_pte, pte);
>  	}
>  
> -	__check_racy_pte_update(mm, ptep, pte);
> +	__check_safe_pte_update(mm, ptep, pte);
>  
>  	set_pte(ptep, pte);
>  }
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 14c87e8d69d8..a1d16b35c4f6 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -133,7 +133,7 @@ static phys_addr_t __init early_pgtable_alloc(int shift)
>  	return phys;
>  }
>  
> -static bool pgattr_change_is_safe(u64 old, u64 new)
> +bool pgattr_change_is_safe(u64 old, u64 new)
>  {
>  	/*
>  	 * The following mapping attributes may be updated in live
> @@ -145,6 +145,12 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
>  	if (old == 0 || new == 0)
>  		return true;

These checks above should really use pte_valid(); we were just being lazy when
this was originally written since for the init_*() cases the memory should be
zero initially.

So could you make that:

	if (!pte_valid(__pte(old)) || !pte_valid(__pte(new)))
		return true;

> +	/* If old and new ptes are valid, pfn should not change */
> +	if (pte_valid(__pte(old)) && pte_valid(__pte(new))) {
> +		if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
> +			return false;
> +	}

With the above change, it's clear that both must be valid to get this far, and
this check can be reduced to:


	/* A live entry's pfn should not change */
	if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
		return false;

With those changes:

Acked-by: Mark Rutland <mark.rutland@arm.com>

Thanks,
Mark.
  
Mark Rutland Jan. 27, 2023, 3:16 p.m. UTC | #5
On Thu, Jan 26, 2023 at 01:33:22PM +0000, Will Deacon wrote:
> On Tue, Jan 24, 2023 at 11:11:49AM +0530, Anshuman Khandual wrote:
> > On 1/9/23 10:58, Anshuman Khandual wrote:
> > > Changing pfn on a user page table mapped entry, without first going through
> > > break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
> > > to intercept such changes, via an updated pgattr_change_is_safe(). This new
> > > check happens via __check_racy_pte_update(), which has now been renamed as
> > > __check_safe_pte_update().
> > > 
> > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > Cc: Will Deacon <will@kernel.org>
> > > Cc: Mark Rutland <mark.rutland@arm.com>
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Cc: linux-arm-kernel@lists.infradead.org
> > > Cc: linux-kernel@vger.kernel.org
> > > Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> > > ---
> > > This applies on v6.2-rc3. This patch had some test time on an internal CI
> > > system without any issues being reported.
> > 
> > Gentle ping, any updates on this patch ? Still any concerns ?
> 
> I don't think we really got to the bottom of Mark's concerns with
> unreachable ptes on the stack, did we? I also have vague recollections
> of somebody (Robin?) running into issues with the vmap code not honouring
> BBM.
> 
> So I think we should confirm/fix the vmap issue before we enable this check
> and also try to get some testing coverage to address Mark's worries. I think
> he has a syzkaller instance set up, so that sound like a good place to
> start.

I've thrown my Syzkaller instance at this patch; if it doesn't find anything by
Monday I reckon we should pick this up.

That said, I had some minor nits on the patch; I'm not sure if you'd be happy
to apply the suggested changes when applying or if you'd prefer that Anshuman
applies those locally and sense a v3.

Thanks,
Mark.
  
Anshuman Khandual Jan. 30, 2023, 8:16 a.m. UTC | #6
On 1/27/23 20:46, Mark Rutland wrote:
> On Thu, Jan 26, 2023 at 01:33:22PM +0000, Will Deacon wrote:
>> On Tue, Jan 24, 2023 at 11:11:49AM +0530, Anshuman Khandual wrote:
>>> On 1/9/23 10:58, Anshuman Khandual wrote:
>>>> Changing pfn on a user page table mapped entry, without first going through
>>>> break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
>>>> to intercept such changes, via an updated pgattr_change_is_safe(). This new
>>>> check happens via __check_racy_pte_update(), which has now been renamed as
>>>> __check_safe_pte_update().
>>>>
>>>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>>>> Cc: Will Deacon <will@kernel.org>
>>>> Cc: Mark Rutland <mark.rutland@arm.com>
>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>> Cc: linux-arm-kernel@lists.infradead.org
>>>> Cc: linux-kernel@vger.kernel.org
>>>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>>>> ---
>>>> This applies on v6.2-rc3. This patch had some test time on an internal CI
>>>> system without any issues being reported.
>>>
>>> Gentle ping, any updates on this patch ? Still any concerns ?
>>
>> I don't think we really got to the bottom of Mark's concerns with
>> unreachable ptes on the stack, did we? I also have vague recollections
>> of somebody (Robin?) running into issues with the vmap code not honouring
>> BBM.
>>
>> So I think we should confirm/fix the vmap issue before we enable this check
>> and also try to get some testing coverage to address Mark's worries. I think
>> he has a syzkaller instance set up, so that sound like a good place to
>> start.
> 
> I've thrown my Syzkaller instance at this patch; if it doesn't find anything by
> Monday I reckon we should pick this up.
> 
> That said, I had some minor nits on the patch; I'm not sure if you'd be happy
> to apply the suggested changes when applying or if you'd prefer that Anshuman
> applies those locally and sense a v3.

I could send out a V3, running some stress-ng based memory tests with the suggested
changes applied on the patch.
  
Mark Rutland Jan. 30, 2023, 10:08 a.m. UTC | #7
On Fri, Jan 27, 2023 at 03:16:52PM +0000, Mark Rutland wrote:
> On Thu, Jan 26, 2023 at 01:33:22PM +0000, Will Deacon wrote:
> > On Tue, Jan 24, 2023 at 11:11:49AM +0530, Anshuman Khandual wrote:
> > > On 1/9/23 10:58, Anshuman Khandual wrote:
> > > > Changing pfn on a user page table mapped entry, without first going through
> > > > break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
> > > > to intercept such changes, via an updated pgattr_change_is_safe(). This new
> > > > check happens via __check_racy_pte_update(), which has now been renamed as
> > > > __check_safe_pte_update().
> > > > 
> > > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > > Cc: Will Deacon <will@kernel.org>
> > > > Cc: Mark Rutland <mark.rutland@arm.com>
> > > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > > Cc: linux-arm-kernel@lists.infradead.org
> > > > Cc: linux-kernel@vger.kernel.org
> > > > Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> > > > ---
> > > > This applies on v6.2-rc3. This patch had some test time on an internal CI
> > > > system without any issues being reported.
> > > 
> > > Gentle ping, any updates on this patch ? Still any concerns ?
> > 
> > I don't think we really got to the bottom of Mark's concerns with
> > unreachable ptes on the stack, did we? I also have vague recollections
> > of somebody (Robin?) running into issues with the vmap code not honouring
> > BBM.
> > 
> > So I think we should confirm/fix the vmap issue before we enable this check
> > and also try to get some testing coverage to address Mark's worries. I think
> > he has a syzkaller instance set up, so that sound like a good place to
> > start.
> 
> I've thrown my Syzkaller instance at this patch; if it doesn't find anything by
> Monday I reckon we should pick this up.

FWIW, that hasn't hit anything so far.

It would be good if we could explicitly nots which mm test suite and/or stress
tests are happy with this, but otherwise this looks good to me.

Thanks,
Mark.

> That said, I had some minor nits on the patch; I'm not sure if you'd be happy
> to apply the suggested changes when applying or if you'd prefer that Anshuman
> applies those locally and sense a v3.
> 
> Thanks,
> Mark.
  
Anshuman Khandual Jan. 31, 2023, 2:57 a.m. UTC | #8
On 1/27/23 20:44, Mark Rutland wrote:
> Hi Annshuman,
> 
> On Mon, Jan 09, 2023 at 10:58:16AM +0530, Anshuman Khandual wrote:
>> Changing pfn on a user page table mapped entry, without first going through
>> break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
>> to intercept such changes, via an updated pgattr_change_is_safe(). This new
>> check happens via __check_racy_pte_update(), which has now been renamed as
>> __check_safe_pte_update().
>>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: Mark Rutland <mark.rutland@arm.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-kernel@vger.kernel.org
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>> This applies on v6.2-rc3. This patch had some test time on an internal CI
>> system without any issues being reported.
> 
> Can you elaborate on this a little bit? It's not entirely clear what that
> internal CI system has tested. It would be helpful if you could indicate:

Please find the details here, as learned from internal CI folks,

> 
> * What sort of testing has been done by the CI system? e.g. is this just
>   booting, running LTP, something else?

Tested on both host and guest, with CONFIG_DEBUG_VM enabled

- Booting
- LTP

> 
> * Has this tried a bunch of configurations and/or machines?

Tested on the following platforms

- LTP test on JUNO	(defconfig)
- LTP test on SOFTIRON 	(debugrun config)
- Kselftests arm64 KVM	(BASEAEM with defconfig)

> 
> * If any targetted stress tests have been used? e.g. stress-ng's memory system
>   tests?

I did run stress-ng memory system tests.

> 
> I'm assuming that's hitting LTP on a few machines/configs, which'd be
> reasonable. It'd just be nice to confirm exactly what has been tested.
> 
> I've added this to my lcoal syzkaller instance's test branch, and I'll shout if
> that hits anything over the weekend.
> 
>> Changes in V1:
>>
>> https://lore.kernel.org/all/20221116031001.292236-1-anshuman.khandual@arm.com/
> 
> Did you mean to list some cahnges here?

Actually there was no change between V1 and V2, other than just rebasing.

> 
>>
>>  arch/arm64/include/asm/pgtable.h | 8 ++++++--
>>  arch/arm64/mm/mmu.c              | 8 +++++++-
>>  2 files changed, 13 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index b4bbeed80fb6..832c9c8fb58f 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -275,6 +275,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
>>  }
>>  
>>  extern void __sync_icache_dcache(pte_t pteval);
>> +bool pgattr_change_is_safe(u64 old, u64 new);
>>  
>>  /*
>>   * PTE bits configuration in the presence of hardware Dirty Bit Management
>> @@ -292,7 +293,7 @@ extern void __sync_icache_dcache(pte_t pteval);
>>   *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
>>   */
>>  
>> -static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
>> +static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
>>  					   pte_t pte)
>>  {
>>  	pte_t old_pte;
>> @@ -318,6 +319,9 @@ static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
>>  	VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte),
>>  		     "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx",
>>  		     __func__, pte_val(old_pte), pte_val(pte));
>> +	VM_WARN_ONCE(!pgattr_change_is_safe(pte_val(old_pte), pte_val(pte)),
>> +		     "%s: unsafe attribute change: 0x%016llx -> 0x%016llx",
>> +		     __func__, pte_val(old_pte), pte_val(pte));
>>  }
>>  
>>  static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
>> @@ -346,7 +350,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
>>  			mte_sync_tags(old_pte, pte);
>>  	}
>>  
>> -	__check_racy_pte_update(mm, ptep, pte);
>> +	__check_safe_pte_update(mm, ptep, pte);
>>  
>>  	set_pte(ptep, pte);
>>  }
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 14c87e8d69d8..a1d16b35c4f6 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -133,7 +133,7 @@ static phys_addr_t __init early_pgtable_alloc(int shift)
>>  	return phys;
>>  }
>>  
>> -static bool pgattr_change_is_safe(u64 old, u64 new)
>> +bool pgattr_change_is_safe(u64 old, u64 new)
>>  {
>>  	/*
>>  	 * The following mapping attributes may be updated in live
>> @@ -145,6 +145,12 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
>>  	if (old == 0 || new == 0)
>>  		return true;
> 
> These checks above should really use pte_valid(); we were just being lazy when
> this was originally written since for the init_*() cases the memory should be
> zero initially.
> 
> So could you make that:
> 
> 	if (!pte_valid(__pte(old)) || !pte_valid(__pte(new)))
> 		return true;
> 
>> +	/* If old and new ptes are valid, pfn should not change */
>> +	if (pte_valid(__pte(old)) && pte_valid(__pte(new))) {
>> +		if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
>> +			return false;
>> +	}
> 
> With the above change, it's clear that both must be valid to get this far, and
> this check can be reduced to:
> 
> 
> 	/* A live entry's pfn should not change */
> 	if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
> 		return false;
> 
> With those changes:
> 
> Acked-by: Mark Rutland <mark.rutland@arm.com>

Sent out the V3 as suggested.

https://lore.kernel.org/all/20230130121457.1607675-1-anshuman.khandual@arm.com/
  
Will Deacon Jan. 31, 2023, 3:49 p.m. UTC | #9
On Fri, Jan 27, 2023 at 12:43:17PM +0000, Robin Murphy wrote:
> On 2023-01-26 13:33, Will Deacon wrote:
> > On Tue, Jan 24, 2023 at 11:11:49AM +0530, Anshuman Khandual wrote:
> > > On 1/9/23 10:58, Anshuman Khandual wrote:
> > > > Changing pfn on a user page table mapped entry, without first going through
> > > > break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
> > > > to intercept such changes, via an updated pgattr_change_is_safe(). This new
> > > > check happens via __check_racy_pte_update(), which has now been renamed as
> > > > __check_safe_pte_update().
> > > > 
> > > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > > Cc: Will Deacon <will@kernel.org>
> > > > Cc: Mark Rutland <mark.rutland@arm.com>
> > > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > > Cc: linux-arm-kernel@lists.infradead.org
> > > > Cc: linux-kernel@vger.kernel.org
> > > > Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> > > > ---
> > > > This applies on v6.2-rc3. This patch had some test time on an internal CI
> > > > system without any issues being reported.
> > > 
> > > Gentle ping, any updates on this patch ? Still any concerns ?
> > 
> > I don't think we really got to the bottom of Mark's concerns with
> > unreachable ptes on the stack, did we? I also have vague recollections
> > of somebody (Robin?) running into issues with the vmap code not honouring
> > BBM.
> 
> Doesn't ring a bell, so either it wasn't me, or it was many years ago and
> about 5 levels deep into trying to fix something else :/

Bah, sorry! Catalin reckons it may have been him talking about the vmemmap.
Catalin -- any clues?

Will
  
Catalin Marinas Feb. 1, 2023, 12:20 p.m. UTC | #10
On Tue, Jan 31, 2023 at 03:49:51PM +0000, Will Deacon wrote:
> On Fri, Jan 27, 2023 at 12:43:17PM +0000, Robin Murphy wrote:
> > On 2023-01-26 13:33, Will Deacon wrote:
> > > On Tue, Jan 24, 2023 at 11:11:49AM +0530, Anshuman Khandual wrote:
> > > > On 1/9/23 10:58, Anshuman Khandual wrote:
> > > > > Changing pfn on a user page table mapped entry, without first going through
> > > > > break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
> > > > > to intercept such changes, via an updated pgattr_change_is_safe(). This new
> > > > > check happens via __check_racy_pte_update(), which has now been renamed as
> > > > > __check_safe_pte_update().
> > > > > 
> > > > > Cc: Catalin Marinas <catalin.marinas@arm.com>
> > > > > Cc: Will Deacon <will@kernel.org>
> > > > > Cc: Mark Rutland <mark.rutland@arm.com>
> > > > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > > > Cc: linux-arm-kernel@lists.infradead.org
> > > > > Cc: linux-kernel@vger.kernel.org
> > > > > Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> > > > > ---
> > > > > This applies on v6.2-rc3. This patch had some test time on an internal CI
> > > > > system without any issues being reported.
> > > > 
> > > > Gentle ping, any updates on this patch ? Still any concerns ?
> > > 
> > > I don't think we really got to the bottom of Mark's concerns with
> > > unreachable ptes on the stack, did we? I also have vague recollections
> > > of somebody (Robin?) running into issues with the vmap code not honouring
> > > BBM.
> > 
> > Doesn't ring a bell, so either it wasn't me, or it was many years ago and
> > about 5 levels deep into trying to fix something else :/
> 
> Bah, sorry! Catalin reckons it may have been him talking about the vmemmap.

Indeed. The discussion with Anshuman started from this thread:

https://lore.kernel.org/all/20221025014215.3466904-1-mawupeng1@huawei.com/

We already trip over the existing checks even without Anshuman's patch,
though only by chance. We are not setting the software PTE_DIRTY on the
new pte (we don't bother with this bit for kernel mappings).

Given that the vmemmap ptes are still live when such change happens and
no-one came with a solution to the break-before-make problem, I propose
we revert the arm64 part of commit 47010c040dec ("mm: hugetlb_vmemmap:
cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*"). We just need this hunk:

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 27b2592698b0..5263454a5794 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -100,7 +100,6 @@ config ARM64
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
-	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
 	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANTS_NO_INSTR
 	select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES
  
Muchun Song Feb. 2, 2023, 9:51 a.m. UTC | #11
> On Feb 1, 2023, at 20:20, Catalin Marinas <catalin.marinas@arm.com> wrote:
> 
> On Tue, Jan 31, 2023 at 03:49:51PM +0000, Will Deacon wrote:
>> On Fri, Jan 27, 2023 at 12:43:17PM +0000, Robin Murphy wrote:
>>> On 2023-01-26 13:33, Will Deacon wrote:
>>>> On Tue, Jan 24, 2023 at 11:11:49AM +0530, Anshuman Khandual wrote:
>>>>> On 1/9/23 10:58, Anshuman Khandual wrote:
>>>>>> Changing pfn on a user page table mapped entry, without first going through
>>>>>> break-before-make (BBM) procedure is unsafe. This just updates set_pte_at()
>>>>>> to intercept such changes, via an updated pgattr_change_is_safe(). This new
>>>>>> check happens via __check_racy_pte_update(), which has now been renamed as
>>>>>> __check_safe_pte_update().
>>>>>> 
>>>>>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>>>>>> Cc: Will Deacon <will@kernel.org>
>>>>>> Cc: Mark Rutland <mark.rutland@arm.com>
>>>>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>>>>> Cc: linux-arm-kernel@lists.infradead.org
>>>>>> Cc: linux-kernel@vger.kernel.org
>>>>>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>>>>>> ---
>>>>>> This applies on v6.2-rc3. This patch had some test time on an internal CI
>>>>>> system without any issues being reported.
>>>>> 
>>>>> Gentle ping, any updates on this patch ? Still any concerns ?
>>>> 
>>>> I don't think we really got to the bottom of Mark's concerns with
>>>> unreachable ptes on the stack, did we? I also have vague recollections
>>>> of somebody (Robin?) running into issues with the vmap code not honouring
>>>> BBM.
>>> 
>>> Doesn't ring a bell, so either it wasn't me, or it was many years ago and
>>> about 5 levels deep into trying to fix something else :/
>> 
>> Bah, sorry! Catalin reckons it may have been him talking about the vmemmap.
> 
> Indeed. The discussion with Anshuman started from this thread:
> 
> https://lore.kernel.org/all/20221025014215.3466904-1-mawupeng1@huawei.com/
> 
> We already trip over the existing checks even without Anshuman's patch,
> though only by chance. We are not setting the software PTE_DIRTY on the
> new pte (we don't bother with this bit for kernel mappings).
> 
> Given that the vmemmap ptes are still live when such change happens and
> no-one came with a solution to the break-before-make problem, I propose
> we revert the arm64 part of commit 47010c040dec ("mm: hugetlb_vmemmap:
> cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*"). We just need this hunk:
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 27b2592698b0..5263454a5794 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -100,7 +100,6 @@ config ARM64
> 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
> 	select ARCH_WANT_FRAME_POINTERS
> 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
> -	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP

Maybe it is a little overkill for HVO as it can significantly minimize the
overhead of vmemmap on ARM64 servers for some workloads (like qemu, DPDK).
So I don't think disabling it is a good approach. Indeed, HVO broke BBM,
but the waring does not affect anything since the tail vmemmap pages are
supposed to be read-only. So, I suggest skipping warnings if it is the
vmemmap address in set_pte_at(). What do you think of?

Muchun,
Thanks.

> 	select ARCH_WANT_LD_ORPHAN_WARN
> 	select ARCH_WANTS_NO_INSTR
> 	select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES
> 
> -- 
> Catalin
  
Catalin Marinas Feb. 2, 2023, 10:45 a.m. UTC | #12
On Thu, Feb 02, 2023 at 05:51:39PM +0800, Muchun Song wrote:
> > On Feb 1, 2023, at 20:20, Catalin Marinas <catalin.marinas@arm.com> wrote:
> >> Bah, sorry! Catalin reckons it may have been him talking about the vmemmap.
> > 
> > Indeed. The discussion with Anshuman started from this thread:
> > 
> > https://lore.kernel.org/all/20221025014215.3466904-1-mawupeng1@huawei.com/
> > 
> > We already trip over the existing checks even without Anshuman's patch,
> > though only by chance. We are not setting the software PTE_DIRTY on the
> > new pte (we don't bother with this bit for kernel mappings).
> > 
> > Given that the vmemmap ptes are still live when such change happens and
> > no-one came with a solution to the break-before-make problem, I propose
> > we revert the arm64 part of commit 47010c040dec ("mm: hugetlb_vmemmap:
> > cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*"). We just need this hunk:
> > 
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index 27b2592698b0..5263454a5794 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -100,7 +100,6 @@ config ARM64
> > 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
> > 	select ARCH_WANT_FRAME_POINTERS
> > 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
> > -	select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> 
> Maybe it is a little overkill for HVO as it can significantly minimize the
> overhead of vmemmap on ARM64 servers for some workloads (like qemu, DPDK).
> So I don't think disabling it is a good approach. Indeed, HVO broke BBM,
> but the waring does not affect anything since the tail vmemmap pages are
> supposed to be read-only. So, I suggest skipping warnings if it is the
> vmemmap address in set_pte_at(). What do you think of?

IIUC, vmemmap_remap_pte() not only makes the pte read-only but also
changes the output address. Architecturally, this needs a BBM sequence.
We can avoid going through an invalid pte if we first make the pte
read-only, TLBI but keeping the same pfn, followed by a change of the
pfn while keeping the pte readonly. This also assumes that the content
of the page pointed at by the pte is the same at both old and new pfn.
  
Muchun Song Feb. 3, 2023, 2:40 a.m. UTC | #13
> On Feb 2, 2023, at 18:45, Catalin Marinas <catalin.marinas@arm.com> wrote:
> 
> On Thu, Feb 02, 2023 at 05:51:39PM +0800, Muchun Song wrote:
>>> On Feb 1, 2023, at 20:20, Catalin Marinas <catalin.marinas@arm.com> wrote:
>>>> Bah, sorry! Catalin reckons it may have been him talking about the vmemmap.
>>> 
>>> Indeed. The discussion with Anshuman started from this thread:
>>> 
>>> https://lore.kernel.org/all/20221025014215.3466904-1-mawupeng1@huawei.com/
>>> 
>>> We already trip over the existing checks even without Anshuman's patch,
>>> though only by chance. We are not setting the software PTE_DIRTY on the
>>> new pte (we don't bother with this bit for kernel mappings).
>>> 
>>> Given that the vmemmap ptes are still live when such change happens and
>>> no-one came with a solution to the break-before-make problem, I propose
>>> we revert the arm64 part of commit 47010c040dec ("mm: hugetlb_vmemmap:
>>> cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*"). We just need this hunk:
>>> 
>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>> index 27b2592698b0..5263454a5794 100644
>>> --- a/arch/arm64/Kconfig
>>> +++ b/arch/arm64/Kconfig
>>> @@ -100,7 +100,6 @@ config ARM64
>>> select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
>>> select ARCH_WANT_FRAME_POINTERS
>>> select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
>>> - select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>> 
>> Maybe it is a little overkill for HVO as it can significantly minimize the
>> overhead of vmemmap on ARM64 servers for some workloads (like qemu, DPDK).
>> So I don't think disabling it is a good approach. Indeed, HVO broke BBM,
>> but the waring does not affect anything since the tail vmemmap pages are
>> supposed to be read-only. So, I suggest skipping warnings if it is the
>> vmemmap address in set_pte_at(). What do you think of?
> 
> IIUC, vmemmap_remap_pte() not only makes the pte read-only but also
> changes the output address. Architecturally, this needs a BBM sequence.
> We can avoid going through an invalid pte if we first make the pte
> read-only, TLBI but keeping the same pfn, followed by a change of the
> pfn while keeping the pte readonly. This also assumes that the content
> of the page pointed at by the pte is the same at both old and new pfn.

Right. I think using BBM is to avoid possibly creating multiple TLB entries
for the same address for a extremely short period. But accessing either the
old page or the new page is fine in this case. Is it acceptable for this
special case without using BBM?

Thanks,
Muchun.
  
Will Deacon Feb. 3, 2023, 10:10 a.m. UTC | #14
On Fri, Feb 03, 2023 at 10:40:18AM +0800, Muchun Song wrote:
> 
> 
> > On Feb 2, 2023, at 18:45, Catalin Marinas <catalin.marinas@arm.com> wrote:
> > 
> > On Thu, Feb 02, 2023 at 05:51:39PM +0800, Muchun Song wrote:
> >>> On Feb 1, 2023, at 20:20, Catalin Marinas <catalin.marinas@arm.com> wrote:
> >>>> Bah, sorry! Catalin reckons it may have been him talking about the vmemmap.
> >>> 
> >>> Indeed. The discussion with Anshuman started from this thread:
> >>> 
> >>> https://lore.kernel.org/all/20221025014215.3466904-1-mawupeng1@huawei.com/
> >>> 
> >>> We already trip over the existing checks even without Anshuman's patch,
> >>> though only by chance. We are not setting the software PTE_DIRTY on the
> >>> new pte (we don't bother with this bit for kernel mappings).
> >>> 
> >>> Given that the vmemmap ptes are still live when such change happens and
> >>> no-one came with a solution to the break-before-make problem, I propose
> >>> we revert the arm64 part of commit 47010c040dec ("mm: hugetlb_vmemmap:
> >>> cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*"). We just need this hunk:
> >>> 
> >>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> >>> index 27b2592698b0..5263454a5794 100644
> >>> --- a/arch/arm64/Kconfig
> >>> +++ b/arch/arm64/Kconfig
> >>> @@ -100,7 +100,6 @@ config ARM64
> >>> select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
> >>> select ARCH_WANT_FRAME_POINTERS
> >>> select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
> >>> - select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> >> 
> >> Maybe it is a little overkill for HVO as it can significantly minimize the
> >> overhead of vmemmap on ARM64 servers for some workloads (like qemu, DPDK).
> >> So I don't think disabling it is a good approach. Indeed, HVO broke BBM,
> >> but the waring does not affect anything since the tail vmemmap pages are
> >> supposed to be read-only. So, I suggest skipping warnings if it is the
> >> vmemmap address in set_pte_at(). What do you think of?
> > 
> > IIUC, vmemmap_remap_pte() not only makes the pte read-only but also
> > changes the output address. Architecturally, this needs a BBM sequence.
> > We can avoid going through an invalid pte if we first make the pte
> > read-only, TLBI but keeping the same pfn, followed by a change of the
> > pfn while keeping the pte readonly. This also assumes that the content
> > of the page pointed at by the pte is the same at both old and new pfn.
> 
> Right. I think using BBM is to avoid possibly creating multiple TLB entries
> for the same address for a extremely short period. But accessing either the
> old page or the new page is fine in this case. Is it acceptable for this
> special case without using BBM?

Sadly, the architecture allows the CPU to conjure up a mapping based on a
combination of the old and the new descriptor (a process known as
"amalgamation") so we _really_ need the BBM sequence.

I'm in favour of disabling the optimisation now and bringing it back once
we've got this fixed.

Will
  
Muchun Song Feb. 6, 2023, 3:28 a.m. UTC | #15
> On Feb 3, 2023, at 18:10, Will Deacon <will@kernel.org> wrote:
> 
> On Fri, Feb 03, 2023 at 10:40:18AM +0800, Muchun Song wrote:
>> 
>> 
>>> On Feb 2, 2023, at 18:45, Catalin Marinas <catalin.marinas@arm.com> wrote:
>>> 
>>> On Thu, Feb 02, 2023 at 05:51:39PM +0800, Muchun Song wrote:
>>>>> On Feb 1, 2023, at 20:20, Catalin Marinas <catalin.marinas@arm.com> wrote:
>>>>>> Bah, sorry! Catalin reckons it may have been him talking about the vmemmap.
>>>>> 
>>>>> Indeed. The discussion with Anshuman started from this thread:
>>>>> 
>>>>> https://lore.kernel.org/all/20221025014215.3466904-1-mawupeng1@huawei.com/
>>>>> 
>>>>> We already trip over the existing checks even without Anshuman's patch,
>>>>> though only by chance. We are not setting the software PTE_DIRTY on the
>>>>> new pte (we don't bother with this bit for kernel mappings).
>>>>> 
>>>>> Given that the vmemmap ptes are still live when such change happens and
>>>>> no-one came with a solution to the break-before-make problem, I propose
>>>>> we revert the arm64 part of commit 47010c040dec ("mm: hugetlb_vmemmap:
>>>>> cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*"). We just need this hunk:
>>>>> 
>>>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>>>> index 27b2592698b0..5263454a5794 100644
>>>>> --- a/arch/arm64/Kconfig
>>>>> +++ b/arch/arm64/Kconfig
>>>>> @@ -100,7 +100,6 @@ config ARM64
>>>>> select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
>>>>> select ARCH_WANT_FRAME_POINTERS
>>>>> select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
>>>>> - select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>>> 
>>>> Maybe it is a little overkill for HVO as it can significantly minimize the
>>>> overhead of vmemmap on ARM64 servers for some workloads (like qemu, DPDK).
>>>> So I don't think disabling it is a good approach. Indeed, HVO broke BBM,
>>>> but the waring does not affect anything since the tail vmemmap pages are
>>>> supposed to be read-only. So, I suggest skipping warnings if it is the
>>>> vmemmap address in set_pte_at(). What do you think of?
>>> 
>>> IIUC, vmemmap_remap_pte() not only makes the pte read-only but also
>>> changes the output address. Architecturally, this needs a BBM sequence.
>>> We can avoid going through an invalid pte if we first make the pte
>>> read-only, TLBI but keeping the same pfn, followed by a change of the
>>> pfn while keeping the pte readonly. This also assumes that the content
>>> of the page pointed at by the pte is the same at both old and new pfn.
>> 
>> Right. I think using BBM is to avoid possibly creating multiple TLB entries
>> for the same address for a extremely short period. But accessing either the
>> old page or the new page is fine in this case. Is it acceptable for this
>> special case without using BBM?
> 
> Sadly, the architecture allows the CPU to conjure up a mapping based on a
> combination of the old and the new descriptor (a process known as
> "amalgamation") so we _really_ need the BBM sequence.

I am not familiar with ARM64, what's the user-visible effect if this
"amalgamation" occurs?

Muchun,
Thanks.

> 
> I'm in favour of disabling the optimisation now and bringing it back once
> we've got this fixed.
> 
> Will
  
Will Deacon Feb. 7, 2023, 2:31 p.m. UTC | #16
On Mon, Feb 06, 2023 at 11:28:12AM +0800, Muchun Song wrote:
> 
> 
> > On Feb 3, 2023, at 18:10, Will Deacon <will@kernel.org> wrote:
> > 
> > On Fri, Feb 03, 2023 at 10:40:18AM +0800, Muchun Song wrote:
> >> 
> >> 
> >>> On Feb 2, 2023, at 18:45, Catalin Marinas <catalin.marinas@arm.com> wrote:
> >>> 
> >>> On Thu, Feb 02, 2023 at 05:51:39PM +0800, Muchun Song wrote:
> >>>>> On Feb 1, 2023, at 20:20, Catalin Marinas <catalin.marinas@arm.com> wrote:
> >>>>>> Bah, sorry! Catalin reckons it may have been him talking about the vmemmap.
> >>>>> 
> >>>>> Indeed. The discussion with Anshuman started from this thread:
> >>>>> 
> >>>>> https://lore.kernel.org/all/20221025014215.3466904-1-mawupeng1@huawei.com/
> >>>>> 
> >>>>> We already trip over the existing checks even without Anshuman's patch,
> >>>>> though only by chance. We are not setting the software PTE_DIRTY on the
> >>>>> new pte (we don't bother with this bit for kernel mappings).
> >>>>> 
> >>>>> Given that the vmemmap ptes are still live when such change happens and
> >>>>> no-one came with a solution to the break-before-make problem, I propose
> >>>>> we revert the arm64 part of commit 47010c040dec ("mm: hugetlb_vmemmap:
> >>>>> cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*"). We just need this hunk:
> >>>>> 
> >>>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> >>>>> index 27b2592698b0..5263454a5794 100644
> >>>>> --- a/arch/arm64/Kconfig
> >>>>> +++ b/arch/arm64/Kconfig
> >>>>> @@ -100,7 +100,6 @@ config ARM64
> >>>>> select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
> >>>>> select ARCH_WANT_FRAME_POINTERS
> >>>>> select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
> >>>>> - select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> >>>> 
> >>>> Maybe it is a little overkill for HVO as it can significantly minimize the
> >>>> overhead of vmemmap on ARM64 servers for some workloads (like qemu, DPDK).
> >>>> So I don't think disabling it is a good approach. Indeed, HVO broke BBM,
> >>>> but the waring does not affect anything since the tail vmemmap pages are
> >>>> supposed to be read-only. So, I suggest skipping warnings if it is the
> >>>> vmemmap address in set_pte_at(). What do you think of?
> >>> 
> >>> IIUC, vmemmap_remap_pte() not only makes the pte read-only but also
> >>> changes the output address. Architecturally, this needs a BBM sequence.
> >>> We can avoid going through an invalid pte if we first make the pte
> >>> read-only, TLBI but keeping the same pfn, followed by a change of the
> >>> pfn while keeping the pte readonly. This also assumes that the content
> >>> of the page pointed at by the pte is the same at both old and new pfn.
> >> 
> >> Right. I think using BBM is to avoid possibly creating multiple TLB entries
> >> for the same address for a extremely short period. But accessing either the
> >> old page or the new page is fine in this case. Is it acceptable for this
> >> special case without using BBM?
> > 
> > Sadly, the architecture allows the CPU to conjure up a mapping based on a
> > combination of the old and the new descriptor (a process known as
> > "amalgamation") so we _really_ need the BBM sequence.
> 
> I am not familiar with ARM64, what's the user-visible effect if this
> "amalgamation" occurs?

The user-visible effects would probably be data corruption and instability,
since the amalgamated TLB entry could result in a bogus physical address and
bogus permissions.

Will
  
Muchun Song Feb. 8, 2023, 3:13 a.m. UTC | #17
> On Feb 7, 2023, at 22:31, Will Deacon <will@kernel.org> wrote:
> 
> On Mon, Feb 06, 2023 at 11:28:12AM +0800, Muchun Song wrote:
>> 
>> 
>>> On Feb 3, 2023, at 18:10, Will Deacon <will@kernel.org> wrote:
>>> 
>>> On Fri, Feb 03, 2023 at 10:40:18AM +0800, Muchun Song wrote:
>>>> 
>>>> 
>>>>> On Feb 2, 2023, at 18:45, Catalin Marinas <catalin.marinas@arm.com> wrote:
>>>>> 
>>>>> On Thu, Feb 02, 2023 at 05:51:39PM +0800, Muchun Song wrote:
>>>>>>> On Feb 1, 2023, at 20:20, Catalin Marinas <catalin.marinas@arm.com> wrote:
>>>>>>>> Bah, sorry! Catalin reckons it may have been him talking about the vmemmap.
>>>>>>> 
>>>>>>> Indeed. The discussion with Anshuman started from this thread:
>>>>>>> 
>>>>>>> https://lore.kernel.org/all/20221025014215.3466904-1-mawupeng1@huawei.com/
>>>>>>> 
>>>>>>> We already trip over the existing checks even without Anshuman's patch,
>>>>>>> though only by chance. We are not setting the software PTE_DIRTY on the
>>>>>>> new pte (we don't bother with this bit for kernel mappings).
>>>>>>> 
>>>>>>> Given that the vmemmap ptes are still live when such change happens and
>>>>>>> no-one came with a solution to the break-before-make problem, I propose
>>>>>>> we revert the arm64 part of commit 47010c040dec ("mm: hugetlb_vmemmap:
>>>>>>> cleanup CONFIG_HUGETLB_PAGE_FREE_VMEMMAP*"). We just need this hunk:
>>>>>>> 
>>>>>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>>>>>> index 27b2592698b0..5263454a5794 100644
>>>>>>> --- a/arch/arm64/Kconfig
>>>>>>> +++ b/arch/arm64/Kconfig
>>>>>>> @@ -100,7 +100,6 @@ config ARM64
>>>>>>> select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
>>>>>>> select ARCH_WANT_FRAME_POINTERS
>>>>>>> select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
>>>>>>> - select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
>>>>>> 
>>>>>> Maybe it is a little overkill for HVO as it can significantly minimize the
>>>>>> overhead of vmemmap on ARM64 servers for some workloads (like qemu, DPDK).
>>>>>> So I don't think disabling it is a good approach. Indeed, HVO broke BBM,
>>>>>> but the waring does not affect anything since the tail vmemmap pages are
>>>>>> supposed to be read-only. So, I suggest skipping warnings if it is the
>>>>>> vmemmap address in set_pte_at(). What do you think of?
>>>>> 
>>>>> IIUC, vmemmap_remap_pte() not only makes the pte read-only but also
>>>>> changes the output address. Architecturally, this needs a BBM sequence.
>>>>> We can avoid going through an invalid pte if we first make the pte
>>>>> read-only, TLBI but keeping the same pfn, followed by a change of the
>>>>> pfn while keeping the pte readonly. This also assumes that the content
>>>>> of the page pointed at by the pte is the same at both old and new pfn.
>>>> 
>>>> Right. I think using BBM is to avoid possibly creating multiple TLB entries
>>>> for the same address for a extremely short period. But accessing either the
>>>> old page or the new page is fine in this case. Is it acceptable for this
>>>> special case without using BBM?
>>> 
>>> Sadly, the architecture allows the CPU to conjure up a mapping based on a
>>> combination of the old and the new descriptor (a process known as
>>> "amalgamation") so we _really_ need the BBM sequence.
>> 
>> I am not familiar with ARM64, what's the user-visible effect if this
>> "amalgamation" occurs?
> 
> The user-visible effects would probably be data corruption and instability,
> since the amalgamated TLB entry could result in a bogus physical address and
> bogus permissions.

You mean the output address of amalgamated TLB entry is neither the old
address (before updated) nor the new address (after updated)? So it is
a bogus physical address? Is there any specifications to describe the
rules of how to create a amalgamated TLB entry? Thanks.

Muchun

> 
> Will
  
Mark Rutland Feb. 8, 2023, 5:27 p.m. UTC | #18
On Wed, Feb 08, 2023 at 11:13:46AM +0800, Muchun Song wrote:
> > On Feb 7, 2023, at 22:31, Will Deacon <will@kernel.org> wrote:
> > On Mon, Feb 06, 2023 at 11:28:12AM +0800, Muchun Song wrote:
> >> I am not familiar with ARM64, what's the user-visible effect if this
> >> "amalgamation" occurs?
> > 
> > The user-visible effects would probably be data corruption and instability,
> > since the amalgamated TLB entry could result in a bogus physical address and
> > bogus permissions.
> 
> You mean the output address of amalgamated TLB entry is neither the old
> address (before updated) nor the new address (after updated)?

Yes, that is one possible result.

> So it is a bogus physical address?

Yes, that is one possible result.

> Is there any specifications to describe the rules of how to create a
> amalgamated TLB entry? Thanks.

Unfortunately, this is not clearly specified in the ARM ARM, and we have to
take a pessimistic reading here. We assume that amalgamation is some arbitrary
function of the TLB entries which are hit (e.g. they might be OR'd together).
This is something that I'd like to have clarified further by Arm's architects.

The important thing to note is that amalgamation applies to *TLB entries*, not
the translation table entries that they were derived from. Since the TLB format
is micro-architecture dependent, and since the manner in which they might be
combined is arbitrary, the results of combining could be arbitrary (and
consequently, this is difficult to specify).

The architecture *does* provide a few restrictions (e.g. Stage-1 entries within
a VM can't escape Stage-2, NS entries can't create a secure physical address),
but beyond that we cannot make any assumptions.

So e.g. if you have 2 read-only entries for addresses A and B, amalgamation
could result in read-write-execute for a distinct address C.

It's not clear to me whether that could also affect hits for unrelated VAs.

So the short answer is that we have to treat this as CONSTRAINED UNPREDICTABLE,
and must avoid potential amalgamation by using Break-Before-Make.

Thanks,
Mark.
  
Muchun Song Feb. 10, 2023, 6:50 a.m. UTC | #19
> On Feb 9, 2023, at 01:27, Mark Rutland <mark.rutland@arm.com> wrote:
> 
> On Wed, Feb 08, 2023 at 11:13:46AM +0800, Muchun Song wrote:
>>> On Feb 7, 2023, at 22:31, Will Deacon <will@kernel.org> wrote:
>>> On Mon, Feb 06, 2023 at 11:28:12AM +0800, Muchun Song wrote:
>>>> I am not familiar with ARM64, what's the user-visible effect if this
>>>> "amalgamation" occurs?
>>> 
>>> The user-visible effects would probably be data corruption and instability,
>>> since the amalgamated TLB entry could result in a bogus physical address and
>>> bogus permissions.
>> 
>> You mean the output address of amalgamated TLB entry is neither the old
>> address (before updated) nor the new address (after updated)?
> 
> Yes, that is one possible result.
> 
>> So it is a bogus physical address?
> 
> Yes, that is one possible result.
> 
>> Is there any specifications to describe the rules of how to create a
>> amalgamated TLB entry? Thanks.
> 
> Unfortunately, this is not clearly specified in the ARM ARM, and we have to
> take a pessimistic reading here. We assume that amalgamation is some arbitrary
> function of the TLB entries which are hit (e.g. they might be OR'd together).
> This is something that I'd like to have clarified further by Arm's architects.
> 
> The important thing to note is that amalgamation applies to *TLB entries*, not
> the translation table entries that they were derived from. Since the TLB format
> is micro-architecture dependent, and since the manner in which they might be
> combined is arbitrary, the results of combining could be arbitrary (and
> consequently, this is difficult to specify).
> 
> The architecture *does* provide a few restrictions (e.g. Stage-1 entries within
> a VM can't escape Stage-2, NS entries can't create a secure physical address),
> but beyond that we cannot make any assumptions.
> 
> So e.g. if you have 2 read-only entries for addresses A and B, amalgamation
> could result in read-write-execute for a distinct address C.
> 
> It's not clear to me whether that could also affect hits for unrelated VAs.
> 
> So the short answer is that we have to treat this as CONSTRAINED UNPREDICTABLE,
> and must avoid potential amalgamation by using Break-Before-Make.

Thanks for your clear description. It's really helpful.

> 
> Thanks,
> Mark.
  

Patch

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b4bbeed80fb6..832c9c8fb58f 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -275,6 +275,7 @@  static inline void set_pte(pte_t *ptep, pte_t pte)
 }
 
 extern void __sync_icache_dcache(pte_t pteval);
+bool pgattr_change_is_safe(u64 old, u64 new);
 
 /*
  * PTE bits configuration in the presence of hardware Dirty Bit Management
@@ -292,7 +293,7 @@  extern void __sync_icache_dcache(pte_t pteval);
  *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
  */
 
-static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
+static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
 					   pte_t pte)
 {
 	pte_t old_pte;
@@ -318,6 +319,9 @@  static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
 	VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte),
 		     "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx",
 		     __func__, pte_val(old_pte), pte_val(pte));
+	VM_WARN_ONCE(!pgattr_change_is_safe(pte_val(old_pte), pte_val(pte)),
+		     "%s: unsafe attribute change: 0x%016llx -> 0x%016llx",
+		     __func__, pte_val(old_pte), pte_val(pte));
 }
 
 static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -346,7 +350,7 @@  static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 			mte_sync_tags(old_pte, pte);
 	}
 
-	__check_racy_pte_update(mm, ptep, pte);
+	__check_safe_pte_update(mm, ptep, pte);
 
 	set_pte(ptep, pte);
 }
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 14c87e8d69d8..a1d16b35c4f6 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -133,7 +133,7 @@  static phys_addr_t __init early_pgtable_alloc(int shift)
 	return phys;
 }
 
-static bool pgattr_change_is_safe(u64 old, u64 new)
+bool pgattr_change_is_safe(u64 old, u64 new)
 {
 	/*
 	 * The following mapping attributes may be updated in live
@@ -145,6 +145,12 @@  static bool pgattr_change_is_safe(u64 old, u64 new)
 	if (old == 0 || new == 0)
 		return true;
 
+	/* If old and new ptes are valid, pfn should not change */
+	if (pte_valid(__pte(old)) && pte_valid(__pte(new))) {
+		if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
+			return false;
+	}
+
 	/* live contiguous mappings may not be manipulated at all */
 	if ((old | new) & PTE_CONT)
 		return false;