[v2,07/23] mips: update_mmu_cache() can replace __update_tlb()

Message ID 178970b0-1539-8aac-76fd-972c6c46ec17@google.com
State New
Headers
Series arch: allow pte_offset_map[_lock]() to fail |

Commit Message

Hugh Dickins June 8, 2023, 7:17 p.m. UTC
  Don't make update_mmu_cache() a wrapper around __update_tlb(): call it
directly, and use the ptep (or pmdp) provided by the caller, instead of
re-calling pte_offset_map() - which would raise a question of whether a
pte_unmap() is needed to balance it.

Check whether the "ptep" provided by the caller is actually the pmdp,
instead of testing pmd_huge(): or test pmd_huge() too and warn if it
disagrees?  This is "hazardous" territory: needs review and testing.

Signed-off-by: Hugh Dickins <hughd@google.com>
---
 arch/mips/include/asm/pgtable.h | 15 +++------------
 arch/mips/mm/tlb-r3k.c          |  5 +++--
 arch/mips/mm/tlb-r4k.c          |  9 +++------
 3 files changed, 9 insertions(+), 20 deletions(-)
  

Comments

Nathan Chancellor June 14, 2023, 11:17 p.m. UTC | #1
Hi Hugh,

On Thu, Jun 08, 2023 at 12:17:24PM -0700, Hugh Dickins wrote:
> Don't make update_mmu_cache() a wrapper around __update_tlb(): call it
> directly, and use the ptep (or pmdp) provided by the caller, instead of
> re-calling pte_offset_map() - which would raise a question of whether a
> pte_unmap() is needed to balance it.
> 
> Check whether the "ptep" provided by the caller is actually the pmdp,
> instead of testing pmd_huge(): or test pmd_huge() too and warn if it
> disagrees?  This is "hazardous" territory: needs review and testing.
> 
> Signed-off-by: Hugh Dickins <hughd@google.com>
> ---
>  arch/mips/include/asm/pgtable.h | 15 +++------------
>  arch/mips/mm/tlb-r3k.c          |  5 +++--
>  arch/mips/mm/tlb-r4k.c          |  9 +++------
>  3 files changed, 9 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
> index 574fa14ac8b2..9175dfab08d5 100644
> --- a/arch/mips/include/asm/pgtable.h
> +++ b/arch/mips/include/asm/pgtable.h
> @@ -565,15 +565,8 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
>  }
>  #endif
>  
> -extern void __update_tlb(struct vm_area_struct *vma, unsigned long address,
> -	pte_t pte);
> -
> -static inline void update_mmu_cache(struct vm_area_struct *vma,
> -	unsigned long address, pte_t *ptep)
> -{
> -	pte_t pte = *ptep;
> -	__update_tlb(vma, address, pte);
> -}
> +extern void update_mmu_cache(struct vm_area_struct *vma,
> +	unsigned long address, pte_t *ptep);
>  
>  #define	__HAVE_ARCH_UPDATE_MMU_TLB
>  #define update_mmu_tlb	update_mmu_cache
> @@ -581,9 +574,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
>  static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
>  	unsigned long address, pmd_t *pmdp)
>  {
> -	pte_t pte = *(pte_t *)pmdp;
> -
> -	__update_tlb(vma, address, pte);
> +	update_mmu_cache(vma, address, (pte_t *)pmdp);
>  }
>  
>  /*
> diff --git a/arch/mips/mm/tlb-r3k.c b/arch/mips/mm/tlb-r3k.c
> index 53dfa2b9316b..e5722cd8dd6d 100644
> --- a/arch/mips/mm/tlb-r3k.c
> +++ b/arch/mips/mm/tlb-r3k.c
> @@ -176,7 +176,8 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
>  	}
>  }
>  
> -void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte)
> +void update_mmu_cache(struct vm_area_struct *vma,
> +		      unsigned long address, pte_t *ptep)
>  {
>  	unsigned long asid_mask = cpu_asid_mask(&current_cpu_data);
>  	unsigned long flags;
> @@ -203,7 +204,7 @@ void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte)
>  	BARRIER;
>  	tlb_probe();
>  	idx = read_c0_index();
> -	write_c0_entrylo0(pte_val(pte));
> +	write_c0_entrylo0(pte_val(*ptep));
>  	write_c0_entryhi(address | pid);
>  	if (idx < 0) {					/* BARRIER */
>  		tlb_write_random();
> diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
> index 1b939abbe4ca..c96725d17cab 100644
> --- a/arch/mips/mm/tlb-r4k.c
> +++ b/arch/mips/mm/tlb-r4k.c
> @@ -290,14 +290,14 @@ void local_flush_tlb_one(unsigned long page)
>   * updates the TLB with the new pte(s), and another which also checks
>   * for the R4k "end of page" hardware bug and does the needy.
>   */
> -void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte)
> +void update_mmu_cache(struct vm_area_struct *vma,
> +		      unsigned long address, pte_t *ptep)
>  {
>  	unsigned long flags;
>  	pgd_t *pgdp;
>  	p4d_t *p4dp;
>  	pud_t *pudp;
>  	pmd_t *pmdp;
> -	pte_t *ptep;
>  	int idx, pid;
>  
>  	/*
> @@ -326,10 +326,9 @@ void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte)
>  	idx = read_c0_index();
>  #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
>  	/* this could be a huge page  */
> -	if (pmd_huge(*pmdp)) {
> +	if (ptep == (pte_t *)pmdp) {
>  		unsigned long lo;
>  		write_c0_pagemask(PM_HUGE_MASK);
> -		ptep = (pte_t *)pmdp;
>  		lo = pte_to_entrylo(pte_val(*ptep));
>  		write_c0_entrylo0(lo);
>  		write_c0_entrylo1(lo + (HPAGE_SIZE >> 7));
> @@ -344,8 +343,6 @@ void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte)
>  	} else
>  #endif
>  	{
> -		ptep = pte_offset_map(pmdp, address);
> -
>  #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32)
>  #ifdef CONFIG_XPA
>  		write_c0_entrylo0(pte_to_entrylo(ptep->pte_high));
> -- 
> 2.35.3
> 

I just bisected a crash while powering down a MIPS machine in QEMU to
this change as commit 8044511d3893 ("mips: update_mmu_cache() can
replace __update_tlb()") in linux-next. Unfortunately, I can still
reproduce it with the existing fix you have for this change on the
mailing list, which is present in next-20230614.

I can reproduce it with the GCC 13.1.0 on kernel.org [1].

  $ make -skj"$(nproc)" ARCH=mips CROSS_COMPILE=mips-linux- mrproper malta_defconfig vmlinux

  $ qemu-system-mipsel \
      -display none \
      -nodefaults \
      -cpu 24Kf \
      -machine malta \
      -kernel vmlinux \
      -initrd rootfs.cpio \
      -m 512m \
      -serial mon:stdio
  ...
  Linux version 6.4.0-rc6-next-20230614 (nathan@dev-arch.thelio-3990X) (mips-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jun 14 16:13:02 MST 2023
  ...
  Run /init as init process
  process '/bin/busybox' started with executable stack
  do_page_fault(): sending SIGSEGV to init for invalid read access from 0000003c
  epc = 77b893dc in ld-uClibc-1.0.39.so[77b84000+8000]
  ra  = 77b8930c in ld-uClibc-1.0.39.so[77b84000+8000]
  Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
  ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]---

The rootfs is available at [2] if it is needed. I am more than happy to
provide additional information or test patches if necessary.

[1]: https://mirrors.edge.kernel.org/pub/tools/crosstool/
[2]: https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230609-194440/mipsel-rootfs.cpio.zst

Cheers,
Nathan
  
Hugh Dickins June 15, 2023, 12:26 a.m. UTC | #2
On Wed, 14 Jun 2023, Nathan Chancellor wrote:

> Hi Hugh,
> 
> On Thu, Jun 08, 2023 at 12:17:24PM -0700, Hugh Dickins wrote:
> > Don't make update_mmu_cache() a wrapper around __update_tlb(): call it
> > directly, and use the ptep (or pmdp) provided by the caller, instead of
> > re-calling pte_offset_map() - which would raise a question of whether a
> > pte_unmap() is needed to balance it.
> > 
> > Check whether the "ptep" provided by the caller is actually the pmdp,
> > instead of testing pmd_huge(): or test pmd_huge() too and warn if it
> > disagrees?  This is "hazardous" territory: needs review and testing.
> > 
> > Signed-off-by: Hugh Dickins <hughd@google.com>
> > ---
> >  arch/mips/include/asm/pgtable.h | 15 +++------------
> >  arch/mips/mm/tlb-r3k.c          |  5 +++--
> >  arch/mips/mm/tlb-r4k.c          |  9 +++------
> >  3 files changed, 9 insertions(+), 20 deletions(-)
> > 
> > diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
> > index 574fa14ac8b2..9175dfab08d5 100644
> > --- a/arch/mips/include/asm/pgtable.h
> > +++ b/arch/mips/include/asm/pgtable.h
> > @@ -565,15 +565,8 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
> >  }
> >  #endif
> >  
> > -extern void __update_tlb(struct vm_area_struct *vma, unsigned long address,
> > -	pte_t pte);
> > -
> > -static inline void update_mmu_cache(struct vm_area_struct *vma,
> > -	unsigned long address, pte_t *ptep)
> > -{
> > -	pte_t pte = *ptep;
> > -	__update_tlb(vma, address, pte);
> > -}
> > +extern void update_mmu_cache(struct vm_area_struct *vma,
> > +	unsigned long address, pte_t *ptep);
> >  
> >  #define	__HAVE_ARCH_UPDATE_MMU_TLB
> >  #define update_mmu_tlb	update_mmu_cache
> > @@ -581,9 +574,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
> >  static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
> >  	unsigned long address, pmd_t *pmdp)
> >  {
> > -	pte_t pte = *(pte_t *)pmdp;
> > -
> > -	__update_tlb(vma, address, pte);
> > +	update_mmu_cache(vma, address, (pte_t *)pmdp);
> >  }
> >  
> >  /*
> > diff --git a/arch/mips/mm/tlb-r3k.c b/arch/mips/mm/tlb-r3k.c
> > index 53dfa2b9316b..e5722cd8dd6d 100644
> > --- a/arch/mips/mm/tlb-r3k.c
> > +++ b/arch/mips/mm/tlb-r3k.c
> > @@ -176,7 +176,8 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
> >  	}
> >  }
> >  
> > -void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte)
> > +void update_mmu_cache(struct vm_area_struct *vma,
> > +		      unsigned long address, pte_t *ptep)
> >  {
> >  	unsigned long asid_mask = cpu_asid_mask(&current_cpu_data);
> >  	unsigned long flags;
> > @@ -203,7 +204,7 @@ void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte)
> >  	BARRIER;
> >  	tlb_probe();
> >  	idx = read_c0_index();
> > -	write_c0_entrylo0(pte_val(pte));
> > +	write_c0_entrylo0(pte_val(*ptep));
> >  	write_c0_entryhi(address | pid);
> >  	if (idx < 0) {					/* BARRIER */
> >  		tlb_write_random();
> > diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
> > index 1b939abbe4ca..c96725d17cab 100644
> > --- a/arch/mips/mm/tlb-r4k.c
> > +++ b/arch/mips/mm/tlb-r4k.c
> > @@ -290,14 +290,14 @@ void local_flush_tlb_one(unsigned long page)
> >   * updates the TLB with the new pte(s), and another which also checks
> >   * for the R4k "end of page" hardware bug and does the needy.
> >   */
> > -void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte)
> > +void update_mmu_cache(struct vm_area_struct *vma,
> > +		      unsigned long address, pte_t *ptep)
> >  {
> >  	unsigned long flags;
> >  	pgd_t *pgdp;
> >  	p4d_t *p4dp;
> >  	pud_t *pudp;
> >  	pmd_t *pmdp;
> > -	pte_t *ptep;
> >  	int idx, pid;
> >  
> >  	/*
> > @@ -326,10 +326,9 @@ void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte)
> >  	idx = read_c0_index();
> >  #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
> >  	/* this could be a huge page  */
> > -	if (pmd_huge(*pmdp)) {
> > +	if (ptep == (pte_t *)pmdp) {
> >  		unsigned long lo;
> >  		write_c0_pagemask(PM_HUGE_MASK);
> > -		ptep = (pte_t *)pmdp;
> >  		lo = pte_to_entrylo(pte_val(*ptep));
> >  		write_c0_entrylo0(lo);
> >  		write_c0_entrylo1(lo + (HPAGE_SIZE >> 7));
> > @@ -344,8 +343,6 @@ void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte)
> >  	} else
> >  #endif
> >  	{
> > -		ptep = pte_offset_map(pmdp, address);
> > -
> >  #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32)
> >  #ifdef CONFIG_XPA
> >  		write_c0_entrylo0(pte_to_entrylo(ptep->pte_high));
> > -- 
> > 2.35.3
> > 
> 
> I just bisected a crash while powering down a MIPS machine in QEMU to
> this change as commit 8044511d3893 ("mips: update_mmu_cache() can
> replace __update_tlb()") in linux-next.

Thank you, Nathan, that's very helpful indeed.  This patch certainly knew
that it wanted testing, and I'm glad to hear that it is now seeing some.

While powering down?  The messages below look like it was just coming up,
but no doubt that's because you were bisecting (or because I'm unfamiliar
with what messages to expect there).  It's probably irrelevant information,
but I wonder whether the (V)machine worked well enough for a while before
you first powered down and spotted the problem, or whether it's never got
much further than trying to run init (busybox)?  I'm trying to get a feel
for whether the problem occurs under common or uncommon conditions.

> Unfortunately, I can still
> reproduce it with the existing fix you have for this change on the
> mailing list, which is present in next-20230614.

Right, that later fix was only for a build warning, nothing functional
(or at least I hoped that it wasn't making any functional difference).

Thanks a lot for the detailed instructions below: unfortunately, those
would draw me into a realm of testing I've never needed to enter before,
so a lot of time spent on setup and learning.  Usually, I just stare at
the source.

What this probably says is that I should revert most my cleanup there,
and keep as close to the existing code as possible.  But some change is
needed, and I may need to understand (or have a good guess at) what was
going wrong, to decide what kind of retreat will be successful.

Back to the source for a while: I hope I'll find examples in nearby MIPS
kernel source (and git history), which will hint at the right way forward.
Then send you a patch against next-20230614 to try, when I'm reasonably
confident that it's enough to satisfy my purpose, but likely not to waste
your time.

Thanks, until later,
Hugh

> 
> I can reproduce it with the GCC 13.1.0 on kernel.org [1].
> 
>   $ make -skj"$(nproc)" ARCH=mips CROSS_COMPILE=mips-linux- mrproper malta_defconfig vmlinux
> 
>   $ qemu-system-mipsel \
>       -display none \
>       -nodefaults \
>       -cpu 24Kf \
>       -machine malta \
>       -kernel vmlinux \
>       -initrd rootfs.cpio \
>       -m 512m \
>       -serial mon:stdio
>   ...
>   Linux version 6.4.0-rc6-next-20230614 (nathan@dev-arch.thelio-3990X) (mips-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jun 14 16:13:02 MST 2023
>   ...
>   Run /init as init process
>   process '/bin/busybox' started with executable stack
>   do_page_fault(): sending SIGSEGV to init for invalid read access from 0000003c
>   epc = 77b893dc in ld-uClibc-1.0.39.so[77b84000+8000]
>   ra  = 77b8930c in ld-uClibc-1.0.39.so[77b84000+8000]
>   Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
>   ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]---
> 
> The rootfs is available at [2] if it is needed. I am more than happy to
> provide additional information or test patches if necessary.
> 
> [1]: https://mirrors.edge.kernel.org/pub/tools/crosstool/
> [2]: https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230609-194440/mipsel-rootfs.cpio.zst
> 
> Cheers,
> Nathan
  
Hugh Dickins June 15, 2023, 5:43 a.m. UTC | #3
On Wed, 14 Jun 2023, Hugh Dickins wrote:
> On Wed, 14 Jun 2023, Nathan Chancellor wrote:
> > 
> > I just bisected a crash while powering down a MIPS machine in QEMU to
> > this change as commit 8044511d3893 ("mips: update_mmu_cache() can
> > replace __update_tlb()") in linux-next.
> 
> Thank you, Nathan, that's very helpful indeed.  This patch certainly knew
> that it wanted testing, and I'm glad to hear that it is now seeing some.
> 
> While powering down?  The messages below look like it was just coming up,
> but no doubt that's because you were bisecting (or because I'm unfamiliar
> with what messages to expect there).  It's probably irrelevant information,
> but I wonder whether the (V)machine worked well enough for a while before
> you first powered down and spotted the problem, or whether it's never got
> much further than trying to run init (busybox)?  I'm trying to get a feel
> for whether the problem occurs under common or uncommon conditions.
> 
> > Unfortunately, I can still
> > reproduce it with the existing fix you have for this change on the
> > mailing list, which is present in next-20230614.
> 
> Right, that later fix was only for a build warning, nothing functional
> (or at least I hoped that it wasn't making any functional difference).
> 
> Thanks a lot for the detailed instructions below: unfortunately, those
> would draw me into a realm of testing I've never needed to enter before,
> so a lot of time spent on setup and learning.  Usually, I just stare at
> the source.
> 
> What this probably says is that I should revert most my cleanup there,
> and keep as close to the existing code as possible.  But some change is
> needed, and I may need to understand (or have a good guess at) what was
> going wrong, to decide what kind of retreat will be successful.
> 
> Back to the source for a while: I hope I'll find examples in nearby MIPS
> kernel source (and git history), which will hint at the right way forward.
> Then send you a patch against next-20230614 to try, when I'm reasonably
> confident that it's enough to satisfy my purpose, but likely not to waste
> your time.

I'm going to take advantage of your good nature by attaching
two alternative patches, either to go on top of next-20230614.

mips1.patch,
 arch/mips/mm/tlb-r4k.c |   12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

is by far my favourite.  I couldn't see anything wrong with what's
already there for mips, but it seems possible that (though I didn't
find it) somewhere calls update_mmu_cache_pmd() on a page table.  So
mips1.patch restores the pmd_huge() check, and cleans up further by
removing the silly pgdp, p4dp, pudp, pmdp stuff: the pointer has now
been passed in by the caller, why walk the tree again?  I should have
done it this way before.

But if that doesn't work, then I'm afraid it will have to be
mips2.patch,
 arch/mips/include/asm/pgtable.h |   15 ++++++++++++---
 arch/mips/mm/tlb-r3k.c          |    5 ++---
 arch/mips/mm/tlb-r4k.c          |   27 ++++++++++++++++++---------
 3 files changed, 32 insertions(+), 15 deletions(-)

which reverts all of the original patch and its build warning fix,
and does a pte_unmap() to balance the silly pte_offset_map() there;
with an apologetic comment for this being about the only place in
the tree where I have no idea what to do if ptep were NULL.

I do hope that you find the first fixes the breakage; but if not, then
I even more fervently hope that the second will, despite my hating it.
Touch wood for the first, fingers crossed for the second, thanks,

Hugh
  
Nathan Chancellor June 15, 2023, 3:50 p.m. UTC | #4
On Wed, Jun 14, 2023 at 10:43:30PM -0700, Hugh Dickins wrote:
> On Wed, 14 Jun 2023, Hugh Dickins wrote:
> > On Wed, 14 Jun 2023, Nathan Chancellor wrote:
> > > 
> > > I just bisected a crash while powering down a MIPS machine in QEMU to
> > > this change as commit 8044511d3893 ("mips: update_mmu_cache() can
> > > replace __update_tlb()") in linux-next.
> > 
> > Thank you, Nathan, that's very helpful indeed.  This patch certainly knew
> > that it wanted testing, and I'm glad to hear that it is now seeing some.
> > 
> > While powering down?  The messages below look like it was just coming up,
> > but no doubt that's because you were bisecting (or because I'm unfamiliar
> > with what messages to expect there).  It's probably irrelevant information,
> > but I wonder whether the (V)machine worked well enough for a while before
> > you first powered down and spotted the problem, or whether it's never got
> > much further than trying to run init (busybox)?  I'm trying to get a feel
> > for whether the problem occurs under common or uncommon conditions.

Ugh sorry, I have been looking into too many bugs lately and got my
wires crossed :) this is indeed a problem when running init (which is
busybox, this is a simple Buildroot file system).

> > > Unfortunately, I can still
> > > reproduce it with the existing fix you have for this change on the
> > > mailing list, which is present in next-20230614.
> > 
> > Right, that later fix was only for a build warning, nothing functional
> > (or at least I hoped that it wasn't making any functional difference).
> > 
> > Thanks a lot for the detailed instructions below: unfortunately, those
> > would draw me into a realm of testing I've never needed to enter before,
> > so a lot of time spent on setup and learning.  Usually, I just stare at
> > the source.
> > 
> > What this probably says is that I should revert most my cleanup there,
> > and keep as close to the existing code as possible.  But some change is
> > needed, and I may need to understand (or have a good guess at) what was
> > going wrong, to decide what kind of retreat will be successful.
> > 
> > Back to the source for a while: I hope I'll find examples in nearby MIPS
> > kernel source (and git history), which will hint at the right way forward.
> > Then send you a patch against next-20230614 to try, when I'm reasonably
> > confident that it's enough to satisfy my purpose, but likely not to waste
> > your time.
> 
> I'm going to take advantage of your good nature by attaching
> two alternative patches, either to go on top of next-20230614.
> 
> mips1.patch,
>  arch/mips/mm/tlb-r4k.c |   12 +-----------
>  1 file changed, 1 insertion(+), 11 deletions(-)
> 
> is by far my favourite.  I couldn't see anything wrong with what's
> already there for mips, but it seems possible that (though I didn't
> find it) somewhere calls update_mmu_cache_pmd() on a page table.  So
> mips1.patch restores the pmd_huge() check, and cleans up further by
> removing the silly pgdp, p4dp, pudp, pmdp stuff: the pointer has now
> been passed in by the caller, why walk the tree again?  I should have
> done it this way before.
> 
> But if that doesn't work, then I'm afraid it will have to be
> mips2.patch,
>  arch/mips/include/asm/pgtable.h |   15 ++++++++++++---
>  arch/mips/mm/tlb-r3k.c          |    5 ++---
>  arch/mips/mm/tlb-r4k.c          |   27 ++++++++++++++++++---------
>  3 files changed, 32 insertions(+), 15 deletions(-)
> 
> which reverts all of the original patch and its build warning fix,
> and does a pte_unmap() to balance the silly pte_offset_map() there;
> with an apologetic comment for this being about the only place in
> the tree where I have no idea what to do if ptep were NULL.
> 
> I do hope that you find the first fixes the breakage; but if not, then

I hate to be the bearer of bad news but the first patch did not fix the
breakage, I see the same issue.

> I even more fervently hope that the second will, despite my hating it.
> Touch wood for the first, fingers crossed for the second, thanks,

Thankfully, the second one does. Thanks for the quick and thoughtful
responses!

Cheers,
Nathan
  
Hugh Dickins June 15, 2023, 9:22 p.m. UTC | #5
On Thu, 15 Jun 2023, Nathan Chancellor wrote:
> On Wed, Jun 14, 2023 at 10:43:30PM -0700, Hugh Dickins wrote:
> > 
> > I do hope that you find the first fixes the breakage; but if not, then
> 
> I hate to be the bearer of bad news but the first patch did not fix the
> breakage, I see the same issue.

Boo!

> 
> > I even more fervently hope that the second will, despite my hating it.
> > Touch wood for the first, fingers crossed for the second, thanks,
> 
> Thankfully, the second one does. Thanks for the quick and thoughtful
> responses!

Hurrah!

Thanks a lot, Nathan.  I'll set aside my disappointment and curiosity,
clearly I'm not going to have much of a future as a MIPS programmer.

I must take a break, then rush Andrew the second patch, well, not
exactly that second patch, since most of that is revert: I'll just
send the few lines of replacement patch (with a new Subject line, as
update_mmu_cache() goes back to being separate from __update_tlb()).

Unless you object, I'll include a Tested-by: you.  I realize that
your testing is limited to seeing it running; but that's true of
most of the testing at this stage - it gets to be more interesting
when the patch that adds the rcu_read_lock() and rcu_read_unlock()
is added on top later.

Thanks again,
Hugh
  
Yu Zhao June 15, 2023, 10:07 p.m. UTC | #6
On Wed, Jun 14, 2023 at 04:17:58PM -0700, Nathan Chancellor wrote:
> Hi Hugh,
> 
> On Thu, Jun 08, 2023 at 12:17:24PM -0700, Hugh Dickins wrote:
> > Don't make update_mmu_cache() a wrapper around __update_tlb(): call it
> > directly, and use the ptep (or pmdp) provided by the caller, instead of
> > re-calling pte_offset_map() - which would raise a question of whether a
> > pte_unmap() is needed to balance it.
> > 
> > Check whether the "ptep" provided by the caller is actually the pmdp,
> > instead of testing pmd_huge(): or test pmd_huge() too and warn if it
> > disagrees?  This is "hazardous" territory: needs review and testing.
> > 
> > Signed-off-by: Hugh Dickins <hughd@google.com>
> > ---
> >  arch/mips/include/asm/pgtable.h | 15 +++------------
> >  arch/mips/mm/tlb-r3k.c          |  5 +++--
> >  arch/mips/mm/tlb-r4k.c          |  9 +++------
> >  3 files changed, 9 insertions(+), 20 deletions(-)
> > 
> 
> I just bisected a crash while powering down a MIPS machine in QEMU to
> this change as commit 8044511d3893 ("mips: update_mmu_cache() can
> replace __update_tlb()") in linux-next. Unfortunately, I can still
> reproduce it with the existing fix you have for this change on the
> mailing list, which is present in next-20230614.
> 
> I can reproduce it with the GCC 13.1.0 on kernel.org [1].
> 
>   $ make -skj"$(nproc)" ARCH=mips CROSS_COMPILE=mips-linux- mrproper malta_defconfig vmlinux
> 
>   $ qemu-system-mipsel \
>       -display none \
>       -nodefaults \
>       -cpu 24Kf \
>       -machine malta \
>       -kernel vmlinux \
>       -initrd rootfs.cpio \
>       -m 512m \
>       -serial mon:stdio
>   ...
>   Linux version 6.4.0-rc6-next-20230614 (nathan@dev-arch.thelio-3990X) (mips-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jun 14 16:13:02 MST 2023
>   ...
>   Run /init as init process
>   process '/bin/busybox' started with executable stack
>   do_page_fault(): sending SIGSEGV to init for invalid read access from 0000003c
>   epc = 77b893dc in ld-uClibc-1.0.39.so[77b84000+8000]
>   ra  = 77b8930c in ld-uClibc-1.0.39.so[77b84000+8000]
>   Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
>   ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]---
> 
> The rootfs is available at [2] if it is needed. I am more than happy to
> provide additional information or test patches if necessary.
> 
> [1]: https://mirrors.edge.kernel.org/pub/tools/crosstool/
> [2]: https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230609-194440/mipsel-rootfs.cpio.zst

Seeing this on real h/w as well (just to confirm).

Linux version 6.4.0-rc4-00437-g4bab5c42a698 (root@yuzhao.bld.corp.google.com) (mips64el-linux-gnuabi64-gcc (Debian 12.2.0-14) 12.2.0, GNU ld (GNU Binutils for Debian) 2.40) #3 SMP PREEMPT Thu Jun 15 01:05:20 MDT 2023
Skipping L2 locking due to reduced L2 cache size
CVMSEG size: 2 cache lines (256 bytes)
printk: bootconsole [early0] enabled
CPU0 revision is: 000d9602 (Cavium Octeon III)
FPU revision is: 00739600
Kernel sections are not in the memory maps
Wasting 243712 bytes for tracking 4352 unused pages
Initrd not found or empty - disabling initrd
Using passed Device Tree.
software IO TLB: SWIOTLB bounce buffer size adjusted to 0MB
software IO TLB: area num 1.
software IO TLB: mapped [mem 0x000000000370d000-0x000000000374d000] (0MB)
Primary instruction cache 78kB, virtually tagged, 39 way, 16 sets, linesize 128 bytes.
Primary data cache 32kB, 32-way, 8 sets, linesize 128 bytes.
Zone ranges:
  DMA32    [mem 0x0000000001100000-0x00000000efffffff]
  Normal   empty
Movable zone start for each node
Early memory node ranges
  node   0: [mem 0x0000000001100000-0x0000000003646fff]
  node   0: [mem 0x0000000003700000-0x000000000fafffff]
  node   0: [mem 0x0000000020000000-0x000000004ebfffff]
Initmem setup node 0 [mem 0x0000000001100000-0x000000004ebfffff]
On node 0, zone DMA32: 4352 pages in unavailable ranges
On node 0, zone DMA32: 185 pages in unavailable ranges
On node 0, zone DMA32: 1280 pages in unavailable ranges
On node 0, zone DMA32: 5120 pages in unavailable ranges
percpu: Embedded 15 pages/cpu s24368 r8192 d28880 u61440
pcpu-alloc: s24368 r8192 d28880 u61440 alloc=15*4096
pcpu-alloc: [0] 0 [0] 1 [0] 2 [0] 3 
Kernel command line:  loglevel=8 console=ttyS0,115200
printk: log_buf_len individual max cpu contribution: 4096 bytes
printk: log_buf_len total cpu_extra contributions: 12288 bytes
printk: log_buf_len min size: 16384 bytes
printk: log_buf_len: 32768 bytes
printk: early log buf free: 14184(86%)
Dentry cache hash table entries: 131072 (order: 8, 1048576 bytes, linear)
Inode-cache hash table entries: 65536 (order: 7, 524288 bytes, linear)
Built 1 zonelists, mobility grouping on.  Total pages: 247772
mem auto-init: stack:all(zero), heap alloc:off, heap free:off
Memory: 950032K/1004828K available (8058K kernel code, 575K rwdata, 1880K rodata, 27488K init, 158K bss, 54796K reserved, 0K cma-reserved)
rcu: Preemptible hierarchical RCU implementation.
rcu: 	RCU event tracing is enabled.
rcu: 	RCU restricting CPUs from NR_CPUS=32 to nr_cpu_ids=4.
rcu: RCU calculated value of scheduler-enlistment delay is 10 jiffies.
rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=4
NR_IRQS: 512
CIB interrupt controller probed: 800107000000e000 23
CIB interrupt controller probed: 800107000000e200 12
CIB interrupt controller probed: 800107000000e400 6
CIB interrupt controller probed: 800107000000ec00 15
CIB interrupt controller probed: 800107000000e600 4
CIB interrupt controller probed: 800107000000e800 11
CIB interrupt controller probed: 800107000000e900 11
rcu: srcu_init: Setting srcu_struct sizes based on contention.
clocksource: OCTEON_CVMCOUNT: mask: 0xffffffffffffffff max_cycles: 0x1cd42e4dffb, max_idle_ns: 881590591483 ns
Calibrating delay loop (skipped) preset value.. 2000.00 BogoMIPS (lpj=10000000)
pid_max: default: 32768 minimum: 301
LSM: initializing lsm=capability,integrity
Mount-cache hash table entries: 2048 (order: 2, 16384 bytes, linear)
Mountpoint-cache hash table entries: 2048 (order: 2, 16384 bytes, linear)
rcu: Hierarchical SRCU implementation.
rcu: 	Max phase no-delay instances is 1000.
smp: Bringing up secondary CPUs ...
SMP: Booting CPU01 (CoreId  1)...
CPU1 revision is: 000d9602 (Cavium Octeon III)
FPU revision is: 00739600
SMP: Booting CPU02 (CoreId  2)...
CPU2 revision is: 000d9602 (Cavium Octeon III)
FPU revision is: 00739600
SMP: Booting CPU03 (CoreId  3)...
CPU3 revision is: 000d9602 (Cavium Octeon III)
FPU revision is: 00739600
smp: Brought up 1 node, 4 CPUs
devtmpfs: initialized
clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 19112604462750000 ns
futex hash table entries: 1024 (order: 5, 131072 bytes, linear)
NET: Registered PF_NETLINK/PF_ROUTE protocol family
PCIe: Initializing port 0
PCIe: BIST2 FAILED for port 0 (0x0000000000000003)
PCIe: Link timeout on port 0, probably the slot is empty
PCIe: Initializing port 1
PCIe: BIST FAILED for port 1 (0xffffffffffffffff)
PCIe: Link timeout on port 1, probably the slot is empty
HugeTLB: registered 2.00 MiB page size, pre-allocated 0 pages
HugeTLB: 0 KiB vmemmap can be freed for a 2.00 MiB page
SCSI subsystem initialized
libata version 3.00 loaded.
usbcore: registered new interface driver usbfs
usbcore: registered new interface driver hub
usbcore: registered new device driver usb
pps_core: LinuxPPS API ver. 1 registered
pps_core: Software ver. 5.3.6 - Copyright 2005-2007 Rodolfo Giometti <giometti@linux.it>
PTP clock support registered
EDAC MC: Ver: 3.0.0
PCI host bridge to bus 0000:00
pci_bus 0000:00: root bus resource [mem 0x1000000000000]
pci_bus 0000:00: root bus resource [io  0x0000]
pci_bus 0000:00: No busn resource found for root bus, will use [bus 00-ff]
pci_bus 0000:00: busn_res: [bus 00-ff] end is updated to 00
vgaarb: loaded
clocksource: Switched to clocksource OCTEON_CVMCOUNT
NET: Registered PF_INET protocol family
IP idents hash table entries: 16384 (order: 5, 131072 bytes, linear)
tcp_listen_portaddr_hash hash table entries: 512 (order: 1, 8192 bytes, linear)
Table-perturb hash table entries: 65536 (order: 6, 262144 bytes, linear)
TCP established hash table entries: 8192 (order: 4, 65536 bytes, linear)
TCP bind hash table entries: 8192 (order: 6, 262144 bytes, linear)
TCP: Hash tables configured (established 8192 bind 8192)
UDP hash table entries: 512 (order: 2, 16384 bytes, linear)
UDP-Lite hash table entries: 512 (order: 2, 16384 bytes, linear)
NET: Registered PF_UNIX/PF_LOCAL protocol family
RPC: Registered named UNIX socket transport module.
RPC: Registered udp transport module.
RPC: Registered tcp transport module.
RPC: Registered tcp NFSv4.1 backchannel transport module.
PCI: CLS 0 bytes, default 128
platform 1180068000000.uctl: clocks initialized.
platform 1180069000000.uctl: clocks initialized.
Starting KVM with MIPS VZ extensions
workingset: timestamp_bits=62 max_order=18 bucket_order=0
NFS: Registering the id_resolver key type
Key type id_resolver registered
Key type id_legacy registered
nfs4filelayout_init: NFSv4 File Layout Driver Registering...
nfs4flexfilelayout_init: NFSv4 Flexfile Layout Driver Registering...
io scheduler mq-deadline registered
io scheduler kyber registered
io scheduler bfq registered
gpio gpiochip0: Static allocation of GPIO base is deprecated, use dynamic allocation.
octeon_gpio 1070000000800.gpio-controller: OCTEON GPIO driver probed.
Serial: 8250/16550 driver, 2 ports, IRQ sharing disabled
printk: console [ttyS0] disabled
1180000000800.serial: ttyS0 at MMIO 0x1180000000800 (irq = 34, base_baud = 25000000) is a OCTEON
printk: console [ttyS0] enabled
printk: console [ttyS0] enabled
printk: bootconsole [early0] disabled
printk: bootconsole [early0] disabled
1180000000c00.serial: ttyS1 at MMIO 0x1180000000c00 (irq = 35, base_baud = 25000000) is a OCTEON
loop: module loaded
Driver 'pata_octeon_cf' needs updating - please use bus_type methods
slram: not enough parameters.
spi-octeon 1070000001000.spi: OCTEON SPI bus driver
process '/bin/kmod' started with executable stack
do_page_fault(): sending SIGSEGV to modprobe for invalid read access from 0000000000000298
epc = 000000fff3346470 in ld.so.1[fff3328000+2e000]
ra  = 000000fff33456d0 in ld.so.1[fff3328000+2e000]
do_page_fault(): sending SIGSEGV to modprobe for invalid read access from 0000000000000298
epc = 000000fff3c78470 in ld.so.1[fff3c5a000+2e000]
ra  = 000000fff3c776d0 in ld.so.1[fff3c5a000+2e000]
do_page_fault(): sending SIGSEGV to modprobe for invalid read access from 0000000000021da8
epc = 000000fff35aa2c0 in ld.so.1[fff358d000+2e000]
ra  = 000000fff35aa688 in ld.so.1[fff358d000+2e000]
do_page_fault(): sending SIGSEGV to modprobe for invalid read access from 0000000000000298
epc = 000000fff34cc470 in ld.so.1[fff34ae000+2e000]
ra  = 000000fff34cb6d0 in ld.so.1[fff34ae000+2e000]
mdio_octeon 1180000001800.mdio: Probed
mdio_octeon 1180000001900.mdio: Probed
dwc3 1680000000000.xhci: Configuration mismatch. dr_mode forced to host
dwc3 1690000000000.xhci: Configuration mismatch. dr_mode forced to host
xhci-hcd xhci-hcd.0.auto: xHCI Host Controller
xhci-hcd xhci-hcd.0.auto: new USB bus registered, assigned bus number 1
xhci-hcd xhci-hcd.0.auto: hcc params 0x0220f06d hci version 0x100 quirks 0x0000000002010010
xhci-hcd xhci-hcd.0.auto: irq 25, io mem 0x1680000000000
dwc3 1680000000000.xhci: xhci_plat_probe get usb3phy fail (ret=-6)
xhci-hcd xhci-hcd.0.auto: xHCI Host Controller
xhci-hcd xhci-hcd.0.auto: new USB bus registered, assigned bus number 2
xhci-hcd xhci-hcd.0.auto: Host supports USB 3.0 SuperSpeed
hub 1-0:1.0: USB hub found
hub 1-0:1.0: 1 port detected
usb usb2: We don't know the algorithms for LPM for this host, disabling LPM.
hub 2-0:1.0: USB hub found
hub 2-0:1.0: 1 port detected
xhci-hcd xhci-hcd.1.auto: xHCI Host Controller
xhci-hcd xhci-hcd.1.auto: new USB bus registered, assigned bus number 3
xhci-hcd xhci-hcd.1.auto: hcc params 0x0220f06d hci version 0x100 quirks 0x0000000002010010
xhci-hcd xhci-hcd.1.auto: irq 26, io mem 0x1690000000000
dwc3 1690000000000.xhci: xhci_plat_probe get usb3phy fail (ret=-6)
xhci-hcd xhci-hcd.1.auto: xHCI Host Controller
xhci-hcd xhci-hcd.1.auto: new USB bus registered, assigned bus number 4
xhci-hcd xhci-hcd.1.auto: Host supports USB 3.0 SuperSpeed
hub 3-0:1.0: USB hub found
hub 3-0:1.0: 1 port detected
usb usb4: We don't know the algorithms for LPM for this host, disabling LPM.
hub 4-0:1.0: USB hub found
hub 4-0:1.0: 1 port detected
usbcore: registered new interface driver usb-storage
i2c-octeon 1180000001000.i2c: probed
i2c-octeon 1180000001200.i2c: probed
octeon_wdt: Initial granularity 5 Sec
EDAC DEVICE0: Giving out device to module octeon-cpu controller cache: DEV octeon_pc_edac (INTERRUPT)
EDAC DEVICE1: Giving out device to module octeon-l2c controller octeon_l2c_err: DEV octeon_l2c_edac (POLLED)
octeon_lmc_edac octeon_lmc_edac.0: Disabled (ECC not enabled)
Interface 0 has 4 ports (SGMII)
Interface 1 has 4 ports (SGMII)
Interface 3 has 4 ports (LOOP)
NET: Registered PF_INET6 protocol family
Segment Routing with IPv6
In-situ OAM (IOAM) with IPv6
sit: IPv6, IPv4 and MPLS over IPv4 tunneling driver
NET: Registered PF_PACKET protocol family
Key type dns_resolver registered
OF: fdt: not creating '/sys/firmware/fdt': CRC check failed
Freeing unused kernel image (initmem) memory: 27488K
This architecture does not have kernel memory protection.
Run /init as init process
  with arguments:
    /init
  with environment:
    HOME=/
    TERM=linux
do_page_fault(): sending SIGSEGV to init for invalid read access from 0000000000021da8
epc = 000000fff3a542c0 in ld.so.1[fff3a37000+2e000]
ra  = 000000fff3a54688 in ld.so.1[fff3a37000+2e000]
Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]---
  

Patch

diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 574fa14ac8b2..9175dfab08d5 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -565,15 +565,8 @@  static inline pte_t pte_swp_clear_exclusive(pte_t pte)
 }
 #endif
 
-extern void __update_tlb(struct vm_area_struct *vma, unsigned long address,
-	pte_t pte);
-
-static inline void update_mmu_cache(struct vm_area_struct *vma,
-	unsigned long address, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-	__update_tlb(vma, address, pte);
-}
+extern void update_mmu_cache(struct vm_area_struct *vma,
+	unsigned long address, pte_t *ptep);
 
 #define	__HAVE_ARCH_UPDATE_MMU_TLB
 #define update_mmu_tlb	update_mmu_cache
@@ -581,9 +574,7 @@  static inline void update_mmu_cache(struct vm_area_struct *vma,
 static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 	unsigned long address, pmd_t *pmdp)
 {
-	pte_t pte = *(pte_t *)pmdp;
-
-	__update_tlb(vma, address, pte);
+	update_mmu_cache(vma, address, (pte_t *)pmdp);
 }
 
 /*
diff --git a/arch/mips/mm/tlb-r3k.c b/arch/mips/mm/tlb-r3k.c
index 53dfa2b9316b..e5722cd8dd6d 100644
--- a/arch/mips/mm/tlb-r3k.c
+++ b/arch/mips/mm/tlb-r3k.c
@@ -176,7 +176,8 @@  void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 	}
 }
 
-void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte)
+void update_mmu_cache(struct vm_area_struct *vma,
+		      unsigned long address, pte_t *ptep)
 {
 	unsigned long asid_mask = cpu_asid_mask(&current_cpu_data);
 	unsigned long flags;
@@ -203,7 +204,7 @@  void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte)
 	BARRIER;
 	tlb_probe();
 	idx = read_c0_index();
-	write_c0_entrylo0(pte_val(pte));
+	write_c0_entrylo0(pte_val(*ptep));
 	write_c0_entryhi(address | pid);
 	if (idx < 0) {					/* BARRIER */
 		tlb_write_random();
diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
index 1b939abbe4ca..c96725d17cab 100644
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -290,14 +290,14 @@  void local_flush_tlb_one(unsigned long page)
  * updates the TLB with the new pte(s), and another which also checks
  * for the R4k "end of page" hardware bug and does the needy.
  */
-void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte)
+void update_mmu_cache(struct vm_area_struct *vma,
+		      unsigned long address, pte_t *ptep)
 {
 	unsigned long flags;
 	pgd_t *pgdp;
 	p4d_t *p4dp;
 	pud_t *pudp;
 	pmd_t *pmdp;
-	pte_t *ptep;
 	int idx, pid;
 
 	/*
@@ -326,10 +326,9 @@  void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte)
 	idx = read_c0_index();
 #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
 	/* this could be a huge page  */
-	if (pmd_huge(*pmdp)) {
+	if (ptep == (pte_t *)pmdp) {
 		unsigned long lo;
 		write_c0_pagemask(PM_HUGE_MASK);
-		ptep = (pte_t *)pmdp;
 		lo = pte_to_entrylo(pte_val(*ptep));
 		write_c0_entrylo0(lo);
 		write_c0_entrylo1(lo + (HPAGE_SIZE >> 7));
@@ -344,8 +343,6 @@  void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte)
 	} else
 #endif
 	{
-		ptep = pte_offset_map(pmdp, address);
-
 #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32)
 #ifdef CONFIG_XPA
 		write_c0_entrylo0(pte_to_entrylo(ptep->pte_high));