[v2] mm: introduce arch_has_hw_nonleaf_pmd_young()

Message ID 20221123064510.16225-1-jgross@suse.com
State New
Headers
Series [v2] mm: introduce arch_has_hw_nonleaf_pmd_young() |

Commit Message

Juergen Gross Nov. 23, 2022, 6:45 a.m. UTC
  When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation
in pmdp_test_and_clear_young():

 BUG: unable to handle page fault for address: ffff8880083374d0
 #PF: supervisor write access in kernel mode
 #PF: error_code(0x0003) - permissions violation
 PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
 Oops: 0003 [#1] PREEMPT SMP NOPTI
 CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
 RIP: e030:pmdp_test_and_clear_young+0x25/0x40

This happens because the Xen hypervisor can't emulate direct writes to
page table entries other than PTEs.

This can easily be fixed by introducing arch_has_hw_nonleaf_pmd_young()
similar to arch_has_hw_pte_young() and test that instead of
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.

Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Yu Zhao <yuzhao@google.com>
Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
---
V2:
- correct function name in commit message to match patch
---
 arch/x86/include/asm/pgtable.h |  8 ++++++++
 include/linux/pgtable.h        | 11 +++++++++++
 mm/vmscan.c                    | 10 +++++-----
 3 files changed, 24 insertions(+), 5 deletions(-)
  

Comments

David Hildenbrand Nov. 23, 2022, 9:31 a.m. UTC | #1
On 23.11.22 07:45, Juergen Gross wrote:
> When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation
> in pmdp_test_and_clear_young():
> 
>   BUG: unable to handle page fault for address: ffff8880083374d0
>   #PF: supervisor write access in kernel mode
>   #PF: error_code(0x0003) - permissions violation
>   PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
>   Oops: 0003 [#1] PREEMPT SMP NOPTI
>   CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
>   RIP: e030:pmdp_test_and_clear_young+0x25/0x40
> 
> This happens because the Xen hypervisor can't emulate direct writes to
> page table entries other than PTEs.
> 
> This can easily be fixed by introducing arch_has_hw_nonleaf_pmd_young()
> similar to arch_has_hw_pte_young() and test that instead of
> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
> 
> Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
> Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
> Signed-off-by: Juergen Gross <jgross@suse.com>
> Acked-by: Yu Zhao <yuzhao@google.com>
> Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
> ---
> V2:
> - correct function name in commit message to match patch
> ---
>   arch/x86/include/asm/pgtable.h |  8 ++++++++
>   include/linux/pgtable.h        | 11 +++++++++++
>   mm/vmscan.c                    | 10 +++++-----
>   3 files changed, 24 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 5059799bebe3..c567a6ed17ce 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -1438,6 +1438,14 @@ static inline bool arch_has_hw_pte_young(void)
>   	return true;
>   }
>   
> +#ifdef CONFIG_XEN_PV
> +#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
> +static inline bool arch_has_hw_nonleaf_pmd_young(void)
> +{
> +	return !cpu_feature_enabled(X86_FEATURE_XENPV);
> +}
> +#endif
> +
>   #ifdef CONFIG_PAGE_TABLE_CHECK
>   static inline bool pte_user_accessible_page(pte_t pte)
>   {
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index a108b60a6962..58fc7e2d9575 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -260,6 +260,17 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
>   #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>   #endif
>   
> +#ifndef arch_has_hw_nonleaf_pmd_young
> +/*
> + * Return whether the accessed bit in non-leaf PMD entries is supported on the
> + * local CPU.
> + */
> +static inline bool arch_has_hw_nonleaf_pmd_young(void)
> +{
> +	return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
> +}
> +#endif
> +
>   #ifndef arch_has_hw_pte_young
>   /*
>    * Return whether the accessed bit is supported on the local CPU.
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 04d8b88e5216..a04ac3b18326 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3975,7 +3975,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
>   			goto next;
>   
>   		if (!pmd_trans_huge(pmd[i])) {
> -			if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
> +			if (arch_has_hw_nonleaf_pmd_young() &&
>   			    get_cap(LRU_GEN_NONLEAF_YOUNG))
>   				pmdp_test_and_clear_young(vma, addr, pmd + i);
>   			goto next;
> @@ -4073,14 +4073,14 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
>   #endif
>   		walk->mm_stats[MM_NONLEAF_TOTAL]++;
>   
> -#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
> -		if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
> +		if (arch_has_hw_nonleaf_pmd_young() &&
> +		    get_cap(LRU_GEN_NONLEAF_YOUNG)) {
>   			if (!pmd_young(val))
>   				continue;
>   
>   			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
>   		}
> -#endif
> +
>   		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
>   			continue;
>   
> @@ -5354,7 +5354,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
>   	if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
>   		caps |= BIT(LRU_GEN_MM_WALK);
>   
> -	if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
> +	if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
>   		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
>   
>   	return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);

Acked-by: David Hildenbrand <david@redhat.com> # core changes
  
Geert Uytterhoeven Nov. 24, 2022, 2:08 p.m. UTC | #2
Hi Jürgen,

On Wed, Nov 23, 2022 at 7:53 AM Juergen Gross <jgross@suse.com> wrote:
> When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation
> in pmdp_test_and_clear_young():
>
>  BUG: unable to handle page fault for address: ffff8880083374d0
>  #PF: supervisor write access in kernel mode
>  #PF: error_code(0x0003) - permissions violation
>  PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
>  Oops: 0003 [#1] PREEMPT SMP NOPTI
>  CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
>  RIP: e030:pmdp_test_and_clear_young+0x25/0x40
>
> This happens because the Xen hypervisor can't emulate direct writes to
> page table entries other than PTEs.
>
> This can easily be fixed by introducing arch_has_hw_nonleaf_pmd_young()
> similar to arch_has_hw_pte_young() and test that instead of
> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
>
> Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
> Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
> Signed-off-by: Juergen Gross <jgross@suse.com>
> Acked-by: Yu Zhao <yuzhao@google.com>
> Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
> ---
> V2:
> - correct function name in commit message to match patch

Thanks for your patch, which is now commit 3f85e711d5af4fb4 ("mm:
introduce arch_has_hw_nonleaf_pmd_young()") in next-20221124.

noreply@ellerman.id.au reported a build failure for m68k/allmodconfig,
which I have bisected to this commit.

> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c

> @@ -4073,14 +4073,14 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
>  #endif
>                 walk->mm_stats[MM_NONLEAF_TOTAL]++;
>
> -#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
> -               if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
> +               if (arch_has_hw_nonleaf_pmd_young() &&
> +                   get_cap(LRU_GEN_NONLEAF_YOUNG)) {
>                         if (!pmd_young(val))

mm/vmscan.c:4102:30: error: implicit declaration of function
'pmd_young'; did you mean 'pte_young'?
[-Werror=implicit-function-declaration]

pmd_young() seems to be defined only on a handful of architectures.

>                                 continue;
>
>                         walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
>                 }
> -#endif
> +
>                 if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
>                         continue;
>

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds
  
Juergen Gross Nov. 24, 2022, 2:30 p.m. UTC | #3
Hi,

On 24.11.22 15:08, Geert Uytterhoeven wrote:
> Hi Jürgen,
> 
> On Wed, Nov 23, 2022 at 7:53 AM Juergen Gross <jgross@suse.com> wrote:
>> When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
>> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation
>> in pmdp_test_and_clear_young():
>>
>>   BUG: unable to handle page fault for address: ffff8880083374d0
>>   #PF: supervisor write access in kernel mode
>>   #PF: error_code(0x0003) - permissions violation
>>   PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
>>   Oops: 0003 [#1] PREEMPT SMP NOPTI
>>   CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
>>   RIP: e030:pmdp_test_and_clear_young+0x25/0x40
>>
>> This happens because the Xen hypervisor can't emulate direct writes to
>> page table entries other than PTEs.
>>
>> This can easily be fixed by introducing arch_has_hw_nonleaf_pmd_young()
>> similar to arch_has_hw_pte_young() and test that instead of
>> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
>>
>> Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
>> Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
>> Signed-off-by: Juergen Gross <jgross@suse.com>
>> Acked-by: Yu Zhao <yuzhao@google.com>
>> Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
>> ---
>> V2:
>> - correct function name in commit message to match patch
> 
> Thanks for your patch, which is now commit 3f85e711d5af4fb4 ("mm:
> introduce arch_has_hw_nonleaf_pmd_young()") in next-20221124.
> 
> noreply@ellerman.id.au reported a build failure for m68k/allmodconfig,
> which I have bisected to this commit.
> 
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
> 
>> @@ -4073,14 +4073,14 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
>>   #endif
>>                  walk->mm_stats[MM_NONLEAF_TOTAL]++;
>>
>> -#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
>> -               if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
>> +               if (arch_has_hw_nonleaf_pmd_young() &&
>> +                   get_cap(LRU_GEN_NONLEAF_YOUNG)) {
>>                          if (!pmd_young(val))
> 
> mm/vmscan.c:4102:30: error: implicit declaration of function
> 'pmd_young'; did you mean 'pte_young'?
> [-Werror=implicit-function-declaration]
> 
> pmd_young() seems to be defined only on a handful of architectures.

What would be the preferred fix for that?

I could offer:

- use V1 of the patch
- add the #ifdefs again to this patch (which would be kind of weird)
- use the attached patch


Juergen
  
Yu Zhao Nov. 24, 2022, 9:26 p.m. UTC | #4
On Thu, Nov 24, 2022 at 7:30 AM Juergen Gross <jgross@suse.com> wrote:
>
> Hi,
>
> On 24.11.22 15:08, Geert Uytterhoeven wrote:
> > Hi Jürgen,
> >
> > On Wed, Nov 23, 2022 at 7:53 AM Juergen Gross <jgross@suse.com> wrote:
> >> When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
> >> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation
> >> in pmdp_test_and_clear_young():
> >>
> >>   BUG: unable to handle page fault for address: ffff8880083374d0
> >>   #PF: supervisor write access in kernel mode
> >>   #PF: error_code(0x0003) - permissions violation
> >>   PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
> >>   Oops: 0003 [#1] PREEMPT SMP NOPTI
> >>   CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
> >>   RIP: e030:pmdp_test_and_clear_young+0x25/0x40
> >>
> >> This happens because the Xen hypervisor can't emulate direct writes to
> >> page table entries other than PTEs.
> >>
> >> This can easily be fixed by introducing arch_has_hw_nonleaf_pmd_young()
> >> similar to arch_has_hw_pte_young() and test that instead of
> >> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
> >>
> >> Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
> >> Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
> >> Signed-off-by: Juergen Gross <jgross@suse.com>
> >> Acked-by: Yu Zhao <yuzhao@google.com>
> >> Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
> >> ---
> >> V2:
> >> - correct function name in commit message to match patch
> >
> > Thanks for your patch, which is now commit 3f85e711d5af4fb4 ("mm:
> > introduce arch_has_hw_nonleaf_pmd_young()") in next-20221124.
> >
> > noreply@ellerman.id.au reported a build failure for m68k/allmodconfig,
> > which I have bisected to this commit.
> >
> >> --- a/mm/vmscan.c
> >> +++ b/mm/vmscan.c
> >
> >> @@ -4073,14 +4073,14 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
> >>   #endif
> >>                  walk->mm_stats[MM_NONLEAF_TOTAL]++;
> >>
> >> -#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
> >> -               if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
> >> +               if (arch_has_hw_nonleaf_pmd_young() &&
> >> +                   get_cap(LRU_GEN_NONLEAF_YOUNG)) {
> >>                          if (!pmd_young(val))
> >
> > mm/vmscan.c:4102:30: error: implicit declaration of function
> > 'pmd_young'; did you mean 'pte_young'?
> > [-Werror=implicit-function-declaration]
> >
> > pmd_young() seems to be defined only on a handful of architectures.
>
> What would be the preferred fix for that?
>
> I could offer:
>
> - use V1 of the patch
> - add the #ifdefs again to this patch (which would be kind of weird)
> - use the attached patch

Your patch looks good to me:

Acked-by: Yu Zhao <yuzhao@google.com>

Thanks.
  

Patch

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5059799bebe3..c567a6ed17ce 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1438,6 +1438,14 @@  static inline bool arch_has_hw_pte_young(void)
 	return true;
 }
 
+#ifdef CONFIG_XEN_PV
+#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
+{
+	return !cpu_feature_enabled(X86_FEATURE_XENPV);
+}
+#endif
+
 #ifdef CONFIG_PAGE_TABLE_CHECK
 static inline bool pte_user_accessible_page(pte_t pte)
 {
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a108b60a6962..58fc7e2d9575 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -260,6 +260,17 @@  static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 
+#ifndef arch_has_hw_nonleaf_pmd_young
+/*
+ * Return whether the accessed bit in non-leaf PMD entries is supported on the
+ * local CPU.
+ */
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
+{
+	return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
+}
+#endif
+
 #ifndef arch_has_hw_pte_young
 /*
  * Return whether the accessed bit is supported on the local CPU.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 04d8b88e5216..a04ac3b18326 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3975,7 +3975,7 @@  static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
 			goto next;
 
 		if (!pmd_trans_huge(pmd[i])) {
-			if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
+			if (arch_has_hw_nonleaf_pmd_young() &&
 			    get_cap(LRU_GEN_NONLEAF_YOUNG))
 				pmdp_test_and_clear_young(vma, addr, pmd + i);
 			goto next;
@@ -4073,14 +4073,14 @@  static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
 #endif
 		walk->mm_stats[MM_NONLEAF_TOTAL]++;
 
-#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
-		if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+		if (arch_has_hw_nonleaf_pmd_young() &&
+		    get_cap(LRU_GEN_NONLEAF_YOUNG)) {
 			if (!pmd_young(val))
 				continue;
 
 			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
 		}
-#endif
+
 		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
 			continue;
 
@@ -5354,7 +5354,7 @@  static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
 	if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
 		caps |= BIT(LRU_GEN_MM_WALK);
 
-	if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
+	if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
 		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
 
 	return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);