mm: introduce arch_has_hw_pmd_young()
Commit Message
When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation
in pmdp_test_and_clear_young():
BUG: unable to handle page fault for address: ffff8880083374d0
#PF: supervisor write access in kernel mode
#PF: error_code(0x0003) - permissions violation
PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
Oops: 0003 [#1] PREEMPT SMP NOPTI
CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
RIP: e030:pmdp_test_and_clear_young+0x25/0x40
This happens because the Xen hypervisor can't emulate direct writes to
page table entries other than PTEs.
This can easily be fixed by introducing arch_has_hw_pmd_young()
similar to arch_has_hw_pte_young() and test that instead of
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
Signed-off-by: Juergen Gross <jgross@suse.com>
---
arch/x86/include/asm/pgtable.h | 8 ++++++++
include/linux/pgtable.h | 11 +++++++++++
mm/vmscan.c | 10 +++++-----
3 files changed, 24 insertions(+), 5 deletions(-)
Comments
On 21.11.22 10:32, Juergen Gross wrote:
> When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation
> in pmdp_test_and_clear_young():
>
> BUG: unable to handle page fault for address: ffff8880083374d0
> #PF: supervisor write access in kernel mode
> #PF: error_code(0x0003) - permissions violation
> PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
> Oops: 0003 [#1] PREEMPT SMP NOPTI
> CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
> RIP: e030:pmdp_test_and_clear_young+0x25/0x40
>
> This happens because the Xen hypervisor can't emulate direct writes to
> page table entries other than PTEs.
>
> This can easily be fixed by introducing arch_has_hw_pmd_young()
> similar to arch_has_hw_pte_young() and test that instead of
> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
I just spotted that I missed to update my commit message to match the
patch:
s/arch_has_hw_pmd_young/arch_has_hw_nonleaf_pmd_young/
I'll wait with a resend in order to get some feedback first.
Juergen
>
> Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
> Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
> Signed-off-by: Juergen Gross <jgross@suse.com>
> ---
> arch/x86/include/asm/pgtable.h | 8 ++++++++
> include/linux/pgtable.h | 11 +++++++++++
> mm/vmscan.c | 10 +++++-----
> 3 files changed, 24 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 5059799bebe3..c567a6ed17ce 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -1438,6 +1438,14 @@ static inline bool arch_has_hw_pte_young(void)
> return true;
> }
>
> +#ifdef CONFIG_XEN_PV
> +#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
> +static inline bool arch_has_hw_nonleaf_pmd_young(void)
> +{
> + return !cpu_feature_enabled(X86_FEATURE_XENPV);
> +}
> +#endif
> +
> #ifdef CONFIG_PAGE_TABLE_CHECK
> static inline bool pte_user_accessible_page(pte_t pte)
> {
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index a108b60a6962..58fc7e2d9575 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -260,6 +260,17 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> #endif
>
> +#ifndef arch_has_hw_nonleaf_pmd_young
> +/*
> + * Return whether the accessed bit in non-leaf PMD entries is supported on the
> + * local CPU.
> + */
> +static inline bool arch_has_hw_nonleaf_pmd_young(void)
> +{
> + return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
> +}
> +#endif
> +
> #ifndef arch_has_hw_pte_young
> /*
> * Return whether the accessed bit is supported on the local CPU.
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 04d8b88e5216..a04ac3b18326 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3975,7 +3975,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
> goto next;
>
> if (!pmd_trans_huge(pmd[i])) {
> - if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
> + if (arch_has_hw_nonleaf_pmd_young() &&
> get_cap(LRU_GEN_NONLEAF_YOUNG))
> pmdp_test_and_clear_young(vma, addr, pmd + i);
> goto next;
> @@ -4073,14 +4073,14 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
> #endif
> walk->mm_stats[MM_NONLEAF_TOTAL]++;
>
> -#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
> - if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
> + if (arch_has_hw_nonleaf_pmd_young() &&
> + get_cap(LRU_GEN_NONLEAF_YOUNG)) {
> if (!pmd_young(val))
> continue;
>
> walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
> }
> -#endif
> +
> if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
> continue;
>
> @@ -5354,7 +5354,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
> if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
> caps |= BIT(LRU_GEN_MM_WALK);
>
> - if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
> + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
> caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
>
> return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
On Mon, Nov 21, 2022 at 2:50 AM Juergen Gross <jgross@suse.com> wrote:
>
> On 21.11.22 10:32, Juergen Gross wrote:
> > When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
> > CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation
> > in pmdp_test_and_clear_young():
> >
> > BUG: unable to handle page fault for address: ffff8880083374d0
> > #PF: supervisor write access in kernel mode
> > #PF: error_code(0x0003) - permissions violation
> > PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
> > Oops: 0003 [#1] PREEMPT SMP NOPTI
> > CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
> > RIP: e030:pmdp_test_and_clear_young+0x25/0x40
> >
> > This happens because the Xen hypervisor can't emulate direct writes to
> > page table entries other than PTEs.
> >
> > This can easily be fixed by introducing arch_has_hw_pmd_young()
> > similar to arch_has_hw_pte_young() and test that instead of
> > CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
>
> I just spotted that I missed to update my commit message to match the
> patch:
>
> s/arch_has_hw_pmd_young/arch_has_hw_nonleaf_pmd_young/
>
> I'll wait with a resend in order to get some feedback first.
Thanks. For the next spin:
Acked-by: Yu Zhao <yuzhao@google.com>
On 21/11/2022 10:50, Juergen Gross wrote:
> On 21.11.22 10:32, Juergen Gross wrote:
>> When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
>> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation
>> in pmdp_test_and_clear_young():
>>
>> BUG: unable to handle page fault for address: ffff8880083374d0
>> #PF: supervisor write access in kernel mode
>> #PF: error_code(0x0003) - permissions violation
>> PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
>> Oops: 0003 [#1] PREEMPT SMP NOPTI
>> CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
>> RIP: e030:pmdp_test_and_clear_young+0x25/0x40
>>
>> This happens because the Xen hypervisor can't emulate direct writes to
>> page table entries other than PTEs.
>>
>> This can easily be fixed by introducing arch_has_hw_pmd_young()
>> similar to arch_has_hw_pte_young() and test that instead of
>> CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
>
> I just spotted that I missed to update my commit message to match the
> patch:
>
> s/arch_has_hw_pmd_young/arch_has_hw_nonleaf_pmd_young/
>
> I'll wait with a resend in order to get some feedback first.
>
>
> Juergen
Hi Juergen,
Thanks for the patch, had it running overnight and did some kernel
compiles. No crashes with this patch so far, so it seems to work for me.
--
Sander
>>
>> Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
>> Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
>> Signed-off-by: Juergen Gross <jgross@suse.com>
>> ---
>> arch/x86/include/asm/pgtable.h | 8 ++++++++
>> include/linux/pgtable.h | 11 +++++++++++
>> mm/vmscan.c | 10 +++++-----
>> 3 files changed, 24 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
>> index 5059799bebe3..c567a6ed17ce 100644
>> --- a/arch/x86/include/asm/pgtable.h
>> +++ b/arch/x86/include/asm/pgtable.h
>> @@ -1438,6 +1438,14 @@ static inline bool arch_has_hw_pte_young(void)
>> return true;
>> }
>>
>> +#ifdef CONFIG_XEN_PV
>> +#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
>> +static inline bool arch_has_hw_nonleaf_pmd_young(void)
>> +{
>> + return !cpu_feature_enabled(X86_FEATURE_XENPV);
>> +}
>> +#endif
>> +
>> #ifdef CONFIG_PAGE_TABLE_CHECK
>> static inline bool pte_user_accessible_page(pte_t pte)
>> {
>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>> index a108b60a6962..58fc7e2d9575 100644
>> --- a/include/linux/pgtable.h
>> +++ b/include/linux/pgtable.h
>> @@ -260,6 +260,17 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
>> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>> #endif
>>
>> +#ifndef arch_has_hw_nonleaf_pmd_young
>> +/*
>> + * Return whether the accessed bit in non-leaf PMD entries is supported on the
>> + * local CPU.
>> + */
>> +static inline bool arch_has_hw_nonleaf_pmd_young(void)
>> +{
>> + return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
>> +}
>> +#endif
>> +
>> #ifndef arch_has_hw_pte_young
>> /*
>> * Return whether the accessed bit is supported on the local CPU.
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 04d8b88e5216..a04ac3b18326 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -3975,7 +3975,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
>> goto next;
>>
>> if (!pmd_trans_huge(pmd[i])) {
>> - if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
>> + if (arch_has_hw_nonleaf_pmd_young() &&
>> get_cap(LRU_GEN_NONLEAF_YOUNG))
>> pmdp_test_and_clear_young(vma, addr, pmd + i);
>> goto next;
>> @@ -4073,14 +4073,14 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
>> #endif
>> walk->mm_stats[MM_NONLEAF_TOTAL]++;
>>
>> -#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
>> - if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
>> + if (arch_has_hw_nonleaf_pmd_young() &&
>> + get_cap(LRU_GEN_NONLEAF_YOUNG)) {
>> if (!pmd_young(val))
>> continue;
>>
>> walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
>> }
>> -#endif
>> +
>> if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
>> continue;
>>
>> @@ -5354,7 +5354,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
>> if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
>> caps |= BIT(LRU_GEN_MM_WALK);
>>
>> - if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
>> + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
>> caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
>>
>> return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
>
@@ -1438,6 +1438,14 @@ static inline bool arch_has_hw_pte_young(void)
return true;
}
+#ifdef CONFIG_XEN_PV
+#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
+{
+ return !cpu_feature_enabled(X86_FEATURE_XENPV);
+}
+#endif
+
#ifdef CONFIG_PAGE_TABLE_CHECK
static inline bool pte_user_accessible_page(pte_t pte)
{
@@ -260,6 +260,17 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
+#ifndef arch_has_hw_nonleaf_pmd_young
+/*
+ * Return whether the accessed bit in non-leaf PMD entries is supported on the
+ * local CPU.
+ */
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
+{
+ return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
+}
+#endif
+
#ifndef arch_has_hw_pte_young
/*
* Return whether the accessed bit is supported on the local CPU.
@@ -3975,7 +3975,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
+ if (arch_has_hw_nonleaf_pmd_young() &&
get_cap(LRU_GEN_NONLEAF_YOUNG))
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
@@ -4073,14 +4073,14 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
#endif
walk->mm_stats[MM_NONLEAF_TOTAL]++;
-#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
- if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (arch_has_hw_nonleaf_pmd_young() &&
+ get_cap(LRU_GEN_NONLEAF_YOUNG)) {
if (!pmd_young(val))
continue;
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
}
-#endif
+
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
continue;
@@ -5354,7 +5354,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
caps |= BIT(LRU_GEN_MM_WALK);
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);