[RFC,v1,1/4] mm: Introduce ptep_get_lockless_norecency()
Commit Message
With the introduction of contpte mapping support for arm64, that
architecture's implementation of ptep_get_lockless() has become very
complex due to the need to gather access and dirty bits from across all
of the ptes in the contpte block. This requires careful implementation
to ensure the returned value is consistent (because its not possible to
read all ptes atomically), but even in the common case when there is no
racing modification, we have to read all ptes, which gives an ~O(n^2)
cost if the core-mm is iterating over a range, and performing a
ptep_get_lockless() on each pte.
Solve this by introducing ptep_get_lockless_norecency(), which does not
make any guarantees about access and dirty bits. Therefore it can simply
read the single target pte.
At the same time, convert all call sites that previously used
ptep_get_lockless() but don't care about access and dirty state.
We may want to do something similar for ptep_get() (i.e.
ptep_get_norecency()) in future; it doesn't suffer from the consistency
problem because the PTL serializes it with any modifications, but does
suffer the same O(n^2) cost.
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
include/linux/pgtable.h | 37 ++++++++++++++++++++++++++++++++++---
kernel/events/core.c | 2 +-
mm/hugetlb.c | 2 +-
mm/khugepaged.c | 2 +-
mm/memory.c | 2 +-
mm/swap_state.c | 2 +-
mm/swapfile.c | 2 +-
7 files changed, 40 insertions(+), 9 deletions(-)
--
2.25.1
@@ -528,16 +528,47 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
-/*
- * We require that the PTE can be read atomically.
- */
#ifndef ptep_get_lockless
+/**
+ * ptep_get_lockless - Get a pte without holding the page table lock. Young and
+ * dirty bits are guaranteed to accurately reflect the state
+ * of the pte at the time of the call.
+ * @ptep: Page table pointer for pte to get.
+ *
+ * If young and dirty information is not required, use
+ * ptep_get_lockless_norecency() which can be faster on some architectures.
+ *
+ * May be overridden by the architecture; otherwise, implemented using
+ * ptep_get(), on the assumption that it is atomic.
+ *
+ * Context: Any.
+ */
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
return ptep_get(ptep);
}
#endif
+#ifndef ptep_get_lockless_norecency
+/**
+ * ptep_get_lockless_norecency - Get a pte without holding the page table lock.
+ * Young and dirty bits may not be accurate.
+ * @ptep: Page table pointer for pte to get.
+ *
+ * Prefer this over ptep_get_lockless() when young and dirty information is not
+ * required since it can be faster on some architectures.
+ *
+ * May be overridden by the architecture; otherwise, implemented using the more
+ * precise ptep_get_lockless().
+ *
+ * Context: Any.
+ */
+static inline pte_t ptep_get_lockless_norecency(pte_t *ptep)
+{
+ return ptep_get_lockless(ptep);
+}
+#endif
+
#ifndef pmdp_get_lockless
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
@@ -7583,7 +7583,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
if (!ptep)
goto again;
- pte = ptep_get_lockless(ptep);
+ pte = ptep_get_lockless_norecency(ptep);
if (pte_present(pte))
size = pte_leaf_size(pte);
pte_unmap(ptep);
@@ -7517,7 +7517,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
}
if (pte) {
- pte_t pteval = ptep_get_lockless(pte);
+ pte_t pteval = ptep_get_lockless_norecency(pte);
BUG_ON(pte_present(pteval) && !pte_huge(pteval));
}
@@ -1019,7 +1019,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
}
}
- vmf.orig_pte = ptep_get_lockless(pte);
+ vmf.orig_pte = ptep_get_lockless_norecency(pte);
if (!is_swap_pte(vmf.orig_pte))
continue;
@@ -4353,7 +4353,7 @@ static bool pte_range_none(pte_t *pte, int nr_pages)
int i;
for (i = 0; i < nr_pages; i++) {
- if (!pte_none(ptep_get_lockless(pte + i)))
+ if (!pte_none(ptep_get_lockless_norecency(pte + i)))
return false;
}
@@ -837,7 +837,7 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
if (!pte)
break;
}
- pentry = ptep_get_lockless(pte);
+ pentry = ptep_get_lockless_norecency(pte);
if (!is_swap_pte(pentry))
continue;
entry = pte_to_swp_entry(pentry);
@@ -1857,7 +1857,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
break;
}
- ptent = ptep_get_lockless(pte);
+ ptent = ptep_get_lockless_norecency(pte);
if (!is_swap_pte(ptent))
continue;