[v2,04/10] mm/hugetlb: Move swap entry handling into vma lock when faulted
Commit Message
In hugetlb_fault(), there used to have a special path to handle swap entry
at the entrance using huge_pte_offset(). That's unsafe because
huge_pte_offset() for a pmd sharable range can access freed pgtables if
without any lock to protect the pgtable from being freed after pmd unshare.
Here the simplest solution to make it safe is to move the swap handling to
be after the vma lock being held. We may need to take the fault mutex on
either migration or hwpoison entries now (also the vma lock, but that's
really needed), however neither of them is hot path.
Note that the vma lock cannot be released in hugetlb_fault() when the
migration entry is detected, because in migration_entry_wait_huge() the
pgtable page will be used again (by taking the pgtable lock), so that also
need to be protected by the vma lock. Modify migration_entry_wait_huge()
so that it must be called with vma read lock held, and properly release the
lock in __migration_entry_wait_huge().
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
include/linux/swapops.h | 6 ++++--
mm/hugetlb.c | 36 +++++++++++++++---------------------
mm/migrate.c | 25 +++++++++++++++++++++----
3 files changed, 40 insertions(+), 27 deletions(-)
Comments
On 12/7/22 12:30, Peter Xu wrote:
> In hugetlb_fault(), there used to have a special path to handle swap entry
> at the entrance using huge_pte_offset(). That's unsafe because
> huge_pte_offset() for a pmd sharable range can access freed pgtables if
> without any lock to protect the pgtable from being freed after pmd unshare.
>
> Here the simplest solution to make it safe is to move the swap handling to
> be after the vma lock being held. We may need to take the fault mutex on
> either migration or hwpoison entries now (also the vma lock, but that's
> really needed), however neither of them is hot path.
>
> Note that the vma lock cannot be released in hugetlb_fault() when the
> migration entry is detected, because in migration_entry_wait_huge() the
> pgtable page will be used again (by taking the pgtable lock), so that also
> need to be protected by the vma lock. Modify migration_entry_wait_huge()
> so that it must be called with vma read lock held, and properly release the
> lock in __migration_entry_wait_huge().
>
> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
> include/linux/swapops.h | 6 ++++--
> mm/hugetlb.c | 36 +++++++++++++++---------------------
> mm/migrate.c | 25 +++++++++++++++++++++----
> 3 files changed, 40 insertions(+), 27 deletions(-)
>
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index a70b5c3a68d7..b134c5eb75cb 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -337,7 +337,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
> extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
> unsigned long address);
> #ifdef CONFIG_HUGETLB_PAGE
> -extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
> +extern void __migration_entry_wait_huge(struct vm_area_struct *vma,
> + pte_t *ptep, spinlock_t *ptl);
> extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
> #endif /* CONFIG_HUGETLB_PAGE */
> #else /* CONFIG_MIGRATION */
> @@ -366,7 +367,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
> static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
> unsigned long address) { }
> #ifdef CONFIG_HUGETLB_PAGE
> -static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
> +static inline void __migration_entry_wait_huge(struct vm_area_struct *vma,
> + pte_t *ptep, spinlock_t *ptl) { }
> static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
> #endif /* CONFIG_HUGETLB_PAGE */
> static inline int is_writable_migration_entry(swp_entry_t entry)
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index c8a6673fe5b4..49f73677a418 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5824,22 +5824,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> int need_wait_lock = 0;
> unsigned long haddr = address & huge_page_mask(h);
>
> - ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
> - if (ptep) {
> - /*
> - * Since we hold no locks, ptep could be stale. That is
> - * OK as we are only making decisions based on content and
> - * not actually modifying content here.
> - */
> - entry = huge_ptep_get(ptep);
> - if (unlikely(is_hugetlb_entry_migration(entry))) {
> - migration_entry_wait_huge(vma, ptep);
> - return 0;
> - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
> - return VM_FAULT_HWPOISON_LARGE |
> - VM_FAULT_SET_HINDEX(hstate_index(h));
> - }
> -
> /*
> * Serialize hugepage allocation and instantiation, so that we don't
> * get spurious allocation failures if two CPUs race to instantiate
> @@ -5854,10 +5838,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> * Acquire vma lock before calling huge_pte_alloc and hold
> * until finished with ptep. This prevents huge_pmd_unshare from
> * being called elsewhere and making the ptep no longer valid.
> - *
> - * ptep could have already be assigned via huge_pte_offset. That
> - * is OK, as huge_pte_alloc will return the same value unless
> - * something has changed.
> */
> hugetlb_vma_lock_read(vma);
> ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
> @@ -5886,8 +5866,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
> * properly handle it.
> */
> - if (!pte_present(entry))
> + if (!pte_present(entry)) {
> + if (unlikely(is_hugetlb_entry_migration(entry))) {
> + /*
> + * Release fault lock first because the vma lock is
> + * needed to guard the huge_pte_lockptr() later in
> + * migration_entry_wait_huge(). The vma lock will
> + * be released there.
> + */
> + mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> + migration_entry_wait_huge(vma, ptep);
> + return 0;
Oh, but now (and also one other, pre-existing case, above)
hugetlb_fault() is returning with the vma lock held. This is in contrast
with most of the rest of the function, which takes great care to release
locks before returning.
Which makes this new case really quite irregular and makes the overall
locking harder to follow. It would be ideal to avoid doing this! But at
the very least, there should be a little note above hugetlb_fault(),
explaining this deviation and how it fits in with the locking rules.
Do we really have to structure it this way, though?
thanks,
Hi, John,
Firstly, thanks for taking a look at the whole set.
On Wed, Dec 07, 2022 at 02:36:21PM -0800, John Hubbard wrote:
> > @@ -5886,8 +5866,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> > * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
> > * properly handle it.
> > */
> > - if (!pte_present(entry))
> > + if (!pte_present(entry)) {
> > + if (unlikely(is_hugetlb_entry_migration(entry))) {
> > + /*
> > + * Release fault lock first because the vma lock is
> > + * needed to guard the huge_pte_lockptr() later in
> > + * migration_entry_wait_huge(). The vma lock will
> > + * be released there.
> > + */
> > + mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> > + migration_entry_wait_huge(vma, ptep);
> > + return 0;
>
> Oh, but now (and also one other, pre-existing case, above)
> hugetlb_fault() is returning with the vma lock held.
Note that here migration_entry_wait_huge() will release it.
Sorry it's definitely not as straightforward, but this is also something I
didn't come up with a better solution, because we need the vma lock to
protect the spinlock, which is used in deep code path of the migration
code.
That's also why I added a rich comment above, and there's "The vma lock
will be released there" which is just for that.
> This is in contrast
> with most of the rest of the function, which takes great care to release
> locks before returning.
>
> Which makes this new case really quite irregular and makes the overall
> locking harder to follow. It would be ideal to avoid doing this! But at
> the very least, there should be a little note above hugetlb_fault(),
> explaining this deviation and how it fits in with the locking rules.
>
> Do we really have to structure it this way, though?
On 12/7/22 14:43, Peter Xu wrote:
> Note that here migration_entry_wait_huge() will release it.
>
> Sorry it's definitely not as straightforward, but this is also something I
> didn't come up with a better solution, because we need the vma lock to
> protect the spinlock, which is used in deep code path of the migration
> code.
>
> That's also why I added a rich comment above, and there's "The vma lock
> will be released there" which is just for that.
>
Yes, OK,
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
...and here is some fancy documentation polishing (incremental on top of this
specific patch) if you would like to fold it in, optional but it makes me
happier:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 49f73677a418..e3bbd4869f68 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5809,6 +5809,10 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
}
#endif
+/*
+ * There are a few special cases in which this function returns while still
+ * holding locks. Those are noted inline.
+ */
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
@@ -5851,8 +5855,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* PTE markers should be handled the same way as none pte */
if (huge_pte_none_mostly(entry))
/*
- * hugetlb_no_page will drop vma lock and hugetlb fault
- * mutex internally, which make us return immediately.
+ * hugetlb_no_page() will release both the vma lock and the
+ * hugetlb fault mutex, so just return directly from that:
*/
return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
entry, flags);
@@ -5869,10 +5873,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_present(entry)) {
if (unlikely(is_hugetlb_entry_migration(entry))) {
/*
- * Release fault lock first because the vma lock is
- * needed to guard the huge_pte_lockptr() later in
- * migration_entry_wait_huge(). The vma lock will
- * be released there.
+ * Release the hugetlb fault lock now, but retain the
+ * vma lock, because it is needed to guard the
+ * huge_pte_lockptr() later in
+ * migration_entry_wait_huge(). The vma lock will be
+ * released there.
*/
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
migration_entry_wait_huge(vma, ptep);
diff --git a/mm/migrate.c b/mm/migrate.c
index d14f1f3ab073..a31df628b938 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -333,16 +333,18 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
}
#ifdef CONFIG_HUGETLB_PAGE
+
+/*
+ * The vma read lock must be held upon entry. Holding that lock prevents either
+ * the pte or the ptl from being freed.
+ *
+ * This function will release the vma lock before returning.
+ */
void __migration_entry_wait_huge(struct vm_area_struct *vma,
pte_t *ptep, spinlock_t *ptl)
{
pte_t pte;
- /*
- * The vma read lock must be taken, which will be released before
- * the function returns. It makes sure the pgtable page (along
- * with its spin lock) not be freed in parallel.
- */
hugetlb_vma_assert_locked(vma);
spin_lock(ptl);
thanks,
On Wed, Dec 07, 2022 at 03:05:42PM -0800, John Hubbard wrote:
> On 12/7/22 14:43, Peter Xu wrote:
> > Note that here migration_entry_wait_huge() will release it.
> >
> > Sorry it's definitely not as straightforward, but this is also something I
> > didn't come up with a better solution, because we need the vma lock to
> > protect the spinlock, which is used in deep code path of the migration
> > code.
> >
> > That's also why I added a rich comment above, and there's "The vma lock
> > will be released there" which is just for that.
> >
>
> Yes, OK,
>
> Reviewed-by: John Hubbard <jhubbard@nvidia.com>
>
> ...and here is some fancy documentation polishing (incremental on top of this
> specific patch) if you would like to fold it in, optional but it makes me
> happier:
>
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 49f73677a418..e3bbd4869f68 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5809,6 +5809,10 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
> }
> #endif
> +/*
> + * There are a few special cases in which this function returns while still
> + * holding locks. Those are noted inline.
> + */
This is not true, I think? It always releases all the locks.
> vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> unsigned long address, unsigned int flags)
> {
> @@ -5851,8 +5855,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> /* PTE markers should be handled the same way as none pte */
> if (huge_pte_none_mostly(entry))
> /*
> - * hugetlb_no_page will drop vma lock and hugetlb fault
> - * mutex internally, which make us return immediately.
> + * hugetlb_no_page() will release both the vma lock and the
> + * hugetlb fault mutex, so just return directly from that:
I'm probably not gonna touch this part because it's not part of the patch..
For the rest, I can do.
I'll also apply the comment changes elsewhere if I don't speak up - in most
cases they all look good to me.
Thanks,
> */
> return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
> entry, flags);
> @@ -5869,10 +5873,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> if (!pte_present(entry)) {
> if (unlikely(is_hugetlb_entry_migration(entry))) {
> /*
> - * Release fault lock first because the vma lock is
> - * needed to guard the huge_pte_lockptr() later in
> - * migration_entry_wait_huge(). The vma lock will
> - * be released there.
> + * Release the hugetlb fault lock now, but retain the
> + * vma lock, because it is needed to guard the
> + * huge_pte_lockptr() later in
> + * migration_entry_wait_huge(). The vma lock will be
> + * released there.
> */
> mutex_unlock(&hugetlb_fault_mutex_table[hash]);
> migration_entry_wait_huge(vma, ptep);
> diff --git a/mm/migrate.c b/mm/migrate.c
> index d14f1f3ab073..a31df628b938 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -333,16 +333,18 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
> }
> #ifdef CONFIG_HUGETLB_PAGE
> +
> +/*
> + * The vma read lock must be held upon entry. Holding that lock prevents either
> + * the pte or the ptl from being freed.
> + *
> + * This function will release the vma lock before returning.
> + */
> void __migration_entry_wait_huge(struct vm_area_struct *vma,
> pte_t *ptep, spinlock_t *ptl)
> {
> pte_t pte;
> - /*
> - * The vma read lock must be taken, which will be released before
> - * the function returns. It makes sure the pgtable page (along
> - * with its spin lock) not be freed in parallel.
> - */
> hugetlb_vma_assert_locked(vma);
> spin_lock(ptl);
>
>
> thanks,
> --
> John Hubbard
> NVIDIA
>
On 12/8/22 12:28, Peter Xu wrote:
>> +/*
>> + * There are a few special cases in which this function returns while still
>> + * holding locks. Those are noted inline.
>> + */
>
> This is not true, I think? It always releases all the locks.
Agreed. It just looks that way at first, and I guess it was getting
late when I wrote that part. :)
thanks,
@@ -337,7 +337,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address);
#ifdef CONFIG_HUGETLB_PAGE
-extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl);
+extern void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte);
#endif /* CONFIG_HUGETLB_PAGE */
#else /* CONFIG_MIGRATION */
@@ -366,7 +367,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address) { }
#ifdef CONFIG_HUGETLB_PAGE
-static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { }
+static inline void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { }
#endif /* CONFIG_HUGETLB_PAGE */
static inline int is_writable_migration_entry(swp_entry_t entry)
@@ -5824,22 +5824,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
- if (ptep) {
- /*
- * Since we hold no locks, ptep could be stale. That is
- * OK as we are only making decisions based on content and
- * not actually modifying content here.
- */
- entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, ptep);
- return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
- }
-
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
@@ -5854,10 +5838,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* Acquire vma lock before calling huge_pte_alloc and hold
* until finished with ptep. This prevents huge_pmd_unshare from
* being called elsewhere and making the ptep no longer valid.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
- * something has changed.
*/
hugetlb_vma_lock_read(vma);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
@@ -5886,8 +5866,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
* properly handle it.
*/
- if (!pte_present(entry))
+ if (!pte_present(entry)) {
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ /*
+ * Release fault lock first because the vma lock is
+ * needed to guard the huge_pte_lockptr() later in
+ * migration_entry_wait_huge(). The vma lock will
+ * be released there.
+ */
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ migration_entry_wait_huge(vma, ptep);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ ret = VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
+ }
/*
* If we are going to COW/unshare the mapping later, we examine the
@@ -333,24 +333,41 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
}
#ifdef CONFIG_HUGETLB_PAGE
-void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
+void __migration_entry_wait_huge(struct vm_area_struct *vma,
+ pte_t *ptep, spinlock_t *ptl)
{
pte_t pte;
+ /*
+ * The vma read lock must be taken, which will be released before
+ * the function returns. It makes sure the pgtable page (along
+ * with its spin lock) not be freed in parallel.
+ */
+ hugetlb_vma_assert_locked(vma);
+
spin_lock(ptl);
pte = huge_ptep_get(ptep);
- if (unlikely(!is_hugetlb_entry_migration(pte)))
+ if (unlikely(!is_hugetlb_entry_migration(pte))) {
spin_unlock(ptl);
- else
+ hugetlb_vma_unlock_read(vma);
+ } else {
+ /*
+ * If migration entry existed, safe to release vma lock
+ * here because the pgtable page won't be freed without the
+ * pgtable lock released. See comment right above pgtable
+ * lock release in migration_entry_wait_on_locked().
+ */
+ hugetlb_vma_unlock_read(vma);
migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
+ }
}
void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
{
spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
- __migration_entry_wait_huge(pte, ptl);
+ __migration_entry_wait_huge(vma, pte, ptl);
}
#endif