[2/3] mm: drop VMA lock before waiting for migration
Commit Message
migration_entry_wait does not need VMA lock, therefore it can be dropped
before waiting. Introduce VM_FAULT_VMA_UNLOCKED to indicate that VMA
lock was dropped while in handle_mm_fault().
Note that once VMA lock is dropped, the VMA reference can't be used as
there are no guarantees it was not freed.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
arch/arm64/mm/fault.c | 3 ++-
arch/powerpc/mm/fault.c | 3 ++-
arch/s390/mm/fault.c | 3 ++-
arch/x86/mm/fault.c | 3 ++-
include/linux/mm_types.h | 6 +++++-
mm/memory.c | 12 ++++++++++--
6 files changed, 23 insertions(+), 7 deletions(-)
Comments
Suren Baghdasaryan <surenb@google.com> writes:
[...]
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 306a3d1a0fa6..b3b57c6da0e1 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1030,6 +1030,7 @@ typedef __bitwise unsigned int vm_fault_t;
> * fsync() to complete (for synchronous page faults
> * in DAX)
> * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released
> + * @VM_FAULT_VMA_UNLOCKED: VMA lock was released
A note here saying vmf->vma should no longer be accessed would be nice.
> * @VM_FAULT_HINDEX_MASK: mask HINDEX value
> *
> */
> @@ -1047,6 +1048,7 @@ enum vm_fault_reason {
> VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
> VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
> VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000,
> + VM_FAULT_VMA_UNLOCKED = (__force vm_fault_t)0x008000,
> VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
> };
>
> @@ -1070,7 +1072,9 @@ enum vm_fault_reason {
> { VM_FAULT_RETRY, "RETRY" }, \
> { VM_FAULT_FALLBACK, "FALLBACK" }, \
> { VM_FAULT_DONE_COW, "DONE_COW" }, \
> - { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
> + { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \
> + { VM_FAULT_COMPLETED, "COMPLETED" }, \
VM_FAULT_COMPLETED isn't used in this patch, guessing that's snuck in
from one of the other patches in the series?
> + { VM_FAULT_VMA_UNLOCKED, "VMA_UNLOCKED" }
>
> struct vm_special_mapping {
> const char *name; /* The name, e.g. "[vdso]". */
> diff --git a/mm/memory.c b/mm/memory.c
> index 41f45819a923..8222acf74fd3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3714,8 +3714,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> entry = pte_to_swp_entry(vmf->orig_pte);
> if (unlikely(non_swap_entry(entry))) {
> if (is_migration_entry(entry)) {
> - migration_entry_wait(vma->vm_mm, vmf->pmd,
> - vmf->address);
> + /* Save mm in case VMA lock is dropped */
> + struct mm_struct *mm = vma->vm_mm;
> +
> + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> + /* No need to hold VMA lock for migration */
> + vma_end_read(vma);
> + /* CAUTION! VMA can't be used after this */
> + ret |= VM_FAULT_VMA_UNLOCKED;
> + }
> + migration_entry_wait(mm, vmf->pmd, vmf->address);
> } else if (is_device_exclusive_entry(entry)) {
> vmf->page = pfn_swap_entry_to_page(entry);
> ret = remove_device_exclusive_entry(vmf);
On Mon, May 01, 2023 at 10:50:24AM -0700, Suren Baghdasaryan wrote:
> migration_entry_wait does not need VMA lock, therefore it can be dropped
> before waiting. Introduce VM_FAULT_VMA_UNLOCKED to indicate that VMA
> lock was dropped while in handle_mm_fault().
> Note that once VMA lock is dropped, the VMA reference can't be used as
> there are no guarantees it was not freed.
How about we introduce:
void vmf_end_read(struct vm_fault *vmf)
{
if (!vmf->vma)
return;
vma_end_read(vmf->vma);
vmf->vma = NULL;
}
Now we don't need a new flag, and calling vmf_end_read() is idempotent.
Oh, argh, we create the vmf too late. We really need to hoist the
creation of vm_fault to the callers of handle_mm_fault().
On Tue, May 2, 2023 at 6:26 AM 'Alistair Popple' via kernel-team
<kernel-team@android.com> wrote:
>
>
> Suren Baghdasaryan <surenb@google.com> writes:
>
> [...]
>
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 306a3d1a0fa6..b3b57c6da0e1 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -1030,6 +1030,7 @@ typedef __bitwise unsigned int vm_fault_t;
> > * fsync() to complete (for synchronous page faults
> > * in DAX)
> > * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released
> > + * @VM_FAULT_VMA_UNLOCKED: VMA lock was released
>
> A note here saying vmf->vma should no longer be accessed would be nice.
Good idea. Will add in the next version. Thanks!
>
> > * @VM_FAULT_HINDEX_MASK: mask HINDEX value
> > *
> > */
> > @@ -1047,6 +1048,7 @@ enum vm_fault_reason {
> > VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
> > VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
> > VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000,
> > + VM_FAULT_VMA_UNLOCKED = (__force vm_fault_t)0x008000,
> > VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
> > };
> >
> > @@ -1070,7 +1072,9 @@ enum vm_fault_reason {
> > { VM_FAULT_RETRY, "RETRY" }, \
> > { VM_FAULT_FALLBACK, "FALLBACK" }, \
> > { VM_FAULT_DONE_COW, "DONE_COW" }, \
> > - { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
> > + { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \
> > + { VM_FAULT_COMPLETED, "COMPLETED" }, \
>
> VM_FAULT_COMPLETED isn't used in this patch, guessing that's snuck in
> from one of the other patches in the series?
I noticed that an entry for VM_FAULT_COMPLETED was missing and wanted
to fix that... Should I drop that?
>
> > + { VM_FAULT_VMA_UNLOCKED, "VMA_UNLOCKED" }
> >
> > struct vm_special_mapping {
> > const char *name; /* The name, e.g. "[vdso]". */
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 41f45819a923..8222acf74fd3 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3714,8 +3714,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > entry = pte_to_swp_entry(vmf->orig_pte);
> > if (unlikely(non_swap_entry(entry))) {
> > if (is_migration_entry(entry)) {
> > - migration_entry_wait(vma->vm_mm, vmf->pmd,
> > - vmf->address);
> > + /* Save mm in case VMA lock is dropped */
> > + struct mm_struct *mm = vma->vm_mm;
> > +
> > + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> > + /* No need to hold VMA lock for migration */
> > + vma_end_read(vma);
> > + /* CAUTION! VMA can't be used after this */
> > + ret |= VM_FAULT_VMA_UNLOCKED;
> > + }
> > + migration_entry_wait(mm, vmf->pmd, vmf->address);
> > } else if (is_device_exclusive_entry(entry)) {
> > vmf->page = pfn_swap_entry_to_page(entry);
> > ret = remove_device_exclusive_entry(vmf);
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
On Tue, May 2, 2023 at 7:28 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Mon, May 01, 2023 at 10:50:24AM -0700, Suren Baghdasaryan wrote:
> > migration_entry_wait does not need VMA lock, therefore it can be dropped
> > before waiting. Introduce VM_FAULT_VMA_UNLOCKED to indicate that VMA
> > lock was dropped while in handle_mm_fault().
> > Note that once VMA lock is dropped, the VMA reference can't be used as
> > there are no guarantees it was not freed.
>
> How about we introduce:
>
> void vmf_end_read(struct vm_fault *vmf)
> {
> if (!vmf->vma)
> return;
> vma_end_read(vmf->vma);
> vmf->vma = NULL;
> }
>
> Now we don't need a new flag, and calling vmf_end_read() is idempotent.
>
> Oh, argh, we create the vmf too late. We really need to hoist the
> creation of vm_fault to the callers of handle_mm_fault().
Yeah, unfortunately vmf does not propagate all the way up to
do_user_addr_fault which needs to know that we dropped the lock.
>
Suren Baghdasaryan <surenb@google.com> writes:
> On Tue, May 2, 2023 at 6:26 AM 'Alistair Popple' via kernel-team
> <kernel-team@android.com> wrote:
>>
>>
>> Suren Baghdasaryan <surenb@google.com> writes:
>>
>> [...]
>>
>> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>> > index 306a3d1a0fa6..b3b57c6da0e1 100644
>> > --- a/include/linux/mm_types.h
>> > +++ b/include/linux/mm_types.h
>> > @@ -1030,6 +1030,7 @@ typedef __bitwise unsigned int vm_fault_t;
>> > * fsync() to complete (for synchronous page faults
>> > * in DAX)
>> > * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released
>> > + * @VM_FAULT_VMA_UNLOCKED: VMA lock was released
>>
>> A note here saying vmf->vma should no longer be accessed would be nice.
>
> Good idea. Will add in the next version. Thanks!
>
>>
>> > * @VM_FAULT_HINDEX_MASK: mask HINDEX value
>> > *
>> > */
>> > @@ -1047,6 +1048,7 @@ enum vm_fault_reason {
>> > VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
>> > VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
>> > VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000,
>> > + VM_FAULT_VMA_UNLOCKED = (__force vm_fault_t)0x008000,
>> > VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
>> > };
>> >
>> > @@ -1070,7 +1072,9 @@ enum vm_fault_reason {
>> > { VM_FAULT_RETRY, "RETRY" }, \
>> > { VM_FAULT_FALLBACK, "FALLBACK" }, \
>> > { VM_FAULT_DONE_COW, "DONE_COW" }, \
>> > - { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
>> > + { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \
>> > + { VM_FAULT_COMPLETED, "COMPLETED" }, \
>>
>> VM_FAULT_COMPLETED isn't used in this patch, guessing that's snuck in
>> from one of the other patches in the series?
>
> I noticed that an entry for VM_FAULT_COMPLETED was missing and wanted
> to fix that... Should I drop that?
Oh ok. It would certainly be good to add but really it should be it's
own patch.
>>
>> > + { VM_FAULT_VMA_UNLOCKED, "VMA_UNLOCKED" }
>> >
>> > struct vm_special_mapping {
>> > const char *name; /* The name, e.g. "[vdso]". */
>> > diff --git a/mm/memory.c b/mm/memory.c
>> > index 41f45819a923..8222acf74fd3 100644
>> > --- a/mm/memory.c
>> > +++ b/mm/memory.c
>> > @@ -3714,8 +3714,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>> > entry = pte_to_swp_entry(vmf->orig_pte);
>> > if (unlikely(non_swap_entry(entry))) {
>> > if (is_migration_entry(entry)) {
>> > - migration_entry_wait(vma->vm_mm, vmf->pmd,
>> > - vmf->address);
>> > + /* Save mm in case VMA lock is dropped */
>> > + struct mm_struct *mm = vma->vm_mm;
>> > +
>> > + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
>> > + /* No need to hold VMA lock for migration */
>> > + vma_end_read(vma);
>> > + /* CAUTION! VMA can't be used after this */
>> > + ret |= VM_FAULT_VMA_UNLOCKED;
>> > + }
>> > + migration_entry_wait(mm, vmf->pmd, vmf->address);
>> > } else if (is_device_exclusive_entry(entry)) {
>> > vmf->page = pfn_swap_entry_to_page(entry);
>> > ret = remove_device_exclusive_entry(vmf);
>>
>> --
>> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>>
On Wed, May 3, 2023 at 6:05 AM Alistair Popple <apopple@nvidia.com> wrote:
>
>
> Suren Baghdasaryan <surenb@google.com> writes:
>
> > On Tue, May 2, 2023 at 6:26 AM 'Alistair Popple' via kernel-team
> > <kernel-team@android.com> wrote:
> >>
> >>
> >> Suren Baghdasaryan <surenb@google.com> writes:
> >>
> >> [...]
> >>
> >> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> >> > index 306a3d1a0fa6..b3b57c6da0e1 100644
> >> > --- a/include/linux/mm_types.h
> >> > +++ b/include/linux/mm_types.h
> >> > @@ -1030,6 +1030,7 @@ typedef __bitwise unsigned int vm_fault_t;
> >> > * fsync() to complete (for synchronous page faults
> >> > * in DAX)
> >> > * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released
> >> > + * @VM_FAULT_VMA_UNLOCKED: VMA lock was released
> >>
> >> A note here saying vmf->vma should no longer be accessed would be nice.
> >
> > Good idea. Will add in the next version. Thanks!
> >
> >>
> >> > * @VM_FAULT_HINDEX_MASK: mask HINDEX value
> >> > *
> >> > */
> >> > @@ -1047,6 +1048,7 @@ enum vm_fault_reason {
> >> > VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
> >> > VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
> >> > VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000,
> >> > + VM_FAULT_VMA_UNLOCKED = (__force vm_fault_t)0x008000,
> >> > VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
> >> > };
> >> >
> >> > @@ -1070,7 +1072,9 @@ enum vm_fault_reason {
> >> > { VM_FAULT_RETRY, "RETRY" }, \
> >> > { VM_FAULT_FALLBACK, "FALLBACK" }, \
> >> > { VM_FAULT_DONE_COW, "DONE_COW" }, \
> >> > - { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
> >> > + { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \
> >> > + { VM_FAULT_COMPLETED, "COMPLETED" }, \
> >>
> >> VM_FAULT_COMPLETED isn't used in this patch, guessing that's snuck in
> >> from one of the other patches in the series?
> >
> > I noticed that an entry for VM_FAULT_COMPLETED was missing and wanted
> > to fix that... Should I drop that?
>
> Oh ok. It would certainly be good to add but really it should be it's
> own patch.
Ack. Will split in the next version. Thanks!
>
> >>
> >> > + { VM_FAULT_VMA_UNLOCKED, "VMA_UNLOCKED" }
> >> >
> >> > struct vm_special_mapping {
> >> > const char *name; /* The name, e.g. "[vdso]". */
> >> > diff --git a/mm/memory.c b/mm/memory.c
> >> > index 41f45819a923..8222acf74fd3 100644
> >> > --- a/mm/memory.c
> >> > +++ b/mm/memory.c
> >> > @@ -3714,8 +3714,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >> > entry = pte_to_swp_entry(vmf->orig_pte);
> >> > if (unlikely(non_swap_entry(entry))) {
> >> > if (is_migration_entry(entry)) {
> >> > - migration_entry_wait(vma->vm_mm, vmf->pmd,
> >> > - vmf->address);
> >> > + /* Save mm in case VMA lock is dropped */
> >> > + struct mm_struct *mm = vma->vm_mm;
> >> > +
> >> > + if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> >> > + /* No need to hold VMA lock for migration */
> >> > + vma_end_read(vma);
> >> > + /* CAUTION! VMA can't be used after this */
> >> > + ret |= VM_FAULT_VMA_UNLOCKED;
> >> > + }
> >> > + migration_entry_wait(mm, vmf->pmd, vmf->address);
> >> > } else if (is_device_exclusive_entry(entry)) {
> >> > vmf->page = pfn_swap_entry_to_page(entry);
> >> > ret = remove_device_exclusive_entry(vmf);
> >>
> >> --
> >> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> >>
>
@@ -602,7 +602,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
}
fault = handle_mm_fault(vma, addr & PAGE_MASK,
mm_flags | FAULT_FLAG_VMA_LOCK, regs);
- vma_end_read(vma);
+ if (!(fault & VM_FAULT_VMA_UNLOCKED))
+ vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
@@ -494,7 +494,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
- vma_end_read(vma);
+ if (!(fault & VM_FAULT_VMA_UNLOCKED))
+ vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
@@ -418,7 +418,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
goto lock_mmap;
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
- vma_end_read(vma);
+ if (!(fault & VM_FAULT_VMA_UNLOCKED))
+ vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
goto out;
@@ -1347,7 +1347,8 @@ void do_user_addr_fault(struct pt_regs *regs,
goto lock_mmap;
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
- vma_end_read(vma);
+ if (!(fault & VM_FAULT_VMA_UNLOCKED))
+ vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
@@ -1030,6 +1030,7 @@ typedef __bitwise unsigned int vm_fault_t;
* fsync() to complete (for synchronous page faults
* in DAX)
* @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released
+ * @VM_FAULT_VMA_UNLOCKED: VMA lock was released
* @VM_FAULT_HINDEX_MASK: mask HINDEX value
*
*/
@@ -1047,6 +1048,7 @@ enum vm_fault_reason {
VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000,
VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000,
VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000,
+ VM_FAULT_VMA_UNLOCKED = (__force vm_fault_t)0x008000,
VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000,
};
@@ -1070,7 +1072,9 @@ enum vm_fault_reason {
{ VM_FAULT_RETRY, "RETRY" }, \
{ VM_FAULT_FALLBACK, "FALLBACK" }, \
{ VM_FAULT_DONE_COW, "DONE_COW" }, \
- { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
+ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \
+ { VM_FAULT_COMPLETED, "COMPLETED" }, \
+ { VM_FAULT_VMA_UNLOCKED, "VMA_UNLOCKED" }
struct vm_special_mapping {
const char *name; /* The name, e.g. "[vdso]". */
@@ -3714,8 +3714,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
- migration_entry_wait(vma->vm_mm, vmf->pmd,
- vmf->address);
+ /* Save mm in case VMA lock is dropped */
+ struct mm_struct *mm = vma->vm_mm;
+
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ /* No need to hold VMA lock for migration */
+ vma_end_read(vma);
+ /* CAUTION! VMA can't be used after this */
+ ret |= VM_FAULT_VMA_UNLOCKED;
+ }
+ migration_entry_wait(mm, vmf->pmd, vmf->address);
} else if (is_device_exclusive_entry(entry)) {
vmf->page = pfn_swap_entry_to_page(entry);
ret = remove_device_exclusive_entry(vmf);