[v4,4/4] mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs
Commit Message
UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new
PTE to resolve a missing fault, one can install a write-protected one.
This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in
combination.
This was motivated by testing HugeTLB HGM [1], and in particular its
interaction with userfaultfd features. Existing userfaultfd code
supports using WP and MINOR modes together (i.e. you can register an
area with both enabled), but without this CONTINUE flag the combination
is in practice unusable.
So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing
as UFFDIO_COPY_MODE_WP, but for *minor* faults.
Update the selftest to do some very basic exercising of the new flag.
[1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/
Acked-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
---
fs/userfaultfd.c | 8 ++++++--
include/linux/userfaultfd_k.h | 3 ++-
include/uapi/linux/userfaultfd.h | 7 +++++++
mm/userfaultfd.c | 5 +++--
tools/testing/selftests/mm/userfaultfd.c | 4 ++++
5 files changed, 22 insertions(+), 5 deletions(-)
Comments
On Wed, Mar 08, 2023 at 02:19:32PM -0800, Axel Rasmussen wrote:
> UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new
> PTE to resolve a missing fault, one can install a write-protected one.
> This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in
> combination.
>
> This was motivated by testing HugeTLB HGM [1], and in particular its
> interaction with userfaultfd features. Existing userfaultfd code
> supports using WP and MINOR modes together (i.e. you can register an
> area with both enabled), but without this CONTINUE flag the combination
> is in practice unusable.
>
> So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing
> as UFFDIO_COPY_MODE_WP, but for *minor* faults.
>
> Update the selftest to do some very basic exercising of the new flag.
>
> [1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/
>
> Acked-by: Peter Xu <peterx@redhat.com>
> Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
> ---
> fs/userfaultfd.c | 8 ++++++--
> include/linux/userfaultfd_k.h | 3 ++-
> include/uapi/linux/userfaultfd.h | 7 +++++++
> mm/userfaultfd.c | 5 +++--
> tools/testing/selftests/mm/userfaultfd.c | 4 ++++
> 5 files changed, 22 insertions(+), 5 deletions(-)
>
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index 005e5e306266..14059a0861bf 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -297,6 +297,13 @@ struct uffdio_writeprotect {
> struct uffdio_continue {
> struct uffdio_range range;
> #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
> + /*
> + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on
> + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the
> + * write protected ioctl is implemented for the range
> + * according to the uffdio_register.ioctls.
> + */
> +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1)
Please add the description of the new flag to Documentation/ and to the
userfaultfd man pages.
> __u64 mode;
>
> /*
On Thu, Mar 9, 2023 at 1:11 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Wed, Mar 08, 2023 at 02:19:32PM -0800, Axel Rasmussen wrote:
> > UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new
> > PTE to resolve a missing fault, one can install a write-protected one.
> > This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in
> > combination.
> >
> > This was motivated by testing HugeTLB HGM [1], and in particular its
> > interaction with userfaultfd features. Existing userfaultfd code
> > supports using WP and MINOR modes together (i.e. you can register an
> > area with both enabled), but without this CONTINUE flag the combination
> > is in practice unusable.
> >
> > So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing
> > as UFFDIO_COPY_MODE_WP, but for *minor* faults.
> >
> > Update the selftest to do some very basic exercising of the new flag.
> >
> > [1]: https://patchwork.kernel.org/project/linux-mm/cover/20230218002819.1486479-1-jthoughton@google.com/
> >
> > Acked-by: Peter Xu <peterx@redhat.com>
> > Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
>
> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
>
> > ---
> > fs/userfaultfd.c | 8 ++++++--
> > include/linux/userfaultfd_k.h | 3 ++-
> > include/uapi/linux/userfaultfd.h | 7 +++++++
> > mm/userfaultfd.c | 5 +++--
> > tools/testing/selftests/mm/userfaultfd.c | 4 ++++
> > 5 files changed, 22 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> > index 005e5e306266..14059a0861bf 100644
> > --- a/include/uapi/linux/userfaultfd.h
> > +++ b/include/uapi/linux/userfaultfd.h
> > @@ -297,6 +297,13 @@ struct uffdio_writeprotect {
> > struct uffdio_continue {
> > struct uffdio_range range;
> > #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
> > + /*
> > + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on
> > + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the
> > + * write protected ioctl is implemented for the range
> > + * according to the uffdio_register.ioctls.
> > + */
> > +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1)
>
> Please add the description of the new flag to Documentation/ and to the
> userfaultfd man pages.
Funny enough, neither flag is mentioned in Documentation/ today - I'll
add a short passage about both.
Happy to update the man pages as well, I'll send that patch separately.
Thanks for reviewing!
>
> > __u64 mode;
> >
> > /*
>
> --
> Sincerely yours,
> Mike.
@@ -1878,6 +1878,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
struct uffdio_continue uffdio_continue;
struct uffdio_continue __user *user_uffdio_continue;
struct userfaultfd_wake_range range;
+ uffd_flags_t flags = 0;
user_uffdio_continue = (struct uffdio_continue __user *)arg;
@@ -1902,13 +1903,16 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
uffdio_continue.range.start) {
goto out;
}
- if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+ if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
+ UFFDIO_CONTINUE_MODE_WP))
goto out;
+ if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
+ flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
uffdio_continue.range.len,
- &ctx->mmap_changing);
+ &ctx->mmap_changing, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -82,7 +82,8 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
unsigned long len,
atomic_t *mmap_changing);
extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long len, atomic_t *mmap_changing);
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
@@ -297,6 +297,13 @@ struct uffdio_writeprotect {
struct uffdio_continue {
struct uffdio_range range;
#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
+ /*
+ * UFFDIO_CONTINUE_MODE_WP will map the page write protected on
+ * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the
+ * write protected ioctl is implemented for the range
+ * according to the uffdio_register.ioctls.
+ */
+#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1)
__u64 mode;
/*
@@ -693,10 +693,11 @@ ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
}
ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ unsigned long len, atomic_t *mmap_changing,
+ uffd_flags_t flags)
{
return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
- uffd_flags_set_mode(0, MFILL_ATOMIC_CONTINUE));
+ uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}
long uffd_wp_range(struct vm_area_struct *dst_vma,
@@ -585,6 +585,8 @@ static void continue_range(int ufd, __u64 start, __u64 len)
req.range.start = start;
req.range.len = len;
req.mode = 0;
+ if (test_uffdio_wp)
+ req.mode |= UFFDIO_CONTINUE_MODE_WP;
if (ioctl(ufd, UFFDIO_CONTINUE, &req))
err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
@@ -1332,6 +1334,8 @@ static int userfaultfd_minor_test(void)
uffdio_register.range.start = (unsigned long)area_dst_alias;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
+ if (test_uffdio_wp)
+ uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
err("register failure");