[v2,12/31] selftests/mm: Create uffd-common.[ch]
Commit Message
Move common utility functions into uffd-common.[ch] files from the original
userfaultfd.c. This prepares for a split of userfaultfd.c into two tests:
one to only cover the old but powerful stress test, the other one covers
all the functional tests.
This movement is kind of a brute-force effort for now, with light touch-ups
but nothing should really change. There's chances to optimize more, but
let's leave that for later.
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
---
tools/testing/selftests/mm/Makefile | 2 +
tools/testing/selftests/mm/uffd-common.c | 611 ++++++++++++++++++++
tools/testing/selftests/mm/uffd-common.h | 117 ++++
tools/testing/selftests/mm/userfaultfd.c | 694 +----------------------
4 files changed, 731 insertions(+), 693 deletions(-)
create mode 100644 tools/testing/selftests/mm/uffd-common.c
create mode 100644 tools/testing/selftests/mm/uffd-common.h
Comments
On Wed, Apr 12, 2023 at 9:42 AM Peter Xu <peterx@redhat.com> wrote:
>
> Move common utility functions into uffd-common.[ch] files from the original
> userfaultfd.c. This prepares for a split of userfaultfd.c into two tests:
> one to only cover the old but powerful stress test, the other one covers
> all the functional tests.
>
> This movement is kind of a brute-force effort for now, with light touch-ups
> but nothing should really change. There's chances to optimize more, but
> let's leave that for later.
>
> Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
> Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Axel Rasmussen <axelrasmussen@google.com>
> ---
> tools/testing/selftests/mm/Makefile | 2 +
> tools/testing/selftests/mm/uffd-common.c | 611 ++++++++++++++++++++
> tools/testing/selftests/mm/uffd-common.h | 117 ++++
> tools/testing/selftests/mm/userfaultfd.c | 694 +----------------------
> 4 files changed, 731 insertions(+), 693 deletions(-)
> create mode 100644 tools/testing/selftests/mm/uffd-common.c
> create mode 100644 tools/testing/selftests/mm/uffd-common.h
>
> diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
> index 5f7626550e5f..36467c15ca00 100644
> --- a/tools/testing/selftests/mm/Makefile
> +++ b/tools/testing/selftests/mm/Makefile
> @@ -108,6 +108,8 @@ include ../lib.mk
>
> $(TEST_GEN_PROGS): vm_util.c
>
> +$(OUTPUT)/userfaultfd: uffd-common.c
> +
> ifeq ($(MACHINE),x86_64)
> BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
> BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
> diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c
> new file mode 100644
> index 000000000000..c57757c2a36f
> --- /dev/null
> +++ b/tools/testing/selftests/mm/uffd-common.c
> @@ -0,0 +1,611 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Userfaultfd tests util functions
> + *
> + * Copyright (C) 2015-2023 Red Hat, Inc.
> + */
> +
> +#include "uffd-common.h"
> +
> +#define BASE_PMD_ADDR ((void *)(1UL << 30))
> +
> +volatile bool test_uffdio_copy_eexist = true;
> +unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
> +char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
> +int mem_fd, uffd = -1, uffd_flags, finished, *pipefd, test_type;
> +bool map_shared, test_collapse, test_dev_userfaultfd;
> +bool test_uffdio_wp = true, test_uffdio_minor = false;
> +unsigned long long *count_verify;
> +uffd_test_ops_t *uffd_test_ops;
> +
> +static void anon_release_pages(char *rel_area)
> +{
> + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
> + err("madvise(MADV_DONTNEED) failed");
> +}
> +
> +static void anon_allocate_area(void **alloc_area, bool is_src)
> +{
> + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
> + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> +}
> +
> +static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
> +{
> +}
> +
> +static void hugetlb_release_pages(char *rel_area)
> +{
> + if (!map_shared) {
> + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
> + err("madvise(MADV_DONTNEED) failed");
> + } else {
> + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
> + err("madvise(MADV_REMOVE) failed");
> + }
> +}
> +
> +static void hugetlb_allocate_area(void **alloc_area, bool is_src)
> +{
> + off_t size = nr_pages * page_size;
> + off_t offset = is_src ? 0 : size;
> + void *area_alias = NULL;
> + char **alloc_area_alias;
> +
> + *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
> + (map_shared ? MAP_SHARED : MAP_PRIVATE) |
> + (is_src ? 0 : MAP_NORESERVE),
> + mem_fd, offset);
> + if (*alloc_area == MAP_FAILED)
> + err("mmap of hugetlbfs file failed");
> +
> + if (map_shared) {
> + area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
> + MAP_SHARED, mem_fd, offset);
> + if (area_alias == MAP_FAILED)
> + err("mmap of hugetlb file alias failed");
> + }
> +
> + if (is_src) {
> + alloc_area_alias = &area_src_alias;
> + } else {
> + alloc_area_alias = &area_dst_alias;
> + }
> + if (area_alias)
> + *alloc_area_alias = area_alias;
> +}
> +
> +static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
> +{
> + if (!map_shared)
> + return;
> +
> + *start = (unsigned long) area_dst_alias + offset;
> +}
> +
> +static void shmem_release_pages(char *rel_area)
> +{
> + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
> + err("madvise(MADV_REMOVE) failed");
> +}
> +
> +static void shmem_allocate_area(void **alloc_area, bool is_src)
> +{
> + void *area_alias = NULL;
> + size_t bytes = nr_pages * page_size;
> + unsigned long offset = is_src ? 0 : bytes;
> + char *p = NULL, *p_alias = NULL;
> +
> + if (test_collapse) {
> + p = BASE_PMD_ADDR;
> + if (!is_src)
> + /* src map + alias + interleaved hpages */
> + p += 2 * (bytes + hpage_size);
> + p_alias = p;
> + p_alias += bytes;
> + p_alias += hpage_size; /* Prevent src/dst VMA merge */
> + }
> +
> + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
> + mem_fd, offset);
> + if (*alloc_area == MAP_FAILED)
> + err("mmap of memfd failed");
> + if (test_collapse && *alloc_area != p)
> + err("mmap of memfd failed at %p", p);
> +
> + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
> + mem_fd, offset);
> + if (area_alias == MAP_FAILED)
> + err("mmap of memfd alias failed");
> + if (test_collapse && area_alias != p_alias)
> + err("mmap of anonymous memory failed at %p", p_alias);
> +
> + if (is_src)
> + area_src_alias = area_alias;
> + else
> + area_dst_alias = area_alias;
> +}
> +
> +static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
> +{
> + *start = (unsigned long)area_dst_alias + offset;
> +}
> +
> +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
> +{
> + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size))
> + err("Did not find expected %d number of hugepages",
> + expect_nr_hpages);
> +}
> +
> +struct uffd_test_ops anon_uffd_test_ops = {
> + .allocate_area = anon_allocate_area,
> + .release_pages = anon_release_pages,
> + .alias_mapping = noop_alias_mapping,
> + .check_pmd_mapping = NULL,
> +};
> +
> +struct uffd_test_ops shmem_uffd_test_ops = {
> + .allocate_area = shmem_allocate_area,
> + .release_pages = shmem_release_pages,
> + .alias_mapping = shmem_alias_mapping,
> + .check_pmd_mapping = shmem_check_pmd_mapping,
> +};
> +
> +struct uffd_test_ops hugetlb_uffd_test_ops = {
> + .allocate_area = hugetlb_allocate_area,
> + .release_pages = hugetlb_release_pages,
> + .alias_mapping = hugetlb_alias_mapping,
> + .check_pmd_mapping = NULL,
> +};
> +
> +void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
> +{
> + int i;
> + unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
> +
> + for (i = 0; i < n_cpus; i++) {
> + miss_total += stats[i].missing_faults;
> + wp_total += stats[i].wp_faults;
> + minor_total += stats[i].minor_faults;
> + }
> +
> + printf("userfaults: ");
> + if (miss_total) {
> + printf("%llu missing (", miss_total);
> + for (i = 0; i < n_cpus; i++)
> + printf("%lu+", stats[i].missing_faults);
> + printf("\b) ");
> + }
> + if (wp_total) {
> + printf("%llu wp (", wp_total);
> + for (i = 0; i < n_cpus; i++)
> + printf("%lu+", stats[i].wp_faults);
> + printf("\b) ");
> + }
> + if (minor_total) {
> + printf("%llu minor (", minor_total);
> + for (i = 0; i < n_cpus; i++)
> + printf("%lu+", stats[i].minor_faults);
> + printf("\b)");
> + }
> + printf("\n");
> +}
> +
> +static int __userfaultfd_open_dev(void)
> +{
> + int fd, _uffd;
> +
> + fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
> + if (fd < 0)
> + errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
> +
> + _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
> + if (_uffd < 0)
> + errexit(errno == ENOTTY ? KSFT_SKIP : 1,
> + "creating userfaultfd failed");
> + close(fd);
> + return _uffd;
> +}
> +
> +void userfaultfd_open(uint64_t *features)
> +{
> + struct uffdio_api uffdio_api;
> +
> + if (test_dev_userfaultfd)
> + uffd = __userfaultfd_open_dev();
> + else {
> + uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
> + if (uffd < 0)
> + errexit(errno == ENOSYS ? KSFT_SKIP : 1,
> + "creating userfaultfd failed");
> + }
> + uffd_flags = fcntl(uffd, F_GETFD, NULL);
> +
> + uffdio_api.api = UFFD_API;
> + uffdio_api.features = *features;
> + if (ioctl(uffd, UFFDIO_API, &uffdio_api))
> + err("UFFDIO_API failed.\nPlease make sure to "
> + "run with either root or ptrace capability.");
> + if (uffdio_api.api != UFFD_API)
> + err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
> +
> + *features = uffdio_api.features;
> +}
> +
> +static inline void munmap_area(void **area)
> +{
> + if (*area)
> + if (munmap(*area, nr_pages * page_size))
> + err("munmap");
> +
> + *area = NULL;
> +}
> +
> +static void uffd_test_ctx_clear(void)
> +{
> + size_t i;
> +
> + if (pipefd) {
> + for (i = 0; i < nr_cpus * 2; ++i) {
> + if (close(pipefd[i]))
> + err("close pipefd");
> + }
> + free(pipefd);
> + pipefd = NULL;
> + }
> +
> + if (count_verify) {
> + free(count_verify);
> + count_verify = NULL;
> + }
> +
> + if (uffd != -1) {
> + if (close(uffd))
> + err("close uffd");
> + uffd = -1;
> + }
> +
> + munmap_area((void **)&area_src);
> + munmap_area((void **)&area_src_alias);
> + munmap_area((void **)&area_dst);
> + munmap_area((void **)&area_dst_alias);
> + munmap_area((void **)&area_remap);
> +}
> +
> +void uffd_test_ctx_init(uint64_t features)
> +{
> + unsigned long nr, cpu;
> +
> + uffd_test_ctx_clear();
> +
> + uffd_test_ops->allocate_area((void **)&area_src, true);
> + uffd_test_ops->allocate_area((void **)&area_dst, false);
> +
> + userfaultfd_open(&features);
> +
> + count_verify = malloc(nr_pages * sizeof(unsigned long long));
> + if (!count_verify)
> + err("count_verify");
> +
> + for (nr = 0; nr < nr_pages; nr++) {
> + *area_mutex(area_src, nr) =
> + (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
> + count_verify[nr] = *area_count(area_src, nr) = 1;
> + /*
> + * In the transition between 255 to 256, powerpc will
> + * read out of order in my_bcmp and see both bytes as
> + * zero, so leave a placeholder below always non-zero
> + * after the count, to avoid my_bcmp to trigger false
> + * positives.
> + */
> + *(area_count(area_src, nr) + 1) = 1;
> + }
> +
> + /*
> + * After initialization of area_src, we must explicitly release pages
> + * for area_dst to make sure it's fully empty. Otherwise we could have
> + * some area_dst pages be errornously initialized with zero pages,
> + * hence we could hit memory corruption later in the test.
> + *
> + * One example is when THP is globally enabled, above allocate_area()
> + * calls could have the two areas merged into a single VMA (as they
> + * will have the same VMA flags so they're mergeable). When we
> + * initialize the area_src above, it's possible that some part of
> + * area_dst could have been faulted in via one huge THP that will be
> + * shared between area_src and area_dst. It could cause some of the
> + * area_dst won't be trapped by missing userfaults.
> + *
> + * This release_pages() will guarantee even if that happened, we'll
> + * proactively split the thp and drop any accidentally initialized
> + * pages within area_dst.
> + */
> + uffd_test_ops->release_pages(area_dst);
> +
> + pipefd = malloc(sizeof(int) * nr_cpus * 2);
> + if (!pipefd)
> + err("pipefd");
> + for (cpu = 0; cpu < nr_cpus; cpu++)
> + if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
> + err("pipe");
> +}
> +
> +uint64_t get_expected_ioctls(uint64_t mode)
> +{
> + uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
> +
> + if (test_type == TEST_HUGETLB)
> + ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
> +
> + if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
> + ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
> +
> + if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
> + ioctls &= ~(1 << _UFFDIO_CONTINUE);
> +
> + return ioctls;
> +}
> +
> +void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
> +{
> + uint64_t expected = get_expected_ioctls(mode);
> + uint64_t actual = ioctls & expected;
> +
> + if (actual != expected) {
> + err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
> + expected, actual);
> + }
> +}
> +
> +void wp_range(int ufd, __u64 start, __u64 len, bool wp)
> +{
> + struct uffdio_writeprotect prms;
> +
> + /* Write protection page faults */
> + prms.range.start = start;
> + prms.range.len = len;
> + /* Undo write-protect, do wakeup after that */
> + prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
> +
> + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
> + err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
> +}
> +
> +static void continue_range(int ufd, __u64 start, __u64 len)
> +{
> + struct uffdio_continue req;
> + int ret;
> +
> + req.range.start = start;
> + req.range.len = len;
> + req.mode = 0;
> + if (test_uffdio_wp)
> + req.mode |= UFFDIO_CONTINUE_MODE_WP;
> +
> + if (ioctl(ufd, UFFDIO_CONTINUE, &req))
> + err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
> + (uint64_t)start);
> +
> + /*
> + * Error handling within the kernel for continue is subtly different
> + * from copy or zeropage, so it may be a source of bugs. Trigger an
> + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
> + */
> + req.mapped = 0;
> + ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
> + if (ret >= 0 || req.mapped != -EEXIST)
> + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
> + ret, (int64_t) req.mapped);
> +}
> +
> +int uffd_read_msg(int ufd, struct uffd_msg *msg)
> +{
> + int ret = read(uffd, msg, sizeof(*msg));
> +
> + if (ret != sizeof(*msg)) {
> + if (ret < 0) {
> + if (errno == EAGAIN || errno == EINTR)
> + return 1;
> + err("blocking read error");
> + } else {
> + err("short read");
> + }
> + }
> +
> + return 0;
> +}
> +
> +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats)
> +{
> + unsigned long offset;
> +
> + if (msg->event != UFFD_EVENT_PAGEFAULT)
> + err("unexpected msg event %u", msg->event);
> +
> + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
> + /* Write protect page faults */
> + wp_range(uffd, msg->arg.pagefault.address, page_size, false);
> + stats->wp_faults++;
> + } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
> + uint8_t *area;
> + int b;
> +
> + /*
> + * Minor page faults
> + *
> + * To prove we can modify the original range for testing
> + * purposes, we're going to bit flip this range before
> + * continuing.
> + *
> + * Note that this requires all minor page fault tests operate on
> + * area_dst (non-UFFD-registered) and area_dst_alias
> + * (UFFD-registered).
> + */
> +
> + area = (uint8_t *)(area_dst +
> + ((char *)msg->arg.pagefault.address -
> + area_dst_alias));
> + for (b = 0; b < page_size; ++b)
> + area[b] = ~area[b];
> + continue_range(uffd, msg->arg.pagefault.address, page_size);
> + stats->minor_faults++;
> + } else {
> + /*
> + * Missing page faults.
> + *
> + * Here we force a write check for each of the missing mode
> + * faults. It's guaranteed because the only threads that
> + * will trigger uffd faults are the locking threads, and
> + * their first instruction to touch the missing page will
> + * always be pthread_mutex_lock().
> + *
> + * Note that here we relied on an NPTL glibc impl detail to
> + * always read the lock type at the entry of the lock op
> + * (pthread_mutex_t.__data.__type, offset 0x10) before
> + * doing any locking operations to guarantee that. It's
> + * actually not good to rely on this impl detail because
> + * logically a pthread-compatible lib can implement the
> + * locks without types and we can fail when linking with
> + * them. However since we used to find bugs with this
> + * strict check we still keep it around. Hopefully this
> + * could be a good hint when it fails again. If one day
> + * it'll break on some other impl of glibc we'll revisit.
> + */
> + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
> + err("unexpected write fault");
> +
> + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
> + offset &= ~(page_size-1);
> +
> + if (copy_page(uffd, offset))
> + stats->missing_faults++;
> + }
> +}
> +
> +void *uffd_poll_thread(void *arg)
> +{
> + struct uffd_stats *stats = (struct uffd_stats *)arg;
> + unsigned long cpu = stats->cpu;
> + struct pollfd pollfd[2];
> + struct uffd_msg msg;
> + struct uffdio_register uffd_reg;
> + int ret;
> + char tmp_chr;
> +
> + pollfd[0].fd = uffd;
> + pollfd[0].events = POLLIN;
> + pollfd[1].fd = pipefd[cpu*2];
> + pollfd[1].events = POLLIN;
> +
> + for (;;) {
> + ret = poll(pollfd, 2, -1);
> + if (ret <= 0) {
> + if (errno == EINTR || errno == EAGAIN)
> + continue;
> + err("poll error: %d", ret);
> + }
> + if (pollfd[1].revents) {
> + if (!(pollfd[1].revents & POLLIN))
> + err("pollfd[1].revents %d", pollfd[1].revents);
> + if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
> + err("read pipefd error");
> + break;
> + }
> + if (!(pollfd[0].revents & POLLIN))
> + err("pollfd[0].revents %d", pollfd[0].revents);
> + if (uffd_read_msg(uffd, &msg))
> + continue;
> + switch (msg.event) {
> + default:
> + err("unexpected msg event %u\n", msg.event);
> + break;
> + case UFFD_EVENT_PAGEFAULT:
> + uffd_handle_page_fault(&msg, stats);
> + break;
> + case UFFD_EVENT_FORK:
> + close(uffd);
> + uffd = msg.arg.fork.ufd;
> + pollfd[0].fd = uffd;
> + break;
> + case UFFD_EVENT_REMOVE:
> + uffd_reg.range.start = msg.arg.remove.start;
> + uffd_reg.range.len = msg.arg.remove.end -
> + msg.arg.remove.start;
> + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
> + err("remove failure");
> + break;
> + case UFFD_EVENT_REMAP:
> + area_remap = area_dst; /* save for later unmap */
> + area_dst = (char *)(unsigned long)msg.arg.remap.to;
> + break;
> + }
> + }
> +
> + return NULL;
> +}
> +
> +static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
> + unsigned long offset)
> +{
> + uffd_test_ops->alias_mapping(&uffdio_copy->dst,
> + uffdio_copy->len,
> + offset);
> + if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
> + /* real retval in ufdio_copy.copy */
> + if (uffdio_copy->copy != -EEXIST)
> + err("UFFDIO_COPY retry error: %"PRId64,
> + (int64_t)uffdio_copy->copy);
> + } else {
> + err("UFFDIO_COPY retry unexpected: %"PRId64,
> + (int64_t)uffdio_copy->copy);
> + }
> +}
> +
> +static void wake_range(int ufd, unsigned long addr, unsigned long len)
> +{
> + struct uffdio_range uffdio_wake;
> +
> + uffdio_wake.start = addr;
> + uffdio_wake.len = len;
> +
> + if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
> + fprintf(stderr, "error waking %lu\n",
> + addr), exit(1);
> +}
> +
> +int __copy_page(int ufd, unsigned long offset, bool retry)
> +{
> + struct uffdio_copy uffdio_copy;
> +
> + if (offset >= nr_pages * page_size)
> + err("unexpected offset %lu\n", offset);
> + uffdio_copy.dst = (unsigned long) area_dst + offset;
> + uffdio_copy.src = (unsigned long) area_src + offset;
> + uffdio_copy.len = page_size;
> + if (test_uffdio_wp)
> + uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
> + else
> + uffdio_copy.mode = 0;
> + uffdio_copy.copy = 0;
> + if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
> + /* real retval in ufdio_copy.copy */
> + if (uffdio_copy.copy != -EEXIST)
> + err("UFFDIO_COPY error: %"PRId64,
> + (int64_t)uffdio_copy.copy);
> + wake_range(ufd, uffdio_copy.dst, page_size);
> + } else if (uffdio_copy.copy != page_size) {
> + err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
> + } else {
> + if (test_uffdio_copy_eexist && retry) {
> + test_uffdio_copy_eexist = false;
> + retry_copy_page(ufd, &uffdio_copy, offset);
> + }
> + return 1;
> + }
> + return 0;
> +}
> +
> +int copy_page(int ufd, unsigned long offset)
> +{
> + return __copy_page(ufd, offset, false);
> +}
> diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h
> new file mode 100644
> index 000000000000..d9430cfdcb19
> --- /dev/null
> +++ b/tools/testing/selftests/mm/uffd-common.h
> @@ -0,0 +1,117 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Userfaultfd tests common header
> + *
> + * Copyright (C) 2015-2023 Red Hat, Inc.
> + */
> +#ifndef __UFFD_COMMON_H__
> +#define __UFFD_COMMON_H__
> +
> +#define _GNU_SOURCE
> +#include <stdio.h>
> +#include <errno.h>
> +#include <unistd.h>
> +#include <stdlib.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <time.h>
> +#include <signal.h>
> +#include <poll.h>
> +#include <string.h>
> +#include <linux/mman.h>
> +#include <sys/mman.h>
> +#include <sys/syscall.h>
> +#include <sys/ioctl.h>
> +#include <sys/wait.h>
> +#include <pthread.h>
> +#include <linux/userfaultfd.h>
> +#include <setjmp.h>
> +#include <stdbool.h>
> +#include <assert.h>
> +#include <inttypes.h>
> +#include <stdint.h>
> +#include <sys/random.h>
> +
> +#include "../kselftest.h"
> +#include "vm_util.h"
> +
> +#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
> +
> +#define _err(fmt, ...) \
> + do { \
> + int ret = errno; \
> + fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \
> + fprintf(stderr, " (errno=%d, @%s:%d)\n", \
> + ret, __FILE__, __LINE__); \
> + } while (0)
> +
> +#define errexit(exitcode, fmt, ...) \
> + do { \
> + _err(fmt, ##__VA_ARGS__); \
> + exit(exitcode); \
> + } while (0)
> +
> +#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
> +
> +/* pthread_mutex_t starts at page offset 0 */
> +#define area_mutex(___area, ___nr) \
> + ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
> +/*
> + * count is placed in the page after pthread_mutex_t naturally aligned
> + * to avoid non alignment faults on non-x86 archs.
> + */
> +#define area_count(___area, ___nr) \
> + ((volatile unsigned long long *) ((unsigned long) \
> + ((___area) + (___nr)*page_size + \
> + sizeof(pthread_mutex_t) + \
> + sizeof(unsigned long long) - 1) & \
> + ~(unsigned long)(sizeof(unsigned long long) \
> + - 1)))
> +
> +/* Userfaultfd test statistics */
> +struct uffd_stats {
> + int cpu;
> + unsigned long missing_faults;
> + unsigned long wp_faults;
> + unsigned long minor_faults;
> +};
> +
> +struct uffd_test_ops {
> + void (*allocate_area)(void **alloc_area, bool is_src);
> + void (*release_pages)(char *rel_area);
> + void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
> + void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
> +};
> +typedef struct uffd_test_ops uffd_test_ops_t;
> +
> +extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
> +extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
> +extern int mem_fd, uffd, uffd_flags, finished, *pipefd, test_type;
> +extern bool map_shared, test_collapse, test_dev_userfaultfd;
> +extern bool test_uffdio_wp, test_uffdio_minor;
> +extern unsigned long long *count_verify;
> +extern volatile bool test_uffdio_copy_eexist;
> +
> +extern uffd_test_ops_t anon_uffd_test_ops;
> +extern uffd_test_ops_t shmem_uffd_test_ops;
> +extern uffd_test_ops_t hugetlb_uffd_test_ops;
> +extern uffd_test_ops_t *uffd_test_ops;
> +
> +void uffd_stats_report(struct uffd_stats *stats, int n_cpus);
> +void uffd_test_ctx_init(uint64_t features);
> +void userfaultfd_open(uint64_t *features);
> +uint64_t get_expected_ioctls(uint64_t mode);
> +void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls);
> +int uffd_read_msg(int ufd, struct uffd_msg *msg);
> +void wp_range(int ufd, __u64 start, __u64 len, bool wp);
> +void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats);
> +int __copy_page(int ufd, unsigned long offset, bool retry);
> +int copy_page(int ufd, unsigned long offset);
> +void *uffd_poll_thread(void *arg);
> +
> +#define TEST_ANON 1
> +#define TEST_HUGETLB 2
> +#define TEST_SHMEM 3
> +
> +#endif
> diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c
> index 3487ec0bfcc8..c68a9aeefc41 100644
> --- a/tools/testing/selftests/mm/userfaultfd.c
> +++ b/tools/testing/selftests/mm/userfaultfd.c
> @@ -34,96 +34,20 @@
> * transfer (UFFDIO_COPY).
> */
>
> -#define _GNU_SOURCE
> -#include <stdio.h>
> -#include <errno.h>
> -#include <unistd.h>
> -#include <stdlib.h>
> -#include <sys/types.h>
> -#include <sys/stat.h>
> -#include <fcntl.h>
> -#include <time.h>
> -#include <signal.h>
> -#include <poll.h>
> -#include <string.h>
> -#include <linux/mman.h>
> -#include <sys/mman.h>
> -#include <sys/syscall.h>
> -#include <sys/ioctl.h>
> -#include <sys/wait.h>
> -#include <pthread.h>
> -#include <linux/userfaultfd.h>
> -#include <setjmp.h>
> -#include <stdbool.h>
> -#include <assert.h>
> -#include <inttypes.h>
> -#include <stdint.h>
> -#include <sys/random.h>
> -
> -#include "../kselftest.h"
> -#include "vm_util.h"
> +#include "uffd-common.h"
>
> #ifdef __NR_userfaultfd
>
> -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
> -
> #define BOUNCE_RANDOM (1<<0)
> #define BOUNCE_RACINGFAULTS (1<<1)
> #define BOUNCE_VERIFY (1<<2)
> #define BOUNCE_POLL (1<<3)
> static int bounces;
>
> -#define TEST_ANON 1
> -#define TEST_HUGETLB 2
> -#define TEST_SHMEM 3
> -static int test_type;
> -
> -#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
> -
> -#define BASE_PMD_ADDR ((void *)(1UL << 30))
> -
> -/* test using /dev/userfaultfd, instead of userfaultfd(2) */
> -static bool test_dev_userfaultfd;
> -
> /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
> #define ALARM_INTERVAL_SECS 10
> -static volatile bool test_uffdio_copy_eexist = true;
> -/* Whether to test uffd write-protection */
> -static bool test_uffdio_wp = true;
> -/* Whether to test uffd minor faults */
> -static bool test_uffdio_minor = false;
> -static bool map_shared;
> -static int mem_fd;
> -static unsigned long long *count_verify;
> -static int uffd = -1;
> -static int uffd_flags, finished, *pipefd;
> -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
> static char *zeropage;
> pthread_attr_t attr;
> -static bool test_collapse;
> -
> -/* Userfaultfd test statistics */
> -struct uffd_stats {
> - int cpu;
> - unsigned long missing_faults;
> - unsigned long wp_faults;
> - unsigned long minor_faults;
> -};
> -
> -/* pthread_mutex_t starts at page offset 0 */
> -#define area_mutex(___area, ___nr) \
> - ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
> -/*
> - * count is placed in the page after pthread_mutex_t naturally aligned
> - * to avoid non alignment faults on non-x86 archs.
> - */
> -#define area_count(___area, ___nr) \
> - ((volatile unsigned long long *) ((unsigned long) \
> - ((___area) + (___nr)*page_size + \
> - sizeof(pthread_mutex_t) + \
> - sizeof(unsigned long long) - 1) & \
> - ~(unsigned long)(sizeof(unsigned long long) \
> - - 1)))
>
> #define swap(a, b) \
> do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
> @@ -166,22 +90,6 @@ static void usage(void)
> exit(1);
> }
>
> -#define _err(fmt, ...) \
> - do { \
> - int ret = errno; \
> - fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \
> - fprintf(stderr, " (errno=%d, line=%d)\n", \
> - ret, __LINE__); \
> - } while (0)
> -
> -#define errexit(exitcode, fmt, ...) \
> - do { \
> - _err(fmt, ##__VA_ARGS__); \
> - exit(exitcode); \
> - } while (0)
> -
> -#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
> -
> static void uffd_stats_reset(struct uffd_stats *uffd_stats,
> unsigned long n_cpus)
> {
> @@ -195,189 +103,6 @@ static void uffd_stats_reset(struct uffd_stats *uffd_stats,
> }
> }
>
> -static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
> -{
> - int i;
> - unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
> -
> - for (i = 0; i < n_cpus; i++) {
> - miss_total += stats[i].missing_faults;
> - wp_total += stats[i].wp_faults;
> - minor_total += stats[i].minor_faults;
> - }
> -
> - printf("userfaults: ");
> - if (miss_total) {
> - printf("%llu missing (", miss_total);
> - for (i = 0; i < n_cpus; i++)
> - printf("%lu+", stats[i].missing_faults);
> - printf("\b) ");
> - }
> - if (wp_total) {
> - printf("%llu wp (", wp_total);
> - for (i = 0; i < n_cpus; i++)
> - printf("%lu+", stats[i].wp_faults);
> - printf("\b) ");
> - }
> - if (minor_total) {
> - printf("%llu minor (", minor_total);
> - for (i = 0; i < n_cpus; i++)
> - printf("%lu+", stats[i].minor_faults);
> - printf("\b)");
> - }
> - printf("\n");
> -}
> -
> -static void anon_release_pages(char *rel_area)
> -{
> - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
> - err("madvise(MADV_DONTNEED) failed");
> -}
> -
> -static void anon_allocate_area(void **alloc_area, bool is_src)
> -{
> - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
> - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> -}
> -
> -static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
> -{
> -}
> -
> -static void hugetlb_release_pages(char *rel_area)
> -{
> - if (!map_shared) {
> - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
> - err("madvise(MADV_DONTNEED) failed");
> - } else {
> - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
> - err("madvise(MADV_REMOVE) failed");
> - }
> -}
> -
> -static void hugetlb_allocate_area(void **alloc_area, bool is_src)
> -{
> - off_t size = nr_pages * page_size;
> - off_t offset = is_src ? 0 : size;
> - void *area_alias = NULL;
> - char **alloc_area_alias;
> -
> - *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
> - (map_shared ? MAP_SHARED : MAP_PRIVATE) |
> - (is_src ? 0 : MAP_NORESERVE),
> - mem_fd, offset);
> - if (*alloc_area == MAP_FAILED)
> - err("mmap of hugetlbfs file failed");
> -
> - if (map_shared) {
> - area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
> - MAP_SHARED, mem_fd, offset);
> - if (area_alias == MAP_FAILED)
> - err("mmap of hugetlb file alias failed");
> - }
> -
> - if (is_src) {
> - alloc_area_alias = &area_src_alias;
> - } else {
> - alloc_area_alias = &area_dst_alias;
> - }
> - if (area_alias)
> - *alloc_area_alias = area_alias;
> -}
> -
> -static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
> -{
> - if (!map_shared)
> - return;
> -
> - *start = (unsigned long) area_dst_alias + offset;
> -}
> -
> -static void shmem_release_pages(char *rel_area)
> -{
> - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
> - err("madvise(MADV_REMOVE) failed");
> -}
> -
> -static void shmem_allocate_area(void **alloc_area, bool is_src)
> -{
> - void *area_alias = NULL;
> - size_t bytes = nr_pages * page_size;
> - unsigned long offset = is_src ? 0 : bytes;
> - char *p = NULL, *p_alias = NULL;
> -
> - if (test_collapse) {
> - p = BASE_PMD_ADDR;
> - if (!is_src)
> - /* src map + alias + interleaved hpages */
> - p += 2 * (bytes + hpage_size);
> - p_alias = p;
> - p_alias += bytes;
> - p_alias += hpage_size; /* Prevent src/dst VMA merge */
> - }
> -
> - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
> - mem_fd, offset);
> - if (*alloc_area == MAP_FAILED)
> - err("mmap of memfd failed");
> - if (test_collapse && *alloc_area != p)
> - err("mmap of memfd failed at %p", p);
> -
> - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
> - mem_fd, offset);
> - if (area_alias == MAP_FAILED)
> - err("mmap of memfd alias failed");
> - if (test_collapse && area_alias != p_alias)
> - err("mmap of anonymous memory failed at %p", p_alias);
> -
> - if (is_src)
> - area_src_alias = area_alias;
> - else
> - area_dst_alias = area_alias;
> -}
> -
> -static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
> -{
> - *start = (unsigned long)area_dst_alias + offset;
> -}
> -
> -static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
> -{
> - if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size))
> - err("Did not find expected %d number of hugepages",
> - expect_nr_hpages);
> -}
> -
> -struct uffd_test_ops {
> - void (*allocate_area)(void **alloc_area, bool is_src);
> - void (*release_pages)(char *rel_area);
> - void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
> - void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
> -};
> -
> -static struct uffd_test_ops anon_uffd_test_ops = {
> - .allocate_area = anon_allocate_area,
> - .release_pages = anon_release_pages,
> - .alias_mapping = noop_alias_mapping,
> - .check_pmd_mapping = NULL,
> -};
> -
> -static struct uffd_test_ops shmem_uffd_test_ops = {
> - .allocate_area = shmem_allocate_area,
> - .release_pages = shmem_release_pages,
> - .alias_mapping = shmem_alias_mapping,
> - .check_pmd_mapping = shmem_check_pmd_mapping,
> -};
> -
> -static struct uffd_test_ops hugetlb_uffd_test_ops = {
> - .allocate_area = hugetlb_allocate_area,
> - .release_pages = hugetlb_release_pages,
> - .alias_mapping = hugetlb_alias_mapping,
> - .check_pmd_mapping = NULL,
> -};
> -
> -static struct uffd_test_ops *uffd_test_ops;
> -
> static inline uint64_t uffd_minor_feature(void)
> {
> if (test_type == TEST_HUGETLB && map_shared)
> @@ -388,171 +113,6 @@ static inline uint64_t uffd_minor_feature(void)
> return 0;
> }
>
> -static uint64_t get_expected_ioctls(uint64_t mode)
> -{
> - uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
> -
> - if (test_type == TEST_HUGETLB)
> - ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
> -
> - if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
> - ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
> -
> - if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
> - ioctls &= ~(1 << _UFFDIO_CONTINUE);
> -
> - return ioctls;
> -}
> -
> -static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
> -{
> - uint64_t expected = get_expected_ioctls(mode);
> - uint64_t actual = ioctls & expected;
> -
> - if (actual != expected) {
> - err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
> - expected, actual);
> - }
> -}
> -
> -static int __userfaultfd_open_dev(void)
> -{
> - int fd, _uffd;
> -
> - fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
> - if (fd < 0)
> - errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
> -
> - _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
> - if (_uffd < 0)
> - errexit(errno == ENOTTY ? KSFT_SKIP : 1,
> - "creating userfaultfd failed");
> - close(fd);
> - return _uffd;
> -}
> -
> -static void userfaultfd_open(uint64_t *features)
> -{
> - struct uffdio_api uffdio_api;
> -
> - if (test_dev_userfaultfd)
> - uffd = __userfaultfd_open_dev();
> - else {
> - uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
> - if (uffd < 0)
> - errexit(errno == ENOSYS ? KSFT_SKIP : 1,
> - "creating userfaultfd failed");
> - }
> - uffd_flags = fcntl(uffd, F_GETFD, NULL);
> -
> - uffdio_api.api = UFFD_API;
> - uffdio_api.features = *features;
> - if (ioctl(uffd, UFFDIO_API, &uffdio_api))
> - err("UFFDIO_API failed.\nPlease make sure to "
> - "run with either root or ptrace capability.");
> - if (uffdio_api.api != UFFD_API)
> - err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
> -
> - *features = uffdio_api.features;
> -}
> -
> -static inline void munmap_area(void **area)
> -{
> - if (*area)
> - if (munmap(*area, nr_pages * page_size))
> - err("munmap");
> -
> - *area = NULL;
> -}
> -
> -static void uffd_test_ctx_clear(void)
> -{
> - size_t i;
> -
> - if (pipefd) {
> - for (i = 0; i < nr_cpus * 2; ++i) {
> - if (close(pipefd[i]))
> - err("close pipefd");
> - }
> - free(pipefd);
> - pipefd = NULL;
> - }
> -
> - if (count_verify) {
> - free(count_verify);
> - count_verify = NULL;
> - }
> -
> - if (uffd != -1) {
> - if (close(uffd))
> - err("close uffd");
> - uffd = -1;
> - }
> -
> - munmap_area((void **)&area_src);
> - munmap_area((void **)&area_src_alias);
> - munmap_area((void **)&area_dst);
> - munmap_area((void **)&area_dst_alias);
> - munmap_area((void **)&area_remap);
> -}
> -
> -static void uffd_test_ctx_init(uint64_t features)
> -{
> - unsigned long nr, cpu;
> -
> - uffd_test_ctx_clear();
> -
> - uffd_test_ops->allocate_area((void **)&area_src, true);
> - uffd_test_ops->allocate_area((void **)&area_dst, false);
> -
> - userfaultfd_open(&features);
> -
> - count_verify = malloc(nr_pages * sizeof(unsigned long long));
> - if (!count_verify)
> - err("count_verify");
> -
> - for (nr = 0; nr < nr_pages; nr++) {
> - *area_mutex(area_src, nr) =
> - (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
> - count_verify[nr] = *area_count(area_src, nr) = 1;
> - /*
> - * In the transition between 255 to 256, powerpc will
> - * read out of order in my_bcmp and see both bytes as
> - * zero, so leave a placeholder below always non-zero
> - * after the count, to avoid my_bcmp to trigger false
> - * positives.
> - */
> - *(area_count(area_src, nr) + 1) = 1;
> - }
> -
> - /*
> - * After initialization of area_src, we must explicitly release pages
> - * for area_dst to make sure it's fully empty. Otherwise we could have
> - * some area_dst pages be errornously initialized with zero pages,
> - * hence we could hit memory corruption later in the test.
> - *
> - * One example is when THP is globally enabled, above allocate_area()
> - * calls could have the two areas merged into a single VMA (as they
> - * will have the same VMA flags so they're mergeable). When we
> - * initialize the area_src above, it's possible that some part of
> - * area_dst could have been faulted in via one huge THP that will be
> - * shared between area_src and area_dst. It could cause some of the
> - * area_dst won't be trapped by missing userfaults.
> - *
> - * This release_pages() will guarantee even if that happened, we'll
> - * proactively split the thp and drop any accidentally initialized
> - * pages within area_dst.
> - */
> - uffd_test_ops->release_pages(area_dst);
> -
> - pipefd = malloc(sizeof(int) * nr_cpus * 2);
> - if (!pipefd)
> - err("pipefd");
> - for (cpu = 0; cpu < nr_cpus; cpu++)
> - if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
> - err("pipe");
> -}
> -
> static int my_bcmp(char *str1, char *str2, size_t n)
> {
> unsigned long i;
> @@ -562,47 +122,6 @@ static int my_bcmp(char *str1, char *str2, size_t n)
> return 0;
> }
>
> -static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
> -{
> - struct uffdio_writeprotect prms;
> -
> - /* Write protection page faults */
> - prms.range.start = start;
> - prms.range.len = len;
> - /* Undo write-protect, do wakeup after that */
> - prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
> -
> - if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
> - err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
> -}
> -
> -static void continue_range(int ufd, __u64 start, __u64 len)
> -{
> - struct uffdio_continue req;
> - int ret;
> -
> - req.range.start = start;
> - req.range.len = len;
> - req.mode = 0;
> - if (test_uffdio_wp)
> - req.mode |= UFFDIO_CONTINUE_MODE_WP;
> -
> - if (ioctl(ufd, UFFDIO_CONTINUE, &req))
> - err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
> - (uint64_t)start);
> -
> - /*
> - * Error handling within the kernel for continue is subtly different
> - * from copy or zeropage, so it may be a source of bugs. Trigger an
> - * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
> - */
> - req.mapped = 0;
> - ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
> - if (ret >= 0 || req.mapped != -EEXIST)
> - err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
> - ret, (int64_t) req.mapped);
> -}
> -
> static void *locking_thread(void *arg)
> {
> unsigned long cpu = (unsigned long) arg;
> @@ -635,222 +154,11 @@ static void *locking_thread(void *arg)
> return NULL;
> }
>
> -static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
> - unsigned long offset)
> -{
> - uffd_test_ops->alias_mapping(&uffdio_copy->dst,
> - uffdio_copy->len,
> - offset);
> - if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
> - /* real retval in ufdio_copy.copy */
> - if (uffdio_copy->copy != -EEXIST)
> - err("UFFDIO_COPY retry error: %"PRId64,
> - (int64_t)uffdio_copy->copy);
> - } else {
> - err("UFFDIO_COPY retry unexpected: %"PRId64,
> - (int64_t)uffdio_copy->copy);
> - }
> -}
> -
> -static void wake_range(int ufd, unsigned long addr, unsigned long len)
> -{
> - struct uffdio_range uffdio_wake;
> -
> - uffdio_wake.start = addr;
> - uffdio_wake.len = len;
> -
> - if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
> - fprintf(stderr, "error waking %lu\n",
> - addr), exit(1);
> -}
> -
> -static int __copy_page(int ufd, unsigned long offset, bool retry)
> -{
> - struct uffdio_copy uffdio_copy;
> -
> - if (offset >= nr_pages * page_size)
> - err("unexpected offset %lu\n", offset);
> - uffdio_copy.dst = (unsigned long) area_dst + offset;
> - uffdio_copy.src = (unsigned long) area_src + offset;
> - uffdio_copy.len = page_size;
> - if (test_uffdio_wp)
> - uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
> - else
> - uffdio_copy.mode = 0;
> - uffdio_copy.copy = 0;
> - if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
> - /* real retval in ufdio_copy.copy */
> - if (uffdio_copy.copy != -EEXIST)
> - err("UFFDIO_COPY error: %"PRId64,
> - (int64_t)uffdio_copy.copy);
> - wake_range(ufd, uffdio_copy.dst, page_size);
> - } else if (uffdio_copy.copy != page_size) {
> - err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
> - } else {
> - if (test_uffdio_copy_eexist && retry) {
> - test_uffdio_copy_eexist = false;
> - retry_copy_page(ufd, &uffdio_copy, offset);
> - }
> - return 1;
> - }
> - return 0;
> -}
> -
> static int copy_page_retry(int ufd, unsigned long offset)
> {
> return __copy_page(ufd, offset, true);
> }
>
> -static int copy_page(int ufd, unsigned long offset)
> -{
> - return __copy_page(ufd, offset, false);
> -}
> -
> -static int uffd_read_msg(int ufd, struct uffd_msg *msg)
> -{
> - int ret = read(uffd, msg, sizeof(*msg));
> -
> - if (ret != sizeof(*msg)) {
> - if (ret < 0) {
> - if (errno == EAGAIN || errno == EINTR)
> - return 1;
> - err("blocking read error");
> - } else {
> - err("short read");
> - }
> - }
> -
> - return 0;
> -}
> -
> -static void uffd_handle_page_fault(struct uffd_msg *msg,
> - struct uffd_stats *stats)
> -{
> - unsigned long offset;
> -
> - if (msg->event != UFFD_EVENT_PAGEFAULT)
> - err("unexpected msg event %u", msg->event);
> -
> - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
> - /* Write protect page faults */
> - wp_range(uffd, msg->arg.pagefault.address, page_size, false);
> - stats->wp_faults++;
> - } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
> - uint8_t *area;
> - int b;
> -
> - /*
> - * Minor page faults
> - *
> - * To prove we can modify the original range for testing
> - * purposes, we're going to bit flip this range before
> - * continuing.
> - *
> - * Note that this requires all minor page fault tests operate on
> - * area_dst (non-UFFD-registered) and area_dst_alias
> - * (UFFD-registered).
> - */
> -
> - area = (uint8_t *)(area_dst +
> - ((char *)msg->arg.pagefault.address -
> - area_dst_alias));
> - for (b = 0; b < page_size; ++b)
> - area[b] = ~area[b];
> - continue_range(uffd, msg->arg.pagefault.address, page_size);
> - stats->minor_faults++;
> - } else {
> - /*
> - * Missing page faults.
> - *
> - * Here we force a write check for each of the missing mode
> - * faults. It's guaranteed because the only threads that
> - * will trigger uffd faults are the locking threads, and
> - * their first instruction to touch the missing page will
> - * always be pthread_mutex_lock().
> - *
> - * Note that here we relied on an NPTL glibc impl detail to
> - * always read the lock type at the entry of the lock op
> - * (pthread_mutex_t.__data.__type, offset 0x10) before
> - * doing any locking operations to guarantee that. It's
> - * actually not good to rely on this impl detail because
> - * logically a pthread-compatible lib can implement the
> - * locks without types and we can fail when linking with
> - * them. However since we used to find bugs with this
> - * strict check we still keep it around. Hopefully this
> - * could be a good hint when it fails again. If one day
> - * it'll break on some other impl of glibc we'll revisit.
> - */
> - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
> - err("unexpected write fault");
> -
> - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
> - offset &= ~(page_size-1);
> -
> - if (copy_page(uffd, offset))
> - stats->missing_faults++;
> - }
> -}
> -
> -static void *uffd_poll_thread(void *arg)
> -{
> - struct uffd_stats *stats = (struct uffd_stats *)arg;
> - unsigned long cpu = stats->cpu;
> - struct pollfd pollfd[2];
> - struct uffd_msg msg;
> - struct uffdio_register uffd_reg;
> - int ret;
> - char tmp_chr;
> -
> - pollfd[0].fd = uffd;
> - pollfd[0].events = POLLIN;
> - pollfd[1].fd = pipefd[cpu*2];
> - pollfd[1].events = POLLIN;
> -
> - for (;;) {
> - ret = poll(pollfd, 2, -1);
> - if (ret <= 0) {
> - if (errno == EINTR || errno == EAGAIN)
> - continue;
> - err("poll error: %d", ret);
> - }
> - if (pollfd[1].revents & POLLIN) {
> - if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
> - err("read pipefd error");
> - break;
> - }
> - if (!(pollfd[0].revents & POLLIN))
> - err("pollfd[0].revents %d", pollfd[0].revents);
> - if (uffd_read_msg(uffd, &msg))
> - continue;
> - switch (msg.event) {
> - default:
> - err("unexpected msg event %u\n", msg.event);
> - break;
> - case UFFD_EVENT_PAGEFAULT:
> - uffd_handle_page_fault(&msg, stats);
> - break;
> - case UFFD_EVENT_FORK:
> - close(uffd);
> - uffd = msg.arg.fork.ufd;
> - pollfd[0].fd = uffd;
> - break;
> - case UFFD_EVENT_REMOVE:
> - uffd_reg.range.start = msg.arg.remove.start;
> - uffd_reg.range.len = msg.arg.remove.end -
> - msg.arg.remove.start;
> - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
> - err("remove failure");
> - break;
> - case UFFD_EVENT_REMAP:
> - area_remap = area_dst; /* save for later unmap */
> - area_dst = (char *)(unsigned long)msg.arg.remap.to;
> - break;
> - }
> - }
> -
> - return NULL;
> -}
> -
> pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
>
> static void *uffd_read_thread(void *arg)
> --
> 2.39.1
>
@@ -108,6 +108,8 @@ include ../lib.mk
$(TEST_GEN_PROGS): vm_util.c
+$(OUTPUT)/userfaultfd: uffd-common.c
+
ifeq ($(MACHINE),x86_64)
BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
new file mode 100644
@@ -0,0 +1,611 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Userfaultfd tests util functions
+ *
+ * Copyright (C) 2015-2023 Red Hat, Inc.
+ */
+
+#include "uffd-common.h"
+
+#define BASE_PMD_ADDR ((void *)(1UL << 30))
+
+volatile bool test_uffdio_copy_eexist = true;
+unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
+char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
+int mem_fd, uffd = -1, uffd_flags, finished, *pipefd, test_type;
+bool map_shared, test_collapse, test_dev_userfaultfd;
+bool test_uffdio_wp = true, test_uffdio_minor = false;
+unsigned long long *count_verify;
+uffd_test_ops_t *uffd_test_ops;
+
+static void anon_release_pages(char *rel_area)
+{
+ if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+}
+
+static void anon_allocate_area(void **alloc_area, bool is_src)
+{
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+}
+
+static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+}
+
+static void hugetlb_release_pages(char *rel_area)
+{
+ if (!map_shared) {
+ if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
+ err("madvise(MADV_DONTNEED) failed");
+ } else {
+ if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+ err("madvise(MADV_REMOVE) failed");
+ }
+}
+
+static void hugetlb_allocate_area(void **alloc_area, bool is_src)
+{
+ off_t size = nr_pages * page_size;
+ off_t offset = is_src ? 0 : size;
+ void *area_alias = NULL;
+ char **alloc_area_alias;
+
+ *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+ (is_src ? 0 : MAP_NORESERVE),
+ mem_fd, offset);
+ if (*alloc_area == MAP_FAILED)
+ err("mmap of hugetlbfs file failed");
+
+ if (map_shared) {
+ area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, mem_fd, offset);
+ if (area_alias == MAP_FAILED)
+ err("mmap of hugetlb file alias failed");
+ }
+
+ if (is_src) {
+ alloc_area_alias = &area_src_alias;
+ } else {
+ alloc_area_alias = &area_dst_alias;
+ }
+ if (area_alias)
+ *alloc_area_alias = area_alias;
+}
+
+static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+ if (!map_shared)
+ return;
+
+ *start = (unsigned long) area_dst_alias + offset;
+}
+
+static void shmem_release_pages(char *rel_area)
+{
+ if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
+ err("madvise(MADV_REMOVE) failed");
+}
+
+static void shmem_allocate_area(void **alloc_area, bool is_src)
+{
+ void *area_alias = NULL;
+ size_t bytes = nr_pages * page_size;
+ unsigned long offset = is_src ? 0 : bytes;
+ char *p = NULL, *p_alias = NULL;
+
+ if (test_collapse) {
+ p = BASE_PMD_ADDR;
+ if (!is_src)
+ /* src map + alias + interleaved hpages */
+ p += 2 * (bytes + hpage_size);
+ p_alias = p;
+ p_alias += bytes;
+ p_alias += hpage_size; /* Prevent src/dst VMA merge */
+ }
+
+ *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+ mem_fd, offset);
+ if (*alloc_area == MAP_FAILED)
+ err("mmap of memfd failed");
+ if (test_collapse && *alloc_area != p)
+ err("mmap of memfd failed at %p", p);
+
+ area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+ mem_fd, offset);
+ if (area_alias == MAP_FAILED)
+ err("mmap of memfd alias failed");
+ if (test_collapse && area_alias != p_alias)
+ err("mmap of anonymous memory failed at %p", p_alias);
+
+ if (is_src)
+ area_src_alias = area_alias;
+ else
+ area_dst_alias = area_alias;
+}
+
+static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+ *start = (unsigned long)area_dst_alias + offset;
+}
+
+static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
+{
+ if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size))
+ err("Did not find expected %d number of hugepages",
+ expect_nr_hpages);
+}
+
+struct uffd_test_ops anon_uffd_test_ops = {
+ .allocate_area = anon_allocate_area,
+ .release_pages = anon_release_pages,
+ .alias_mapping = noop_alias_mapping,
+ .check_pmd_mapping = NULL,
+};
+
+struct uffd_test_ops shmem_uffd_test_ops = {
+ .allocate_area = shmem_allocate_area,
+ .release_pages = shmem_release_pages,
+ .alias_mapping = shmem_alias_mapping,
+ .check_pmd_mapping = shmem_check_pmd_mapping,
+};
+
+struct uffd_test_ops hugetlb_uffd_test_ops = {
+ .allocate_area = hugetlb_allocate_area,
+ .release_pages = hugetlb_release_pages,
+ .alias_mapping = hugetlb_alias_mapping,
+ .check_pmd_mapping = NULL,
+};
+
+void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
+{
+ int i;
+ unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
+
+ for (i = 0; i < n_cpus; i++) {
+ miss_total += stats[i].missing_faults;
+ wp_total += stats[i].wp_faults;
+ minor_total += stats[i].minor_faults;
+ }
+
+ printf("userfaults: ");
+ if (miss_total) {
+ printf("%llu missing (", miss_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", stats[i].missing_faults);
+ printf("\b) ");
+ }
+ if (wp_total) {
+ printf("%llu wp (", wp_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", stats[i].wp_faults);
+ printf("\b) ");
+ }
+ if (minor_total) {
+ printf("%llu minor (", minor_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", stats[i].minor_faults);
+ printf("\b)");
+ }
+ printf("\n");
+}
+
+static int __userfaultfd_open_dev(void)
+{
+ int fd, _uffd;
+
+ fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
+ if (fd < 0)
+ errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
+
+ _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
+ if (_uffd < 0)
+ errexit(errno == ENOTTY ? KSFT_SKIP : 1,
+ "creating userfaultfd failed");
+ close(fd);
+ return _uffd;
+}
+
+void userfaultfd_open(uint64_t *features)
+{
+ struct uffdio_api uffdio_api;
+
+ if (test_dev_userfaultfd)
+ uffd = __userfaultfd_open_dev();
+ else {
+ uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
+ if (uffd < 0)
+ errexit(errno == ENOSYS ? KSFT_SKIP : 1,
+ "creating userfaultfd failed");
+ }
+ uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = *features;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api))
+ err("UFFDIO_API failed.\nPlease make sure to "
+ "run with either root or ptrace capability.");
+ if (uffdio_api.api != UFFD_API)
+ err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
+
+ *features = uffdio_api.features;
+}
+
+static inline void munmap_area(void **area)
+{
+ if (*area)
+ if (munmap(*area, nr_pages * page_size))
+ err("munmap");
+
+ *area = NULL;
+}
+
+static void uffd_test_ctx_clear(void)
+{
+ size_t i;
+
+ if (pipefd) {
+ for (i = 0; i < nr_cpus * 2; ++i) {
+ if (close(pipefd[i]))
+ err("close pipefd");
+ }
+ free(pipefd);
+ pipefd = NULL;
+ }
+
+ if (count_verify) {
+ free(count_verify);
+ count_verify = NULL;
+ }
+
+ if (uffd != -1) {
+ if (close(uffd))
+ err("close uffd");
+ uffd = -1;
+ }
+
+ munmap_area((void **)&area_src);
+ munmap_area((void **)&area_src_alias);
+ munmap_area((void **)&area_dst);
+ munmap_area((void **)&area_dst_alias);
+ munmap_area((void **)&area_remap);
+}
+
+void uffd_test_ctx_init(uint64_t features)
+{
+ unsigned long nr, cpu;
+
+ uffd_test_ctx_clear();
+
+ uffd_test_ops->allocate_area((void **)&area_src, true);
+ uffd_test_ops->allocate_area((void **)&area_dst, false);
+
+ userfaultfd_open(&features);
+
+ count_verify = malloc(nr_pages * sizeof(unsigned long long));
+ if (!count_verify)
+ err("count_verify");
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ *area_mutex(area_src, nr) =
+ (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
+ count_verify[nr] = *area_count(area_src, nr) = 1;
+ /*
+ * In the transition between 255 to 256, powerpc will
+ * read out of order in my_bcmp and see both bytes as
+ * zero, so leave a placeholder below always non-zero
+ * after the count, to avoid my_bcmp to trigger false
+ * positives.
+ */
+ *(area_count(area_src, nr) + 1) = 1;
+ }
+
+ /*
+ * After initialization of area_src, we must explicitly release pages
+ * for area_dst to make sure it's fully empty. Otherwise we could have
+ * some area_dst pages be errornously initialized with zero pages,
+ * hence we could hit memory corruption later in the test.
+ *
+ * One example is when THP is globally enabled, above allocate_area()
+ * calls could have the two areas merged into a single VMA (as they
+ * will have the same VMA flags so they're mergeable). When we
+ * initialize the area_src above, it's possible that some part of
+ * area_dst could have been faulted in via one huge THP that will be
+ * shared between area_src and area_dst. It could cause some of the
+ * area_dst won't be trapped by missing userfaults.
+ *
+ * This release_pages() will guarantee even if that happened, we'll
+ * proactively split the thp and drop any accidentally initialized
+ * pages within area_dst.
+ */
+ uffd_test_ops->release_pages(area_dst);
+
+ pipefd = malloc(sizeof(int) * nr_cpus * 2);
+ if (!pipefd)
+ err("pipefd");
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
+ err("pipe");
+}
+
+uint64_t get_expected_ioctls(uint64_t mode)
+{
+ uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
+
+ if (test_type == TEST_HUGETLB)
+ ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
+
+ if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
+ ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
+
+ if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
+ ioctls &= ~(1 << _UFFDIO_CONTINUE);
+
+ return ioctls;
+}
+
+void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
+{
+ uint64_t expected = get_expected_ioctls(mode);
+ uint64_t actual = ioctls & expected;
+
+ if (actual != expected) {
+ err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
+ expected, actual);
+ }
+}
+
+void wp_range(int ufd, __u64 start, __u64 len, bool wp)
+{
+ struct uffdio_writeprotect prms;
+
+ /* Write protection page faults */
+ prms.range.start = start;
+ prms.range.len = len;
+ /* Undo write-protect, do wakeup after that */
+ prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
+
+ if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
+ err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
+}
+
+static void continue_range(int ufd, __u64 start, __u64 len)
+{
+ struct uffdio_continue req;
+ int ret;
+
+ req.range.start = start;
+ req.range.len = len;
+ req.mode = 0;
+ if (test_uffdio_wp)
+ req.mode |= UFFDIO_CONTINUE_MODE_WP;
+
+ if (ioctl(ufd, UFFDIO_CONTINUE, &req))
+ err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
+ (uint64_t)start);
+
+ /*
+ * Error handling within the kernel for continue is subtly different
+ * from copy or zeropage, so it may be a source of bugs. Trigger an
+ * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
+ */
+ req.mapped = 0;
+ ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
+ if (ret >= 0 || req.mapped != -EEXIST)
+ err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
+ ret, (int64_t) req.mapped);
+}
+
+int uffd_read_msg(int ufd, struct uffd_msg *msg)
+{
+ int ret = read(uffd, msg, sizeof(*msg));
+
+ if (ret != sizeof(*msg)) {
+ if (ret < 0) {
+ if (errno == EAGAIN || errno == EINTR)
+ return 1;
+ err("blocking read error");
+ } else {
+ err("short read");
+ }
+ }
+
+ return 0;
+}
+
+void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats)
+{
+ unsigned long offset;
+
+ if (msg->event != UFFD_EVENT_PAGEFAULT)
+ err("unexpected msg event %u", msg->event);
+
+ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
+ /* Write protect page faults */
+ wp_range(uffd, msg->arg.pagefault.address, page_size, false);
+ stats->wp_faults++;
+ } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
+ uint8_t *area;
+ int b;
+
+ /*
+ * Minor page faults
+ *
+ * To prove we can modify the original range for testing
+ * purposes, we're going to bit flip this range before
+ * continuing.
+ *
+ * Note that this requires all minor page fault tests operate on
+ * area_dst (non-UFFD-registered) and area_dst_alias
+ * (UFFD-registered).
+ */
+
+ area = (uint8_t *)(area_dst +
+ ((char *)msg->arg.pagefault.address -
+ area_dst_alias));
+ for (b = 0; b < page_size; ++b)
+ area[b] = ~area[b];
+ continue_range(uffd, msg->arg.pagefault.address, page_size);
+ stats->minor_faults++;
+ } else {
+ /*
+ * Missing page faults.
+ *
+ * Here we force a write check for each of the missing mode
+ * faults. It's guaranteed because the only threads that
+ * will trigger uffd faults are the locking threads, and
+ * their first instruction to touch the missing page will
+ * always be pthread_mutex_lock().
+ *
+ * Note that here we relied on an NPTL glibc impl detail to
+ * always read the lock type at the entry of the lock op
+ * (pthread_mutex_t.__data.__type, offset 0x10) before
+ * doing any locking operations to guarantee that. It's
+ * actually not good to rely on this impl detail because
+ * logically a pthread-compatible lib can implement the
+ * locks without types and we can fail when linking with
+ * them. However since we used to find bugs with this
+ * strict check we still keep it around. Hopefully this
+ * could be a good hint when it fails again. If one day
+ * it'll break on some other impl of glibc we'll revisit.
+ */
+ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+ err("unexpected write fault");
+
+ offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+
+ if (copy_page(uffd, offset))
+ stats->missing_faults++;
+ }
+}
+
+void *uffd_poll_thread(void *arg)
+{
+ struct uffd_stats *stats = (struct uffd_stats *)arg;
+ unsigned long cpu = stats->cpu;
+ struct pollfd pollfd[2];
+ struct uffd_msg msg;
+ struct uffdio_register uffd_reg;
+ int ret;
+ char tmp_chr;
+
+ pollfd[0].fd = uffd;
+ pollfd[0].events = POLLIN;
+ pollfd[1].fd = pipefd[cpu*2];
+ pollfd[1].events = POLLIN;
+
+ for (;;) {
+ ret = poll(pollfd, 2, -1);
+ if (ret <= 0) {
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ err("poll error: %d", ret);
+ }
+ if (pollfd[1].revents) {
+ if (!(pollfd[1].revents & POLLIN))
+ err("pollfd[1].revents %d", pollfd[1].revents);
+ if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+ err("read pipefd error");
+ break;
+ }
+ if (!(pollfd[0].revents & POLLIN))
+ err("pollfd[0].revents %d", pollfd[0].revents);
+ if (uffd_read_msg(uffd, &msg))
+ continue;
+ switch (msg.event) {
+ default:
+ err("unexpected msg event %u\n", msg.event);
+ break;
+ case UFFD_EVENT_PAGEFAULT:
+ uffd_handle_page_fault(&msg, stats);
+ break;
+ case UFFD_EVENT_FORK:
+ close(uffd);
+ uffd = msg.arg.fork.ufd;
+ pollfd[0].fd = uffd;
+ break;
+ case UFFD_EVENT_REMOVE:
+ uffd_reg.range.start = msg.arg.remove.start;
+ uffd_reg.range.len = msg.arg.remove.end -
+ msg.arg.remove.start;
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
+ err("remove failure");
+ break;
+ case UFFD_EVENT_REMAP:
+ area_remap = area_dst; /* save for later unmap */
+ area_dst = (char *)(unsigned long)msg.arg.remap.to;
+ break;
+ }
+ }
+
+ return NULL;
+}
+
+static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
+ unsigned long offset)
+{
+ uffd_test_ops->alias_mapping(&uffdio_copy->dst,
+ uffdio_copy->len,
+ offset);
+ if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy->copy != -EEXIST)
+ err("UFFDIO_COPY retry error: %"PRId64,
+ (int64_t)uffdio_copy->copy);
+ } else {
+ err("UFFDIO_COPY retry unexpected: %"PRId64,
+ (int64_t)uffdio_copy->copy);
+ }
+}
+
+static void wake_range(int ufd, unsigned long addr, unsigned long len)
+{
+ struct uffdio_range uffdio_wake;
+
+ uffdio_wake.start = addr;
+ uffdio_wake.len = len;
+
+ if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
+ fprintf(stderr, "error waking %lu\n",
+ addr), exit(1);
+}
+
+int __copy_page(int ufd, unsigned long offset, bool retry)
+{
+ struct uffdio_copy uffdio_copy;
+
+ if (offset >= nr_pages * page_size)
+ err("unexpected offset %lu\n", offset);
+ uffdio_copy.dst = (unsigned long) area_dst + offset;
+ uffdio_copy.src = (unsigned long) area_src + offset;
+ uffdio_copy.len = page_size;
+ if (test_uffdio_wp)
+ uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
+ else
+ uffdio_copy.mode = 0;
+ uffdio_copy.copy = 0;
+ if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy.copy != -EEXIST)
+ err("UFFDIO_COPY error: %"PRId64,
+ (int64_t)uffdio_copy.copy);
+ wake_range(ufd, uffdio_copy.dst, page_size);
+ } else if (uffdio_copy.copy != page_size) {
+ err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
+ } else {
+ if (test_uffdio_copy_eexist && retry) {
+ test_uffdio_copy_eexist = false;
+ retry_copy_page(ufd, &uffdio_copy, offset);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+int copy_page(int ufd, unsigned long offset)
+{
+ return __copy_page(ufd, offset, false);
+}
new file mode 100644
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Userfaultfd tests common header
+ *
+ * Copyright (C) 2015-2023 Red Hat, Inc.
+ */
+#ifndef __UFFD_COMMON_H__
+#define __UFFD_COMMON_H__
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <pthread.h>
+#include <linux/userfaultfd.h>
+#include <setjmp.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <sys/random.h>
+
+#include "../kselftest.h"
+#include "vm_util.h"
+
+#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
+
+#define _err(fmt, ...) \
+ do { \
+ int ret = errno; \
+ fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \
+ fprintf(stderr, " (errno=%d, @%s:%d)\n", \
+ ret, __FILE__, __LINE__); \
+ } while (0)
+
+#define errexit(exitcode, fmt, ...) \
+ do { \
+ _err(fmt, ##__VA_ARGS__); \
+ exit(exitcode); \
+ } while (0)
+
+#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr) \
+ ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr) \
+ ((volatile unsigned long long *) ((unsigned long) \
+ ((___area) + (___nr)*page_size + \
+ sizeof(pthread_mutex_t) + \
+ sizeof(unsigned long long) - 1) & \
+ ~(unsigned long)(sizeof(unsigned long long) \
+ - 1)))
+
+/* Userfaultfd test statistics */
+struct uffd_stats {
+ int cpu;
+ unsigned long missing_faults;
+ unsigned long wp_faults;
+ unsigned long minor_faults;
+};
+
+struct uffd_test_ops {
+ void (*allocate_area)(void **alloc_area, bool is_src);
+ void (*release_pages)(char *rel_area);
+ void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
+ void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
+};
+typedef struct uffd_test_ops uffd_test_ops_t;
+
+extern unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
+extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
+extern int mem_fd, uffd, uffd_flags, finished, *pipefd, test_type;
+extern bool map_shared, test_collapse, test_dev_userfaultfd;
+extern bool test_uffdio_wp, test_uffdio_minor;
+extern unsigned long long *count_verify;
+extern volatile bool test_uffdio_copy_eexist;
+
+extern uffd_test_ops_t anon_uffd_test_ops;
+extern uffd_test_ops_t shmem_uffd_test_ops;
+extern uffd_test_ops_t hugetlb_uffd_test_ops;
+extern uffd_test_ops_t *uffd_test_ops;
+
+void uffd_stats_report(struct uffd_stats *stats, int n_cpus);
+void uffd_test_ctx_init(uint64_t features);
+void userfaultfd_open(uint64_t *features);
+uint64_t get_expected_ioctls(uint64_t mode);
+void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls);
+int uffd_read_msg(int ufd, struct uffd_msg *msg);
+void wp_range(int ufd, __u64 start, __u64 len, bool wp);
+void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_stats *stats);
+int __copy_page(int ufd, unsigned long offset, bool retry);
+int copy_page(int ufd, unsigned long offset);
+void *uffd_poll_thread(void *arg);
+
+#define TEST_ANON 1
+#define TEST_HUGETLB 2
+#define TEST_SHMEM 3
+
+#endif
@@ -34,96 +34,20 @@
* transfer (UFFDIO_COPY).
*/
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <errno.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <time.h>
-#include <signal.h>
-#include <poll.h>
-#include <string.h>
-#include <linux/mman.h>
-#include <sys/mman.h>
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/wait.h>
-#include <pthread.h>
-#include <linux/userfaultfd.h>
-#include <setjmp.h>
-#include <stdbool.h>
-#include <assert.h>
-#include <inttypes.h>
-#include <stdint.h>
-#include <sys/random.h>
-
-#include "../kselftest.h"
-#include "vm_util.h"
+#include "uffd-common.h"
#ifdef __NR_userfaultfd
-static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size;
-
#define BOUNCE_RANDOM (1<<0)
#define BOUNCE_RACINGFAULTS (1<<1)
#define BOUNCE_VERIFY (1<<2)
#define BOUNCE_POLL (1<<3)
static int bounces;
-#define TEST_ANON 1
-#define TEST_HUGETLB 2
-#define TEST_SHMEM 3
-static int test_type;
-
-#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
-
-#define BASE_PMD_ADDR ((void *)(1UL << 30))
-
-/* test using /dev/userfaultfd, instead of userfaultfd(2) */
-static bool test_dev_userfaultfd;
-
/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
#define ALARM_INTERVAL_SECS 10
-static volatile bool test_uffdio_copy_eexist = true;
-/* Whether to test uffd write-protection */
-static bool test_uffdio_wp = true;
-/* Whether to test uffd minor faults */
-static bool test_uffdio_minor = false;
-static bool map_shared;
-static int mem_fd;
-static unsigned long long *count_verify;
-static int uffd = -1;
-static int uffd_flags, finished, *pipefd;
-static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
static char *zeropage;
pthread_attr_t attr;
-static bool test_collapse;
-
-/* Userfaultfd test statistics */
-struct uffd_stats {
- int cpu;
- unsigned long missing_faults;
- unsigned long wp_faults;
- unsigned long minor_faults;
-};
-
-/* pthread_mutex_t starts at page offset 0 */
-#define area_mutex(___area, ___nr) \
- ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
-/*
- * count is placed in the page after pthread_mutex_t naturally aligned
- * to avoid non alignment faults on non-x86 archs.
- */
-#define area_count(___area, ___nr) \
- ((volatile unsigned long long *) ((unsigned long) \
- ((___area) + (___nr)*page_size + \
- sizeof(pthread_mutex_t) + \
- sizeof(unsigned long long) - 1) & \
- ~(unsigned long)(sizeof(unsigned long long) \
- - 1)))
#define swap(a, b) \
do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
@@ -166,22 +90,6 @@ static void usage(void)
exit(1);
}
-#define _err(fmt, ...) \
- do { \
- int ret = errno; \
- fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \
- fprintf(stderr, " (errno=%d, line=%d)\n", \
- ret, __LINE__); \
- } while (0)
-
-#define errexit(exitcode, fmt, ...) \
- do { \
- _err(fmt, ##__VA_ARGS__); \
- exit(exitcode); \
- } while (0)
-
-#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
-
static void uffd_stats_reset(struct uffd_stats *uffd_stats,
unsigned long n_cpus)
{
@@ -195,189 +103,6 @@ static void uffd_stats_reset(struct uffd_stats *uffd_stats,
}
}
-static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
-{
- int i;
- unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
-
- for (i = 0; i < n_cpus; i++) {
- miss_total += stats[i].missing_faults;
- wp_total += stats[i].wp_faults;
- minor_total += stats[i].minor_faults;
- }
-
- printf("userfaults: ");
- if (miss_total) {
- printf("%llu missing (", miss_total);
- for (i = 0; i < n_cpus; i++)
- printf("%lu+", stats[i].missing_faults);
- printf("\b) ");
- }
- if (wp_total) {
- printf("%llu wp (", wp_total);
- for (i = 0; i < n_cpus; i++)
- printf("%lu+", stats[i].wp_faults);
- printf("\b) ");
- }
- if (minor_total) {
- printf("%llu minor (", minor_total);
- for (i = 0; i < n_cpus; i++)
- printf("%lu+", stats[i].minor_faults);
- printf("\b)");
- }
- printf("\n");
-}
-
-static void anon_release_pages(char *rel_area)
-{
- if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
- err("madvise(MADV_DONTNEED) failed");
-}
-
-static void anon_allocate_area(void **alloc_area, bool is_src)
-{
- *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
- MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-}
-
-static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
-}
-
-static void hugetlb_release_pages(char *rel_area)
-{
- if (!map_shared) {
- if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
- err("madvise(MADV_DONTNEED) failed");
- } else {
- if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
- err("madvise(MADV_REMOVE) failed");
- }
-}
-
-static void hugetlb_allocate_area(void **alloc_area, bool is_src)
-{
- off_t size = nr_pages * page_size;
- off_t offset = is_src ? 0 : size;
- void *area_alias = NULL;
- char **alloc_area_alias;
-
- *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
- (map_shared ? MAP_SHARED : MAP_PRIVATE) |
- (is_src ? 0 : MAP_NORESERVE),
- mem_fd, offset);
- if (*alloc_area == MAP_FAILED)
- err("mmap of hugetlbfs file failed");
-
- if (map_shared) {
- area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
- MAP_SHARED, mem_fd, offset);
- if (area_alias == MAP_FAILED)
- err("mmap of hugetlb file alias failed");
- }
-
- if (is_src) {
- alloc_area_alias = &area_src_alias;
- } else {
- alloc_area_alias = &area_dst_alias;
- }
- if (area_alias)
- *alloc_area_alias = area_alias;
-}
-
-static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
- if (!map_shared)
- return;
-
- *start = (unsigned long) area_dst_alias + offset;
-}
-
-static void shmem_release_pages(char *rel_area)
-{
- if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
- err("madvise(MADV_REMOVE) failed");
-}
-
-static void shmem_allocate_area(void **alloc_area, bool is_src)
-{
- void *area_alias = NULL;
- size_t bytes = nr_pages * page_size;
- unsigned long offset = is_src ? 0 : bytes;
- char *p = NULL, *p_alias = NULL;
-
- if (test_collapse) {
- p = BASE_PMD_ADDR;
- if (!is_src)
- /* src map + alias + interleaved hpages */
- p += 2 * (bytes + hpage_size);
- p_alias = p;
- p_alias += bytes;
- p_alias += hpage_size; /* Prevent src/dst VMA merge */
- }
-
- *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
- mem_fd, offset);
- if (*alloc_area == MAP_FAILED)
- err("mmap of memfd failed");
- if (test_collapse && *alloc_area != p)
- err("mmap of memfd failed at %p", p);
-
- area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
- mem_fd, offset);
- if (area_alias == MAP_FAILED)
- err("mmap of memfd alias failed");
- if (test_collapse && area_alias != p_alias)
- err("mmap of anonymous memory failed at %p", p_alias);
-
- if (is_src)
- area_src_alias = area_alias;
- else
- area_dst_alias = area_alias;
-}
-
-static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
-{
- *start = (unsigned long)area_dst_alias + offset;
-}
-
-static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
-{
- if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size))
- err("Did not find expected %d number of hugepages",
- expect_nr_hpages);
-}
-
-struct uffd_test_ops {
- void (*allocate_area)(void **alloc_area, bool is_src);
- void (*release_pages)(char *rel_area);
- void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
- void (*check_pmd_mapping)(void *p, int expect_nr_hpages);
-};
-
-static struct uffd_test_ops anon_uffd_test_ops = {
- .allocate_area = anon_allocate_area,
- .release_pages = anon_release_pages,
- .alias_mapping = noop_alias_mapping,
- .check_pmd_mapping = NULL,
-};
-
-static struct uffd_test_ops shmem_uffd_test_ops = {
- .allocate_area = shmem_allocate_area,
- .release_pages = shmem_release_pages,
- .alias_mapping = shmem_alias_mapping,
- .check_pmd_mapping = shmem_check_pmd_mapping,
-};
-
-static struct uffd_test_ops hugetlb_uffd_test_ops = {
- .allocate_area = hugetlb_allocate_area,
- .release_pages = hugetlb_release_pages,
- .alias_mapping = hugetlb_alias_mapping,
- .check_pmd_mapping = NULL,
-};
-
-static struct uffd_test_ops *uffd_test_ops;
-
static inline uint64_t uffd_minor_feature(void)
{
if (test_type == TEST_HUGETLB && map_shared)
@@ -388,171 +113,6 @@ static inline uint64_t uffd_minor_feature(void)
return 0;
}
-static uint64_t get_expected_ioctls(uint64_t mode)
-{
- uint64_t ioctls = UFFD_API_RANGE_IOCTLS;
-
- if (test_type == TEST_HUGETLB)
- ioctls &= ~(1 << _UFFDIO_ZEROPAGE);
-
- if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp))
- ioctls &= ~(1 << _UFFDIO_WRITEPROTECT);
-
- if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor))
- ioctls &= ~(1 << _UFFDIO_CONTINUE);
-
- return ioctls;
-}
-
-static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
-{
- uint64_t expected = get_expected_ioctls(mode);
- uint64_t actual = ioctls & expected;
-
- if (actual != expected) {
- err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64,
- expected, actual);
- }
-}
-
-static int __userfaultfd_open_dev(void)
-{
- int fd, _uffd;
-
- fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
- if (fd < 0)
- errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
-
- _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
- if (_uffd < 0)
- errexit(errno == ENOTTY ? KSFT_SKIP : 1,
- "creating userfaultfd failed");
- close(fd);
- return _uffd;
-}
-
-static void userfaultfd_open(uint64_t *features)
-{
- struct uffdio_api uffdio_api;
-
- if (test_dev_userfaultfd)
- uffd = __userfaultfd_open_dev();
- else {
- uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
- if (uffd < 0)
- errexit(errno == ENOSYS ? KSFT_SKIP : 1,
- "creating userfaultfd failed");
- }
- uffd_flags = fcntl(uffd, F_GETFD, NULL);
-
- uffdio_api.api = UFFD_API;
- uffdio_api.features = *features;
- if (ioctl(uffd, UFFDIO_API, &uffdio_api))
- err("UFFDIO_API failed.\nPlease make sure to "
- "run with either root or ptrace capability.");
- if (uffdio_api.api != UFFD_API)
- err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
-
- *features = uffdio_api.features;
-}
-
-static inline void munmap_area(void **area)
-{
- if (*area)
- if (munmap(*area, nr_pages * page_size))
- err("munmap");
-
- *area = NULL;
-}
-
-static void uffd_test_ctx_clear(void)
-{
- size_t i;
-
- if (pipefd) {
- for (i = 0; i < nr_cpus * 2; ++i) {
- if (close(pipefd[i]))
- err("close pipefd");
- }
- free(pipefd);
- pipefd = NULL;
- }
-
- if (count_verify) {
- free(count_verify);
- count_verify = NULL;
- }
-
- if (uffd != -1) {
- if (close(uffd))
- err("close uffd");
- uffd = -1;
- }
-
- munmap_area((void **)&area_src);
- munmap_area((void **)&area_src_alias);
- munmap_area((void **)&area_dst);
- munmap_area((void **)&area_dst_alias);
- munmap_area((void **)&area_remap);
-}
-
-static void uffd_test_ctx_init(uint64_t features)
-{
- unsigned long nr, cpu;
-
- uffd_test_ctx_clear();
-
- uffd_test_ops->allocate_area((void **)&area_src, true);
- uffd_test_ops->allocate_area((void **)&area_dst, false);
-
- userfaultfd_open(&features);
-
- count_verify = malloc(nr_pages * sizeof(unsigned long long));
- if (!count_verify)
- err("count_verify");
-
- for (nr = 0; nr < nr_pages; nr++) {
- *area_mutex(area_src, nr) =
- (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
- count_verify[nr] = *area_count(area_src, nr) = 1;
- /*
- * In the transition between 255 to 256, powerpc will
- * read out of order in my_bcmp and see both bytes as
- * zero, so leave a placeholder below always non-zero
- * after the count, to avoid my_bcmp to trigger false
- * positives.
- */
- *(area_count(area_src, nr) + 1) = 1;
- }
-
- /*
- * After initialization of area_src, we must explicitly release pages
- * for area_dst to make sure it's fully empty. Otherwise we could have
- * some area_dst pages be errornously initialized with zero pages,
- * hence we could hit memory corruption later in the test.
- *
- * One example is when THP is globally enabled, above allocate_area()
- * calls could have the two areas merged into a single VMA (as they
- * will have the same VMA flags so they're mergeable). When we
- * initialize the area_src above, it's possible that some part of
- * area_dst could have been faulted in via one huge THP that will be
- * shared between area_src and area_dst. It could cause some of the
- * area_dst won't be trapped by missing userfaults.
- *
- * This release_pages() will guarantee even if that happened, we'll
- * proactively split the thp and drop any accidentally initialized
- * pages within area_dst.
- */
- uffd_test_ops->release_pages(area_dst);
-
- pipefd = malloc(sizeof(int) * nr_cpus * 2);
- if (!pipefd)
- err("pipefd");
- for (cpu = 0; cpu < nr_cpus; cpu++)
- if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
- err("pipe");
-}
-
static int my_bcmp(char *str1, char *str2, size_t n)
{
unsigned long i;
@@ -562,47 +122,6 @@ static int my_bcmp(char *str1, char *str2, size_t n)
return 0;
}
-static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
-{
- struct uffdio_writeprotect prms;
-
- /* Write protection page faults */
- prms.range.start = start;
- prms.range.len = len;
- /* Undo write-protect, do wakeup after that */
- prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
-
- if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
- err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
-}
-
-static void continue_range(int ufd, __u64 start, __u64 len)
-{
- struct uffdio_continue req;
- int ret;
-
- req.range.start = start;
- req.range.len = len;
- req.mode = 0;
- if (test_uffdio_wp)
- req.mode |= UFFDIO_CONTINUE_MODE_WP;
-
- if (ioctl(ufd, UFFDIO_CONTINUE, &req))
- err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
- (uint64_t)start);
-
- /*
- * Error handling within the kernel for continue is subtly different
- * from copy or zeropage, so it may be a source of bugs. Trigger an
- * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
- */
- req.mapped = 0;
- ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
- if (ret >= 0 || req.mapped != -EEXIST)
- err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
- ret, (int64_t) req.mapped);
-}
-
static void *locking_thread(void *arg)
{
unsigned long cpu = (unsigned long) arg;
@@ -635,222 +154,11 @@ static void *locking_thread(void *arg)
return NULL;
}
-static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
- unsigned long offset)
-{
- uffd_test_ops->alias_mapping(&uffdio_copy->dst,
- uffdio_copy->len,
- offset);
- if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
- /* real retval in ufdio_copy.copy */
- if (uffdio_copy->copy != -EEXIST)
- err("UFFDIO_COPY retry error: %"PRId64,
- (int64_t)uffdio_copy->copy);
- } else {
- err("UFFDIO_COPY retry unexpected: %"PRId64,
- (int64_t)uffdio_copy->copy);
- }
-}
-
-static void wake_range(int ufd, unsigned long addr, unsigned long len)
-{
- struct uffdio_range uffdio_wake;
-
- uffdio_wake.start = addr;
- uffdio_wake.len = len;
-
- if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
- fprintf(stderr, "error waking %lu\n",
- addr), exit(1);
-}
-
-static int __copy_page(int ufd, unsigned long offset, bool retry)
-{
- struct uffdio_copy uffdio_copy;
-
- if (offset >= nr_pages * page_size)
- err("unexpected offset %lu\n", offset);
- uffdio_copy.dst = (unsigned long) area_dst + offset;
- uffdio_copy.src = (unsigned long) area_src + offset;
- uffdio_copy.len = page_size;
- if (test_uffdio_wp)
- uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
- else
- uffdio_copy.mode = 0;
- uffdio_copy.copy = 0;
- if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
- /* real retval in ufdio_copy.copy */
- if (uffdio_copy.copy != -EEXIST)
- err("UFFDIO_COPY error: %"PRId64,
- (int64_t)uffdio_copy.copy);
- wake_range(ufd, uffdio_copy.dst, page_size);
- } else if (uffdio_copy.copy != page_size) {
- err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
- } else {
- if (test_uffdio_copy_eexist && retry) {
- test_uffdio_copy_eexist = false;
- retry_copy_page(ufd, &uffdio_copy, offset);
- }
- return 1;
- }
- return 0;
-}
-
static int copy_page_retry(int ufd, unsigned long offset)
{
return __copy_page(ufd, offset, true);
}
-static int copy_page(int ufd, unsigned long offset)
-{
- return __copy_page(ufd, offset, false);
-}
-
-static int uffd_read_msg(int ufd, struct uffd_msg *msg)
-{
- int ret = read(uffd, msg, sizeof(*msg));
-
- if (ret != sizeof(*msg)) {
- if (ret < 0) {
- if (errno == EAGAIN || errno == EINTR)
- return 1;
- err("blocking read error");
- } else {
- err("short read");
- }
- }
-
- return 0;
-}
-
-static void uffd_handle_page_fault(struct uffd_msg *msg,
- struct uffd_stats *stats)
-{
- unsigned long offset;
-
- if (msg->event != UFFD_EVENT_PAGEFAULT)
- err("unexpected msg event %u", msg->event);
-
- if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
- /* Write protect page faults */
- wp_range(uffd, msg->arg.pagefault.address, page_size, false);
- stats->wp_faults++;
- } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
- uint8_t *area;
- int b;
-
- /*
- * Minor page faults
- *
- * To prove we can modify the original range for testing
- * purposes, we're going to bit flip this range before
- * continuing.
- *
- * Note that this requires all minor page fault tests operate on
- * area_dst (non-UFFD-registered) and area_dst_alias
- * (UFFD-registered).
- */
-
- area = (uint8_t *)(area_dst +
- ((char *)msg->arg.pagefault.address -
- area_dst_alias));
- for (b = 0; b < page_size; ++b)
- area[b] = ~area[b];
- continue_range(uffd, msg->arg.pagefault.address, page_size);
- stats->minor_faults++;
- } else {
- /*
- * Missing page faults.
- *
- * Here we force a write check for each of the missing mode
- * faults. It's guaranteed because the only threads that
- * will trigger uffd faults are the locking threads, and
- * their first instruction to touch the missing page will
- * always be pthread_mutex_lock().
- *
- * Note that here we relied on an NPTL glibc impl detail to
- * always read the lock type at the entry of the lock op
- * (pthread_mutex_t.__data.__type, offset 0x10) before
- * doing any locking operations to guarantee that. It's
- * actually not good to rely on this impl detail because
- * logically a pthread-compatible lib can implement the
- * locks without types and we can fail when linking with
- * them. However since we used to find bugs with this
- * strict check we still keep it around. Hopefully this
- * could be a good hint when it fails again. If one day
- * it'll break on some other impl of glibc we'll revisit.
- */
- if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
- err("unexpected write fault");
-
- offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
- offset &= ~(page_size-1);
-
- if (copy_page(uffd, offset))
- stats->missing_faults++;
- }
-}
-
-static void *uffd_poll_thread(void *arg)
-{
- struct uffd_stats *stats = (struct uffd_stats *)arg;
- unsigned long cpu = stats->cpu;
- struct pollfd pollfd[2];
- struct uffd_msg msg;
- struct uffdio_register uffd_reg;
- int ret;
- char tmp_chr;
-
- pollfd[0].fd = uffd;
- pollfd[0].events = POLLIN;
- pollfd[1].fd = pipefd[cpu*2];
- pollfd[1].events = POLLIN;
-
- for (;;) {
- ret = poll(pollfd, 2, -1);
- if (ret <= 0) {
- if (errno == EINTR || errno == EAGAIN)
- continue;
- err("poll error: %d", ret);
- }
- if (pollfd[1].revents & POLLIN) {
- if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
- err("read pipefd error");
- break;
- }
- if (!(pollfd[0].revents & POLLIN))
- err("pollfd[0].revents %d", pollfd[0].revents);
- if (uffd_read_msg(uffd, &msg))
- continue;
- switch (msg.event) {
- default:
- err("unexpected msg event %u\n", msg.event);
- break;
- case UFFD_EVENT_PAGEFAULT:
- uffd_handle_page_fault(&msg, stats);
- break;
- case UFFD_EVENT_FORK:
- close(uffd);
- uffd = msg.arg.fork.ufd;
- pollfd[0].fd = uffd;
- break;
- case UFFD_EVENT_REMOVE:
- uffd_reg.range.start = msg.arg.remove.start;
- uffd_reg.range.len = msg.arg.remove.end -
- msg.arg.remove.start;
- if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
- err("remove failure");
- break;
- case UFFD_EVENT_REMAP:
- area_remap = area_dst; /* save for later unmap */
- area_dst = (char *)(unsigned long)msg.arg.remap.to;
- break;
- }
- }
-
- return NULL;
-}
-
pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
static void *uffd_read_thread(void *arg)