[2/2] selftests: KVM: Add page splitting test

Message ID 20230119212510.3938454-3-bgardon@google.com
State New
Headers
Series selftests: KVM: Add a test for eager page splitting |

Commit Message

Ben Gardon Jan. 19, 2023, 9:25 p.m. UTC
  Add a test for page splitting during dirty logging and for hugepage
recovery after dirty logging.

Page splitting represents non-trivial behavior, which is complicated
by MANUAL_PROTECT mode, which causes pages to be split on the first
clear, instead of when dirty logging is enabled.

Add a test which makes asserions about page counts to help define the
expected behavior of page splitting and to provid needed coverage of the
behavior. This also helps ensure that a failure in eager page splitting
is not covered up by splitting in the vCPU path.

Tested by running the test on an Intel Haswell machine w/wo
MANUAL_PROTECT.

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 tools/testing/selftests/kvm/Makefile          |   1 +
 .../selftests/kvm/include/kvm_util_base.h     |   2 +
 tools/testing/selftests/kvm/lib/kvm_util.c    |   5 +
 .../kvm/x86_64/page_splitting_test.c          | 314 ++++++++++++++++++
 4 files changed, 322 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/page_splitting_test.c
  

Comments

David Matlack Jan. 19, 2023, 10:55 p.m. UTC | #1
On Thu, Jan 19, 2023 at 09:25:10PM +0000, Ben Gardon wrote:
> Add a test for page splitting during dirty logging and for hugepage
> recovery after dirty logging.
> 
> Page splitting represents non-trivial behavior, which is complicated
> by MANUAL_PROTECT mode, which causes pages to be split on the first
> clear, instead of when dirty logging is enabled.
> 
> Add a test which makes asserions about page counts to help define the
> expected behavior of page splitting and to provid needed coverage of the
> behavior. This also helps ensure that a failure in eager page splitting
> is not covered up by splitting in the vCPU path.
> 
> Tested by running the test on an Intel Haswell machine w/wo
> MANUAL_PROTECT.
> 
> Signed-off-by: Ben Gardon <bgardon@google.com>
> ---
>  tools/testing/selftests/kvm/Makefile          |   1 +
>  .../selftests/kvm/include/kvm_util_base.h     |   2 +
>  tools/testing/selftests/kvm/lib/kvm_util.c    |   5 +
>  .../kvm/x86_64/page_splitting_test.c          | 314 ++++++++++++++++++
>  4 files changed, 322 insertions(+)
>  create mode 100644 tools/testing/selftests/kvm/x86_64/page_splitting_test.c
> 
> diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
> index 1750f91dd9362..057ebd709e77a 100644
> --- a/tools/testing/selftests/kvm/Makefile
> +++ b/tools/testing/selftests/kvm/Makefile
> @@ -76,6 +76,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
>  TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
>  TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
>  TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
> +TEST_GEN_PROGS_x86_64 += x86_64/page_splitting_test
>  TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
>  TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
>  TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
> diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
> index fbc2a79369b8b..4e38a771fa5d1 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util_base.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
> @@ -213,6 +213,7 @@ extern const struct vm_guest_mode_params vm_guest_mode_params[];
>  int open_path_or_exit(const char *path, int flags);
>  int open_kvm_dev_path_or_exit(void);
>  
> +bool get_kvm_param_bool(const char *param);
>  bool get_kvm_intel_param_bool(const char *param);
>  bool get_kvm_amd_param_bool(const char *param);
>  
> @@ -402,6 +403,7 @@ static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name)
>  	uint64_t data;
>  
>  	__vm_get_stat(vm, stat_name, &data, 1);
> +

Stray newline.

>  	return data;
>  }
>  
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
> index 56d5ea949cbbe..fa6d69f731990 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -80,6 +80,11 @@ static bool get_module_param_bool(const char *module_name, const char *param)
>  	TEST_FAIL("Unrecognized value '%c' for boolean module param", value);
>  }
>  
> +bool get_kvm_param_bool(const char *param)
> +{
> +	return get_module_param_bool("kvm", param);
> +}
> +
>  bool get_kvm_intel_param_bool(const char *param)
>  {
>  	return get_module_param_bool("kvm_intel", param);
> diff --git a/tools/testing/selftests/kvm/x86_64/page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/page_splitting_test.c
> new file mode 100644
> index 0000000000000..3e2ee20adf036
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/x86_64/page_splitting_test.c
> @@ -0,0 +1,314 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * KVM dirty logging page splitting test
> + *
> + * Based on dirty_log_perf.c
> + *
> + * Copyright (C) 2018, Red Hat, Inc.
> + * Copyright (C) 2020, Google, Inc.
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <time.h>
> +#include <pthread.h>
> +#include <linux/bitmap.h>
> +
> +#include "kvm_util.h"
> +#include "test_util.h"
> +#include "memstress.h"
> +#include "guest_modes.h"
> +
> +/* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/
> +#define TEST_HOST_LOOP_N		2UL
> +
> +static int NR_VCPUS = 2;
> +static int NR_SLOTS = 2;
> +static int NR_ITERATIONS = 2;

These should be macros or at least const?

> +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
> +
> +/* Host variables */

What does "Host variables" mean? (And why is guest_percpu_mem_size not a
"Host variable"?)

I imagine this is copy-pasta from a test that has some global variables
that are used by guest code? If that's correct, it's probably best to
just drop this comment.

> +static u64 dirty_log_manual_caps;
> +static bool host_quit;
> +static int iteration;
> +static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
> +
> +struct kvm_page_stats {
> +	uint64_t pages_4k;
> +	uint64_t pages_2m;
> +	uint64_t pages_1g;
> +	uint64_t hugepages;
> +};
> +
> +static void get_page_stats(struct kvm_vm *vm, struct kvm_page_stats *stats)
> +{
> +	stats->pages_4k = vm_get_stat(vm, "pages_4k");
> +	stats->pages_2m = vm_get_stat(vm, "pages_2m");
> +	stats->pages_1g = vm_get_stat(vm, "pages_1g");
> +	stats->hugepages = stats->pages_2m + stats->pages_1g;
> +
> +	pr_debug("Page stats - 4K: %ld 2M: %ld 1G: %ld huge: %ld\n",
> +		 stats->pages_4k, stats->pages_2m, stats->pages_1g,
> +		 stats->hugepages);
> +}
> +
> +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
> +{
> +	struct kvm_vcpu *vcpu = vcpu_args->vcpu;
> +	int vcpu_idx = vcpu_args->vcpu_idx;
> +	uint64_t pages_count = 0;
> +	struct kvm_run *run;
> +	int ret;
> +
> +	run = vcpu->run;
> +
> +	while (!READ_ONCE(host_quit)) {
> +		int current_iteration = READ_ONCE(iteration);
> +
> +		ret = _vcpu_run(vcpu);
> +
> +		TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);

vcpu_run() handles this assert for you.

> +		TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC,
> +			    "Invalid guest sync status: exit_reason=%s\n",
> +			    exit_reason_str(run->exit_reason));

optional: ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);

(You'll lose the exit reason logging.)

> +
> +		vcpu_last_completed_iteration[vcpu_idx] = current_iteration;
> +
> +		if (current_iteration)
> +			pages_count += vcpu_args->pages;

pages_count looks unnecessary

> +
> +		/* Wait for the start of the next iteration to be signaled. */
> +		while (current_iteration == READ_ONCE(iteration) &&
> +		       READ_ONCE(iteration) >= 0 &&
> +		       !READ_ONCE(host_quit))
> +			;
> +	}
> +}
> +
> +struct test_params {
> +	enum vm_mem_backing_src_type backing_src;
> +};
> +
> +static void run_test(struct test_params *p)
> +{
> +	struct kvm_vm *vm;
> +	unsigned long **bitmaps;
> +	uint64_t guest_num_pages;
> +	uint64_t host_num_pages;
> +	uint64_t pages_per_slot;
> +	int i;
> +	uint64_t total_4k_pages;
> +	struct kvm_page_stats stats_populated;
> +	struct kvm_page_stats stats_dirty_logging_enabled;
> +	struct kvm_page_stats stats_dirty_pass[NR_ITERATIONS];
> +	struct kvm_page_stats stats_clear_pass[NR_ITERATIONS];
> +	struct kvm_page_stats stats_dirty_logging_disabled;
> +	struct kvm_page_stats stats_repopulated;
> +
> +	vm = memstress_create_vm(VM_MODE_DEFAULT, NR_VCPUS, guest_percpu_mem_size,
> +				 NR_SLOTS, p->backing_src, false);
> +
> +	guest_num_pages = (NR_VCPUS * guest_percpu_mem_size) >> vm->page_shift;
> +	guest_num_pages = vm_adjust_num_guest_pages(VM_MODE_DEFAULT, guest_num_pages);
> +	host_num_pages = vm_num_host_pages(VM_MODE_DEFAULT, guest_num_pages);
> +	pages_per_slot = host_num_pages / NR_SLOTS;
> +
> +	bitmaps = memstress_alloc_bitmaps(NR_SLOTS, pages_per_slot);
> +
> +	if (dirty_log_manual_caps)
> +		vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2,
> +			      dirty_log_manual_caps);
> +
> +	/* Start the iterations */
> +	iteration = 0;
> +	host_quit = false;
> +
> +	for (i = 0; i < NR_VCPUS; i++)
> +		vcpu_last_completed_iteration[i] = -1;
> +
> +	memstress_start_vcpu_threads(NR_VCPUS, vcpu_worker);
> +
> +	/* Allow the vCPUs to populate memory */
> +	for (i = 0; i < NR_VCPUS; i++) {
> +		while (READ_ONCE(vcpu_last_completed_iteration[i]) !=
> +		       iteration)
> +			;
> +	}
> +
> +	pr_debug("\nGetting stats after populating memory:\n");
> +	get_page_stats(vm, &stats_populated);
> +
> +	/* Enable dirty logging */
> +	memstress_enable_dirty_logging(vm, NR_SLOTS);
> +
> +	pr_debug("\nGetting stats after enabling dirty logging:\n");
> +	get_page_stats(vm, &stats_dirty_logging_enabled);
> +
> +	while (iteration < NR_ITERATIONS) {
> +		/*
> +		 * Incrementing the iteration number will start the vCPUs
> +		 * dirtying memory again.
> +		 */
> +		iteration++;
> +
> +		for (i = 0; i < NR_VCPUS; i++) {
> +			while (READ_ONCE(vcpu_last_completed_iteration[i])
> +			       != iteration)
> +				;
> +		}
> +
> +		pr_debug("\nGetting stats after dirtying memory on pass %d:\n", iteration);
> +		get_page_stats(vm, &stats_dirty_pass[iteration - 1]);

Incrementing iteration, waiting for vCPUs, and grabbing stats is
repeated below. Throw it in a helper function?

> +
> +		memstress_get_dirty_log(vm, bitmaps, NR_SLOTS);
> +
> +		if (dirty_log_manual_caps) {
> +			memstress_clear_dirty_log(vm, bitmaps, NR_SLOTS, pages_per_slot);
> +
> +			pr_debug("\nGetting stats after clearing dirty log pass %d:\n", iteration);
> +			get_page_stats(vm, &stats_clear_pass[iteration - 1]);
> +		}
> +	}
> +
> +	/* Disable dirty logging */
> +	memstress_disable_dirty_logging(vm, NR_SLOTS);
> +
> +	pr_debug("\nGetting stats after disabling dirty logging:\n");
> +	get_page_stats(vm, &stats_dirty_logging_disabled);
> +
> +	/* Run vCPUs again to fault pages back in. */
> +	iteration++;
> +	for (i = 0; i < NR_VCPUS; i++) {
> +		while (READ_ONCE(vcpu_last_completed_iteration[i]) != iteration)
> +			;
> +	}
> +
> +	pr_debug("\nGetting stats after repopulating memory:\n");
> +	get_page_stats(vm, &stats_repopulated);
> +
> +	/*
> +	 * Tell the vCPU threads to quit.  No need to manually check that vCPUs
> +	 * have stopped running after disabling dirty logging, the join will
> +	 * wait for them to exit.
> +	 */
> +	host_quit = true;
> +	memstress_join_vcpu_threads(NR_VCPUS);
> +
> +	memstress_free_bitmaps(bitmaps, NR_SLOTS);
> +	memstress_destroy_vm(vm);
> +
> +	/* Make assertions about the page counts. */
> +	total_4k_pages = stats_populated.pages_4k;
> +	total_4k_pages += stats_populated.pages_2m * 512;
> +	total_4k_pages += stats_populated.pages_1g * 512 * 512;
> +
> +	/*
> +	 * Check that all huge pages were split. Since large pages can only
> +	 * exist in the data slot, and the vCPUs should have dirtied all pages
> +	 * in the data slot, there should be no huge pages left after splitting.
> +	 * Splitting happens at dirty log enable time without
> +	 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass
> +	 * with that capability.
> +	 */
> +	if (dirty_log_manual_caps) {
> +		TEST_ASSERT(stats_clear_pass[0].hugepages == 0,

Consider using ASSERT_EQ() to simplify these checks. It will
automatically print out the values for you, but you'll lose the
contextual error message ("Unexpected huge page count after
splitting..."). But maybe we could add support for a custom extra error
string?

__ASSERT_EQ(stats_clear_pass[0].hugepages, 0,
            "Expected 0 hugepages after splitting");

Or use a comment to document the context for the assertion. Whoever is
debugging a failure is going to come look at the selftest code no matter
what.

I think I prefer ASSERT_EQ() + comment, especially since the comment
pretty much already exists above.

> +			    "Unexpected huge page count after splitting. Expected 0, got %ld",
> +			    stats_clear_pass[0].hugepages);
> +		TEST_ASSERT(stats_clear_pass[0].pages_4k == total_4k_pages,
> +			    "All memory should be mapped at 4k. Expected %ld 4k pages, got %ld",
> +			    total_4k_pages, stats_clear_pass[0].pages_4k);

Also assert that huge pages are *not* split when dirty logging is first
enabled.

> +	} else {
> +		TEST_ASSERT(stats_dirty_logging_enabled.hugepages == 0,
> +			    "Unexpected huge page count after splitting. Expected 0, got %ld",
> +			    stats_dirty_logging_enabled.hugepages);
> +		TEST_ASSERT(stats_dirty_logging_enabled.pages_4k == total_4k_pages,
> +			    "All memory should be mapped at 4k. Expected %ld 4k pages, got %ld",
> +			    total_4k_pages, stats_dirty_logging_enabled.pages_4k);
> +	}
> +
> +	/*
> +	 * Once dirty logging is disabled and the vCPUs have touched all their
> +	 * memory again, the page counts should be the same as they were
> +	 * right after initial population of memory.
> +	 */
> +	TEST_ASSERT(stats_populated.pages_4k == stats_repopulated.pages_4k,
> +		    "4k page count did not return to its initial value after "
> +		    "dirty logging. Expected %ld, got %ld",
> +		    stats_populated.pages_4k, stats_repopulated.pages_4k);
> +	TEST_ASSERT(stats_populated.pages_2m == stats_repopulated.pages_2m,
> +		    "2m page count did not return to its initial value after "
> +		    "dirty logging. Expected %ld, got %ld",
> +		    stats_populated.pages_2m, stats_repopulated.pages_2m);
> +	TEST_ASSERT(stats_populated.pages_1g == stats_repopulated.pages_1g,
> +		    "1g page count did not return to its initial value after "
> +		    "dirty logging. Expected %ld, got %ld",
> +		    stats_populated.pages_1g, stats_repopulated.pages_1g);

Nice!

> +}
> +
> +static void help(char *name)
> +{
> +	puts("");
> +	printf("usage: %s [-h] [-g] [-m mode] [-b vcpu bytes] [-s mem type]\n",
                                    ^^^^^^^^^
				    -m is not used
> +	       name);
> +	puts("");
> +	printf(" -g: Do not enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2. This\n"
> +	       "     makes KVM_GET_DIRTY_LOG clear the dirty log (i.e.\n"
> +	       "     KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE is not enabled)\n"
> +	       "     and writes will be tracked as soon as dirty logging is\n"
> +	       "     enabled on the memslot (i.e. KVM_DIRTY_LOG_INITIALLY_SET\n"
> +	       "     is not enabled).\n");
> +	printf(" -b: specify the size of the memory region which should be\n"
> +	       "     dirtied by each vCPU. e.g. 10M or 3G.\n"
> +	       "     (default: 1G)\n");
> +	backing_src_help("-s");
> +	puts("");
> +	exit(0);
> +}
> +
> +int main(int argc, char *argv[])
> +{
> +	struct test_params p = {
> +		.backing_src = VM_MEM_SRC_ANONYMOUS_HUGETLB,
> +	};
> +	int opt;
> +
> +	TEST_REQUIRE(get_kvm_param_bool("eager_page_split"));
> +	TEST_REQUIRE(get_kvm_param_bool("tdp_mmu"));
> +
> +	dirty_log_manual_caps =
> +		kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
> +	dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
> +				  KVM_DIRTY_LOG_INITIALLY_SET);

Since this is a correctness test I think the test should, by default,
test both KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE and 0, to ensure we get
test coverage of both.

And with that in place, there's probably no need for the -g flag.

> +
> +	guest_modes_append_default();
> +
> +	while ((opt = getopt(argc, argv, "b:ghs:")) != -1) {
> +		switch (opt) {
> +		case 'b':
> +			guest_percpu_mem_size = parse_size(optarg);
> +			break;
> +		case 'g':
> +			dirty_log_manual_caps = 0;
> +			break;
> +		case 'h':
> +			help(argv[0]);
> +			break;

nit: Move this down so it can fallthrough to the default case.

> +		case 's':
> +			p.backing_src = parse_backing_src_type(optarg);

It's odd that backing_src is in test_params but guest_percpu_mem_size
and dirty_log_manual_caps are global variables.

Personally I prefer using global variables for constants that do not
change across run_test(). i.e. Let's make backing_src a global.

> +			break;
> +		default:
> +			help(argv[0]);
> +			break;
> +		}
> +	}
> +
> +	if (!is_backing_src_hugetlb(p.backing_src)) {
> +		pr_info("This test will only work reliably with HugeTLB memory. "
> +			"It can work with THP, but that is best effort.");
> +		return KSFT_SKIP;
> +	}

backing_src only controls the memstress data slots. The rest of guest
memory could be a source of noise for this test.

> +
> +	run_test(&p);

Use for_each_guest_mode() to run against all supported guest modes.

> +
> +	return 0;
> +}
> -- 
> 2.39.1.405.gd4c25cc71f-goog
>
  
Ben Gardon Jan. 19, 2023, 11:48 p.m. UTC | #2
On Thu, Jan 19, 2023 at 2:56 PM David Matlack <dmatlack@google.com> wrote:
...
> > +static int NR_VCPUS = 2;
> > +static int NR_SLOTS = 2;
> > +static int NR_ITERATIONS = 2;
>
> These should be macros or at least const?

Yikes, woops, that was a basic mistake.

>
> > +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
> > +
> > +/* Host variables */
>
> What does "Host variables" mean? (And why is guest_percpu_mem_size not a
> "Host variable"?)
>
> I imagine this is copy-pasta from a test that has some global variables
> that are used by guest code? If that's correct, it's probably best to
> just drop this comment.

Yeah, shameful copypasta. I'll drop it.

>
> > +static u64 dirty_log_manual_caps;
...

> > +             /*
> > +              * Incrementing the iteration number will start the vCPUs
> > +              * dirtying memory again.
> > +              */
> > +             iteration++;
> > +
> > +             for (i = 0; i < NR_VCPUS; i++) {
> > +                     while (READ_ONCE(vcpu_last_completed_iteration[i])
> > +                            != iteration)
> > +                             ;
> > +             }
> > +
> > +             pr_debug("\nGetting stats after dirtying memory on pass %d:\n", iteration);
> > +             get_page_stats(vm, &stats_dirty_pass[iteration - 1]);
>
> Incrementing iteration, waiting for vCPUs, and grabbing stats is
> repeated below. Throw it in a helper function?

Good call.

>
> > +
> > +             memstress_get_dirty_log(vm, bitmaps, NR_SLOTS);
> > +
> > +             if (dirty_log_manual_caps) {
> > +                     memstress_clear_dirty_log(vm, bitmaps, NR_SLOTS, pages_per_slot);
> > +
> > +                     pr_debug("\nGetting stats after clearing dirty log pass %d:\n", iteration);
> > +                     get_page_stats(vm, &stats_clear_pass[iteration - 1]);
> > +             }
> > +     }
> > +
> > +     /* Disable dirty logging */
> > +     memstress_disable_dirty_logging(vm, NR_SLOTS);
> > +
> > +     pr_debug("\nGetting stats after disabling dirty logging:\n");
> > +     get_page_stats(vm, &stats_dirty_logging_disabled);
> > +
> > +     /* Run vCPUs again to fault pages back in. */
> > +     iteration++;
> > +     for (i = 0; i < NR_VCPUS; i++) {
> > +             while (READ_ONCE(vcpu_last_completed_iteration[i]) != iteration)
> > +                     ;
> > +     }
> > +
> > +     pr_debug("\nGetting stats after repopulating memory:\n");
> > +     get_page_stats(vm, &stats_repopulated);
> > +
> > +     /*
> > +      * Tell the vCPU threads to quit.  No need to manually check that vCPUs
> > +      * have stopped running after disabling dirty logging, the join will
> > +      * wait for them to exit.
> > +      */
> > +     host_quit = true;
> > +     memstress_join_vcpu_threads(NR_VCPUS);
> > +
> > +     memstress_free_bitmaps(bitmaps, NR_SLOTS);
> > +     memstress_destroy_vm(vm);
> > +
> > +     /* Make assertions about the page counts. */
> > +     total_4k_pages = stats_populated.pages_4k;
> > +     total_4k_pages += stats_populated.pages_2m * 512;
> > +     total_4k_pages += stats_populated.pages_1g * 512 * 512;
> > +
> > +     /*
> > +      * Check that all huge pages were split. Since large pages can only
> > +      * exist in the data slot, and the vCPUs should have dirtied all pages
> > +      * in the data slot, there should be no huge pages left after splitting.
> > +      * Splitting happens at dirty log enable time without
> > +      * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass
> > +      * with that capability.
> > +      */
> > +     if (dirty_log_manual_caps) {
> > +             TEST_ASSERT(stats_clear_pass[0].hugepages == 0,
>
> Consider using ASSERT_EQ() to simplify these checks. It will
> automatically print out the values for you, but you'll lose the
> contextual error message ("Unexpected huge page count after
> splitting..."). But maybe we could add support for a custom extra error
> string?
>
> __ASSERT_EQ(stats_clear_pass[0].hugepages, 0,
>             "Expected 0 hugepages after splitting");
>
> Or use a comment to document the context for the assertion. Whoever is
> debugging a failure is going to come look at the selftest code no matter
> what.
>
> I think I prefer ASSERT_EQ() + comment, especially since the comment
> pretty much already exists above.

That's fair. I prefer the way it is because the resulting error
message is a lot easier to read and I don't need to look at the test
code to decrypt it. If I'm developing a feature and just running all
tests, it's nice to not have to track down the test source code.

>
> > +                         "Unexpected huge page count after splitting. Expected 0, got %ld",
> > +                         stats_clear_pass[0].hugepages);
> > +             TEST_ASSERT(stats_clear_pass[0].pages_4k == total_4k_pages,
> > +                         "All memory should be mapped at 4k. Expected %ld 4k pages, got %ld",
> > +                         total_4k_pages, stats_clear_pass[0].pages_4k);
>
> Also assert that huge pages are *not* split when dirty logging is first
> enabled.

Ah great idea. I felt like I was a little light on the assertions.
That'll be a good addition.

>
> > +     } else {
...
> > +
> > +     dirty_log_manual_caps =
> > +             kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
> > +     dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
> > +                               KVM_DIRTY_LOG_INITIALLY_SET);
>
> Since this is a correctness test I think the test should, by default,
> test both KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE and 0, to ensure we get
> test coverage of both.
>
> And with that in place, there's probably no need for the -g flag.

Good idea.

>
> > +
> > +     guest_modes_append_default();
...
> > +
> > +     if (!is_backing_src_hugetlb(p.backing_src)) {
> > +             pr_info("This test will only work reliably with HugeTLB memory. "
> > +                     "It can work with THP, but that is best effort.");
> > +             return KSFT_SKIP;
> > +     }
>
> backing_src only controls the memstress data slots. The rest of guest
> memory could be a source of noise for this test.

That's true, but we compensate for that noise by taking a measurement
after the population pass. At that point the guest has executed all
it's code (or at least from all it's code pages) and touched every
page in the data slot. Since the other slots aren't backed with huge
pages, NX hugepages shouldn't be an issue. As a result, I would be
surprised if noise from the other memory became a problem.

>
> > +
> > +     run_test(&p);
>
> Use for_each_guest_mode() to run against all supported guest modes.

I'm not sure that would actually improve coverage. None of the page
splitting behavior depends on the mode AFAICT.

>
> > +
> > +     return 0;
> > +}
> > --
> > 2.39.1.405.gd4c25cc71f-goog
> >
  
Ricardo Koller Jan. 20, 2023, 2:23 p.m. UTC | #3
On Thu, Jan 19, 2023 at 03:48:14PM -0800, Ben Gardon wrote:
> On Thu, Jan 19, 2023 at 2:56 PM David Matlack <dmatlack@google.com> wrote:
> ...
> > > +static int NR_VCPUS = 2;
> > > +static int NR_SLOTS = 2;
> > > +static int NR_ITERATIONS = 2;
> >
> > These should be macros or at least const?
> 
> Yikes, woops, that was a basic mistake.
> 
> >
> > > +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
> > > +
> > > +/* Host variables */
> >
> > What does "Host variables" mean? (And why is guest_percpu_mem_size not a
> > "Host variable"?)
> >
> > I imagine this is copy-pasta from a test that has some global variables
> > that are used by guest code? If that's correct, it's probably best to
> > just drop this comment.
> 
> Yeah, shameful copypasta. I'll drop it.
> 
> >
> > > +static u64 dirty_log_manual_caps;
> ...
> 
> > > +             /*
> > > +              * Incrementing the iteration number will start the vCPUs
> > > +              * dirtying memory again.
> > > +              */
> > > +             iteration++;
> > > +
> > > +             for (i = 0; i < NR_VCPUS; i++) {
> > > +                     while (READ_ONCE(vcpu_last_completed_iteration[i])
> > > +                            != iteration)
> > > +                             ;
> > > +             }
> > > +
> > > +             pr_debug("\nGetting stats after dirtying memory on pass %d:\n", iteration);
> > > +             get_page_stats(vm, &stats_dirty_pass[iteration - 1]);
> >
> > Incrementing iteration, waiting for vCPUs, and grabbing stats is
> > repeated below. Throw it in a helper function?
> 
> Good call.
> 
> >
> > > +
> > > +             memstress_get_dirty_log(vm, bitmaps, NR_SLOTS);
> > > +
> > > +             if (dirty_log_manual_caps) {
> > > +                     memstress_clear_dirty_log(vm, bitmaps, NR_SLOTS, pages_per_slot);
> > > +
> > > +                     pr_debug("\nGetting stats after clearing dirty log pass %d:\n", iteration);
> > > +                     get_page_stats(vm, &stats_clear_pass[iteration - 1]);
> > > +             }
> > > +     }
> > > +
> > > +     /* Disable dirty logging */
> > > +     memstress_disable_dirty_logging(vm, NR_SLOTS);
> > > +
> > > +     pr_debug("\nGetting stats after disabling dirty logging:\n");
> > > +     get_page_stats(vm, &stats_dirty_logging_disabled);
> > > +
> > > +     /* Run vCPUs again to fault pages back in. */
> > > +     iteration++;
> > > +     for (i = 0; i < NR_VCPUS; i++) {
> > > +             while (READ_ONCE(vcpu_last_completed_iteration[i]) != iteration)
> > > +                     ;
> > > +     }
> > > +
> > > +     pr_debug("\nGetting stats after repopulating memory:\n");
> > > +     get_page_stats(vm, &stats_repopulated);
> > > +
> > > +     /*
> > > +      * Tell the vCPU threads to quit.  No need to manually check that vCPUs
> > > +      * have stopped running after disabling dirty logging, the join will
> > > +      * wait for them to exit.
> > > +      */
> > > +     host_quit = true;
> > > +     memstress_join_vcpu_threads(NR_VCPUS);
> > > +
> > > +     memstress_free_bitmaps(bitmaps, NR_SLOTS);
> > > +     memstress_destroy_vm(vm);
> > > +
> > > +     /* Make assertions about the page counts. */
> > > +     total_4k_pages = stats_populated.pages_4k;
> > > +     total_4k_pages += stats_populated.pages_2m * 512;
> > > +     total_4k_pages += stats_populated.pages_1g * 512 * 512;
> > > +
> > > +     /*
> > > +      * Check that all huge pages were split. Since large pages can only
> > > +      * exist in the data slot, and the vCPUs should have dirtied all pages
> > > +      * in the data slot, there should be no huge pages left after splitting.
> > > +      * Splitting happens at dirty log enable time without
> > > +      * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass
> > > +      * with that capability.
> > > +      */
> > > +     if (dirty_log_manual_caps) {
> > > +             TEST_ASSERT(stats_clear_pass[0].hugepages == 0,
> >
> > Consider using ASSERT_EQ() to simplify these checks. It will
> > automatically print out the values for you, but you'll lose the
> > contextual error message ("Unexpected huge page count after
> > splitting..."). But maybe we could add support for a custom extra error
> > string?
> >
> > __ASSERT_EQ(stats_clear_pass[0].hugepages, 0,
> >             "Expected 0 hugepages after splitting");
> >
> > Or use a comment to document the context for the assertion. Whoever is
> > debugging a failure is going to come look at the selftest code no matter
> > what.
> >
> > I think I prefer ASSERT_EQ() + comment, especially since the comment
> > pretty much already exists above.
> 
> That's fair. I prefer the way it is because the resulting error
> message is a lot easier to read and I don't need to look at the test
> code to decrypt it. If I'm developing a feature and just running all
> tests, it's nice to not have to track down the test source code.
> 
> >
> > > +                         "Unexpected huge page count after splitting. Expected 0, got %ld",
> > > +                         stats_clear_pass[0].hugepages);
> > > +             TEST_ASSERT(stats_clear_pass[0].pages_4k == total_4k_pages,
> > > +                         "All memory should be mapped at 4k. Expected %ld 4k pages, got %ld",
> > > +                         total_4k_pages, stats_clear_pass[0].pages_4k);
> >
> > Also assert that huge pages are *not* split when dirty logging is first
> > enabled.
> 
> Ah great idea. I felt like I was a little light on the assertions.
> That'll be a good addition.
> 
> >
> > > +     } else {
> ...
> > > +
> > > +     dirty_log_manual_caps =
> > > +             kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
> > > +     dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
> > > +                               KVM_DIRTY_LOG_INITIALLY_SET);
> >
> > Since this is a correctness test I think the test should, by default,
> > test both KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE and 0, to ensure we get
> > test coverage of both.
> >
> > And with that in place, there's probably no need for the -g flag.
> 
> Good idea.
> 
> >
> > > +
> > > +     guest_modes_append_default();
> ...
> > > +
> > > +     if (!is_backing_src_hugetlb(p.backing_src)) {
> > > +             pr_info("This test will only work reliably with HugeTLB memory. "
> > > +                     "It can work with THP, but that is best effort.");
> > > +             return KSFT_SKIP;
> > > +     }
> >
> > backing_src only controls the memstress data slots. The rest of guest
> > memory could be a source of noise for this test.
> 
> That's true, but we compensate for that noise by taking a measurement
> after the population pass. At that point the guest has executed all
> it's code (or at least from all it's code pages) and touched every
> page in the data slot. Since the other slots aren't backed with huge
> pages, NX hugepages shouldn't be an issue. As a result, I would be
> surprised if noise from the other memory became a problem.
> 
> >
> > > +
> > > +     run_test(&p);
> >
> > Use for_each_guest_mode() to run against all supported guest modes.
> 
> I'm not sure that would actually improve coverage. None of the page
> splitting behavior depends on the mode AFAICT.

You need to use for_each_guest_mode() for the ARM case. The issue is
that whatever mode (guest page size and VA size) you pick might not be
supported by the host. So, you first to explore what's available (via
for_each_guest_mode()).

Thanks,
Ricardo

> 
> >
> > > +
> > > +     return 0;
> > > +}
> > > --
> > > 2.39.1.405.gd4c25cc71f-goog
> > >
  
Ricardo Koller Jan. 20, 2023, 2:33 p.m. UTC | #4
On Fri, Jan 20, 2023 at 6:23 AM Ricardo Koller <ricarkol@google.com> wrote:
>
> On Thu, Jan 19, 2023 at 03:48:14PM -0800, Ben Gardon wrote:
> > On Thu, Jan 19, 2023 at 2:56 PM David Matlack <dmatlack@google.com> wrote:
> > ...
> > > > +static int NR_VCPUS = 2;
> > > > +static int NR_SLOTS = 2;
> > > > +static int NR_ITERATIONS = 2;
> > >
> > > These should be macros or at least const?
> >
> > Yikes, woops, that was a basic mistake.
> >
> > >
> > > > +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
> > > > +
> > > > +/* Host variables */
> > >
> > > What does "Host variables" mean? (And why is guest_percpu_mem_size not a
> > > "Host variable"?)
> > >
> > > I imagine this is copy-pasta from a test that has some global variables
> > > that are used by guest code? If that's correct, it's probably best to
> > > just drop this comment.
> >
> > Yeah, shameful copypasta. I'll drop it.
> >
> > >
> > > > +static u64 dirty_log_manual_caps;
> > ...
> >
> > > > +             /*
> > > > +              * Incrementing the iteration number will start the vCPUs
> > > > +              * dirtying memory again.
> > > > +              */
> > > > +             iteration++;
> > > > +
> > > > +             for (i = 0; i < NR_VCPUS; i++) {
> > > > +                     while (READ_ONCE(vcpu_last_completed_iteration[i])
> > > > +                            != iteration)
> > > > +                             ;
> > > > +             }
> > > > +
> > > > +             pr_debug("\nGetting stats after dirtying memory on pass %d:\n", iteration);
> > > > +             get_page_stats(vm, &stats_dirty_pass[iteration - 1]);
> > >
> > > Incrementing iteration, waiting for vCPUs, and grabbing stats is
> > > repeated below. Throw it in a helper function?
> >
> > Good call.
> >
> > >
> > > > +
> > > > +             memstress_get_dirty_log(vm, bitmaps, NR_SLOTS);
> > > > +
> > > > +             if (dirty_log_manual_caps) {
> > > > +                     memstress_clear_dirty_log(vm, bitmaps, NR_SLOTS, pages_per_slot);
> > > > +
> > > > +                     pr_debug("\nGetting stats after clearing dirty log pass %d:\n", iteration);
> > > > +                     get_page_stats(vm, &stats_clear_pass[iteration - 1]);
> > > > +             }
> > > > +     }
> > > > +
> > > > +     /* Disable dirty logging */
> > > > +     memstress_disable_dirty_logging(vm, NR_SLOTS);
> > > > +
> > > > +     pr_debug("\nGetting stats after disabling dirty logging:\n");
> > > > +     get_page_stats(vm, &stats_dirty_logging_disabled);
> > > > +
> > > > +     /* Run vCPUs again to fault pages back in. */
> > > > +     iteration++;
> > > > +     for (i = 0; i < NR_VCPUS; i++) {
> > > > +             while (READ_ONCE(vcpu_last_completed_iteration[i]) != iteration)
> > > > +                     ;
> > > > +     }
> > > > +
> > > > +     pr_debug("\nGetting stats after repopulating memory:\n");
> > > > +     get_page_stats(vm, &stats_repopulated);
> > > > +
> > > > +     /*
> > > > +      * Tell the vCPU threads to quit.  No need to manually check that vCPUs
> > > > +      * have stopped running after disabling dirty logging, the join will
> > > > +      * wait for them to exit.
> > > > +      */
> > > > +     host_quit = true;
> > > > +     memstress_join_vcpu_threads(NR_VCPUS);
> > > > +
> > > > +     memstress_free_bitmaps(bitmaps, NR_SLOTS);
> > > > +     memstress_destroy_vm(vm);
> > > > +
> > > > +     /* Make assertions about the page counts. */
> > > > +     total_4k_pages = stats_populated.pages_4k;
> > > > +     total_4k_pages += stats_populated.pages_2m * 512;
> > > > +     total_4k_pages += stats_populated.pages_1g * 512 * 512;
> > > > +
> > > > +     /*
> > > > +      * Check that all huge pages were split. Since large pages can only
> > > > +      * exist in the data slot, and the vCPUs should have dirtied all pages
> > > > +      * in the data slot, there should be no huge pages left after splitting.
> > > > +      * Splitting happens at dirty log enable time without
> > > > +      * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass
> > > > +      * with that capability.
> > > > +      */
> > > > +     if (dirty_log_manual_caps) {
> > > > +             TEST_ASSERT(stats_clear_pass[0].hugepages == 0,
> > >
> > > Consider using ASSERT_EQ() to simplify these checks. It will
> > > automatically print out the values for you, but you'll lose the
> > > contextual error message ("Unexpected huge page count after
> > > splitting..."). But maybe we could add support for a custom extra error
> > > string?
> > >
> > > __ASSERT_EQ(stats_clear_pass[0].hugepages, 0,
> > >             "Expected 0 hugepages after splitting");
> > >
> > > Or use a comment to document the context for the assertion. Whoever is
> > > debugging a failure is going to come look at the selftest code no matter
> > > what.
> > >
> > > I think I prefer ASSERT_EQ() + comment, especially since the comment
> > > pretty much already exists above.
> >
> > That's fair. I prefer the way it is because the resulting error
> > message is a lot easier to read and I don't need to look at the test
> > code to decrypt it. If I'm developing a feature and just running all
> > tests, it's nice to not have to track down the test source code.
> >
> > >
> > > > +                         "Unexpected huge page count after splitting. Expected 0, got %ld",
> > > > +                         stats_clear_pass[0].hugepages);
> > > > +             TEST_ASSERT(stats_clear_pass[0].pages_4k == total_4k_pages,
> > > > +                         "All memory should be mapped at 4k. Expected %ld 4k pages, got %ld",
> > > > +                         total_4k_pages, stats_clear_pass[0].pages_4k);
> > >
> > > Also assert that huge pages are *not* split when dirty logging is first
> > > enabled.
> >
> > Ah great idea. I felt like I was a little light on the assertions.
> > That'll be a good addition.
> >
> > >
> > > > +     } else {
> > ...
> > > > +
> > > > +     dirty_log_manual_caps =
> > > > +             kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
> > > > +     dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
> > > > +                               KVM_DIRTY_LOG_INITIALLY_SET);
> > >
> > > Since this is a correctness test I think the test should, by default,
> > > test both KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE and 0, to ensure we get
> > > test coverage of both.
> > >
> > > And with that in place, there's probably no need for the -g flag.
> >
> > Good idea.
> >
> > >
> > > > +
> > > > +     guest_modes_append_default();
> > ...
> > > > +
> > > > +     if (!is_backing_src_hugetlb(p.backing_src)) {
> > > > +             pr_info("This test will only work reliably with HugeTLB memory. "
> > > > +                     "It can work with THP, but that is best effort.");
> > > > +             return KSFT_SKIP;
> > > > +     }
> > >
> > > backing_src only controls the memstress data slots. The rest of guest
> > > memory could be a source of noise for this test.
> >
> > That's true, but we compensate for that noise by taking a measurement
> > after the population pass. At that point the guest has executed all
> > it's code (or at least from all it's code pages) and touched every
> > page in the data slot. Since the other slots aren't backed with huge
> > pages, NX hugepages shouldn't be an issue. As a result, I would be
> > surprised if noise from the other memory became a problem.
> >
> > >
> > > > +
> > > > +     run_test(&p);
> > >
> > > Use for_each_guest_mode() to run against all supported guest modes.
> >
> > I'm not sure that would actually improve coverage. None of the page
> > splitting behavior depends on the mode AFAICT.
>
> You need to use for_each_guest_mode() for the ARM case. The issue is
> that whatever mode (guest page size and VA size) you pick might not be
> supported by the host. So, you first to explore what's available (via
> for_each_guest_mode()).

Actually, that's fixed by using the default mode, which picks the
first available
mode. I would prefer to use for_each_guest_mode() though, who knows and
something fails with some specific guest page size for some reason.

>
> Thanks,
> Ricardo
>
> >
> > >
> > > > +
> > > > +     return 0;
> > > > +}
> > > > --
> > > > 2.39.1.405.gd4c25cc71f-goog
> > > >
  
Ben Gardon Jan. 20, 2023, 8:04 p.m. UTC | #5
On Fri, Jan 20, 2023 at 6:34 AM Ricardo Koller <ricarkol@google.com> wrote:
>
...
> > > > > +
> > > > > +     run_test(&p);
> > > >
> > > > Use for_each_guest_mode() to run against all supported guest modes.
> > >
> > > I'm not sure that would actually improve coverage. None of the page
> > > splitting behavior depends on the mode AFAICT.
> >
> > You need to use for_each_guest_mode() for the ARM case. The issue is
> > that whatever mode (guest page size and VA size) you pick might not be
> > supported by the host. So, you first to explore what's available (via
> > for_each_guest_mode()).
>
> Actually, that's fixed by using the default mode, which picks the
> first available
> mode. I would prefer to use for_each_guest_mode() though, who knows and
> something fails with some specific guest page size for some reason.

Okay, will do. I wasn't sure if we did eager page splitting on ARM, so
I was only planning on making this test for x86_64 initially, hence it
being in that directory. If ARM rolls with the same behavior, then
I'll add the for_each_mode bit and move the test up a directory.

>
> >
> > Thanks,
> > Ricardo
> >
> > >
> > > >
> > > > > +
> > > > > +     return 0;
> > > > > +}
> > > > > --
> > > > > 2.39.1.405.gd4c25cc71f-goog
> > > > >
  
David Matlack Jan. 23, 2023, 6:41 p.m. UTC | #6
On Fri, Jan 20, 2023 at 12:04:04PM -0800, Ben Gardon wrote:
> On Fri, Jan 20, 2023 at 6:34 AM Ricardo Koller <ricarkol@google.com> wrote:
> >
> ...
> > > > > > +
> > > > > > +     run_test(&p);
> > > > >
> > > > > Use for_each_guest_mode() to run against all supported guest modes.
> > > >
> > > > I'm not sure that would actually improve coverage. None of the page
> > > > splitting behavior depends on the mode AFAICT.
> > >
> > > You need to use for_each_guest_mode() for the ARM case. The issue is
> > > that whatever mode (guest page size and VA size) you pick might not be
> > > supported by the host. So, you first to explore what's available (via
> > > for_each_guest_mode()).
> >
> > Actually, that's fixed by using the default mode, which picks the
> > first available
> > mode. I would prefer to use for_each_guest_mode() though, who knows and
> > something fails with some specific guest page size for some reason.
> 
> Okay, will do. I wasn't sure if we did eager page splitting on ARM, so

Ricardo is in the process of upstreaming eager page splitting for ARM:

https://lore.kernel.org/kvm/20230113035000.480021-1-ricarkol@google.com/

> I was only planning on making this test for x86_64 initially, hence it
> being in that directory. If ARM rolls with the same behavior, then
> I'll add the for_each_mode bit and move the test up a directory.

In addition to for_each_guest_mode(), KVM/ARM will need to expose page
size stats so the test can verify the splitting (yet another reason to
have a common MMU).

Ricardo, if you're interested in adding page size stats to KVM/ARM ahead
of the Common MMU, e.g. to test eager page splitting, let me know. I
want to make sure we align on the userspace-visible stat names to avoid
churn down the road. Specifically, do we want to expose neutral names
like pages_{pte,pmd,pud} or expand the KVM/x86 list to include all of
ARM's possible pages sizes like pages_{4k,16k,64k,...} (or both)?
  
Ricardo Koller Jan. 24, 2023, 3:40 p.m. UTC | #7
On Fri, Jan 20, 2023 at 12:04:04PM -0800, Ben Gardon wrote:
> On Fri, Jan 20, 2023 at 6:34 AM Ricardo Koller <ricarkol@google.com> wrote:
> >
> ...
> > > > > > +
> > > > > > +     run_test(&p);
> > > > >
> > > > > Use for_each_guest_mode() to run against all supported guest modes.
> > > >
> > > > I'm not sure that would actually improve coverage. None of the page
> > > > splitting behavior depends on the mode AFAICT.
> > >
> > > You need to use for_each_guest_mode() for the ARM case. The issue is
> > > that whatever mode (guest page size and VA size) you pick might not be
> > > supported by the host. So, you first to explore what's available (via
> > > for_each_guest_mode()).
> >
> > Actually, that's fixed by using the default mode, which picks the
> > first available
> > mode. I would prefer to use for_each_guest_mode() though, who knows and
> > something fails with some specific guest page size for some reason.
> 
> Okay, will do. I wasn't sure if we did eager page splitting on ARM, so
> I was only planning on making this test for x86_64 initially, hence it

Thanks Ben, just saw that V2 has the for_each_guest_mode() change.
> being in that directory. If ARM rolls with the same behavior, then
> I'll add the for_each_mode bit and move the test up a directory.

No wories, I can take care of that as part of my ARM changes, maybe even
as part of the eager splitting series.

> >
> > >
> > > Thanks,
> > > Ricardo
> > >
> > > >
> > > > >
> > > > > > +
> > > > > > +     return 0;
> > > > > > +}
> > > > > > --
> > > > > > 2.39.1.405.gd4c25cc71f-goog
> > > > > >
  
Ricardo Koller Jan. 24, 2023, 3:54 p.m. UTC | #8
On Mon, Jan 23, 2023 at 10:41:18AM -0800, David Matlack wrote:
> On Fri, Jan 20, 2023 at 12:04:04PM -0800, Ben Gardon wrote:
> > On Fri, Jan 20, 2023 at 6:34 AM Ricardo Koller <ricarkol@google.com> wrote:
> > >
> > ...
> > > > > > > +
> > > > > > > +     run_test(&p);
> > > > > >
> > > > > > Use for_each_guest_mode() to run against all supported guest modes.
> > > > >
> > > > > I'm not sure that would actually improve coverage. None of the page
> > > > > splitting behavior depends on the mode AFAICT.
> > > >
> > > > You need to use for_each_guest_mode() for the ARM case. The issue is
> > > > that whatever mode (guest page size and VA size) you pick might not be
> > > > supported by the host. So, you first to explore what's available (via
> > > > for_each_guest_mode()).
> > >
> > > Actually, that's fixed by using the default mode, which picks the
> > > first available
> > > mode. I would prefer to use for_each_guest_mode() though, who knows and
> > > something fails with some specific guest page size for some reason.
> > 
> > Okay, will do. I wasn't sure if we did eager page splitting on ARM, so
> 
> Ricardo is in the process of upstreaming eager page splitting for ARM:
> 
> https://lore.kernel.org/kvm/20230113035000.480021-1-ricarkol@google.com/
> 
> > I was only planning on making this test for x86_64 initially, hence it
> > being in that directory. If ARM rolls with the same behavior, then
> > I'll add the for_each_mode bit and move the test up a directory.
> 
> In addition to for_each_guest_mode(), KVM/ARM will need to expose page
> size stats so the test can verify the splitting (yet another reason to
> have a common MMU).
> 
> Ricardo, if you're interested in adding page size stats to KVM/ARM ahead
> of the Common MMU, e.g. to test eager page splitting, let me know.

Sure, I can do that. Sounds pretty useful too.

> I
> want to make sure we align on the userspace-visible stat names to avoid
> churn down the road. Specifically, do we want to expose neutral names
> like pages_{pte,pmd,pud} or expand the KVM/x86 list to include all of
> ARM's possible pages sizes like pages_{4k,16k,64k,...} (or both)?

I would prefer the latter, mainly to match the x86 names:

	+	stats->pages_4k = vm_get_stat(vm, "pages_4k");
	+	stats->pages_2m = vm_get_stat(vm, "pages_2m");
	+	stats->pages_1g = vm_get_stat(vm, "pages_1g");
	(from this patch)

but pages_{pte,pmd,pud} would certainly make this test simpler
as it would handle all guest page sizes:

	+	stats->pages_pte = vm_get_stat(vm, "pages_pte");
  
David Matlack Jan. 24, 2023, 5:15 p.m. UTC | #9
On Tue, Jan 24, 2023 at 7:54 AM Ricardo Koller <ricarkol@google.com> wrote:
>
> On Mon, Jan 23, 2023 at 10:41:18AM -0800, David Matlack wrote:
> >
> > Ricardo, if you're interested in adding page size stats to KVM/ARM ahead
> > of the Common MMU, e.g. to test eager page splitting, let me know.
>
> Sure, I can do that. Sounds pretty useful too.
>
> > I
> > want to make sure we align on the userspace-visible stat names to avoid
> > churn down the road. Specifically, do we want to expose neutral names
> > like pages_{pte,pmd,pud} or expand the KVM/x86 list to include all of
> > ARM's possible pages sizes like pages_{4k,16k,64k,...} (or both)?
>
> I would prefer the latter, mainly to match the x86 names:
>
>         +       stats->pages_4k = vm_get_stat(vm, "pages_4k");
>         +       stats->pages_2m = vm_get_stat(vm, "pages_2m");
>         +       stats->pages_1g = vm_get_stat(vm, "pages_1g");
>         (from this patch)

We can always add pages_{pte,pmd,pud} to x86 as aliases of the
existing stats. The series I recently sent out to allow custom names
for stats [1] would make adding the aliases trivial actually.

[1] https://lore.kernel.org/kvm/20230118175300.790835-1-dmatlack@google.com/

>
> but pages_{pte,pmd,pud} would certainly make this test simpler
> as it would handle all guest page sizes:
>
>         +       stats->pages_pte = vm_get_stat(vm, "pages_pte");
>

Yeah pages_{pte,pmd,pud} would certainly make the test simpler.

At this point I'm leaning toward pages_{pte,pmd,pud} to unblock this
testing in an architecture neutral way, and we can add
pages_{4k,16k,...} to ARM in the future using aliases.
  

Patch

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 1750f91dd9362..057ebd709e77a 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -76,6 +76,7 @@  TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
 TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
 TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test
 TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test
+TEST_GEN_PROGS_x86_64 += x86_64/page_splitting_test
 TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
 TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
 TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h
index fbc2a79369b8b..4e38a771fa5d1 100644
--- a/tools/testing/selftests/kvm/include/kvm_util_base.h
+++ b/tools/testing/selftests/kvm/include/kvm_util_base.h
@@ -213,6 +213,7 @@  extern const struct vm_guest_mode_params vm_guest_mode_params[];
 int open_path_or_exit(const char *path, int flags);
 int open_kvm_dev_path_or_exit(void);
 
+bool get_kvm_param_bool(const char *param);
 bool get_kvm_intel_param_bool(const char *param);
 bool get_kvm_amd_param_bool(const char *param);
 
@@ -402,6 +403,7 @@  static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name)
 	uint64_t data;
 
 	__vm_get_stat(vm, stat_name, &data, 1);
+
 	return data;
 }
 
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 56d5ea949cbbe..fa6d69f731990 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -80,6 +80,11 @@  static bool get_module_param_bool(const char *module_name, const char *param)
 	TEST_FAIL("Unrecognized value '%c' for boolean module param", value);
 }
 
+bool get_kvm_param_bool(const char *param)
+{
+	return get_module_param_bool("kvm", param);
+}
+
 bool get_kvm_intel_param_bool(const char *param)
 {
 	return get_module_param_bool("kvm_intel", param);
diff --git a/tools/testing/selftests/kvm/x86_64/page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/page_splitting_test.c
new file mode 100644
index 0000000000000..3e2ee20adf036
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/page_splitting_test.c
@@ -0,0 +1,314 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty logging page splitting test
+ *
+ * Based on dirty_log_perf.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2020, Google, Inc.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+
+#include "kvm_util.h"
+#include "test_util.h"
+#include "memstress.h"
+#include "guest_modes.h"
+
+/* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/
+#define TEST_HOST_LOOP_N		2UL
+
+static int NR_VCPUS = 2;
+static int NR_SLOTS = 2;
+static int NR_ITERATIONS = 2;
+static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+
+/* Host variables */
+static u64 dirty_log_manual_caps;
+static bool host_quit;
+static int iteration;
+static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
+
+struct kvm_page_stats {
+	uint64_t pages_4k;
+	uint64_t pages_2m;
+	uint64_t pages_1g;
+	uint64_t hugepages;
+};
+
+static void get_page_stats(struct kvm_vm *vm, struct kvm_page_stats *stats)
+{
+	stats->pages_4k = vm_get_stat(vm, "pages_4k");
+	stats->pages_2m = vm_get_stat(vm, "pages_2m");
+	stats->pages_1g = vm_get_stat(vm, "pages_1g");
+	stats->hugepages = stats->pages_2m + stats->pages_1g;
+
+	pr_debug("Page stats - 4K: %ld 2M: %ld 1G: %ld huge: %ld\n",
+		 stats->pages_4k, stats->pages_2m, stats->pages_1g,
+		 stats->hugepages);
+}
+
+static void vcpu_worker(struct memstress_vcpu_args *vcpu_args)
+{
+	struct kvm_vcpu *vcpu = vcpu_args->vcpu;
+	int vcpu_idx = vcpu_args->vcpu_idx;
+	uint64_t pages_count = 0;
+	struct kvm_run *run;
+	int ret;
+
+	run = vcpu->run;
+
+	while (!READ_ONCE(host_quit)) {
+		int current_iteration = READ_ONCE(iteration);
+
+		ret = _vcpu_run(vcpu);
+
+		TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+		TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC,
+			    "Invalid guest sync status: exit_reason=%s\n",
+			    exit_reason_str(run->exit_reason));
+
+		vcpu_last_completed_iteration[vcpu_idx] = current_iteration;
+
+		if (current_iteration)
+			pages_count += vcpu_args->pages;
+
+		/* Wait for the start of the next iteration to be signaled. */
+		while (current_iteration == READ_ONCE(iteration) &&
+		       READ_ONCE(iteration) >= 0 &&
+		       !READ_ONCE(host_quit))
+			;
+	}
+}
+
+struct test_params {
+	enum vm_mem_backing_src_type backing_src;
+};
+
+static void run_test(struct test_params *p)
+{
+	struct kvm_vm *vm;
+	unsigned long **bitmaps;
+	uint64_t guest_num_pages;
+	uint64_t host_num_pages;
+	uint64_t pages_per_slot;
+	int i;
+	uint64_t total_4k_pages;
+	struct kvm_page_stats stats_populated;
+	struct kvm_page_stats stats_dirty_logging_enabled;
+	struct kvm_page_stats stats_dirty_pass[NR_ITERATIONS];
+	struct kvm_page_stats stats_clear_pass[NR_ITERATIONS];
+	struct kvm_page_stats stats_dirty_logging_disabled;
+	struct kvm_page_stats stats_repopulated;
+
+	vm = memstress_create_vm(VM_MODE_DEFAULT, NR_VCPUS, guest_percpu_mem_size,
+				 NR_SLOTS, p->backing_src, false);
+
+	guest_num_pages = (NR_VCPUS * guest_percpu_mem_size) >> vm->page_shift;
+	guest_num_pages = vm_adjust_num_guest_pages(VM_MODE_DEFAULT, guest_num_pages);
+	host_num_pages = vm_num_host_pages(VM_MODE_DEFAULT, guest_num_pages);
+	pages_per_slot = host_num_pages / NR_SLOTS;
+
+	bitmaps = memstress_alloc_bitmaps(NR_SLOTS, pages_per_slot);
+
+	if (dirty_log_manual_caps)
+		vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2,
+			      dirty_log_manual_caps);
+
+	/* Start the iterations */
+	iteration = 0;
+	host_quit = false;
+
+	for (i = 0; i < NR_VCPUS; i++)
+		vcpu_last_completed_iteration[i] = -1;
+
+	memstress_start_vcpu_threads(NR_VCPUS, vcpu_worker);
+
+	/* Allow the vCPUs to populate memory */
+	for (i = 0; i < NR_VCPUS; i++) {
+		while (READ_ONCE(vcpu_last_completed_iteration[i]) !=
+		       iteration)
+			;
+	}
+
+	pr_debug("\nGetting stats after populating memory:\n");
+	get_page_stats(vm, &stats_populated);
+
+	/* Enable dirty logging */
+	memstress_enable_dirty_logging(vm, NR_SLOTS);
+
+	pr_debug("\nGetting stats after enabling dirty logging:\n");
+	get_page_stats(vm, &stats_dirty_logging_enabled);
+
+	while (iteration < NR_ITERATIONS) {
+		/*
+		 * Incrementing the iteration number will start the vCPUs
+		 * dirtying memory again.
+		 */
+		iteration++;
+
+		for (i = 0; i < NR_VCPUS; i++) {
+			while (READ_ONCE(vcpu_last_completed_iteration[i])
+			       != iteration)
+				;
+		}
+
+		pr_debug("\nGetting stats after dirtying memory on pass %d:\n", iteration);
+		get_page_stats(vm, &stats_dirty_pass[iteration - 1]);
+
+		memstress_get_dirty_log(vm, bitmaps, NR_SLOTS);
+
+		if (dirty_log_manual_caps) {
+			memstress_clear_dirty_log(vm, bitmaps, NR_SLOTS, pages_per_slot);
+
+			pr_debug("\nGetting stats after clearing dirty log pass %d:\n", iteration);
+			get_page_stats(vm, &stats_clear_pass[iteration - 1]);
+		}
+	}
+
+	/* Disable dirty logging */
+	memstress_disable_dirty_logging(vm, NR_SLOTS);
+
+	pr_debug("\nGetting stats after disabling dirty logging:\n");
+	get_page_stats(vm, &stats_dirty_logging_disabled);
+
+	/* Run vCPUs again to fault pages back in. */
+	iteration++;
+	for (i = 0; i < NR_VCPUS; i++) {
+		while (READ_ONCE(vcpu_last_completed_iteration[i]) != iteration)
+			;
+	}
+
+	pr_debug("\nGetting stats after repopulating memory:\n");
+	get_page_stats(vm, &stats_repopulated);
+
+	/*
+	 * Tell the vCPU threads to quit.  No need to manually check that vCPUs
+	 * have stopped running after disabling dirty logging, the join will
+	 * wait for them to exit.
+	 */
+	host_quit = true;
+	memstress_join_vcpu_threads(NR_VCPUS);
+
+	memstress_free_bitmaps(bitmaps, NR_SLOTS);
+	memstress_destroy_vm(vm);
+
+	/* Make assertions about the page counts. */
+	total_4k_pages = stats_populated.pages_4k;
+	total_4k_pages += stats_populated.pages_2m * 512;
+	total_4k_pages += stats_populated.pages_1g * 512 * 512;
+
+	/*
+	 * Check that all huge pages were split. Since large pages can only
+	 * exist in the data slot, and the vCPUs should have dirtied all pages
+	 * in the data slot, there should be no huge pages left after splitting.
+	 * Splitting happens at dirty log enable time without
+	 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass
+	 * with that capability.
+	 */
+	if (dirty_log_manual_caps) {
+		TEST_ASSERT(stats_clear_pass[0].hugepages == 0,
+			    "Unexpected huge page count after splitting. Expected 0, got %ld",
+			    stats_clear_pass[0].hugepages);
+		TEST_ASSERT(stats_clear_pass[0].pages_4k == total_4k_pages,
+			    "All memory should be mapped at 4k. Expected %ld 4k pages, got %ld",
+			    total_4k_pages, stats_clear_pass[0].pages_4k);
+	} else {
+		TEST_ASSERT(stats_dirty_logging_enabled.hugepages == 0,
+			    "Unexpected huge page count after splitting. Expected 0, got %ld",
+			    stats_dirty_logging_enabled.hugepages);
+		TEST_ASSERT(stats_dirty_logging_enabled.pages_4k == total_4k_pages,
+			    "All memory should be mapped at 4k. Expected %ld 4k pages, got %ld",
+			    total_4k_pages, stats_dirty_logging_enabled.pages_4k);
+	}
+
+	/*
+	 * Once dirty logging is disabled and the vCPUs have touched all their
+	 * memory again, the page counts should be the same as they were
+	 * right after initial population of memory.
+	 */
+	TEST_ASSERT(stats_populated.pages_4k == stats_repopulated.pages_4k,
+		    "4k page count did not return to its initial value after "
+		    "dirty logging. Expected %ld, got %ld",
+		    stats_populated.pages_4k, stats_repopulated.pages_4k);
+	TEST_ASSERT(stats_populated.pages_2m == stats_repopulated.pages_2m,
+		    "2m page count did not return to its initial value after "
+		    "dirty logging. Expected %ld, got %ld",
+		    stats_populated.pages_2m, stats_repopulated.pages_2m);
+	TEST_ASSERT(stats_populated.pages_1g == stats_repopulated.pages_1g,
+		    "1g page count did not return to its initial value after "
+		    "dirty logging. Expected %ld, got %ld",
+		    stats_populated.pages_1g, stats_repopulated.pages_1g);
+}
+
+static void help(char *name)
+{
+	puts("");
+	printf("usage: %s [-h] [-g] [-m mode] [-b vcpu bytes] [-s mem type]\n",
+	       name);
+	puts("");
+	printf(" -g: Do not enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2. This\n"
+	       "     makes KVM_GET_DIRTY_LOG clear the dirty log (i.e.\n"
+	       "     KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE is not enabled)\n"
+	       "     and writes will be tracked as soon as dirty logging is\n"
+	       "     enabled on the memslot (i.e. KVM_DIRTY_LOG_INITIALLY_SET\n"
+	       "     is not enabled).\n");
+	printf(" -b: specify the size of the memory region which should be\n"
+	       "     dirtied by each vCPU. e.g. 10M or 3G.\n"
+	       "     (default: 1G)\n");
+	backing_src_help("-s");
+	puts("");
+	exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+	struct test_params p = {
+		.backing_src = VM_MEM_SRC_ANONYMOUS_HUGETLB,
+	};
+	int opt;
+
+	TEST_REQUIRE(get_kvm_param_bool("eager_page_split"));
+	TEST_REQUIRE(get_kvm_param_bool("tdp_mmu"));
+
+	dirty_log_manual_caps =
+		kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+	dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+				  KVM_DIRTY_LOG_INITIALLY_SET);
+
+	guest_modes_append_default();
+
+	while ((opt = getopt(argc, argv, "b:ghs:")) != -1) {
+		switch (opt) {
+		case 'b':
+			guest_percpu_mem_size = parse_size(optarg);
+			break;
+		case 'g':
+			dirty_log_manual_caps = 0;
+			break;
+		case 'h':
+			help(argv[0]);
+			break;
+		case 's':
+			p.backing_src = parse_backing_src_type(optarg);
+			break;
+		default:
+			help(argv[0]);
+			break;
+		}
+	}
+
+	if (!is_backing_src_hugetlb(p.backing_src)) {
+		pr_info("This test will only work reliably with HugeTLB memory. "
+			"It can work with THP, but that is best effort.");
+		return KSFT_SKIP;
+	}
+
+	run_test(&p);
+
+	return 0;
+}