From: Sean Christopherson <seanjc@google.com>
Extended guest_memfd to allow backing guest memory with transparent
hugepages. Require userspace to opt-in via a flag even though there's no
known/anticipated use case for forcing small pages as THP is optional,
i.e. to avoid ending up in a situation where userspace is unaware that
KVM can't provide hugepages.
For simplicity, require the guest_memfd size to be a multiple of the
hugepage size, e.g. so that KVM doesn't need to do bounds checking when
deciding whether or not to allocate a huge folio.
When reporting the max order when KVM gets a pfn from guest_memfd, force
order-0 pages if the hugepage is not fully contained by the memslot
binding, e.g. if userspace requested hugepages but punches a hole in the
memslot bindings in order to emulate x86's VGA hole.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/r/20231027182217.3615211-18-seanjc@google.com
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
Documentation/virt/kvm/api.rst | 7 ++++
include/uapi/linux/kvm.h | 2 +
virt/kvm/guest_memfd.c | 73 ++++++++++++++++++++++++++++++----
3 files changed, 75 insertions(+), 7 deletions(-)
@@ -6312,6 +6312,8 @@ and cannot be resized (guest_memfd files do however support PUNCH_HOLE).
__u64 reserved[6];
};
+ #define KVM_GUEST_MEMFD_ALLOW_HUGEPAGE (1ULL << 0)
+
Conceptually, the inode backing a guest_memfd file represents physical memory,
i.e. is coupled to the virtual machine as a thing, not to a "struct kvm". The
file itself, which is bound to a "struct kvm", is that instance's view of the
@@ -6328,6 +6330,11 @@ most one mapping per page, i.e. binding multiple memory regions to a single
guest_memfd range is not allowed (any number of memory regions can be bound to
a single guest_memfd file, but the bound ranges must not overlap).
+If KVM_GUEST_MEMFD_ALLOW_HUGEPAGE is set in flags, KVM will attempt to allocate
+and map hugepages for the guest_memfd file. This is currently best effort. If
+KVM_GUEST_MEMFD_ALLOW_HUGEPAGE is set, the size must be aligned to the maximum
+transparent hugepage size supported by the kernel
+
See KVM_SET_USER_MEMORY_REGION2 for additional details.
4.143 KVM_MEMORY_MAPPING
@@ -2317,6 +2317,8 @@ struct kvm_create_guest_memfd {
__u64 reserved[6];
};
+#define KVM_GUEST_MEMFD_ALLOW_HUGEPAGE (1ULL << 0)
+
#define KVM_MEMORY_MAPPING _IOWR(KVMIO, 0xd5, struct kvm_memory_mapping)
struct kvm_memory_mapping {
@@ -13,14 +13,47 @@ struct kvm_gmem {
struct list_head entry;
};
-static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+static struct folio *kvm_gmem_get_huge_folio(struct inode *inode, pgoff_t index)
{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ unsigned long huge_index = round_down(index, HPAGE_PMD_NR);
+ unsigned long flags = (unsigned long)inode->i_private;
+ struct address_space *mapping = inode->i_mapping;
+ gfp_t gfp = mapping_gfp_mask(mapping);
struct folio *folio;
- /* TODO: Support huge pages. */
- folio = filemap_grab_folio(inode->i_mapping, index);
- if (IS_ERR_OR_NULL(folio))
+ if (!(flags & KVM_GUEST_MEMFD_ALLOW_HUGEPAGE))
+ return NULL;
+
+ if (filemap_range_has_page(mapping, huge_index << PAGE_SHIFT,
+ (huge_index + HPAGE_PMD_NR - 1) << PAGE_SHIFT))
+ return NULL;
+
+ folio = filemap_alloc_folio(gfp, HPAGE_PMD_ORDER);
+ if (!folio)
+ return NULL;
+
+ if (filemap_add_folio(mapping, folio, huge_index, gfp)) {
+ folio_put(folio);
return NULL;
+ }
+
+ return folio;
+#else
+ return NULL;
+#endif
+}
+
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
+{
+ struct folio *folio;
+
+ folio = kvm_gmem_get_huge_folio(inode, index);
+ if (!folio) {
+ folio = filemap_grab_folio(inode->i_mapping, index);
+ if (IS_ERR_OR_NULL(folio))
+ return NULL;
+ }
/*
* Use the up-to-date flag to track whether or not the memory has been
@@ -363,6 +396,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
inode->i_mode |= S_IFREG;
inode->i_size = size;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+ mapping_set_large_folios(inode->i_mapping);
mapping_set_unmovable(inode->i_mapping);
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
@@ -388,12 +422,21 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
u64 flags = args->flags;
u64 valid_flags = 0;
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ valid_flags |= KVM_GUEST_MEMFD_ALLOW_HUGEPAGE;
+
if (flags & ~valid_flags)
return -EINVAL;
if (size <= 0 || !PAGE_ALIGNED(size))
return -EINVAL;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if ((flags & KVM_GUEST_MEMFD_ALLOW_HUGEPAGE) &&
+ !IS_ALIGNED(size, HPAGE_PMD_SIZE))
+ return -EINVAL;
+#endif
+
return __kvm_gmem_create(kvm, size, flags);
}
@@ -488,7 +531,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
{
- pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
+ pgoff_t index, huge_index;
struct kvm_gmem *gmem;
struct folio *folio;
struct page *page;
@@ -501,6 +544,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
gmem = file->private_data;
+ index = gfn - slot->base_gfn + slot->gmem.pgoff;
if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
r = -EIO;
goto out_fput;
@@ -520,9 +564,24 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
page = folio_file_page(folio, index);
*pfn = page_to_pfn(page);
- if (max_order)
- *max_order = 0;
+ if (!max_order)
+ goto success;
+
+ *max_order = compound_order(compound_head(page));
+ if (!*max_order)
+ goto success;
+ /*
+ * The folio can be mapped with a hugepage if and only if the folio is
+ * fully contained by the range the memslot is bound to. Note, the
+ * caller is responsible for handling gfn alignment, this only deals
+ * with the file binding.
+ */
+ huge_index = ALIGN(index, 1ull << *max_order);
+ if (huge_index < ALIGN(slot->gmem.pgoff, 1ull << *max_order) ||
+ huge_index + (1ull << *max_order) > slot->gmem.pgoff + slot->npages)
+ *max_order = 0;
+success:
r = 0;
out_unlock: