[v13,10/24] gunyah: vm_mgr: Add/remove user memory regions
Commit Message
When launching a virtual machine, Gunyah userspace allocates memory for
the guest and informs Gunyah about these memory regions through
SET_USER_MEMORY_REGION ioctl.
Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
drivers/virt/gunyah/Makefile | 2 +-
drivers/virt/gunyah/vm_mgr.c | 59 +++++++-
drivers/virt/gunyah/vm_mgr.h | 26 ++++
drivers/virt/gunyah/vm_mgr_mm.c | 236 ++++++++++++++++++++++++++++++++
include/uapi/linux/gunyah.h | 37 +++++
5 files changed, 356 insertions(+), 4 deletions(-)
create mode 100644 drivers/virt/gunyah/vm_mgr_mm.c
Comments
Hi Elliot,
On Tue, May 09, 2023 at 01:47:47PM -0700, Elliot Berman wrote:
> When launching a virtual machine, Gunyah userspace allocates memory for
> the guest and informs Gunyah about these memory regions through
> SET_USER_MEMORY_REGION ioctl.
>
> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> ---
> drivers/virt/gunyah/Makefile | 2 +-
> drivers/virt/gunyah/vm_mgr.c | 59 +++++++-
> drivers/virt/gunyah/vm_mgr.h | 26 ++++
> drivers/virt/gunyah/vm_mgr_mm.c | 236 ++++++++++++++++++++++++++++++++
> include/uapi/linux/gunyah.h | 37 +++++
> 5 files changed, 356 insertions(+), 4 deletions(-)
> create mode 100644 drivers/virt/gunyah/vm_mgr_mm.c
[...]
> +int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *region)
> +{
> + struct gh_vm_mem *mapping, *tmp_mapping;
> + struct page *curr_page, *prev_page;
> + struct gh_rm_mem_parcel *parcel;
> + int i, j, pinned, ret = 0;
> + unsigned int gup_flags;
> + size_t entry_size;
> + u16 vmid;
> +
> + if (!region->memory_size || !PAGE_ALIGNED(region->memory_size) ||
> + !PAGE_ALIGNED(region->userspace_addr) ||
> + !PAGE_ALIGNED(region->guest_phys_addr))
> + return -EINVAL;
> +
> + if (overflows_type(region->guest_phys_addr + region->memory_size, u64))
> + return -EOVERFLOW;
> +
> + ret = mutex_lock_interruptible(&ghvm->mm_lock);
> + if (ret)
> + return ret;
> +
> + mapping = __gh_vm_mem_find_by_label(ghvm, region->label);
> + if (mapping) {
> + ret = -EEXIST;
> + goto unlock;
> + }
> +
> + list_for_each_entry(tmp_mapping, &ghvm->memory_mappings, list) {
> + if (gh_vm_mem_overlap(tmp_mapping, region->guest_phys_addr,
> + region->memory_size)) {
> + ret = -EEXIST;
> + goto unlock;
> + }
> + }
> +
> + mapping = kzalloc(sizeof(*mapping), GFP_KERNEL_ACCOUNT);
> + if (!mapping) {
> + ret = -ENOMEM;
> + goto unlock;
> + }
> +
> + mapping->guest_phys_addr = region->guest_phys_addr;
> + mapping->npages = region->memory_size >> PAGE_SHIFT;
> + parcel = &mapping->parcel;
> + parcel->label = region->label;
> + parcel->mem_handle = GH_MEM_HANDLE_INVAL; /* to be filled later by mem_share/mem_lend */
> + parcel->mem_type = GH_RM_MEM_TYPE_NORMAL;
> +
> + ret = account_locked_vm(ghvm->mm, mapping->npages, true);
> + if (ret)
> + goto free_mapping;
> +
> + mapping->pages = kcalloc(mapping->npages, sizeof(*mapping->pages), GFP_KERNEL_ACCOUNT);
> + if (!mapping->pages) {
> + ret = -ENOMEM;
> + mapping->npages = 0; /* update npages for reclaim */
> + goto unlock_pages;
> + }
> +
> + gup_flags = FOLL_LONGTERM;
> + if (region->flags & GH_MEM_ALLOW_WRITE)
> + gup_flags |= FOLL_WRITE;
> +
> + pinned = pin_user_pages_fast(region->userspace_addr, mapping->npages,
> + gup_flags, mapping->pages);
> + if (pinned < 0) {
> + ret = pinned;
> + goto free_pages;
> + } else if (pinned != mapping->npages) {
> + ret = -EFAULT;
> + mapping->npages = pinned; /* update npages for reclaim */
> + goto unpin_pages;
> + }
Sorry if I missed it, but I still don't see where you reject file mappings
here.
This is also the wrong interface for upstream. Please get involved with
the fd-based guest memory discussions [1] and port your series to that.
This patch cannot be merged in its current form.
Will
[1] https://lore.kernel.org/kvm/20221202061347.1070246-1-chao.p.peng@linux.intel.com/
On 5/19/2023 4:59 AM, Will Deacon wrote:
> Hi Elliot,
>
> On Tue, May 09, 2023 at 01:47:47PM -0700, Elliot Berman wrote:
>> When launching a virtual machine, Gunyah userspace allocates memory for
>> the guest and informs Gunyah about these memory regions through
>> SET_USER_MEMORY_REGION ioctl.
>>
>> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
>> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
>> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
>> ---
>> drivers/virt/gunyah/Makefile | 2 +-
>> drivers/virt/gunyah/vm_mgr.c | 59 +++++++-
>> drivers/virt/gunyah/vm_mgr.h | 26 ++++
>> drivers/virt/gunyah/vm_mgr_mm.c | 236 ++++++++++++++++++++++++++++++++
>> include/uapi/linux/gunyah.h | 37 +++++
>> 5 files changed, 356 insertions(+), 4 deletions(-)
>> create mode 100644 drivers/virt/gunyah/vm_mgr_mm.c
>
> [...]
>
>> +int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *region)
>> +{
>> + struct gh_vm_mem *mapping, *tmp_mapping;
>> + struct page *curr_page, *prev_page;
>> + struct gh_rm_mem_parcel *parcel;
>> + int i, j, pinned, ret = 0;
>> + unsigned int gup_flags;
>> + size_t entry_size;
>> + u16 vmid;
>> +
>> + if (!region->memory_size || !PAGE_ALIGNED(region->memory_size) ||
>> + !PAGE_ALIGNED(region->userspace_addr) ||
>> + !PAGE_ALIGNED(region->guest_phys_addr))
>> + return -EINVAL;
>> +
>> + if (overflows_type(region->guest_phys_addr + region->memory_size, u64))
>> + return -EOVERFLOW;
>> +
>> + ret = mutex_lock_interruptible(&ghvm->mm_lock);
>> + if (ret)
>> + return ret;
>> +
>> + mapping = __gh_vm_mem_find_by_label(ghvm, region->label);
>> + if (mapping) {
>> + ret = -EEXIST;
>> + goto unlock;
>> + }
>> +
>> + list_for_each_entry(tmp_mapping, &ghvm->memory_mappings, list) {
>> + if (gh_vm_mem_overlap(tmp_mapping, region->guest_phys_addr,
>> + region->memory_size)) {
>> + ret = -EEXIST;
>> + goto unlock;
>> + }
>> + }
>> +
>> + mapping = kzalloc(sizeof(*mapping), GFP_KERNEL_ACCOUNT);
>> + if (!mapping) {
>> + ret = -ENOMEM;
>> + goto unlock;
>> + }
>> +
>> + mapping->guest_phys_addr = region->guest_phys_addr;
>> + mapping->npages = region->memory_size >> PAGE_SHIFT;
>> + parcel = &mapping->parcel;
>> + parcel->label = region->label;
>> + parcel->mem_handle = GH_MEM_HANDLE_INVAL; /* to be filled later by mem_share/mem_lend */
>> + parcel->mem_type = GH_RM_MEM_TYPE_NORMAL;
>> +
>> + ret = account_locked_vm(ghvm->mm, mapping->npages, true);
>> + if (ret)
>> + goto free_mapping;
>> +
>> + mapping->pages = kcalloc(mapping->npages, sizeof(*mapping->pages), GFP_KERNEL_ACCOUNT);
>> + if (!mapping->pages) {
>> + ret = -ENOMEM;
>> + mapping->npages = 0; /* update npages for reclaim */
>> + goto unlock_pages;
>> + }
>> +
>> + gup_flags = FOLL_LONGTERM;
>> + if (region->flags & GH_MEM_ALLOW_WRITE)
>> + gup_flags |= FOLL_WRITE;
>> +
>> + pinned = pin_user_pages_fast(region->userspace_addr, mapping->npages,
>> + gup_flags, mapping->pages);
>> + if (pinned < 0) {
>> + ret = pinned;
>> + goto free_pages;
>> + } else if (pinned != mapping->npages) {
>> + ret = -EFAULT;
>> + mapping->npages = pinned; /* update npages for reclaim */
>> + goto unpin_pages;
>> + }
>
> Sorry if I missed it, but I still don't see where you reject file mappings
> here.
>
Sure, I can reject file mappings. I didn't catch that was the ask
previously and thought it was only a comment about behavior of file
mappings.
> This is also the wrong interface for upstream. Please get involved with
> the fd-based guest memory discussions [1] and port your series to that.
>
The user interface design for *shared* memory aligns with
KVM_SET_USER_MEMORY_REGION.
I understood we want to use restricted memfd for giving guest-private
memory (Gunyah calls this "lending memory"). When I went through the
changes, I gathered KVM is using restricted memfd only for guest-private
memory and not for shared memory. Thus, I dropped support for lending
memory to the guest VM and only retained the shared memory support in
this series. I'd like to merge what we can today and introduce the
guest-private memory support in tandem with the restricted memfd; I
don't see much reason to delay the series.
I briefly evaluated and picked the arm64/pKVM support that Fuad shared
[2] and found it should be fine for Gunyah. I did build-only at the
time. I don't have any comments on the base restricted_memfd support and
Fuad has not posted [2] on mailing lists yet as far as I can tell.
> This patch cannot be merged in its current form.
>
I am a little confused why the implementation to share memory with the
VM is being rejected. Besides rejecting file mappings, any other changes
needed to be accepted?
- Elliot
> Will
>
> [1] https://lore.kernel.org/kvm/20221202061347.1070246-1-chao.p.peng@linux.intel.com/
[2]:
https://android-kvm.googlesource.com/linux/+/refs/heads/tabba/fdmem-v10-core
Hi Elliot,
[+Quentin since he's looked at the MMU notifiers]
Sorry for the slow response, I got buried in email during a week away.
On Fri, May 19, 2023 at 10:02:29AM -0700, Elliot Berman wrote:
> On 5/19/2023 4:59 AM, Will Deacon wrote:
> > On Tue, May 09, 2023 at 01:47:47PM -0700, Elliot Berman wrote:
> > > + ret = account_locked_vm(ghvm->mm, mapping->npages, true);
> > > + if (ret)
> > > + goto free_mapping;
> > > +
> > > + mapping->pages = kcalloc(mapping->npages, sizeof(*mapping->pages), GFP_KERNEL_ACCOUNT);
> > > + if (!mapping->pages) {
> > > + ret = -ENOMEM;
> > > + mapping->npages = 0; /* update npages for reclaim */
> > > + goto unlock_pages;
> > > + }
> > > +
> > > + gup_flags = FOLL_LONGTERM;
> > > + if (region->flags & GH_MEM_ALLOW_WRITE)
> > > + gup_flags |= FOLL_WRITE;
> > > +
> > > + pinned = pin_user_pages_fast(region->userspace_addr, mapping->npages,
> > > + gup_flags, mapping->pages);
> > > + if (pinned < 0) {
> > > + ret = pinned;
> > > + goto free_pages;
> > > + } else if (pinned != mapping->npages) {
> > > + ret = -EFAULT;
> > > + mapping->npages = pinned; /* update npages for reclaim */
> > > + goto unpin_pages;
> > > + }
> >
> > Sorry if I missed it, but I still don't see where you reject file mappings
> > here.
> >
>
> Sure, I can reject file mappings. I didn't catch that was the ask previously
> and thought it was only a comment about behavior of file mappings.
I thought the mention of filesystem corruption was clear enough! It's
definitely something we shouldn't allow.
> > This is also the wrong interface for upstream. Please get involved with
> > the fd-based guest memory discussions [1] and port your series to that.
> >
>
> The user interface design for *shared* memory aligns with
> KVM_SET_USER_MEMORY_REGION.
I don't think it does. For example, file mappings don't work (as above),
you're placing additional rlimit requirements on the caller, read-only
memslots are not functional, the memory cannot be swapped or migrated,
dirty logging doesn't work etc. pKVM is in the same boat, but that's why
we're not upstreaming this part in its current form.
> I understood we want to use restricted memfd for giving guest-private memory
> (Gunyah calls this "lending memory"). When I went through the changes, I
> gathered KVM is using restricted memfd only for guest-private memory and not
> for shared memory. Thus, I dropped support for lending memory to the guest
> VM and only retained the shared memory support in this series. I'd like to
> merge what we can today and introduce the guest-private memory support in
> tandem with the restricted memfd; I don't see much reason to delay the
> series.
Right, protected guests will use the new restricted memfd ("guest mem"
now, I think?), but non-protected guests should implement the existing
interface *without* the need for the GUP pin on guest memory pages. Yes,
that means full support for MMU notifiers so that these pages can be
managed properly by the host kernel. We're working on that for pKVM, but
it requires a more flexible form of memory sharing over what we currently
have so that e.g. the zero page can be shared between multiple entities.
Will
On 5/9/23 3:47 PM, Elliot Berman wrote:
> When launching a virtual machine, Gunyah userspace allocates memory for
> the guest and informs Gunyah about these memory regions through
> SET_USER_MEMORY_REGION ioctl.
>
> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
Two minor comments below. In any case:
Reviewed-by: Alex Elder <elder@linaro.org>
> ---
> drivers/virt/gunyah/Makefile | 2 +-
> drivers/virt/gunyah/vm_mgr.c | 59 +++++++-
> drivers/virt/gunyah/vm_mgr.h | 26 ++++
> drivers/virt/gunyah/vm_mgr_mm.c | 236 ++++++++++++++++++++++++++++++++
> include/uapi/linux/gunyah.h | 37 +++++
> 5 files changed, 356 insertions(+), 4 deletions(-)
> create mode 100644 drivers/virt/gunyah/vm_mgr_mm.c
>
> diff --git a/drivers/virt/gunyah/Makefile b/drivers/virt/gunyah/Makefile
> index e47e25895299..bacf78b8fa33 100644
> --- a/drivers/virt/gunyah/Makefile
> +++ b/drivers/virt/gunyah/Makefile
> @@ -1,4 +1,4 @@
> # SPDX-License-Identifier: GPL-2.0
>
> -gunyah-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o
> +gunyah-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o vm_mgr_mm.o
> obj-$(CONFIG_GUNYAH) += gunyah.o
> diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
> index a43401cb34f7..297427952b8c 100644
> --- a/drivers/virt/gunyah/vm_mgr.c
> +++ b/drivers/virt/gunyah/vm_mgr.c
> @@ -15,6 +15,8 @@
>
> #include "vm_mgr.h"
>
> +static void gh_vm_free(struct work_struct *work);
> +
You could just define gh_vm_free() here rather than declaring
and defining it later.
> static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
> {
> struct gh_vm *ghvm;
. . .
> diff --git a/drivers/virt/gunyah/vm_mgr_mm.c b/drivers/virt/gunyah/vm_mgr_mm.c
> new file mode 100644
> index 000000000000..91109bbf36b3
> --- /dev/null
> +++ b/drivers/virt/gunyah/vm_mgr_mm.c
> @@ -0,0 +1,236 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All rights reserved.
> + */
> +
> +#define pr_fmt(fmt) "gh_vm_mgr: " fmt
> +
> +#include <linux/gunyah_rsc_mgr.h>
> +#include <linux/mm.h>
> +
> +#include <uapi/linux/gunyah.h>
> +
> +#include "vm_mgr.h"
> +
> +static bool pages_are_mergeable(struct page *a, struct page *b)
> +{
> + if (page_to_pfn(a) + 1 != page_to_pfn(b))
> + return false;
> + if (!zone_device_pages_have_same_pgmap(a, b))
> + return false;
> + return true;
Maybe just:
return zone_device_pages_have_same_pgmap(a, b);
> +}
> +
> +static bool gh_vm_mem_overlap(struct gh_vm_mem *a, u64 addr, u64 size)
> +{
> + u64 a_end = a->guest_phys_addr + (a->npages << PAGE_SHIFT);
> + u64 end = addr + size;
> +
> + return a->guest_phys_addr < end && addr < a_end;
> +}
> +
. . .
On 6/5/2023 7:18 AM, Will Deacon wrote:
> Hi Elliot,
>
> [+Quentin since he's looked at the MMU notifiers]
>
> Sorry for the slow response, I got buried in email during a week away.
>
> On Fri, May 19, 2023 at 10:02:29AM -0700, Elliot Berman wrote:
>> On 5/19/2023 4:59 AM, Will Deacon wrote:
>>> On Tue, May 09, 2023 at 01:47:47PM -0700, Elliot Berman wrote:
>>>> + ret = account_locked_vm(ghvm->mm, mapping->npages, true);
>>>> + if (ret)
>>>> + goto free_mapping;
>>>> +
>>>> + mapping->pages = kcalloc(mapping->npages, sizeof(*mapping->pages), GFP_KERNEL_ACCOUNT);
>>>> + if (!mapping->pages) {
>>>> + ret = -ENOMEM;
>>>> + mapping->npages = 0; /* update npages for reclaim */
>>>> + goto unlock_pages;
>>>> + }
>>>> +
>>>> + gup_flags = FOLL_LONGTERM;
>>>> + if (region->flags & GH_MEM_ALLOW_WRITE)
>>>> + gup_flags |= FOLL_WRITE;
>>>> +
>>>> + pinned = pin_user_pages_fast(region->userspace_addr, mapping->npages,
>>>> + gup_flags, mapping->pages);
>>>> + if (pinned < 0) {
>>>> + ret = pinned;
>>>> + goto free_pages;
>>>> + } else if (pinned != mapping->npages) {
>>>> + ret = -EFAULT;
>>>> + mapping->npages = pinned; /* update npages for reclaim */
>>>> + goto unpin_pages;
>>>> + }
>>>
>>> Sorry if I missed it, but I still don't see where you reject file mappings
>>> here.
>>>
>>
>> Sure, I can reject file mappings. I didn't catch that was the ask previously
>> and thought it was only a comment about behavior of file mappings.
>
> I thought the mention of filesystem corruption was clear enough! It's
> definitely something we shouldn't allow.
>
>>> This is also the wrong interface for upstream. Please get involved with
>>> the fd-based guest memory discussions [1] and port your series to that.
>>>
>>
>> The user interface design for *shared* memory aligns with
>> KVM_SET_USER_MEMORY_REGION.
>
> I don't think it does. For example, file mappings don't work (as above),
> you're placing additional rlimit requirements on the caller, read-only
> memslots are not functional, the memory cannot be swapped or migrated,
> dirty logging doesn't work etc. pKVM is in the same boat, but that's why
> we're not upstreaming this part in its current form.
>
I thought pKVM was only holding off on upstreaming changes related to
guest-private memory?
>> I understood we want to use restricted memfd for giving guest-private memory
>> (Gunyah calls this "lending memory"). When I went through the changes, I
>> gathered KVM is using restricted memfd only for guest-private memory and not
>> for shared memory. Thus, I dropped support for lending memory to the guest
>> VM and only retained the shared memory support in this series. I'd like to
>> merge what we can today and introduce the guest-private memory support in
>> tandem with the restricted memfd; I don't see much reason to delay the
>> series.
>
> Right, protected guests will use the new restricted memfd ("guest mem"
> now, I think?), but non-protected guests should implement the existing
> interface *without* the need for the GUP pin on guest memory pages. Yes,
> that means full support for MMU notifiers so that these pages can be
> managed properly by the host kernel. We're working on that for pKVM, but
> it requires a more flexible form of memory sharing over what we currently
> have so that e.g. the zero page can be shared between multiple entities.
Gunyah doesn't support swapping pages out while the guest is running and
the design of Gunyah isn't made to give host kernel full control over
the S2 page table for its guests. As best I can tell from reading the
respective drivers, ACRN and Nitro Enclaves both GUP pin guest memory
pages prior to giving them to the guest, so I don't think this
requirement from Gunyah is particularly unusual.
On 6/5/2023 7:18 AM, Will Deacon wrote:
> Hi Elliot,
>
> [+Quentin since he's looked at the MMU notifiers]
>
> Sorry for the slow response, I got buried in email during a week away.
>
> On Fri, May 19, 2023 at 10:02:29AM -0700, Elliot Berman wrote:
>> On 5/19/2023 4:59 AM, Will Deacon wrote:
>>> On Tue, May 09, 2023 at 01:47:47PM -0700, Elliot Berman wrote:
>>>> + ret = account_locked_vm(ghvm->mm, mapping->npages, true);
>>>> + if (ret)
>>>> + goto free_mapping;
>>>> +
>>>> + mapping->pages = kcalloc(mapping->npages, sizeof(*mapping->pages), GFP_KERNEL_ACCOUNT);
>>>> + if (!mapping->pages) {
>>>> + ret = -ENOMEM;
>>>> + mapping->npages = 0; /* update npages for reclaim */
>>>> + goto unlock_pages;
>>>> + }
>>>> +
>>>> + gup_flags = FOLL_LONGTERM;
>>>> + if (region->flags & GH_MEM_ALLOW_WRITE)
>>>> + gup_flags |= FOLL_WRITE;
>>>> +
>>>> + pinned = pin_user_pages_fast(region->userspace_addr, mapping->npages,
>>>> + gup_flags, mapping->pages);
>>>> + if (pinned < 0) {
>>>> + ret = pinned;
>>>> + goto free_pages;
>>>> + } else if (pinned != mapping->npages) {
>>>> + ret = -EFAULT;
>>>> + mapping->npages = pinned; /* update npages for reclaim */
>>>> + goto unpin_pages;
>>>> + }
>>>
>>> Sorry if I missed it, but I still don't see where you reject file mappings
>>> here.
>>>
>>
>> Sure, I can reject file mappings. I didn't catch that was the ask previously
>> and thought it was only a comment about behavior of file mappings.
>
> I thought the mention of filesystem corruption was clear enough! It's
> definitely something we shouldn't allow.
>
I tried preventing file mappings but this breaks memfd used by crosvm. I
didn't understand the vector you were tracking for filesystem
corruption. I ran a few basic experiments with real filesystem backed
memory mappings and didn't observe corruption, but maybe my experiments
weren't right.
[snip; response to other comments in
https://lore.kernel.org/all/3bd86221-ee2e-d157-009b-11f6ada98537@quicinc.com/]
On 6/7/2023 8:54 AM, Elliot Berman wrote:
>
>
> On 6/5/2023 7:18 AM, Will Deacon wrote:
>> Hi Elliot,
>>
>> [+Quentin since he's looked at the MMU notifiers]
>>
>> Sorry for the slow response, I got buried in email during a week away.
>>
>> On Fri, May 19, 2023 at 10:02:29AM -0700, Elliot Berman wrote:
>>> On 5/19/2023 4:59 AM, Will Deacon wrote:
>>>> On Tue, May 09, 2023 at 01:47:47PM -0700, Elliot Berman wrote:
>>>>> + ret = account_locked_vm(ghvm->mm, mapping->npages, true);
>>>>> + if (ret)
>>>>> + goto free_mapping;
>>>>> +
>>>>> + mapping->pages = kcalloc(mapping->npages,
>>>>> sizeof(*mapping->pages), GFP_KERNEL_ACCOUNT);
>>>>> + if (!mapping->pages) {
>>>>> + ret = -ENOMEM;
>>>>> + mapping->npages = 0; /* update npages for reclaim */
>>>>> + goto unlock_pages;
>>>>> + }
>>>>> +
>>>>> + gup_flags = FOLL_LONGTERM;
>>>>> + if (region->flags & GH_MEM_ALLOW_WRITE)
>>>>> + gup_flags |= FOLL_WRITE;
>>>>> +
>>>>> + pinned = pin_user_pages_fast(region->userspace_addr,
>>>>> mapping->npages,
>>>>> + gup_flags, mapping->pages);
>>>>> + if (pinned < 0) {
>>>>> + ret = pinned;
>>>>> + goto free_pages;
>>>>> + } else if (pinned != mapping->npages) {
>>>>> + ret = -EFAULT;
>>>>> + mapping->npages = pinned; /* update npages for reclaim */
>>>>> + goto unpin_pages;
>>>>> + }
>>>>
>>>> Sorry if I missed it, but I still don't see where you reject file
>>>> mappings
>>>> here.
>>>>
>>>
>>> Sure, I can reject file mappings. I didn't catch that was the ask
>>> previously
>>> and thought it was only a comment about behavior of file mappings.
>>
>> I thought the mention of filesystem corruption was clear enough! It's
>> definitely something we shouldn't allow.
>>
>>>> This is also the wrong interface for upstream. Please get involved with
>>>> the fd-based guest memory discussions [1] and port your series to that.
>>>>
>>>
>>> The user interface design for *shared* memory aligns with
>>> KVM_SET_USER_MEMORY_REGION.
>>
>> I don't think it does. For example, file mappings don't work (as above),
>> you're placing additional rlimit requirements on the caller, read-only
>> memslots are not functional, the memory cannot be swapped or migrated,
>> dirty logging doesn't work etc. pKVM is in the same boat, but that's why
>> we're not upstreaming this part in its current form.
>>
>
> I thought pKVM was only holding off on upstreaming changes related to
> guest-private memory?
>
>>> I understood we want to use restricted memfd for giving guest-private
>>> memory
>>> (Gunyah calls this "lending memory"). When I went through the changes, I
>>> gathered KVM is using restricted memfd only for guest-private memory
>>> and not
>>> for shared memory. Thus, I dropped support for lending memory to the
>>> guest
>>> VM and only retained the shared memory support in this series. I'd
>>> like to
>>> merge what we can today and introduce the guest-private memory
>>> support in
>>> tandem with the restricted memfd; I don't see much reason to delay the
>>> series.
>>
>> Right, protected guests will use the new restricted memfd ("guest mem"
>> now, I think?), but non-protected guests should implement the existing
>> interface *without* the need for the GUP pin on guest memory pages. Yes,
>> that means full support for MMU notifiers so that these pages can be
>> managed properly by the host kernel. We're working on that for pKVM, but
>> it requires a more flexible form of memory sharing over what we currently
>> have so that e.g. the zero page can be shared between multiple entities.
>
> Gunyah doesn't support swapping pages out while the guest is running and
> the design of Gunyah isn't made to give host kernel full control over
> the S2 page table for its guests. As best I can tell from reading the
> respective drivers, ACRN and Nitro Enclaves both GUP pin guest memory
> pages prior to giving them to the guest, so I don't think this
> requirement from Gunyah is particularly unusual.
>
I read/dug into mmu notifiers more and I don't think it matches with
Gunyah's features today. We don't allow the host to freely manage VM's
pages because it requires the guest VM to have a level of trust on the
host. Once a page is given to the guest, it's done for the lifetime of
the VM. Allowing the host to replace pages in the guest memory map isn't
part of any VM's security model that we run in Gunyah. With that
requirement, longterm pinning looks like the correct approach to me.
Thanks,
Elliot
Hi Will,
On 6/22/2023 4:56 PM, Elliot Berman wrote:
>
>
> On 6/7/2023 8:54 AM, Elliot Berman wrote:
>>
>>
>> On 6/5/2023 7:18 AM, Will Deacon wrote:
>>> Hi Elliot,
>>>
>>> [+Quentin since he's looked at the MMU notifiers]
>>>
>>> Sorry for the slow response, I got buried in email during a week away.
>>>
>>> On Fri, May 19, 2023 at 10:02:29AM -0700, Elliot Berman wrote:
>>>> On 5/19/2023 4:59 AM, Will Deacon wrote:
>>>>> On Tue, May 09, 2023 at 01:47:47PM -0700, Elliot Berman wrote:
>>>>>> + ret = account_locked_vm(ghvm->mm, mapping->npages, true);
>>>>>> + if (ret)
>>>>>> + goto free_mapping;
>>>>>> +
>>>>>> + mapping->pages = kcalloc(mapping->npages,
>>>>>> sizeof(*mapping->pages), GFP_KERNEL_ACCOUNT);
>>>>>> + if (!mapping->pages) {
>>>>>> + ret = -ENOMEM;
>>>>>> + mapping->npages = 0; /* update npages for reclaim */
>>>>>> + goto unlock_pages;
>>>>>> + }
>>>>>> +
>>>>>> + gup_flags = FOLL_LONGTERM;
>>>>>> + if (region->flags & GH_MEM_ALLOW_WRITE)
>>>>>> + gup_flags |= FOLL_WRITE;
>>>>>> +
>>>>>> + pinned = pin_user_pages_fast(region->userspace_addr,
>>>>>> mapping->npages,
>>>>>> + gup_flags, mapping->pages);
>>>>>> + if (pinned < 0) {
>>>>>> + ret = pinned;
>>>>>> + goto free_pages;
>>>>>> + } else if (pinned != mapping->npages) {
>>>>>> + ret = -EFAULT;
>>>>>> + mapping->npages = pinned; /* update npages for reclaim */
>>>>>> + goto unpin_pages;
>>>>>> + }
>>>>>
>>>>> Sorry if I missed it, but I still don't see where you reject file
>>>>> mappings
>>>>> here.
>>>>>
>>>>
>>>> Sure, I can reject file mappings. I didn't catch that was the ask
>>>> previously
>>>> and thought it was only a comment about behavior of file mappings.
>>>
>>> I thought the mention of filesystem corruption was clear enough! It's
>>> definitely something we shouldn't allow.
>>>
>>>>> This is also the wrong interface for upstream. Please get involved
>>>>> with
>>>>> the fd-based guest memory discussions [1] and port your series to
>>>>> that.
>>>>>
>>>>
>>>> The user interface design for *shared* memory aligns with
>>>> KVM_SET_USER_MEMORY_REGION.
>>>
>>> I don't think it does. For example, file mappings don't work (as above),
>>> you're placing additional rlimit requirements on the caller, read-only
>>> memslots are not functional, the memory cannot be swapped or migrated,
>>> dirty logging doesn't work etc. pKVM is in the same boat, but that's why
>>> we're not upstreaming this part in its current form.
>>>
>>
>> I thought pKVM was only holding off on upstreaming changes related to
>> guest-private memory?
>>
>>>> I understood we want to use restricted memfd for giving
>>>> guest-private memory
>>>> (Gunyah calls this "lending memory"). When I went through the
>>>> changes, I
>>>> gathered KVM is using restricted memfd only for guest-private memory
>>>> and not
>>>> for shared memory. Thus, I dropped support for lending memory to the
>>>> guest
>>>> VM and only retained the shared memory support in this series. I'd
>>>> like to
>>>> merge what we can today and introduce the guest-private memory
>>>> support in
>>>> tandem with the restricted memfd; I don't see much reason to delay the
>>>> series.
>>>
>>> Right, protected guests will use the new restricted memfd ("guest mem"
>>> now, I think?), but non-protected guests should implement the existing
>>> interface *without* the need for the GUP pin on guest memory pages. Yes,
>>> that means full support for MMU notifiers so that these pages can be
>>> managed properly by the host kernel. We're working on that for pKVM, but
>>> it requires a more flexible form of memory sharing over what we
>>> currently
>>> have so that e.g. the zero page can be shared between multiple entities.
>>
>> Gunyah doesn't support swapping pages out while the guest is running
>> and the design of Gunyah isn't made to give host kernel full control
>> over the S2 page table for its guests. As best I can tell from reading
>> the respective drivers, ACRN and Nitro Enclaves both GUP pin guest
>> memory pages prior to giving them to the guest, so I don't think this
>> requirement from Gunyah is particularly unusual.
>>
>
> I read/dug into mmu notifiers more and I don't think it matches with
> Gunyah's features today. We don't allow the host to freely manage VM's
> pages because it requires the guest VM to have a level of trust on the
> host. Once a page is given to the guest, it's done for the lifetime of
> the VM. Allowing the host to replace pages in the guest memory map isn't
> part of any VM's security model that we run in Gunyah. With that
> requirement, longterm pinning looks like the correct approach to me.
Is my approach of longterm pinning correct given that Gunyah doesn't
allow host to freely swap pages?
On Thu, Jul 13, 2023 at 01:28:34PM -0700, Elliot Berman wrote:
> On 6/22/2023 4:56 PM, Elliot Berman wrote:
> > On 6/7/2023 8:54 AM, Elliot Berman wrote:
> > > On 6/5/2023 7:18 AM, Will Deacon wrote:
> > > > On Fri, May 19, 2023 at 10:02:29AM -0700, Elliot Berman wrote:
> > > > > The user interface design for *shared* memory aligns with
> > > > > KVM_SET_USER_MEMORY_REGION.
> > > >
> > > > I don't think it does. For example, file mappings don't work (as above),
> > > > you're placing additional rlimit requirements on the caller, read-only
> > > > memslots are not functional, the memory cannot be swapped or migrated,
> > > > dirty logging doesn't work etc. pKVM is in the same boat, but that's why
> > > > we're not upstreaming this part in its current form.
> > > >
> > >
> > > I thought pKVM was only holding off on upstreaming changes related
> > > to guest-private memory?
> > >
> > > > > I understood we want to use restricted memfd for giving
> > > > > guest-private memory
> > > > > (Gunyah calls this "lending memory"). When I went through
> > > > > the changes, I
> > > > > gathered KVM is using restricted memfd only for
> > > > > guest-private memory and not
> > > > > for shared memory. Thus, I dropped support for lending
> > > > > memory to the guest
> > > > > VM and only retained the shared memory support in this
> > > > > series. I'd like to
> > > > > merge what we can today and introduce the guest-private
> > > > > memory support in
> > > > > tandem with the restricted memfd; I don't see much reason to delay the
> > > > > series.
> > > >
> > > > Right, protected guests will use the new restricted memfd ("guest mem"
> > > > now, I think?), but non-protected guests should implement the existing
> > > > interface *without* the need for the GUP pin on guest memory pages. Yes,
> > > > that means full support for MMU notifiers so that these pages can be
> > > > managed properly by the host kernel. We're working on that for pKVM, but
> > > > it requires a more flexible form of memory sharing over what we
> > > > currently
> > > > have so that e.g. the zero page can be shared between multiple entities.
> > >
> > > Gunyah doesn't support swapping pages out while the guest is running
> > > and the design of Gunyah isn't made to give host kernel full control
> > > over the S2 page table for its guests. As best I can tell from
> > > reading the respective drivers, ACRN and Nitro Enclaves both GUP pin
> > > guest memory pages prior to giving them to the guest, so I don't
> > > think this requirement from Gunyah is particularly unusual.
> > >
> >
> > I read/dug into mmu notifiers more and I don't think it matches with
> > Gunyah's features today. We don't allow the host to freely manage VM's
> > pages because it requires the guest VM to have a level of trust on the
> > host. Once a page is given to the guest, it's done for the lifetime of
> > the VM. Allowing the host to replace pages in the guest memory map isn't
> > part of any VM's security model that we run in Gunyah. With that
> > requirement, longterm pinning looks like the correct approach to me.
>
> Is my approach of longterm pinning correct given that Gunyah doesn't allow
> host to freely swap pages?
No, I really don't think a longterm GUP pin is the right approach for this.
GUP pins in general are horrible for the mm layer, but required for cases
such as DMA where I/O faults are unrecoverable. Gunyah is not a good
justification for such a hack, and I don't think you get to choose which
parts of the Linux mm you want and which bits you don't.
In other words, either carve out your memory and pin it that way, or
implement the proper hooks for the mm to do its job.
Will
Hi Will,
On 7/14/2023 5:13 AM, Will Deacon wrote:
> On Thu, Jul 13, 2023 at 01:28:34PM -0700, Elliot Berman wrote:
>> On 6/22/2023 4:56 PM, Elliot Berman wrote:
>>> On 6/7/2023 8:54 AM, Elliot Berman wrote:
>>>> On 6/5/2023 7:18 AM, Will Deacon wrote:
>>>>> On Fri, May 19, 2023 at 10:02:29AM -0700, Elliot Berman wrote:
>>>>>> The user interface design for *shared* memory aligns with
>>>>>> KVM_SET_USER_MEMORY_REGION.
>>>>>
>>>>> I don't think it does. For example, file mappings don't work (as above),
>>>>> you're placing additional rlimit requirements on the caller, read-only
>>>>> memslots are not functional, the memory cannot be swapped or migrated,
>>>>> dirty logging doesn't work etc. pKVM is in the same boat, but that's why
>>>>> we're not upstreaming this part in its current form.
>>>>>
>>>>
>>>> I thought pKVM was only holding off on upstreaming changes related
>>>> to guest-private memory?
>>>>
>>>>>> I understood we want to use restricted memfd for giving
>>>>>> guest-private memory
>>>>>> (Gunyah calls this "lending memory"). When I went through
>>>>>> the changes, I
>>>>>> gathered KVM is using restricted memfd only for
>>>>>> guest-private memory and not
>>>>>> for shared memory. Thus, I dropped support for lending
>>>>>> memory to the guest
>>>>>> VM and only retained the shared memory support in this
>>>>>> series. I'd like to
>>>>>> merge what we can today and introduce the guest-private
>>>>>> memory support in
>>>>>> tandem with the restricted memfd; I don't see much reason to delay the
>>>>>> series.
>>>>>
>>>>> Right, protected guests will use the new restricted memfd ("guest mem"
>>>>> now, I think?), but non-protected guests should implement the existing
>>>>> interface *without* the need for the GUP pin on guest memory pages. Yes,
>>>>> that means full support for MMU notifiers so that these pages can be
>>>>> managed properly by the host kernel. We're working on that for pKVM, but
>>>>> it requires a more flexible form of memory sharing over what we
>>>>> currently
>>>>> have so that e.g. the zero page can be shared between multiple entities.
>>>>
>>>> Gunyah doesn't support swapping pages out while the guest is running
>>>> and the design of Gunyah isn't made to give host kernel full control
>>>> over the S2 page table for its guests. As best I can tell from
>>>> reading the respective drivers, ACRN and Nitro Enclaves both GUP pin
>>>> guest memory pages prior to giving them to the guest, so I don't
>>>> think this requirement from Gunyah is particularly unusual.
>>>>
>>>
>>> I read/dug into mmu notifiers more and I don't think it matches with
>>> Gunyah's features today. We don't allow the host to freely manage VM's
>>> pages because it requires the guest VM to have a level of trust on the
>>> host. Once a page is given to the guest, it's done for the lifetime of
>>> the VM. Allowing the host to replace pages in the guest memory map isn't
>>> part of any VM's security model that we run in Gunyah. With that
>>> requirement, longterm pinning looks like the correct approach to me.
>>
>> Is my approach of longterm pinning correct given that Gunyah doesn't allow
>> host to freely swap pages?
>
> No, I really don't think a longterm GUP pin is the right approach for this.
> GUP pins in general are horrible for the mm layer, but required for cases
> such as DMA where I/O faults are unrecoverable. Gunyah is not a good
> justification for such a hack, and I don't think you get to choose which
> parts of the Linux mm you want and which bits you don't.
>
> In other words, either carve out your memory and pin it that way, or
> implement the proper hooks for the mm to do its job.
I talked to the team about whether we can extend the Gunyah support for
this. We have plans to support sharing/lending individual pages when the
guest faults on them. The support also allows (unprotected) pages to be
removed from the VM. We'll need to temporarily pin the pages of the VM
configuration device tree blob while the VM is being created and those
pages can be unpinned once the VM starts. I'll work on this.
Thanks for the feedback!
- Elliot
On Tue, Jul 18, 2023 at 07:28:49PM -0700, Elliot Berman wrote:
> On 7/14/2023 5:13 AM, Will Deacon wrote:
> > On Thu, Jul 13, 2023 at 01:28:34PM -0700, Elliot Berman wrote:
> > > On 6/22/2023 4:56 PM, Elliot Berman wrote:
> > > > On 6/7/2023 8:54 AM, Elliot Berman wrote:
> > > > > On 6/5/2023 7:18 AM, Will Deacon wrote:
> > > > > > Right, protected guests will use the new restricted memfd ("guest mem"
> > > > > > now, I think?), but non-protected guests should implement the existing
> > > > > > interface *without* the need for the GUP pin on guest memory pages. Yes,
> > > > > > that means full support for MMU notifiers so that these pages can be
> > > > > > managed properly by the host kernel. We're working on that for pKVM, but
> > > > > > it requires a more flexible form of memory sharing over what we
> > > > > > currently
> > > > > > have so that e.g. the zero page can be shared between multiple entities.
> > > > >
> > > > > Gunyah doesn't support swapping pages out while the guest is running
> > > > > and the design of Gunyah isn't made to give host kernel full control
> > > > > over the S2 page table for its guests. As best I can tell from
> > > > > reading the respective drivers, ACRN and Nitro Enclaves both GUP pin
> > > > > guest memory pages prior to giving them to the guest, so I don't
> > > > > think this requirement from Gunyah is particularly unusual.
> > > > >
> > > >
> > > > I read/dug into mmu notifiers more and I don't think it matches with
> > > > Gunyah's features today. We don't allow the host to freely manage VM's
> > > > pages because it requires the guest VM to have a level of trust on the
> > > > host. Once a page is given to the guest, it's done for the lifetime of
> > > > the VM. Allowing the host to replace pages in the guest memory map isn't
> > > > part of any VM's security model that we run in Gunyah. With that
> > > > requirement, longterm pinning looks like the correct approach to me.
> > >
> > > Is my approach of longterm pinning correct given that Gunyah doesn't allow
> > > host to freely swap pages?
> >
> > No, I really don't think a longterm GUP pin is the right approach for this.
> > GUP pins in general are horrible for the mm layer, but required for cases
> > such as DMA where I/O faults are unrecoverable. Gunyah is not a good
> > justification for such a hack, and I don't think you get to choose which
> > parts of the Linux mm you want and which bits you don't.
> >
> > In other words, either carve out your memory and pin it that way, or
> > implement the proper hooks for the mm to do its job.
>
> I talked to the team about whether we can extend the Gunyah support for
> this. We have plans to support sharing/lending individual pages when the
> guest faults on them. The support also allows (unprotected) pages to be
> removed from the VM. We'll need to temporarily pin the pages of the VM
> configuration device tree blob while the VM is being created and those pages
> can be unpinned once the VM starts. I'll work on this.
That's pleasantly unexpected, thanks for pursuing this!
Will
@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-gunyah-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o
+gunyah-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o vm_mgr_mm.o
obj-$(CONFIG_GUNYAH) += gunyah.o
@@ -15,6 +15,8 @@
#include "vm_mgr.h"
+static void gh_vm_free(struct work_struct *work);
+
static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
{
struct gh_vm *ghvm;
@@ -26,20 +28,72 @@ static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
ghvm->parent = gh_rm_get(rm);
ghvm->rm = rm;
+ mmgrab(current->mm);
+ ghvm->mm = current->mm;
+ mutex_init(&ghvm->mm_lock);
+ INIT_LIST_HEAD(&ghvm->memory_mappings);
+ INIT_WORK(&ghvm->free_work, gh_vm_free);
+
return ghvm;
}
-static int gh_vm_release(struct inode *inode, struct file *filp)
+static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct gh_vm *ghvm = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ long r;
+
+ switch (cmd) {
+ case GH_VM_SET_USER_MEM_REGION: {
+ struct gh_userspace_memory_region region;
+
+ /* only allow owner task to add memory */
+ if (ghvm->mm != current->mm)
+ return -EPERM;
+
+ if (copy_from_user(®ion, argp, sizeof(region)))
+ return -EFAULT;
+
+ /* All other flag bits are reserved for future use */
+ if (region.flags & ~(GH_MEM_ALLOW_READ | GH_MEM_ALLOW_WRITE | GH_MEM_ALLOW_EXEC))
+ return -EINVAL;
+
+ r = gh_vm_mem_alloc(ghvm, ®ion);
+ break;
+ }
+ default:
+ r = -ENOTTY;
+ break;
+ }
+ return r;
+}
+
+static void gh_vm_free(struct work_struct *work)
+{
+ struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
+
+ gh_vm_mem_reclaim(ghvm);
gh_rm_put(ghvm->rm);
+ mmdrop(ghvm->mm);
kfree(ghvm);
+}
+
+static int gh_vm_release(struct inode *inode, struct file *filp)
+{
+ struct gh_vm *ghvm = filp->private_data;
+
+ /* VM will be reset and make RM calls which can interruptible sleep.
+ * Defer to a work so this thread can receive signal.
+ */
+ schedule_work(&ghvm->free_work);
return 0;
}
static const struct file_operations gh_vm_fops = {
.owner = THIS_MODULE,
+ .unlocked_ioctl = gh_vm_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.release = gh_vm_release,
.llseek = noop_llseek,
};
@@ -77,8 +131,7 @@ static long gh_dev_ioctl_create_vm(struct gh_rm *rm, unsigned long arg)
err_put_fd:
put_unused_fd(fd);
err_destroy_vm:
- gh_rm_put(ghvm->rm);
- kfree(ghvm);
+ gh_vm_free(&ghvm->free_work);
return err;
}
@@ -7,14 +7,40 @@
#define _GH_VM_MGR_H
#include <linux/gunyah_rsc_mgr.h>
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
#include <uapi/linux/gunyah.h>
long gh_dev_vm_mgr_ioctl(struct gh_rm *rm, unsigned int cmd, unsigned long arg);
+enum gh_vm_mem_share_type {
+ VM_MEM_SHARE,
+ VM_MEM_LEND,
+};
+
+struct gh_vm_mem {
+ struct list_head list;
+ enum gh_vm_mem_share_type share_type;
+ struct gh_rm_mem_parcel parcel;
+
+ __u64 guest_phys_addr;
+ struct page **pages;
+ unsigned long npages;
+};
+
struct gh_vm {
struct gh_rm *rm;
struct device *parent;
+
+ struct work_struct free_work;
+ struct mm_struct *mm; /* userspace tied to this vm */
+ struct mutex mm_lock;
+ struct list_head memory_mappings;
};
+int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *region);
+void gh_vm_mem_reclaim(struct gh_vm *ghvm);
+
#endif
new file mode 100644
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#define pr_fmt(fmt) "gh_vm_mgr: " fmt
+
+#include <linux/gunyah_rsc_mgr.h>
+#include <linux/mm.h>
+
+#include <uapi/linux/gunyah.h>
+
+#include "vm_mgr.h"
+
+static bool pages_are_mergeable(struct page *a, struct page *b)
+{
+ if (page_to_pfn(a) + 1 != page_to_pfn(b))
+ return false;
+ if (!zone_device_pages_have_same_pgmap(a, b))
+ return false;
+ return true;
+}
+
+static bool gh_vm_mem_overlap(struct gh_vm_mem *a, u64 addr, u64 size)
+{
+ u64 a_end = a->guest_phys_addr + (a->npages << PAGE_SHIFT);
+ u64 end = addr + size;
+
+ return a->guest_phys_addr < end && addr < a_end;
+}
+
+static struct gh_vm_mem *__gh_vm_mem_find_by_label(struct gh_vm *ghvm, u32 label)
+ __must_hold(&ghvm->mm_lock)
+{
+ struct gh_vm_mem *mapping;
+
+ list_for_each_entry(mapping, &ghvm->memory_mappings, list)
+ if (mapping->parcel.label == label)
+ return mapping;
+
+ return NULL;
+}
+
+static void gh_vm_mem_reclaim_mapping(struct gh_vm *ghvm, struct gh_vm_mem *mapping)
+ __must_hold(&ghvm->mm_lock)
+{
+ int ret = 0;
+
+ if (mapping->parcel.mem_handle != GH_MEM_HANDLE_INVAL) {
+ ret = gh_rm_mem_reclaim(ghvm->rm, &mapping->parcel);
+ if (ret)
+ pr_warn("Failed to reclaim memory parcel for label %d: %d\n",
+ mapping->parcel.label, ret);
+ }
+
+ if (!ret) {
+ unpin_user_pages(mapping->pages, mapping->npages);
+ account_locked_vm(ghvm->mm, mapping->npages, false);
+ }
+
+ kfree(mapping->pages);
+ kfree(mapping->parcel.acl_entries);
+ kfree(mapping->parcel.mem_entries);
+
+ list_del(&mapping->list);
+}
+
+void gh_vm_mem_reclaim(struct gh_vm *ghvm)
+{
+ struct gh_vm_mem *mapping, *tmp;
+
+ mutex_lock(&ghvm->mm_lock);
+
+ list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
+ gh_vm_mem_reclaim_mapping(ghvm, mapping);
+ kfree(mapping);
+ }
+
+ mutex_unlock(&ghvm->mm_lock);
+}
+
+int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *region)
+{
+ struct gh_vm_mem *mapping, *tmp_mapping;
+ struct page *curr_page, *prev_page;
+ struct gh_rm_mem_parcel *parcel;
+ int i, j, pinned, ret = 0;
+ unsigned int gup_flags;
+ size_t entry_size;
+ u16 vmid;
+
+ if (!region->memory_size || !PAGE_ALIGNED(region->memory_size) ||
+ !PAGE_ALIGNED(region->userspace_addr) ||
+ !PAGE_ALIGNED(region->guest_phys_addr))
+ return -EINVAL;
+
+ if (overflows_type(region->guest_phys_addr + region->memory_size, u64))
+ return -EOVERFLOW;
+
+ ret = mutex_lock_interruptible(&ghvm->mm_lock);
+ if (ret)
+ return ret;
+
+ mapping = __gh_vm_mem_find_by_label(ghvm, region->label);
+ if (mapping) {
+ ret = -EEXIST;
+ goto unlock;
+ }
+
+ list_for_each_entry(tmp_mapping, &ghvm->memory_mappings, list) {
+ if (gh_vm_mem_overlap(tmp_mapping, region->guest_phys_addr,
+ region->memory_size)) {
+ ret = -EEXIST;
+ goto unlock;
+ }
+ }
+
+ mapping = kzalloc(sizeof(*mapping), GFP_KERNEL_ACCOUNT);
+ if (!mapping) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ mapping->guest_phys_addr = region->guest_phys_addr;
+ mapping->npages = region->memory_size >> PAGE_SHIFT;
+ parcel = &mapping->parcel;
+ parcel->label = region->label;
+ parcel->mem_handle = GH_MEM_HANDLE_INVAL; /* to be filled later by mem_share/mem_lend */
+ parcel->mem_type = GH_RM_MEM_TYPE_NORMAL;
+
+ ret = account_locked_vm(ghvm->mm, mapping->npages, true);
+ if (ret)
+ goto free_mapping;
+
+ mapping->pages = kcalloc(mapping->npages, sizeof(*mapping->pages), GFP_KERNEL_ACCOUNT);
+ if (!mapping->pages) {
+ ret = -ENOMEM;
+ mapping->npages = 0; /* update npages for reclaim */
+ goto unlock_pages;
+ }
+
+ gup_flags = FOLL_LONGTERM;
+ if (region->flags & GH_MEM_ALLOW_WRITE)
+ gup_flags |= FOLL_WRITE;
+
+ pinned = pin_user_pages_fast(region->userspace_addr, mapping->npages,
+ gup_flags, mapping->pages);
+ if (pinned < 0) {
+ ret = pinned;
+ goto free_pages;
+ } else if (pinned != mapping->npages) {
+ ret = -EFAULT;
+ mapping->npages = pinned; /* update npages for reclaim */
+ goto unpin_pages;
+ }
+
+ parcel->n_acl_entries = 2;
+ mapping->share_type = VM_MEM_SHARE;
+ parcel->acl_entries = kcalloc(parcel->n_acl_entries, sizeof(*parcel->acl_entries),
+ GFP_KERNEL);
+ if (!parcel->acl_entries) {
+ ret = -ENOMEM;
+ goto unpin_pages;
+ }
+
+ /* acl_entries[0].vmid will be this VM's vmid. We'll fill it when the
+ * VM is starting and we know the VM's vmid.
+ */
+ if (region->flags & GH_MEM_ALLOW_READ)
+ parcel->acl_entries[0].perms |= GH_RM_ACL_R;
+ if (region->flags & GH_MEM_ALLOW_WRITE)
+ parcel->acl_entries[0].perms |= GH_RM_ACL_W;
+ if (region->flags & GH_MEM_ALLOW_EXEC)
+ parcel->acl_entries[0].perms |= GH_RM_ACL_X;
+
+ ret = gh_rm_get_vmid(ghvm->rm, &vmid);
+ if (ret)
+ goto free_acl;
+
+ parcel->acl_entries[1].vmid = cpu_to_le16(vmid);
+ /* Host assumed to have all these permissions. Gunyah will not
+ * grant new permissions if host actually had less than RWX
+ */
+ parcel->acl_entries[1].perms = GH_RM_ACL_R | GH_RM_ACL_W | GH_RM_ACL_X;
+
+ parcel->n_mem_entries = 1;
+ for (i = 1; i < mapping->npages; i++) {
+ if (!pages_are_mergeable(mapping->pages[i - 1], mapping->pages[i]))
+ parcel->n_mem_entries++;
+ }
+
+ parcel->mem_entries = kcalloc(parcel->n_mem_entries,
+ sizeof(parcel->mem_entries[0]),
+ GFP_KERNEL_ACCOUNT);
+ if (!parcel->mem_entries) {
+ ret = -ENOMEM;
+ goto free_acl;
+ }
+
+ /* reduce number of entries by combining contiguous pages into single memory entry */
+ prev_page = mapping->pages[0];
+ parcel->mem_entries[0].phys_addr = cpu_to_le64(page_to_phys(prev_page));
+ entry_size = PAGE_SIZE;
+ for (i = 1, j = 0; i < mapping->npages; i++) {
+ curr_page = mapping->pages[i];
+ if (pages_are_mergeable(prev_page, curr_page)) {
+ entry_size += PAGE_SIZE;
+ } else {
+ parcel->mem_entries[j].size = cpu_to_le64(entry_size);
+ j++;
+ parcel->mem_entries[j].phys_addr =
+ cpu_to_le64(page_to_phys(curr_page));
+ entry_size = PAGE_SIZE;
+ }
+
+ prev_page = curr_page;
+ }
+ parcel->mem_entries[j].size = cpu_to_le64(entry_size);
+
+ list_add(&mapping->list, &ghvm->memory_mappings);
+ mutex_unlock(&ghvm->mm_lock);
+ return 0;
+free_acl:
+ kfree(parcel->acl_entries);
+unpin_pages:
+ unpin_user_pages(mapping->pages, pinned);
+free_pages:
+ kfree(mapping->pages);
+unlock_pages:
+ account_locked_vm(ghvm->mm, mapping->npages, false);
+free_mapping:
+ kfree(mapping);
+unlock:
+ mutex_unlock(&ghvm->mm_lock);
+ return ret;
+}
@@ -20,4 +20,41 @@
*/
#define GH_CREATE_VM _IO(GH_IOCTL_TYPE, 0x0) /* Returns a Gunyah VM fd */
+/*
+ * ioctls for VM fds
+ */
+
+/**
+ * enum gh_mem_flags - Possible flags on &struct gh_userspace_memory_region
+ * @GH_MEM_ALLOW_READ: Allow guest to read the memory
+ * @GH_MEM_ALLOW_WRITE: Allow guest to write to the memory
+ * @GH_MEM_ALLOW_EXEC: Allow guest to execute instructions in the memory
+ */
+enum gh_mem_flags {
+ GH_MEM_ALLOW_READ = 1UL << 0,
+ GH_MEM_ALLOW_WRITE = 1UL << 1,
+ GH_MEM_ALLOW_EXEC = 1UL << 2,
+};
+
+/**
+ * struct gh_userspace_memory_region - Userspace memory descripion for GH_VM_SET_USER_MEM_REGION
+ * @label: Identifer to the region which is unique to the VM.
+ * @flags: Flags for memory parcel behavior. See &enum gh_mem_flags.
+ * @guest_phys_addr: Location of the memory region in guest's memory space (page-aligned)
+ * @memory_size: Size of the region (page-aligned)
+ * @userspace_addr: Location of the memory region in caller (userspace)'s memory
+ *
+ * See Documentation/virt/gunyah/vm-manager.rst for further details.
+ */
+struct gh_userspace_memory_region {
+ __u32 label;
+ __u32 flags;
+ __u64 guest_phys_addr;
+ __u64 memory_size;
+ __u64 userspace_addr;
+};
+
+#define GH_VM_SET_USER_MEM_REGION _IOW(GH_IOCTL_TYPE, 0x1, \
+ struct gh_userspace_memory_region)
+
#endif