[v10,12/26] gunyah: vm_mgr: Add/remove user memory regions

Message ID 20230214212417.3315422-1-quic_eberman@quicinc.com
State New
Headers
Series Drivers for Gunyah hypervisor |

Commit Message

Elliot Berman Feb. 14, 2023, 9:24 p.m. UTC
  When launching a virtual machine, Gunyah userspace allocates memory for
the guest and informs Gunyah about these memory regions through
SET_USER_MEMORY_REGION ioctl.

Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
 drivers/virt/gunyah/Makefile    |   2 +-
 drivers/virt/gunyah/vm_mgr.c    |  44 ++++++
 drivers/virt/gunyah/vm_mgr.h    |  25 ++++
 drivers/virt/gunyah/vm_mgr_mm.c | 235 ++++++++++++++++++++++++++++++++
 include/uapi/linux/gunyah.h     |  33 +++++
 5 files changed, 338 insertions(+), 1 deletion(-)
 create mode 100644 drivers/virt/gunyah/vm_mgr_mm.c
  

Comments

Greg KH Feb. 16, 2023, 6:38 a.m. UTC | #1
On Tue, Feb 14, 2023 at 01:24:16PM -0800, Elliot Berman wrote:
> 
> When launching a virtual machine, Gunyah userspace allocates memory for
> the guest and informs Gunyah about these memory regions through
> SET_USER_MEMORY_REGION ioctl.

It also frees memory, see below.  Why not document that?

> +	case GH_VM_SET_USER_MEM_REGION: {
> +		struct gh_userspace_memory_region region;
> +
> +		if (copy_from_user(&region, argp, sizeof(region)))
> +			return -EFAULT;
> +
> +		/* All other flag bits are reserved for future use */
> +		if (region.flags & ~(GH_MEM_ALLOW_READ | GH_MEM_ALLOW_WRITE | GH_MEM_ALLOW_EXEC |
> +			GH_MEM_LENT))
> +			return -EINVAL;

Nice, thanks for validating that.


> +
> +

Nit, 2 blank lines are not needed :(


> +		if (region.memory_size)
> +			r = gh_vm_mem_alloc(ghvm, &region);
> +		else
> +			r = gh_vm_mem_free(ghvm, region.label);

So if you set the size to 0 it is freed?  Wouldn't a separate ioctl make
more sense?  Where is this logic documented to userspace?

thanks,

greg k-h
  
Elliot Berman Feb. 16, 2023, 5:24 p.m. UTC | #2
On 2/15/2023 10:38 PM, Greg Kroah-Hartman wrote:
> On Tue, Feb 14, 2023 at 01:24:16PM -0800, Elliot Berman wrote:
>>
>> When launching a virtual machine, Gunyah userspace allocates memory for
>> the guest and informs Gunyah about these memory regions through
>> SET_USER_MEMORY_REGION ioctl.
> 
> It also frees memory, see below.  Why not document that?
> 

I can mention in commit text, too.

>> +	case GH_VM_SET_USER_MEM_REGION: {
>> +		struct gh_userspace_memory_region region;
>> +
>> +		if (copy_from_user(&region, argp, sizeof(region)))
>> +			return -EFAULT;
>> +
>> +		/* All other flag bits are reserved for future use */
>> +		if (region.flags & ~(GH_MEM_ALLOW_READ | GH_MEM_ALLOW_WRITE | GH_MEM_ALLOW_EXEC |
>> +			GH_MEM_LENT))
>> +			return -EINVAL;
> 
> Nice, thanks for validating that.
> 
> 
>> +
>> +
> 
> Nit, 2 blank lines are not needed :(
> 
> 
>> +		if (region.memory_size)
>> +			r = gh_vm_mem_alloc(ghvm, &region);
>> +		else
>> +			r = gh_vm_mem_free(ghvm, region.label);
> 
> So if you set the size to 0 it is freed?  Wouldn't a separate ioctl make
> more sense?  Where is this logic documented to userspace? >

We're following KVM convention here. The logic is documented in patch 17/26.

Thanks,
Elliot
  
Srinivas Kandagatla Feb. 21, 2023, 12:28 p.m. UTC | #3
On 14/02/2023 21:24, Elliot Berman wrote:
> 
> When launching a virtual machine, Gunyah userspace allocates memory for
> the guest and informs Gunyah about these memory regions through
> SET_USER_MEMORY_REGION ioctl.
> 
> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> ---
>   drivers/virt/gunyah/Makefile    |   2 +-
>   drivers/virt/gunyah/vm_mgr.c    |  44 ++++++
>   drivers/virt/gunyah/vm_mgr.h    |  25 ++++
>   drivers/virt/gunyah/vm_mgr_mm.c | 235 ++++++++++++++++++++++++++++++++
>   include/uapi/linux/gunyah.h     |  33 +++++
>   5 files changed, 338 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/virt/gunyah/vm_mgr_mm.c
> 
> diff --git a/drivers/virt/gunyah/Makefile b/drivers/virt/gunyah/Makefile
> index 03951cf82023..ff8bc4925392 100644
> --- a/drivers/virt/gunyah/Makefile
> +++ b/drivers/virt/gunyah/Makefile
> @@ -2,5 +2,5 @@
>   
>   obj-$(CONFIG_GUNYAH) += gunyah.o
>   
> -gunyah_rsc_mgr-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o
> +gunyah_rsc_mgr-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o vm_mgr_mm.o
>   obj-$(CONFIG_GUNYAH) += gunyah_rsc_mgr.o
> diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
> index fd890a57172e..84102bac03cc 100644
> --- a/drivers/virt/gunyah/vm_mgr.c
> +++ b/drivers/virt/gunyah/vm_mgr.c
> @@ -18,8 +18,16 @@
>   static void gh_vm_free(struct work_struct *work)
>   {
>   	struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
> +	struct gh_vm_mem *mapping, *tmp;
>   	int ret;
>   
> +	mutex_lock(&ghvm->mm_lock);
> +	list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
> +		gh_vm_mem_reclaim(ghvm, mapping);
> +		kfree(mapping);
> +	}
> +	mutex_unlock(&ghvm->mm_lock);
> +
>   	ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
>   	if (ret)
>   		pr_warn("Failed to deallocate vmid: %d\n", ret);
> @@ -48,11 +56,46 @@ static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
>   	ghvm->vmid = vmid;
>   	ghvm->rm = rm;
>   
> +	mutex_init(&ghvm->mm_lock);
> +	INIT_LIST_HEAD(&ghvm->memory_mappings);
>   	INIT_WORK(&ghvm->free_work, gh_vm_free);
>   
>   	return ghvm;
>   }
>   
> +static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> +{
> +	struct gh_vm *ghvm = filp->private_data;
> +	void __user *argp = (void __user *)arg;
> +	long r;
> +
> +	switch (cmd) {
> +	case GH_VM_SET_USER_MEM_REGION: {
> +		struct gh_userspace_memory_region region;
> +
> +		if (copy_from_user(&region, argp, sizeof(region)))
> +			return -EFAULT;
> +
> +		/* All other flag bits are reserved for future use */
> +		if (region.flags & ~(GH_MEM_ALLOW_READ | GH_MEM_ALLOW_WRITE | GH_MEM_ALLOW_EXEC |
> +			GH_MEM_LENT))
> +			return -EINVAL;
> +
> +
> +		if (region.memory_size)
> +			r = gh_vm_mem_alloc(ghvm, &region);
> +		else
> +			r = gh_vm_mem_free(ghvm, region.label);

Looks like we are repurposing GH_VM_SET_USER_MEM_REGION for allocation 
and freeing.

Should we have corresponding GH_VM_UN_SET_USER_MEM_REGION instead for 
freeing? given that label is the only relevant member of struct 
gh_userspace_memory_region in free case.


> +		break;
> +	}
> +	default:
> +		r = -ENOTTY;
> +		break;
> +	}
> +
> +	return r;
> +}
> +
>   static int gh_vm_release(struct inode *inode, struct file *filp)
>   {
>   	struct gh_vm *ghvm = filp->private_data;
> @@ -65,6 +108,7 @@ static int gh_vm_release(struct inode *inode, struct file *filp)
>   }
>   
>   static const struct file_operations gh_vm_fops = {
> +	.unlocked_ioctl = gh_vm_ioctl,
>   	.release = gh_vm_release,
>   	.compat_ioctl	= compat_ptr_ioctl,
>   	.llseek = noop_llseek,
> diff --git a/drivers/virt/gunyah/vm_mgr.h b/drivers/virt/gunyah/vm_mgr.h
> index 76954da706e9..97bc00c34878 100644
> --- a/drivers/virt/gunyah/vm_mgr.h
> +++ b/drivers/virt/gunyah/vm_mgr.h
> @@ -7,16 +7,41 @@
>   #define _GH_PRIV_VM_MGR_H
>   
>   #include <linux/gunyah_rsc_mgr.h>
> +#include <linux/list.h>
> +#include <linux/miscdevice.h>
> +#include <linux/mutex.h>
>   
>   #include <uapi/linux/gunyah.h>
>   
>   long gh_dev_vm_mgr_ioctl(struct gh_rm *rm, unsigned int cmd, unsigned long arg);
>   
> +enum gh_vm_mem_share_type {
> +	VM_MEM_SHARE,
> +	VM_MEM_LEND,
> +};
> +
> +struct gh_vm_mem {
> +	struct list_head list;
> +	enum gh_vm_mem_share_type share_type;
> +	struct gh_rm_mem_parcel parcel;
> +
> +	__u64 guest_phys_addr;
> +	struct page **pages;
> +	unsigned long npages;
> +};
> +
>   struct gh_vm {
>   	u16 vmid;
>   	struct gh_rm *rm;
>   
>   	struct work_struct free_work;
> +	struct mutex mm_lock;
> +	struct list_head memory_mappings;
>   };
>   
> +int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *region);
> +void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping);
> +int gh_vm_mem_free(struct gh_vm *ghvm, u32 label);
> +struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label);
> +
>   #endif
> diff --git a/drivers/virt/gunyah/vm_mgr_mm.c b/drivers/virt/gunyah/vm_mgr_mm.c
> new file mode 100644
> index 000000000000..03e71a36ea3b
> --- /dev/null
> +++ b/drivers/virt/gunyah/vm_mgr_mm.c
> @@ -0,0 +1,235 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All rights reserved.
> + */
> +
> +#define pr_fmt(fmt) "gh_vm_mgr: " fmt
> +
> +#include <linux/gunyah_rsc_mgr.h>
> +#include <linux/mm.h>
> +
> +#include <uapi/linux/gunyah.h>
> +
> +#include "vm_mgr.h"
> +
> +static inline bool page_contiguous(phys_addr_t p, phys_addr_t t)
> +{
> +	return t - p == PAGE_SIZE;
> +}
> +
> +static struct gh_vm_mem *__gh_vm_mem_find(struct gh_vm *ghvm, u32 label)
> +	__must_hold(&ghvm->mm_lock)
> +{
> +	struct gh_vm_mem *mapping;
> +
> +	list_for_each_entry(mapping, &ghvm->memory_mappings, list)
> +		if (mapping->parcel.label == label)
> +			return mapping;
> +
> +	return NULL;
> +}
> +
> +void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping)
> +	__must_hold(&ghvm->mm_lock)
> +{
> +	int i, ret = 0;
> +
> +	if (mapping->parcel.mem_handle != GH_MEM_HANDLE_INVAL) {
> +		ret = gh_rm_mem_reclaim(ghvm->rm, &mapping->parcel);
> +		if (ret)
> +			pr_warn("Failed to reclaim memory parcel for label %d: %d\n",
> +				mapping->parcel.label, ret);

what the behavoir of hypervisor if we failed to reclaim the pages?

> +	}
> +
> +	if (!ret)
So we will leave the user pages pinned if hypervisor call fails, but 
further down we free the mapping all together.

Am not 100% sure if this will have any side-effect, but is it okay to 
leave user-pages pinned with no possiblity of unpinning them in such cases?


> +		for (i = 0; i < mapping->npages; i++)
> +			unpin_user_page(mapping->pages[i]);
> +
> +	kfree(mapping->pages);
> +	kfree(mapping->parcel.acl_entries);
> +	kfree(mapping->parcel.mem_entries);
> +
> +	list_del(&mapping->list);
> +}
> +
> +struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label)
> +{
> +	struct gh_vm_mem *mapping;
> +	int ret;
> +
> +	ret = mutex_lock_interruptible(&ghvm->mm_lock);
> +	if (ret)
> +		return ERR_PTR(ret);
new line would be nice here.

> +	mapping = __gh_vm_mem_find(ghvm, label);
> +	mutex_unlock(&ghvm->mm_lock);
new line would be nice here.

> +	return mapping ? : ERR_PTR(-ENODEV);
> +}
> +
> +int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *region)
> +{
> +	struct gh_vm_mem *mapping, *tmp_mapping;
> +	struct gh_rm_mem_entry *mem_entries;
> +	phys_addr_t curr_page, prev_page;
> +	struct gh_rm_mem_parcel *parcel;
> +	int i, j, pinned, ret = 0;
> +	size_t entry_size;
> +	u16 vmid;
> +
> +	if (!gh_api_has_feature(GH_API_FEATURE_MEMEXTENT))
> +		return -EOPNOTSUPP;

Should this not be first thing to do in ioctl before even entering this 
function?

> +
> +	if (!region->memory_size || !PAGE_ALIGNED(region->memory_size) ||
> +		!PAGE_ALIGNED(region->userspace_addr) || !PAGE_ALIGNED(region->guest_phys_addr))
> +		return -EINVAL;
> +
> +	ret = mutex_lock_interruptible(&ghvm->mm_lock);
> +	if (ret)
> +		return ret;
new line.

> +	mapping = __gh_vm_mem_find(ghvm, region->label);
> +	if (mapping) {
> +		mutex_unlock(&ghvm->mm_lock);
> +		return -EEXIST;
> +	}
> +
> +	mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
> +	if (!mapping) {
> +		ret = -ENOMEM;
> +		goto free_mapping;

how about,

mutex_unlock(&ghvm->mm_lock);
return -ENMEM;

> +	}
> +
> +	mapping->parcel.label = region->label;
> +	mapping->guest_phys_addr = region->guest_phys_addr;
> +	mapping->npages = region->memory_size >> PAGE_SHIFT;
> +	parcel = &mapping->parcel;
> +	parcel->mem_handle = GH_MEM_HANDLE_INVAL; /* to be filled later by mem_share/mem_lend */
> +	parcel->mem_type = GH_RM_MEM_TYPE_NORMAL;
> +
> +	/* Check for overlap */
> +	list_for_each_entry(tmp_mapping, &ghvm->memory_mappings, list) {
> +		if (!((mapping->guest_phys_addr + (mapping->npages << PAGE_SHIFT) <=
> +			tmp_mapping->guest_phys_addr) ||
> +			(mapping->guest_phys_addr >=
> +			tmp_mapping->guest_phys_addr + (tmp_mapping->npages << PAGE_SHIFT)))) {
> +			ret = -EEXIST;
> +			goto free_mapping;
> +		}
> +	}
> +
> +	list_add(&mapping->list, &ghvm->memory_mappings);
> +
> +	mapping->pages = kcalloc(mapping->npages, sizeof(*mapping->pages), GFP_KERNEL);
> +	if (!mapping->pages) {
> +		ret = -ENOMEM;
> +		mapping->npages = 0; /* update npages for reclaim */
> +		goto reclaim;
> +	}
> +
> +	pinned = pin_user_pages_fast(region->userspace_addr, mapping->npages,
> +					FOLL_WRITE | FOLL_LONGTERM, mapping->pages);
> +	if (pinned < 0) {
> +		ret = pinned;
> +		mapping->npages = 0; /* update npages for reclaim */
> +		goto reclaim;
> +	} else if (pinned != mapping->npages) {
> +		ret = -EFAULT;
> +		mapping->npages = pinned; /* update npages for reclaim */
> +		goto reclaim;
> +	}
> +
> +	if (region->flags & GH_MEM_LENT) {
> +		parcel->n_acl_entries = 1;
> +		mapping->share_type = VM_MEM_LEND;
> +	} else {
> +		parcel->n_acl_entries = 2;
> +		mapping->share_type = VM_MEM_SHARE;
> +	}
> +	parcel->acl_entries = kcalloc(parcel->n_acl_entries, sizeof(*parcel->acl_entries),
> +					GFP_KERNEL);
> +	if (!parcel->acl_entries) {
> +		ret = -ENOMEM;
> +		goto reclaim;
> +	}
> +
> +	parcel->acl_entries[0].vmid = cpu_to_le16(ghvm->vmid);
new line
> +	if (region->flags & GH_MEM_ALLOW_READ)
> +		parcel->acl_entries[0].perms |= GH_RM_ACL_R;
> +	if (region->flags & GH_MEM_ALLOW_WRITE)
> +		parcel->acl_entries[0].perms |= GH_RM_ACL_W;
> +	if (region->flags & GH_MEM_ALLOW_EXEC)
> +		parcel->acl_entries[0].perms |= GH_RM_ACL_X;
> +
> +	if (mapping->share_type == VM_MEM_SHARE) {
> +		ret = gh_rm_get_vmid(ghvm->rm, &vmid);
> +		if (ret)
> +			goto reclaim;
> +
> +		parcel->acl_entries[1].vmid = cpu_to_le16(vmid);
> +		/* Host assumed to have all these permissions. Gunyah will not
> +		 * grant new permissions if host actually had less than RWX
> +		 */
> +		parcel->acl_entries[1].perms |= GH_RM_ACL_R | GH_RM_ACL_W | GH_RM_ACL_X;
> +	}
> +
> +	mem_entries = kcalloc(mapping->npages, sizeof(*mem_entries), GFP_KERNEL);
> +	if (!mem_entries) {
> +		ret = -ENOMEM;
> +		goto reclaim;
> +	}
> +
> +	/* reduce number of entries by combining contiguous pages into single memory entry */
> +	prev_page = page_to_phys(mapping->pages[0]);
> +	mem_entries[0].ipa_base = cpu_to_le64(prev_page);
> +	entry_size = PAGE_SIZE;
new line
> +	for (i = 1, j = 0; i < mapping->npages; i++) {
> +		curr_page = page_to_phys(mapping->pages[i]);
> +		if (page_contiguous(prev_page, curr_page)) {
> +			entry_size += PAGE_SIZE;
> +		} else {
> +			mem_entries[j].size = cpu_to_le64(entry_size);
> +			j++;
> +			mem_entries[j].ipa_base = cpu_to_le64(curr_page);
> +			entry_size = PAGE_SIZE;
> +		}
> +
> +		prev_page = curr_page;
> +	}
> +	mem_entries[j].size = cpu_to_le64(entry_size);
> +
> +	parcel->n_mem_entries = j + 1;
> +	parcel->mem_entries = kmemdup(mem_entries, sizeof(*mem_entries) * parcel->n_mem_entries,
> +					GFP_KERNEL);
> +	kfree(mem_entries);
> +	if (!parcel->mem_entries) {
> +		ret = -ENOMEM;
> +		goto reclaim;
> +	}
> +
> +	mutex_unlock(&ghvm->mm_lock);
> +	return 0;
> +reclaim:
> +	gh_vm_mem_reclaim(ghvm, mapping);
> +free_mapping:
> +	kfree(mapping);
> +	mutex_unlock(&ghvm->mm_lock);
> +	return ret;
> +}
> +
> +int gh_vm_mem_free(struct gh_vm *ghvm, u32 label)
> +{
> +	struct gh_vm_mem *mapping;
> +	int ret;
> +
> +	ret = mutex_lock_interruptible(&ghvm->mm_lock);
> +	if (ret)
> +		return ret;
> +
> +	mapping = __gh_vm_mem_find(ghvm, label);
> +	if (!mapping)
> +		goto out;
> +
> +	gh_vm_mem_reclaim(ghvm, mapping);
> +	kfree(mapping);
> +out:
> +	mutex_unlock(&ghvm->mm_lock);
> +	return ret;
> +}
> diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
> index 10ba32d2b0a6..d85d12119a48 100644
> --- a/include/uapi/linux/gunyah.h
> +++ b/include/uapi/linux/gunyah.h
> @@ -20,4 +20,37 @@
>    */
>   #define GH_CREATE_VM			_IO(GH_IOCTL_TYPE, 0x0) /* Returns a Gunyah VM fd */
>   
> +/*
> + * ioctls for VM fds
> + */
> +
> +/**
> + * struct gh_userspace_memory_region - Userspace memory descripion for GH_VM_SET_USER_MEM_REGION
> + * @label: Unique identifer to the region.
> + * @flags: Flags for memory parcel behavior
> + * @guest_phys_addr: Location of the memory region in guest's memory space (page-aligned)#

Note about overlapping here would be useful.

> + * @memory_size: Size of the region (page-aligned)
> + * @userspace_addr: Location of the memory region in caller (userspace)'s memory
> + *
> + * See Documentation/virt/gunyah/vm-manager.rst for further details.
> + */
> +struct gh_userspace_memory_region {
> +	__u32 label;
> +#define GH_MEM_ALLOW_READ	(1UL << 0)
> +#define GH_MEM_ALLOW_WRITE	(1UL << 1)
> +#define GH_MEM_ALLOW_EXEC	(1UL << 2)
> +/*
> + * The guest will be lent the memory instead of shared.
> + * In other words, the guest has exclusive access to the memory region and the host loses access.
> + */
> +#define GH_MEM_LENT		(1UL << 3)
> +	__u32 flags;
> +	__u64 guest_phys_addr;
> +	__u64 memory_size;
> +	__u64 userspace_addr;
> +};
> +
> +#define GH_VM_SET_USER_MEM_REGION	_IOW(GH_IOCTL_TYPE, 0x1, \
> +						struct gh_userspace_memory_region)
> +
>   #endif
  
Srivatsa Vaddagiri Feb. 21, 2023, 12:43 p.m. UTC | #4
* Srinivas Kandagatla <srinivas.kandagatla@linaro.org> [2023-02-21 12:28:53]:

> > +void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping)
> > +	__must_hold(&ghvm->mm_lock)
> > +{
> > +	int i, ret = 0;
> > +
> > +	if (mapping->parcel.mem_handle != GH_MEM_HANDLE_INVAL) {
> > +		ret = gh_rm_mem_reclaim(ghvm->rm, &mapping->parcel);
> > +		if (ret)
> > +			pr_warn("Failed to reclaim memory parcel for label %d: %d\n",
> > +				mapping->parcel.label, ret);
> 
> what the behavoir of hypervisor if we failed to reclaim the pages?
> 
> > +	}
> > +
> > +	if (!ret)
> So we will leave the user pages pinned if hypervisor call fails, but further
> down we free the mapping all together.

I think we should cleanup and bail out here, rather than try continuing past the
error. For ex: imagine userspace were to reclaim with VM still running. We would
leave the pages pinned AFAICS (even after VM terminates later) and also not
return any error to userspace indicating failure to reclaim.
  
Srivatsa Vaddagiri Feb. 21, 2023, 12:45 p.m. UTC | #5
* Elliot Berman <quic_eberman@quicinc.com> [2023-02-14 13:24:16]:

> +int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *region)
> +{
> +	struct gh_vm_mem *mapping, *tmp_mapping;
> +	struct gh_rm_mem_entry *mem_entries;
> +	phys_addr_t curr_page, prev_page;
> +	struct gh_rm_mem_parcel *parcel;
> +	int i, j, pinned, ret = 0;
> +	size_t entry_size;
> +	u16 vmid;
> +
> +	if (!gh_api_has_feature(GH_API_FEATURE_MEMEXTENT))
> +		return -EOPNOTSUPP;
> +
> +	if (!region->memory_size || !PAGE_ALIGNED(region->memory_size) ||
> +		!PAGE_ALIGNED(region->userspace_addr) || !PAGE_ALIGNED(region->guest_phys_addr))
> +		return -EINVAL;

Check for wraps also:

        region->guest_phys_addr + region->memory_size > region->guest_phys_addr
  
Alex Elder Feb. 24, 2023, 12:34 a.m. UTC | #6
On 2/14/23 3:24 PM, Elliot Berman wrote:
> 
> When launching a virtual machine, Gunyah userspace allocates memory for
> the guest and informs Gunyah about these memory regions through
> SET_USER_MEMORY_REGION ioctl.
> 
> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> ---
>   drivers/virt/gunyah/Makefile    |   2 +-
>   drivers/virt/gunyah/vm_mgr.c    |  44 ++++++
>   drivers/virt/gunyah/vm_mgr.h    |  25 ++++
>   drivers/virt/gunyah/vm_mgr_mm.c | 235 ++++++++++++++++++++++++++++++++
>   include/uapi/linux/gunyah.h     |  33 +++++
>   5 files changed, 338 insertions(+), 1 deletion(-)
>   create mode 100644 drivers/virt/gunyah/vm_mgr_mm.c
> 
> diff --git a/drivers/virt/gunyah/Makefile b/drivers/virt/gunyah/Makefile
> index 03951cf82023..ff8bc4925392 100644
> --- a/drivers/virt/gunyah/Makefile
> +++ b/drivers/virt/gunyah/Makefile
> @@ -2,5 +2,5 @@
>   
>   obj-$(CONFIG_GUNYAH) += gunyah.o
>   
> -gunyah_rsc_mgr-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o
> +gunyah_rsc_mgr-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o vm_mgr_mm.o
>   obj-$(CONFIG_GUNYAH) += gunyah_rsc_mgr.o
> diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
> index fd890a57172e..84102bac03cc 100644
> --- a/drivers/virt/gunyah/vm_mgr.c
> +++ b/drivers/virt/gunyah/vm_mgr.c
> @@ -18,8 +18,16 @@
>   static void gh_vm_free(struct work_struct *work)
>   {
>   	struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
> +	struct gh_vm_mem *mapping, *tmp;
>   	int ret;
>   
> +	mutex_lock(&ghvm->mm_lock);
> +	list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
> +		gh_vm_mem_reclaim(ghvm, mapping);
> +		kfree(mapping);
> +	}
> +	mutex_unlock(&ghvm->mm_lock);
> +
>   	ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
>   	if (ret)
>   		pr_warn("Failed to deallocate vmid: %d\n", ret);
> @@ -48,11 +56,46 @@ static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
>   	ghvm->vmid = vmid;
>   	ghvm->rm = rm;
>   
> +	mutex_init(&ghvm->mm_lock);
> +	INIT_LIST_HEAD(&ghvm->memory_mappings);
>   	INIT_WORK(&ghvm->free_work, gh_vm_free);
>   
>   	return ghvm;
>   }
>   
> +static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> +{
> +	struct gh_vm *ghvm = filp->private_data;
> +	void __user *argp = (void __user *)arg;
> +	long r;
> +
> +	switch (cmd) {
> +	case GH_VM_SET_USER_MEM_REGION: {
> +		struct gh_userspace_memory_region region;
> +
> +		if (copy_from_user(&region, argp, sizeof(region)))
> +			return -EFAULT;
> +
> +		/* All other flag bits are reserved for future use */
> +		if (region.flags & ~(GH_MEM_ALLOW_READ | GH_MEM_ALLOW_WRITE | GH_MEM_ALLOW_EXEC |
> +			GH_MEM_LENT))
> +			return -EINVAL;
> +
> +
> +		if (region.memory_size)

Would there be any value in allowing a zero-size memory
region to be created?  Maybe that doesn't make sense, but
I guess i'm questioning whether a zero memory region size
have special meaning in this interface is a good thing to
do.  You could sensibly have a separate REMOVE_USER_MEM_REGION
request, and still permit 0 to be a valid size.

> +			r = gh_vm_mem_alloc(ghvm, &region);
> +		else
> +			r = gh_vm_mem_free(ghvm, region.label);
> +		break;
> +	}
> +	default:
> +		r = -ENOTTY;
> +		break;
> +	}
> +
> +	return r;
> +}
> +
>   static int gh_vm_release(struct inode *inode, struct file *filp)
>   {
>   	struct gh_vm *ghvm = filp->private_data;
> @@ -65,6 +108,7 @@ static int gh_vm_release(struct inode *inode, struct file *filp)
>   }
>   
>   static const struct file_operations gh_vm_fops = {
> +	.unlocked_ioctl = gh_vm_ioctl,
>   	.release = gh_vm_release,
>   	.compat_ioctl	= compat_ptr_ioctl,
>   	.llseek = noop_llseek,
> diff --git a/drivers/virt/gunyah/vm_mgr.h b/drivers/virt/gunyah/vm_mgr.h
> index 76954da706e9..97bc00c34878 100644
> --- a/drivers/virt/gunyah/vm_mgr.h
> +++ b/drivers/virt/gunyah/vm_mgr.h
> @@ -7,16 +7,41 @@
>   #define _GH_PRIV_VM_MGR_H
>   
>   #include <linux/gunyah_rsc_mgr.h>
> +#include <linux/list.h>
> +#include <linux/miscdevice.h>
> +#include <linux/mutex.h>
>   
>   #include <uapi/linux/gunyah.h>
>   
>   long gh_dev_vm_mgr_ioctl(struct gh_rm *rm, unsigned int cmd, unsigned long arg);
>   
> +enum gh_vm_mem_share_type {
> +	VM_MEM_SHARE,
> +	VM_MEM_LEND,

Are there any other share types anticipated?  Even if
there were, for now you could use a Boolean to distinguish
between shared or lent (at least until a third option
materializes).

> +};
> +
> +struct gh_vm_mem {
> +	struct list_head list;
> +	enum gh_vm_mem_share_type share_type;
> +	struct gh_rm_mem_parcel parcel;
> +
> +	__u64 guest_phys_addr;
> +	struct page **pages;
> +	unsigned long npages;
> +};
> +
>   struct gh_vm {
>   	u16 vmid;
>   	struct gh_rm *rm;
>   
>   	struct work_struct free_work;
> +	struct mutex mm_lock;
> +	struct list_head memory_mappings;
>   };
>   
> +int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *region);
> +void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping);
> +int gh_vm_mem_free(struct gh_vm *ghvm, u32 label);
> +struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label);
> +
>   #endif
> diff --git a/drivers/virt/gunyah/vm_mgr_mm.c b/drivers/virt/gunyah/vm_mgr_mm.c
> new file mode 100644
> index 000000000000..03e71a36ea3b
> --- /dev/null
> +++ b/drivers/virt/gunyah/vm_mgr_mm.c
> @@ -0,0 +1,235 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All rights reserved.
> + */
> +
> +#define pr_fmt(fmt) "gh_vm_mgr: " fmt
> +
> +#include <linux/gunyah_rsc_mgr.h>
> +#include <linux/mm.h>
> +
> +#include <uapi/linux/gunyah.h>
> +
> +#include "vm_mgr.h"
> +
> +static inline bool page_contiguous(phys_addr_t p, phys_addr_t t)

Is there not some existing function that captures this?
In any case, it's used in one place and I think it would
be clearer to just put the logic there rather than hiding
it behind this function.

> +{
> +	return t - p == PAGE_SIZE;
> +}
> +
> +static struct gh_vm_mem *__gh_vm_mem_find(struct gh_vm *ghvm, u32 label)
> +	__must_hold(&ghvm->mm_lock)
> +{
> +	struct gh_vm_mem *mapping;
> +
> +	list_for_each_entry(mapping, &ghvm->memory_mappings, list)
> +		if (mapping->parcel.label == label)
> +			return mapping;
> +
> +	return NULL;
> +}
> +

. . .

> diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
> index 10ba32d2b0a6..d85d12119a48 100644
> --- a/include/uapi/linux/gunyah.h
> +++ b/include/uapi/linux/gunyah.h
> @@ -20,4 +20,37 @@
>    */
>   #define GH_CREATE_VM			_IO(GH_IOCTL_TYPE, 0x0) /* Returns a Gunyah VM fd */
>   
> +/*
> + * ioctls for VM fds
> + */
> +
> +/**
> + * struct gh_userspace_memory_region - Userspace memory descripion for GH_VM_SET_USER_MEM_REGION
> + * @label: Unique identifer to the region.

Maybe this is described somewhere, but what is the purpose
of the label?  Who uses it?  Is it meant to be a value
only the current owner of a resource understands?  Or does
resource manager use it internally, or what?

> + * @flags: Flags for memory parcel behavior
> + * @guest_phys_addr: Location of the memory region in guest's memory space (page-aligned)
> + * @memory_size: Size of the region (page-aligned)
> + * @userspace_addr: Location of the memory region in caller (userspace)'s memory
> + *
> + * See Documentation/virt/gunyah/vm-manager.rst for further details.
> + */
> +struct gh_userspace_memory_region {
> +	__u32 label;

Define the possible permission values separate from
the structure.

					-Alex

> +#define GH_MEM_ALLOW_READ	(1UL << 0)
> +#define GH_MEM_ALLOW_WRITE	(1UL << 1)
> +#define GH_MEM_ALLOW_EXEC	(1UL << 2)
> +/*
> + * The guest will be lent the memory instead of shared.
> + * In other words, the guest has exclusive access to the memory region and the host loses access.
> + */
> +#define GH_MEM_LENT		(1UL << 3)
> +	__u32 flags;
> +	__u64 guest_phys_addr;
> +	__u64 memory_size;
> +	__u64 userspace_addr;
> +};
> +
> +#define GH_VM_SET_USER_MEM_REGION	_IOW(GH_IOCTL_TYPE, 0x1, \
> +						struct gh_userspace_memory_region)
> +
>   #endif
  
Elliot Berman Feb. 24, 2023, 12:43 a.m. UTC | #7
On 2/21/2023 4:28 AM, Srinivas Kandagatla wrote:
> 
> 
> On 14/02/2023 21:24, Elliot Berman wrote:
>>
>> When launching a virtual machine, Gunyah userspace allocates memory for
>> the guest and informs Gunyah about these memory regions through
>> SET_USER_MEMORY_REGION ioctl.
>>
>> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
>> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
>> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
>> ---
>>   drivers/virt/gunyah/Makefile    |   2 +-
>>   drivers/virt/gunyah/vm_mgr.c    |  44 ++++++
>>   drivers/virt/gunyah/vm_mgr.h    |  25 ++++
>>   drivers/virt/gunyah/vm_mgr_mm.c | 235 ++++++++++++++++++++++++++++++++
>>   include/uapi/linux/gunyah.h     |  33 +++++
>>   5 files changed, 338 insertions(+), 1 deletion(-)
>>   create mode 100644 drivers/virt/gunyah/vm_mgr_mm.c
>>
>> diff --git a/drivers/virt/gunyah/Makefile b/drivers/virt/gunyah/Makefile
>> index 03951cf82023..ff8bc4925392 100644
>> --- a/drivers/virt/gunyah/Makefile
>> +++ b/drivers/virt/gunyah/Makefile
>> @@ -2,5 +2,5 @@
>>   obj-$(CONFIG_GUNYAH) += gunyah.o
>> -gunyah_rsc_mgr-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o
>> +gunyah_rsc_mgr-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o vm_mgr_mm.o
>>   obj-$(CONFIG_GUNYAH) += gunyah_rsc_mgr.o
>> diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
>> index fd890a57172e..84102bac03cc 100644
>> --- a/drivers/virt/gunyah/vm_mgr.c
>> +++ b/drivers/virt/gunyah/vm_mgr.c
>> @@ -18,8 +18,16 @@
>>   static void gh_vm_free(struct work_struct *work)
>>   {
>>       struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
>> +    struct gh_vm_mem *mapping, *tmp;
>>       int ret;
>> +    mutex_lock(&ghvm->mm_lock);
>> +    list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, 
>> list) {
>> +        gh_vm_mem_reclaim(ghvm, mapping);
>> +        kfree(mapping);
>> +    }
>> +    mutex_unlock(&ghvm->mm_lock);
>> +
>>       ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
>>       if (ret)
>>           pr_warn("Failed to deallocate vmid: %d\n", ret);
>> @@ -48,11 +56,46 @@ static __must_check struct gh_vm 
>> *gh_vm_alloc(struct gh_rm *rm)
>>       ghvm->vmid = vmid;
>>       ghvm->rm = rm;
>> +    mutex_init(&ghvm->mm_lock);
>> +    INIT_LIST_HEAD(&ghvm->memory_mappings);
>>       INIT_WORK(&ghvm->free_work, gh_vm_free);
>>       return ghvm;
>>   }
>> +static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned 
>> long arg)
>> +{
>> +    struct gh_vm *ghvm = filp->private_data;
>> +    void __user *argp = (void __user *)arg;
>> +    long r;
>> +
>> +    switch (cmd) {
>> +    case GH_VM_SET_USER_MEM_REGION: {
>> +        struct gh_userspace_memory_region region;
>> +
>> +        if (copy_from_user(&region, argp, sizeof(region)))
>> +            return -EFAULT;
>> +
>> +        /* All other flag bits are reserved for future use */
>> +        if (region.flags & ~(GH_MEM_ALLOW_READ | GH_MEM_ALLOW_WRITE | 
>> GH_MEM_ALLOW_EXEC |
>> +            GH_MEM_LENT))
>> +            return -EINVAL;
>> +
>> +
>> +        if (region.memory_size)
>> +            r = gh_vm_mem_alloc(ghvm, &region);
>> +        else
>> +            r = gh_vm_mem_free(ghvm, region.label);
> 
> Looks like we are repurposing GH_VM_SET_USER_MEM_REGION for allocation 
> and freeing.
> 
> Should we have corresponding GH_VM_UN_SET_USER_MEM_REGION instead for 
> freeing? given that label is the only relevant member of struct 
> gh_userspace_memory_region in free case.
> 
> 

I'm following convention of KVM here, which re-uses 
KVM_SET_USER_MEM_REGION for deleting regions as well.

One question though --

We don't have need to support removal of memory regions while VM is 
running. Gunyah rejects removal of parcels that haven't been released 
and no guests currently support releasing a memory parcel while it's 
running. With the current series, the only time memory parcels can be 
reclaimed is when VM is being disposed after shut down. With that in 
mind, shall I drop the removal of memory regions in v11? I had added it 
for symmetry/completeness, but I'm holding GH_VM_DESTROY for now as well 
[1].

[1]: 
https://lore.kernel.org/all/52d944b1-3ea6-26b7-766a-2fed05dccf3a@linaro.org/

>> +        break;
>> +    }
>> +    default:
>> +        r = -ENOTTY;
>> +        break;
>> +    }
>> +
>> +    return r;
>> +}
>> +
>>   static int gh_vm_release(struct inode *inode, struct file *filp)
>>   {
>>       struct gh_vm *ghvm = filp->private_data;
>> @@ -65,6 +108,7 @@ static int gh_vm_release(struct inode *inode, 
>> struct file *filp)
>>   }
>>   static const struct file_operations gh_vm_fops = {
>> +    .unlocked_ioctl = gh_vm_ioctl,
>>       .release = gh_vm_release,
>>       .compat_ioctl    = compat_ptr_ioctl,
>>       .llseek = noop_llseek,
>> diff --git a/drivers/virt/gunyah/vm_mgr.h b/drivers/virt/gunyah/vm_mgr.h
>> index 76954da706e9..97bc00c34878 100644
>> --- a/drivers/virt/gunyah/vm_mgr.h
>> +++ b/drivers/virt/gunyah/vm_mgr.h
>> @@ -7,16 +7,41 @@
>>   #define _GH_PRIV_VM_MGR_H
>>   #include <linux/gunyah_rsc_mgr.h>
>> +#include <linux/list.h>
>> +#include <linux/miscdevice.h>
>> +#include <linux/mutex.h>
>>   #include <uapi/linux/gunyah.h>
>>   long gh_dev_vm_mgr_ioctl(struct gh_rm *rm, unsigned int cmd, 
>> unsigned long arg);
>> +enum gh_vm_mem_share_type {
>> +    VM_MEM_SHARE,
>> +    VM_MEM_LEND,
>> +};
>> +
>> +struct gh_vm_mem {
>> +    struct list_head list;
>> +    enum gh_vm_mem_share_type share_type;
>> +    struct gh_rm_mem_parcel parcel;
>> +
>> +    __u64 guest_phys_addr;
>> +    struct page **pages;
>> +    unsigned long npages;
>> +};
>> +
>>   struct gh_vm {
>>       u16 vmid;
>>       struct gh_rm *rm;
>>       struct work_struct free_work;
>> +    struct mutex mm_lock;
>> +    struct list_head memory_mappings;
>>   };
>> +int gh_vm_mem_alloc(struct gh_vm *ghvm, struct 
>> gh_userspace_memory_region *region);
>> +void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping);
>> +int gh_vm_mem_free(struct gh_vm *ghvm, u32 label);
>> +struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label);
>> +
>>   #endif
>> diff --git a/drivers/virt/gunyah/vm_mgr_mm.c 
>> b/drivers/virt/gunyah/vm_mgr_mm.c
>> new file mode 100644
>> index 000000000000..03e71a36ea3b
>> --- /dev/null
>> +++ b/drivers/virt/gunyah/vm_mgr_mm.c
>> @@ -0,0 +1,235 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All 
>> rights reserved.
>> + */
>> +
>> +#define pr_fmt(fmt) "gh_vm_mgr: " fmt
>> +
>> +#include <linux/gunyah_rsc_mgr.h>
>> +#include <linux/mm.h>
>> +
>> +#include <uapi/linux/gunyah.h>
>> +
>> +#include "vm_mgr.h"
>> +
>> +static inline bool page_contiguous(phys_addr_t p, phys_addr_t t)
>> +{
>> +    return t - p == PAGE_SIZE;
>> +}
>> +
>> +static struct gh_vm_mem *__gh_vm_mem_find(struct gh_vm *ghvm, u32 label)
>> +    __must_hold(&ghvm->mm_lock)
>> +{
>> +    struct gh_vm_mem *mapping;
>> +
>> +    list_for_each_entry(mapping, &ghvm->memory_mappings, list)
>> +        if (mapping->parcel.label == label)
>> +            return mapping;
>> +
>> +    return NULL;
>> +}
>> +
>> +void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping)
>> +    __must_hold(&ghvm->mm_lock)
>> +{
>> +    int i, ret = 0;
>> +
>> +    if (mapping->parcel.mem_handle != GH_MEM_HANDLE_INVAL) {
>> +        ret = gh_rm_mem_reclaim(ghvm->rm, &mapping->parcel);
>> +        if (ret)
>> +            pr_warn("Failed to reclaim memory parcel for label %d: 
>> %d\n",
>> +                mapping->parcel.label, ret);
> 
> what the behavoir of hypervisor if we failed to reclaim the pages?
> 

Hypervisor doesn't modify access to the pages.

>> +    }
>> +
>> +    if (!ret)
> So we will leave the user pages pinned if hypervisor call fails, but 
> further down we free the mapping all together.
> 
> Am not 100% sure if this will have any side-effect, but is it okay to 
> leave user-pages pinned with no possiblity of unpinning them in such cases?
> 

I think it's not okay, but the only way this could fail is if there is a 
kernel bug. I'd rather not BUG_ON?

> 
>> +        for (i = 0; i < mapping->npages; i++)
>> +            unpin_user_page(mapping->pages[i]);
>> +
>> +    kfree(mapping->pages);
>> +    kfree(mapping->parcel.acl_entries);
>> +    kfree(mapping->parcel.mem_entries);
>> +
>> +    list_del(&mapping->list);
>> +}
>> +
>> +struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label)
>> +{
>> +    struct gh_vm_mem *mapping;
>> +    int ret;
>> +
>> +    ret = mutex_lock_interruptible(&ghvm->mm_lock);
>> +    if (ret)
>> +        return ERR_PTR(ret);
> new line would be nice here.
> 
>> +    mapping = __gh_vm_mem_find(ghvm, label);
>> +    mutex_unlock(&ghvm->mm_lock);
> new line would be nice here.
> 
>> +    return mapping ? : ERR_PTR(-ENODEV);
>> +}
>> +
>> +int gh_vm_mem_alloc(struct gh_vm *ghvm, struct 
>> gh_userspace_memory_region *region)
>> +{
>> +    struct gh_vm_mem *mapping, *tmp_mapping;
>> +    struct gh_rm_mem_entry *mem_entries;
>> +    phys_addr_t curr_page, prev_page;
>> +    struct gh_rm_mem_parcel *parcel;
>> +    int i, j, pinned, ret = 0;
>> +    size_t entry_size;
>> +    u16 vmid;
>> +
>> +    if (!gh_api_has_feature(GH_API_FEATURE_MEMEXTENT))
>> +        return -EOPNOTSUPP;
> 
> Should this not be first thing to do in ioctl before even entering this 
> function?
> 

I don't see why one place is better than other, but I can move.

>> +
>> +    if (!region->memory_size || !PAGE_ALIGNED(region->memory_size) ||
>> +        !PAGE_ALIGNED(region->userspace_addr) || 
>> !PAGE_ALIGNED(region->guest_phys_addr))
>> +        return -EINVAL;
>> +
>> +    ret = mutex_lock_interruptible(&ghvm->mm_lock);
>> +    if (ret)
>> +        return ret;
> new line.
> 
>> +    mapping = __gh_vm_mem_find(ghvm, region->label);
>> +    if (mapping) {
>> +        mutex_unlock(&ghvm->mm_lock);
>> +        return -EEXIST;
>> +    }
>> +
>> +    mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
>> +    if (!mapping) {
>> +        ret = -ENOMEM;
>> +        goto free_mapping;
> 
> how about,
> 
> mutex_unlock(&ghvm->mm_lock);
> return -ENMEM;
> 
>> +    }
>> +
>> +    mapping->parcel.label = region->label;
>> +    mapping->guest_phys_addr = region->guest_phys_addr;
>> +    mapping->npages = region->memory_size >> PAGE_SHIFT;
>> +    parcel = &mapping->parcel;
>> +    parcel->mem_handle = GH_MEM_HANDLE_INVAL; /* to be filled later 
>> by mem_share/mem_lend */
>> +    parcel->mem_type = GH_RM_MEM_TYPE_NORMAL;
>> +
>> +    /* Check for overlap */
>> +    list_for_each_entry(tmp_mapping, &ghvm->memory_mappings, list) {
>> +        if (!((mapping->guest_phys_addr + (mapping->npages << 
>> PAGE_SHIFT) <=
>> +            tmp_mapping->guest_phys_addr) ||
>> +            (mapping->guest_phys_addr >=
>> +            tmp_mapping->guest_phys_addr + (tmp_mapping->npages << 
>> PAGE_SHIFT)))) {
>> +            ret = -EEXIST;
>> +            goto free_mapping;
>> +        }
>> +    }
>> +
>> +    list_add(&mapping->list, &ghvm->memory_mappings);
>> +
>> +    mapping->pages = kcalloc(mapping->npages, 
>> sizeof(*mapping->pages), GFP_KERNEL);
>> +    if (!mapping->pages) {
>> +        ret = -ENOMEM;
>> +        mapping->npages = 0; /* update npages for reclaim */
>> +        goto reclaim;
>> +    }
>> +
>> +    pinned = pin_user_pages_fast(region->userspace_addr, 
>> mapping->npages,
>> +                    FOLL_WRITE | FOLL_LONGTERM, mapping->pages);
>> +    if (pinned < 0) {
>> +        ret = pinned;
>> +        mapping->npages = 0; /* update npages for reclaim */
>> +        goto reclaim;
>> +    } else if (pinned != mapping->npages) {
>> +        ret = -EFAULT;
>> +        mapping->npages = pinned; /* update npages for reclaim */
>> +        goto reclaim;
>> +    }
>> +
>> +    if (region->flags & GH_MEM_LENT) {
>> +        parcel->n_acl_entries = 1;
>> +        mapping->share_type = VM_MEM_LEND;
>> +    } else {
>> +        parcel->n_acl_entries = 2;
>> +        mapping->share_type = VM_MEM_SHARE;
>> +    }
>> +    parcel->acl_entries = kcalloc(parcel->n_acl_entries, 
>> sizeof(*parcel->acl_entries),
>> +                    GFP_KERNEL);
>> +    if (!parcel->acl_entries) {
>> +        ret = -ENOMEM;
>> +        goto reclaim;
>> +    }
>> +
>> +    parcel->acl_entries[0].vmid = cpu_to_le16(ghvm->vmid);
> new line
>> +    if (region->flags & GH_MEM_ALLOW_READ)
>> +        parcel->acl_entries[0].perms |= GH_RM_ACL_R;
>> +    if (region->flags & GH_MEM_ALLOW_WRITE)
>> +        parcel->acl_entries[0].perms |= GH_RM_ACL_W;
>> +    if (region->flags & GH_MEM_ALLOW_EXEC)
>> +        parcel->acl_entries[0].perms |= GH_RM_ACL_X;
>> +
>> +    if (mapping->share_type == VM_MEM_SHARE) {
>> +        ret = gh_rm_get_vmid(ghvm->rm, &vmid);
>> +        if (ret)
>> +            goto reclaim;
>> +
>> +        parcel->acl_entries[1].vmid = cpu_to_le16(vmid);
>> +        /* Host assumed to have all these permissions. Gunyah will not
>> +         * grant new permissions if host actually had less than RWX
>> +         */
>> +        parcel->acl_entries[1].perms |= GH_RM_ACL_R | GH_RM_ACL_W | 
>> GH_RM_ACL_X;
>> +    }
>> +
>> +    mem_entries = kcalloc(mapping->npages, sizeof(*mem_entries), 
>> GFP_KERNEL);
>> +    if (!mem_entries) {
>> +        ret = -ENOMEM;
>> +        goto reclaim;
>> +    }
>> +
>> +    /* reduce number of entries by combining contiguous pages into 
>> single memory entry */
>> +    prev_page = page_to_phys(mapping->pages[0]);
>> +    mem_entries[0].ipa_base = cpu_to_le64(prev_page);
>> +    entry_size = PAGE_SIZE;
> new line
>> +    for (i = 1, j = 0; i < mapping->npages; i++) {
>> +        curr_page = page_to_phys(mapping->pages[i]);
>> +        if (page_contiguous(prev_page, curr_page)) {
>> +            entry_size += PAGE_SIZE;
>> +        } else {
>> +            mem_entries[j].size = cpu_to_le64(entry_size);
>> +            j++;
>> +            mem_entries[j].ipa_base = cpu_to_le64(curr_page);
>> +            entry_size = PAGE_SIZE;
>> +        }
>> +
>> +        prev_page = curr_page;
>> +    }
>> +    mem_entries[j].size = cpu_to_le64(entry_size);
>> +
>> +    parcel->n_mem_entries = j + 1;
>> +    parcel->mem_entries = kmemdup(mem_entries, sizeof(*mem_entries) * 
>> parcel->n_mem_entries,
>> +                    GFP_KERNEL);
>> +    kfree(mem_entries);
>> +    if (!parcel->mem_entries) {
>> +        ret = -ENOMEM;
>> +        goto reclaim;
>> +    }
>> +
>> +    mutex_unlock(&ghvm->mm_lock);
>> +    return 0;
>> +reclaim:
>> +    gh_vm_mem_reclaim(ghvm, mapping);
>> +free_mapping:
>> +    kfree(mapping);
>> +    mutex_unlock(&ghvm->mm_lock);
>> +    return ret;
>> +}
>> +
>> +int gh_vm_mem_free(struct gh_vm *ghvm, u32 label)
>> +{
>> +    struct gh_vm_mem *mapping;
>> +    int ret;
>> +
>> +    ret = mutex_lock_interruptible(&ghvm->mm_lock);
>> +    if (ret)
>> +        return ret;
>> +
>> +    mapping = __gh_vm_mem_find(ghvm, label);
>> +    if (!mapping)
>> +        goto out;
>> +
>> +    gh_vm_mem_reclaim(ghvm, mapping);
>> +    kfree(mapping);
>> +out:
>> +    mutex_unlock(&ghvm->mm_lock);
>> +    return ret;
>> +}
>> diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
>> index 10ba32d2b0a6..d85d12119a48 100644
>> --- a/include/uapi/linux/gunyah.h
>> +++ b/include/uapi/linux/gunyah.h
>> @@ -20,4 +20,37 @@
>>    */
>>   #define GH_CREATE_VM            _IO(GH_IOCTL_TYPE, 0x0) /* Returns a 
>> Gunyah VM fd */
>> +/*
>> + * ioctls for VM fds
>> + */
>> +
>> +/**
>> + * struct gh_userspace_memory_region - Userspace memory descripion 
>> for GH_VM_SET_USER_MEM_REGION
>> + * @label: Unique identifer to the region.
>> + * @flags: Flags for memory parcel behavior
>> + * @guest_phys_addr: Location of the memory region in guest's memory 
>> space (page-aligned)#
> 
> Note about overlapping here would be useful.
> 

I'd like to reduce duplicate documentation where possible. I was 
generally following this procedure:
  - include/uapi/linux/gunyah.h docstrings have basic information to 
remind what the field is
  - Documentation/virt/gunyah/ documentation explains how to properly 
use the APIs

I think it's definitely good idea to have separate documentation beyond 
what can be described in docstrings here.

Thanks,
Elliot
  
Fuad Tabba Feb. 24, 2023, 10:19 a.m. UTC | #8
Hi,

On Tue, Feb 14, 2023 at 9:26 PM Elliot Berman <quic_eberman@quicinc.com> wrote:
>
>
> When launching a virtual machine, Gunyah userspace allocates memory for
> the guest and informs Gunyah about these memory regions through
> SET_USER_MEMORY_REGION ioctl.

I'm working on pKVM [1], and regarding the problem of donating private
memory to a guest, we and others working on confidential computing
have faced a similar issue that this patch is trying to address. In
pKVM, we've initially taken an approach similar to the one here by
pinning the pages being donated to prevent swapping or migration [2].
However, we've encountered issues with this approach since the memory
is still mapped by the host, which could cause the system to crash on
an errant access.

Instead, we've been working on adopting an fd-based restricted memory
approach that was initially proposed for TDX [3] and is now being
considered by others in the confidential computing space as well
(e.g., Arm CCA [4]). The basic idea is that the host manages the guest
memory via a file descriptor instead of a userspace address. It cannot
map that memory (unless explicitly shared by the guest [5]),
eliminating the possibility of the host trying to access private
memory accidentally or being tricked by a malicious actor. This is
based on memfd with some restrictions. It handles swapping and
migration by disallowing them (for now [6]), and adds a new type of
memory region to KVM to accommodate having an fd representing guest
memory.

Although the fd-based restricted memory isn't upstream yet, we've
ported the latest patches to arm64 and made changes and additions to
make it work with pKVM, to test it and see if the solution is feasible
for us (it is). I wanted to mention this work in case you find it
useful, and in the hopes that we can all work on confidential
computing using the same interfaces as much as possible.

Some comments inline below...

Cheers,
/fuad

[1] https://lore.kernel.org/kvmarm/20220519134204.5379-1-will@kernel.org/
[2] https://lore.kernel.org/kvmarm/20220519134204.5379-34-will@kernel.org/
[3] https://lore.kernel.org/all/20221202061347.1070246-1-chao.p.peng@linux.intel.com/
[4] https://lore.kernel.org/lkml/20230127112932.38045-1-steven.price@arm.com/
[5] This is a modification we've done for the arm64 port, after
discussing it with the original authors.
[6] Nothing inherent in the proposal to stop migration and swapping.
There are some technical issues that need to be resolved.

<snip>

> +int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *region)
> +{
> +       struct gh_vm_mem *mapping, *tmp_mapping;
> +       struct gh_rm_mem_entry *mem_entries;
> +       phys_addr_t curr_page, prev_page;
> +       struct gh_rm_mem_parcel *parcel;
> +       int i, j, pinned, ret = 0;
> +       size_t entry_size;
> +       u16 vmid;
> +
> +       if (!gh_api_has_feature(GH_API_FEATURE_MEMEXTENT))
> +               return -EOPNOTSUPP;
> +
> +       if (!region->memory_size || !PAGE_ALIGNED(region->memory_size) ||
> +               !PAGE_ALIGNED(region->userspace_addr) || !PAGE_ALIGNED(region->guest_phys_addr))
> +               return -EINVAL;
> +
> +       ret = mutex_lock_interruptible(&ghvm->mm_lock);
> +       if (ret)
> +               return ret;
> +       mapping = __gh_vm_mem_find(ghvm, region->label);
> +       if (mapping) {
> +               mutex_unlock(&ghvm->mm_lock);
> +               return -EEXIST;
> +       }
> +
> +       mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
> +       if (!mapping) {
> +               ret = -ENOMEM;
> +               goto free_mapping;
> +       }
> +
> +       mapping->parcel.label = region->label;
> +       mapping->guest_phys_addr = region->guest_phys_addr;
> +       mapping->npages = region->memory_size >> PAGE_SHIFT;
> +       parcel = &mapping->parcel;
> +       parcel->mem_handle = GH_MEM_HANDLE_INVAL; /* to be filled later by mem_share/mem_lend */
> +       parcel->mem_type = GH_RM_MEM_TYPE_NORMAL;
> +
> +       /* Check for overlap */
> +       list_for_each_entry(tmp_mapping, &ghvm->memory_mappings, list) {
> +               if (!((mapping->guest_phys_addr + (mapping->npages << PAGE_SHIFT) <=
> +                       tmp_mapping->guest_phys_addr) ||
> +                       (mapping->guest_phys_addr >=
> +                       tmp_mapping->guest_phys_addr + (tmp_mapping->npages << PAGE_SHIFT)))) {
> +                       ret = -EEXIST;
> +                       goto free_mapping;
> +               }
> +       }
> +
> +       list_add(&mapping->list, &ghvm->memory_mappings);
> +
> +       mapping->pages = kcalloc(mapping->npages, sizeof(*mapping->pages), GFP_KERNEL);
> +       if (!mapping->pages) {
> +               ret = -ENOMEM;
> +               mapping->npages = 0; /* update npages for reclaim */
> +               goto reclaim;
> +       }

These pages should be accounted for as locked pages, e.g.,
account_locked_vm(), which would also ensure that the process hasn't
reached its limit.

> +       pinned = pin_user_pages_fast(region->userspace_addr, mapping->npages,
> +                                       FOLL_WRITE | FOLL_LONGTERM, mapping->pages);

It might be good to check and avoid donating pages with pre-faulted
file mappings since it might trigger a writeback of a page after
losing access to it. Ideally, you want to only accept anonymous or
shmem pages. In pKVM, we check that the pages are SwapBacked and
reject the pinning/donation otherwise [2].

> +       if (pinned < 0) {
> +               ret = pinned;
> +               mapping->npages = 0; /* update npages for reclaim */
> +               goto reclaim;
> +       } else if (pinned != mapping->npages) {
> +               ret = -EFAULT;
> +               mapping->npages = pinned; /* update npages for reclaim */
> +               goto reclaim;
> +       }
> +
> +       if (region->flags & GH_MEM_LENT) {
> +               parcel->n_acl_entries = 1;
> +               mapping->share_type = VM_MEM_LEND;
> +       } else {
> +               parcel->n_acl_entries = 2;
> +               mapping->share_type = VM_MEM_SHARE;
> +       }
> +       parcel->acl_entries = kcalloc(parcel->n_acl_entries, sizeof(*parcel->acl_entries),
> +                                       GFP_KERNEL);
> +       if (!parcel->acl_entries) {
> +               ret = -ENOMEM;
> +               goto reclaim;
> +       }
> +
> +       parcel->acl_entries[0].vmid = cpu_to_le16(ghvm->vmid);
> +       if (region->flags & GH_MEM_ALLOW_READ)
> +               parcel->acl_entries[0].perms |= GH_RM_ACL_R;
> +       if (region->flags & GH_MEM_ALLOW_WRITE)
> +               parcel->acl_entries[0].perms |= GH_RM_ACL_W;
> +       if (region->flags & GH_MEM_ALLOW_EXEC)
> +               parcel->acl_entries[0].perms |= GH_RM_ACL_X;
> +
> +       if (mapping->share_type == VM_MEM_SHARE) {
> +               ret = gh_rm_get_vmid(ghvm->rm, &vmid);
> +               if (ret)
> +                       goto reclaim;
> +
> +               parcel->acl_entries[1].vmid = cpu_to_le16(vmid);
> +               /* Host assumed to have all these permissions. Gunyah will not
> +                * grant new permissions if host actually had less than RWX
> +                */
> +               parcel->acl_entries[1].perms |= GH_RM_ACL_R | GH_RM_ACL_W | GH_RM_ACL_X;
> +       }
> +
> +       mem_entries = kcalloc(mapping->npages, sizeof(*mem_entries), GFP_KERNEL);
> +       if (!mem_entries) {
> +               ret = -ENOMEM;
> +               goto reclaim;
> +       }
> +
> +       /* reduce number of entries by combining contiguous pages into single memory entry */
> +       prev_page = page_to_phys(mapping->pages[0]);
> +       mem_entries[0].ipa_base = cpu_to_le64(prev_page);
> +       entry_size = PAGE_SIZE;
> +       for (i = 1, j = 0; i < mapping->npages; i++) {
> +               curr_page = page_to_phys(mapping->pages[i]);
> +               if (page_contiguous(prev_page, curr_page)) {
> +                       entry_size += PAGE_SIZE;
> +               } else {
> +                       mem_entries[j].size = cpu_to_le64(entry_size);
> +                       j++;
> +                       mem_entries[j].ipa_base = cpu_to_le64(curr_page);
> +                       entry_size = PAGE_SIZE;
> +               }
> +
> +               prev_page = curr_page;
> +       }
> +       mem_entries[j].size = cpu_to_le64(entry_size);
> +
> +       parcel->n_mem_entries = j + 1;
> +       parcel->mem_entries = kmemdup(mem_entries, sizeof(*mem_entries) * parcel->n_mem_entries,
> +                                       GFP_KERNEL);
> +       kfree(mem_entries);
> +       if (!parcel->mem_entries) {
> +               ret = -ENOMEM;
> +               goto reclaim;
> +       }
> +
> +       mutex_unlock(&ghvm->mm_lock);
> +       return 0;
> +reclaim:
> +       gh_vm_mem_reclaim(ghvm, mapping);
> +free_mapping:
> +       kfree(mapping);
> +       mutex_unlock(&ghvm->mm_lock);
> +       return ret;
> +}
> +
> +int gh_vm_mem_free(struct gh_vm *ghvm, u32 label)
> +{
> +       struct gh_vm_mem *mapping;
> +       int ret;
> +
> +       ret = mutex_lock_interruptible(&ghvm->mm_lock);
> +       if (ret)
> +               return ret;
> +
> +       mapping = __gh_vm_mem_find(ghvm, label);
> +       if (!mapping)
> +               goto out;
> +
> +       gh_vm_mem_reclaim(ghvm, mapping);
> +       kfree(mapping);
> +out:
> +       mutex_unlock(&ghvm->mm_lock);
> +       return ret;
> +}
> diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
> index 10ba32d2b0a6..d85d12119a48 100644
> --- a/include/uapi/linux/gunyah.h
> +++ b/include/uapi/linux/gunyah.h
> @@ -20,4 +20,37 @@
>   */
>  #define GH_CREATE_VM                   _IO(GH_IOCTL_TYPE, 0x0) /* Returns a Gunyah VM fd */
>
> +/*
> + * ioctls for VM fds
> + */
> +
> +/**
> + * struct gh_userspace_memory_region - Userspace memory descripion for GH_VM_SET_USER_MEM_REGION

nit: s/descripion/description

> + * @label: Unique identifer to the region.

nit: s/identifer/identifier




> + * @flags: Flags for memory parcel behavior
> + * @guest_phys_addr: Location of the memory region in guest's memory space (page-aligned)
> + * @memory_size: Size of the region (page-aligned)
> + * @userspace_addr: Location of the memory region in caller (userspace)'s memory
> + *
> + * See Documentation/virt/gunyah/vm-manager.rst for further details.
> + */
> +struct gh_userspace_memory_region {
> +       __u32 label;
> +#define GH_MEM_ALLOW_READ      (1UL << 0)
> +#define GH_MEM_ALLOW_WRITE     (1UL << 1)
> +#define GH_MEM_ALLOW_EXEC      (1UL << 2)
> +/*
> + * The guest will be lent the memory instead of shared.
> + * In other words, the guest has exclusive access to the memory region and the host loses access.
> + */
> +#define GH_MEM_LENT            (1UL << 3)
> +       __u32 flags;
> +       __u64 guest_phys_addr;
> +       __u64 memory_size;
> +       __u64 userspace_addr;
> +};
> +
> +#define GH_VM_SET_USER_MEM_REGION      _IOW(GH_IOCTL_TYPE, 0x1, \
> +                                               struct gh_userspace_memory_region)
> +
>  #endif
> --
> 2.39.1
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
  
Srinivas Kandagatla Feb. 24, 2023, 10:36 a.m. UTC | #9
On 24/02/2023 00:43, Elliot Berman wrote:
>>> +/*
>>> + * ioctls for VM fds
>>> + */
>>> +
>>> +/**
>>> + * struct gh_userspace_memory_region - Userspace memory descripion 
>>> for GH_VM_SET_USER_MEM_REGION
>>> + * @label: Unique identifer to the region.
>>> + * @flags: Flags for memory parcel behavior
>>> + * @guest_phys_addr: Location of the memory region in guest's memory 
>>> space (page-aligned)#
>>
>> Note about overlapping here would be useful.
>>
> 
> I'd like to reduce duplicate documentation where possible. I was 
This is exactly what .rst files can provide.

If you have a proper kernel-doc type documentation in header/source 
files, these can be directly used in .rst files.

The reStructuredText (.rst) files may contain directives to include 
structured documentation comments, or kernel-doc comments, from source 
files.

ex:
.. kernel-doc:: include/linux/gunyah.h
    :internal:


--srini
> generally following this procedure:
>   - include/uapi/linux/gunyah.h docstrings have basic information to 
> remind what the field is
>   - Documentation/virt/gunyah/ documentation explains how to properly 
> use the APIs
> 
> I think it's definitely good idea to have separate documentation beyond 
> what can be described in docstrings here.
> 
> Thanks,
> Elliot
  
Elliot Berman Feb. 24, 2023, 6:08 p.m. UTC | #10
On 2/24/2023 2:19 AM, Fuad Tabba wrote:
> Hi,
> 
> On Tue, Feb 14, 2023 at 9:26 PM Elliot Berman <quic_eberman@quicinc.com> wrote:
>>
>>
>> When launching a virtual machine, Gunyah userspace allocates memory for
>> the guest and informs Gunyah about these memory regions through
>> SET_USER_MEMORY_REGION ioctl.
> 
> I'm working on pKVM [1], and regarding the problem of donating private
> memory to a guest, we and others working on confidential computing
> have faced a similar issue that this patch is trying to address. In
> pKVM, we've initially taken an approach similar to the one here by
> pinning the pages being donated to prevent swapping or migration [2].
> However, we've encountered issues with this approach since the memory
> is still mapped by the host, which could cause the system to crash on
> an errant access.
> 
> Instead, we've been working on adopting an fd-based restricted memory
> approach that was initially proposed for TDX [3] and is now being
> considered by others in the confidential computing space as well
> (e.g., Arm CCA [4]). The basic idea is that the host manages the guest
> memory via a file descriptor instead of a userspace address. It cannot
> map that memory (unless explicitly shared by the guest [5]),
> eliminating the possibility of the host trying to access private
> memory accidentally or being tricked by a malicious actor. This is
> based on memfd with some restrictions. It handles swapping and
> migration by disallowing them (for now [6]), and adds a new type of
> memory region to KVM to accommodate having an fd representing guest
> memory.
> 
> Although the fd-based restricted memory isn't upstream yet, we've
> ported the latest patches to arm64 and made changes and additions to
> make it work with pKVM, to test it and see if the solution is feasible
> for us (it is). I wanted to mention this work in case you find it
> useful, and in the hopes that we can all work on confidential
> computing using the same interfaces as much as possible.

Thanks for highlighting the memfd_restricted changes to us! We'll 
investigate how/if it can suit Gunyah usecases. It sounds like you 
might've made memfd_restricted changes as well? Are those posted on the 
mailing lists? Also, are example userspace (crosvm?) changes posted?

Thanks,
Elliot

> 
> Some comments inline below...
> 
> Cheers,
> /fuad
> 
> [1] https://lore.kernel.org/kvmarm/20220519134204.5379-1-will@kernel.org/
> [2] https://lore.kernel.org/kvmarm/20220519134204.5379-34-will@kernel.org/
> [3] https://lore.kernel.org/all/20221202061347.1070246-1-chao.p.peng@linux.intel.com/
> [4] https://lore.kernel.org/lkml/20230127112932.38045-1-steven.price@arm.com/
> [5] This is a modification we've done for the arm64 port, after
> discussing it with the original authors.
> [6] Nothing inherent in the proposal to stop migration and swapping.
> There are some technical issues that need to be resolved.
> 
> <snip>
<snip, looking at comments in parallel>
  
Sean Christopherson Feb. 24, 2023, 6:58 p.m. UTC | #11
On Fri, Feb 24, 2023, Elliot Berman wrote:
> 
> 
> On 2/24/2023 2:19 AM, Fuad Tabba wrote:
> > Hi,
> > 
> > On Tue, Feb 14, 2023 at 9:26 PM Elliot Berman <quic_eberman@quicinc.com> wrote:
> > > 
> > > 
> > > When launching a virtual machine, Gunyah userspace allocates memory for
> > > the guest and informs Gunyah about these memory regions through
> > > SET_USER_MEMORY_REGION ioctl.
> > 
> > I'm working on pKVM [1], and regarding the problem of donating private
> > memory to a guest, we and others working on confidential computing
> > have faced a similar issue that this patch is trying to address. In
> > pKVM, we've initially taken an approach similar to the one here by
> > pinning the pages being donated to prevent swapping or migration [2].
> > However, we've encountered issues with this approach since the memory
> > is still mapped by the host, which could cause the system to crash on
> > an errant access.
> > 
> > Instead, we've been working on adopting an fd-based restricted memory
> > approach that was initially proposed for TDX [3] and is now being
> > considered by others in the confidential computing space as well
> > (e.g., Arm CCA [4]). The basic idea is that the host manages the guest
> > memory via a file descriptor instead of a userspace address. It cannot
> > map that memory (unless explicitly shared by the guest [5]),
> > eliminating the possibility of the host trying to access private
> > memory accidentally or being tricked by a malicious actor. This is
> > based on memfd with some restrictions. It handles swapping and
> > migration by disallowing them (for now [6]), and adds a new type of
> > memory region to KVM to accommodate having an fd representing guest
> > memory.
> > 
> > Although the fd-based restricted memory isn't upstream yet, we've
> > ported the latest patches to arm64 and made changes and additions to
> > make it work with pKVM, to test it and see if the solution is feasible
> > for us (it is). I wanted to mention this work in case you find it
> > useful, and in the hopes that we can all work on confidential
> > computing using the same interfaces as much as possible.
> 
> Thanks for highlighting the memfd_restricted changes to us! We'll
> investigate how/if it can suit Gunyah usecases.

Can you provide Gunyah's requirements/rules and use cases as they relate to memory
management?  I agree with Fuad, this is pretty much exactly what memfd_restricted()
is intended to handle.  If Gunyah has a unique requirement or use case, it'd be
helpful to find out sooner than later.  E.g.

  1. What is the state of memory when it's accepted by a VM?  Is it undefined,
     i.e. the VM's responsibility to initialize?  If not, is it always
     zero-initialized or can memory be populated by the RM?

  2. When exclusive/private memory is reclaimed, can the VM's data be preserved,
     or is it unconditionally

  3. How frequently is memory transition allocated/reclaimed?

  4. Are there assumptions and/or limitations on the size or granlarity of
     memory objects?

  5. Can memory be shared by multiple VMs but _not_ be accessible from the RM?

  6. etc. :-)

Thanks!
  
Elliot Berman Feb. 25, 2023, 1:03 a.m. UTC | #12
On 2/23/2023 4:34 PM, Alex Elder wrote:
> On 2/14/23 3:24 PM, Elliot Berman wrote:
>>
>> When launching a virtual machine, Gunyah userspace allocates memory for
>> the guest and informs Gunyah about these memory regions through
>> SET_USER_MEMORY_REGION ioctl.
>>
>> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
>> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
>> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
>> ---
>>   drivers/virt/gunyah/Makefile    |   2 +-
>>   drivers/virt/gunyah/vm_mgr.c    |  44 ++++++
>>   drivers/virt/gunyah/vm_mgr.h    |  25 ++++
>>   drivers/virt/gunyah/vm_mgr_mm.c | 235 ++++++++++++++++++++++++++++++++
>>   include/uapi/linux/gunyah.h     |  33 +++++
>>   5 files changed, 338 insertions(+), 1 deletion(-)
>>   create mode 100644 drivers/virt/gunyah/vm_mgr_mm.c
>>
>> diff --git a/drivers/virt/gunyah/Makefile b/drivers/virt/gunyah/Makefile
>> index 03951cf82023..ff8bc4925392 100644
>> --- a/drivers/virt/gunyah/Makefile
>> +++ b/drivers/virt/gunyah/Makefile
>> @@ -2,5 +2,5 @@
>>   obj-$(CONFIG_GUNYAH) += gunyah.o
>> -gunyah_rsc_mgr-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o
>> +gunyah_rsc_mgr-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o vm_mgr_mm.o
>>   obj-$(CONFIG_GUNYAH) += gunyah_rsc_mgr.o
>> diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
>> index fd890a57172e..84102bac03cc 100644
>> --- a/drivers/virt/gunyah/vm_mgr.c
>> +++ b/drivers/virt/gunyah/vm_mgr.c
>> @@ -18,8 +18,16 @@
>>   static void gh_vm_free(struct work_struct *work)
>>   {
>>       struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
>> +    struct gh_vm_mem *mapping, *tmp;
>>       int ret;
>> +    mutex_lock(&ghvm->mm_lock);
>> +    list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, 
>> list) {
>> +        gh_vm_mem_reclaim(ghvm, mapping);
>> +        kfree(mapping);
>> +    }
>> +    mutex_unlock(&ghvm->mm_lock);
>> +
>>       ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
>>       if (ret)
>>           pr_warn("Failed to deallocate vmid: %d\n", ret);
>> @@ -48,11 +56,46 @@ static __must_check struct gh_vm 
>> *gh_vm_alloc(struct gh_rm *rm)
>>       ghvm->vmid = vmid;
>>       ghvm->rm = rm;
>> +    mutex_init(&ghvm->mm_lock);
>> +    INIT_LIST_HEAD(&ghvm->memory_mappings);
>>       INIT_WORK(&ghvm->free_work, gh_vm_free);
>>       return ghvm;
>>   }
>> +static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned 
>> long arg)
>> +{
>> +    struct gh_vm *ghvm = filp->private_data;
>> +    void __user *argp = (void __user *)arg;
>> +    long r;
>> +
>> +    switch (cmd) {
>> +    case GH_VM_SET_USER_MEM_REGION: {
>> +        struct gh_userspace_memory_region region;
>> +
>> +        if (copy_from_user(&region, argp, sizeof(region)))
>> +            return -EFAULT;
>> +
>> +        /* All other flag bits are reserved for future use */
>> +        if (region.flags & ~(GH_MEM_ALLOW_READ | GH_MEM_ALLOW_WRITE | 
>> GH_MEM_ALLOW_EXEC |
>> +            GH_MEM_LENT))
>> +            return -EINVAL;
>> +
>> +
>> +        if (region.memory_size)
> 
> Would there be any value in allowing a zero-size memory
> region to be created?  Maybe that doesn't make sense, but
> I guess i'm questioning whether a zero memory region size
> have special meaning in this interface is a good thing to
> do.  You could sensibly have a separate REMOVE_USER_MEM_REGION
> request, and still permit 0 to be a valid size.
> 

I don't think zero-size memory region makes sense. At best, it only 
registers an empty region with guest and causes memory overhead for 
bookkeeping.

>> +            r = gh_vm_mem_alloc(ghvm, &region);
>> +        else
>> +            r = gh_vm_mem_free(ghvm, region.label);
>> +        break;
>> +    }
>> +    default:
>> +        r = -ENOTTY;
>> +        break;
>> +    }
>> +
>> +    return r;
>> +}
>> +
>>   static int gh_vm_release(struct inode *inode, struct file *filp)
>>   {
>>       struct gh_vm *ghvm = filp->private_data;
>> @@ -65,6 +108,7 @@ static int gh_vm_release(struct inode *inode, 
>> struct file *filp)
>>   }
>>   static const struct file_operations gh_vm_fops = {
>> +    .unlocked_ioctl = gh_vm_ioctl,
>>       .release = gh_vm_release,
>>       .compat_ioctl    = compat_ptr_ioctl,
>>       .llseek = noop_llseek,
>> diff --git a/drivers/virt/gunyah/vm_mgr.h b/drivers/virt/gunyah/vm_mgr.h
>> index 76954da706e9..97bc00c34878 100644
>> --- a/drivers/virt/gunyah/vm_mgr.h
>> +++ b/drivers/virt/gunyah/vm_mgr.h
>> @@ -7,16 +7,41 @@
>>   #define _GH_PRIV_VM_MGR_H
>>   #include <linux/gunyah_rsc_mgr.h>
>> +#include <linux/list.h>
>> +#include <linux/miscdevice.h>
>> +#include <linux/mutex.h>
>>   #include <uapi/linux/gunyah.h>
>>   long gh_dev_vm_mgr_ioctl(struct gh_rm *rm, unsigned int cmd, 
>> unsigned long arg);
>> +enum gh_vm_mem_share_type {
>> +    VM_MEM_SHARE,
>> +    VM_MEM_LEND,
> 
> Are there any other share types anticipated?  Even if
> there were, for now you could use a Boolean to distinguish
> between shared or lent (at least until a third option
> materializes).
> 

There is VM_MEM_DONATE. I can add the type, but it's only used special 
VMs (there's nothing really stopping a generic unauth VM to use it, but 
I don't think anyone will want to).

>> +};
>> +
>> +struct gh_vm_mem {
>> +    struct list_head list;
>> +    enum gh_vm_mem_share_type share_type;
>> +    struct gh_rm_mem_parcel parcel;
>> +
>> +    __u64 guest_phys_addr;
>> +    struct page **pages;
>> +    unsigned long npages;
>> +};
>> +
>>   struct gh_vm {
>>       u16 vmid;
>>       struct gh_rm *rm;
>>       struct work_struct free_work;
>> +    struct mutex mm_lock;
>> +    struct list_head memory_mappings;
>>   };
>> +int gh_vm_mem_alloc(struct gh_vm *ghvm, struct 
>> gh_userspace_memory_region *region);
>> +void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping);
>> +int gh_vm_mem_free(struct gh_vm *ghvm, u32 label);
>> +struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label);
>> +
>>   #endif
>> diff --git a/drivers/virt/gunyah/vm_mgr_mm.c 
>> b/drivers/virt/gunyah/vm_mgr_mm.c
>> new file mode 100644
>> index 000000000000..03e71a36ea3b
>> --- /dev/null
>> +++ b/drivers/virt/gunyah/vm_mgr_mm.c
>> @@ -0,0 +1,235 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All 
>> rights reserved.
>> + */
>> +
>> +#define pr_fmt(fmt) "gh_vm_mgr: " fmt
>> +
>> +#include <linux/gunyah_rsc_mgr.h>
>> +#include <linux/mm.h>
>> +
>> +#include <uapi/linux/gunyah.h>
>> +
>> +#include "vm_mgr.h"
>> +
>> +static inline bool page_contiguous(phys_addr_t p, phys_addr_t t)
> 
> Is there not some existing function that captures this?
> In any case, it's used in one place and I think it would
> be clearer to just put the logic there rather than hiding
> it behind this function.
> 

Done.

>> +{
>> +    return t - p == PAGE_SIZE;
>> +}
>> +
>> +static struct gh_vm_mem *__gh_vm_mem_find(struct gh_vm *ghvm, u32 label)
>> +    __must_hold(&ghvm->mm_lock)
>> +{
>> +    struct gh_vm_mem *mapping;
>> +
>> +    list_for_each_entry(mapping, &ghvm->memory_mappings, list)
>> +        if (mapping->parcel.label == label)
>> +            return mapping;
>> +
>> +    return NULL;
>> +}
>> +
> 
> . . .
> 
>> diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
>> index 10ba32d2b0a6..d85d12119a48 100644
>> --- a/include/uapi/linux/gunyah.h
>> +++ b/include/uapi/linux/gunyah.h
>> @@ -20,4 +20,37 @@
>>    */
>>   #define GH_CREATE_VM            _IO(GH_IOCTL_TYPE, 0x0) /* Returns a 
>> Gunyah VM fd */
>> +/*
>> + * ioctls for VM fds
>> + */
>> +
>> +/**
>> + * struct gh_userspace_memory_region - Userspace memory descripion 
>> for GH_VM_SET_USER_MEM_REGION
>> + * @label: Unique identifer to the region.
> 
> Maybe this is described somewhere, but what is the purpose
> of the label?  Who uses it?  Is it meant to be a value
> only the current owner of a resource understands?  Or does
> resource manager use it internally, or what?
> 

The label is used by kernel, userspace, and Gunyah. Userspace decides 
all the labels and there are no special labels.

  - Userspace can delete memory parcels by label (kernel looks up parcel 
by label)
  - The VM's DTB configuration describes where Gunyah should map memory 
parcels into guest's memory. The VM DTB uses the memory parcel's label 
as the reference.

Thanks,
Elliot

>> + * @flags: Flags for memory parcel behavior
>> + * @guest_phys_addr: Location of the memory region in guest's memory 
>> space (page-aligned)
>> + * @memory_size: Size of the region (page-aligned)
>> + * @userspace_addr: Location of the memory region in caller 
>> (userspace)'s memory
>> + *
>> + * See Documentation/virt/gunyah/vm-manager.rst for further details.
>> + */
>> +struct gh_userspace_memory_region {
>> +    __u32 label;
> 
> Define the possible permission values separate from
> the structure.
> 
>                      -Alex
> 
>> +#define GH_MEM_ALLOW_READ    (1UL << 0)
>> +#define GH_MEM_ALLOW_WRITE    (1UL << 1)
>> +#define GH_MEM_ALLOW_EXEC    (1UL << 2)
>> +/*
>> + * The guest will be lent the memory instead of shared.
>> + * In other words, the guest has exclusive access to the memory 
>> region and the host loses access.
>> + */
>> +#define GH_MEM_LENT        (1UL << 3)
>> +    __u32 flags;
>> +    __u64 guest_phys_addr;
>> +    __u64 memory_size;
>> +    __u64 userspace_addr;
>> +};
>> +
>> +#define GH_VM_SET_USER_MEM_REGION    _IOW(GH_IOCTL_TYPE, 0x1, \
>> +                        struct gh_userspace_memory_region)
>> +
>>   #endif
>
  
Fuad Tabba Feb. 27, 2023, 9:55 a.m. UTC | #13
Hi,

On Fri, Feb 24, 2023 at 6:08 PM Elliot Berman <quic_eberman@quicinc.com> wrote:
>
>
>
> On 2/24/2023 2:19 AM, Fuad Tabba wrote:
> > Hi,
> >
> > On Tue, Feb 14, 2023 at 9:26 PM Elliot Berman <quic_eberman@quicinc.com> wrote:
> >>
> >>
> >> When launching a virtual machine, Gunyah userspace allocates memory for
> >> the guest and informs Gunyah about these memory regions through
> >> SET_USER_MEMORY_REGION ioctl.
> >
> > I'm working on pKVM [1], and regarding the problem of donating private
> > memory to a guest, we and others working on confidential computing
> > have faced a similar issue that this patch is trying to address. In
> > pKVM, we've initially taken an approach similar to the one here by
> > pinning the pages being donated to prevent swapping or migration [2].
> > However, we've encountered issues with this approach since the memory
> > is still mapped by the host, which could cause the system to crash on
> > an errant access.
> >
> > Instead, we've been working on adopting an fd-based restricted memory
> > approach that was initially proposed for TDX [3] and is now being
> > considered by others in the confidential computing space as well
> > (e.g., Arm CCA [4]). The basic idea is that the host manages the guest
> > memory via a file descriptor instead of a userspace address. It cannot
> > map that memory (unless explicitly shared by the guest [5]),
> > eliminating the possibility of the host trying to access private
> > memory accidentally or being tricked by a malicious actor. This is
> > based on memfd with some restrictions. It handles swapping and
> > migration by disallowing them (for now [6]), and adds a new type of
> > memory region to KVM to accommodate having an fd representing guest
> > memory.
> >
> > Although the fd-based restricted memory isn't upstream yet, we've
> > ported the latest patches to arm64 and made changes and additions to
> > make it work with pKVM, to test it and see if the solution is feasible
> > for us (it is). I wanted to mention this work in case you find it
> > useful, and in the hopes that we can all work on confidential
> > computing using the same interfaces as much as possible.
>
> Thanks for highlighting the memfd_restricted changes to us! We'll
> investigate how/if it can suit Gunyah usecases. It sounds like you
> might've made memfd_restricted changes as well? Are those posted on the
> mailing lists? Also, are example userspace (crosvm?) changes posted?

I have posted kvmtool changes to make it work with memfd_restricted
and pKVM as an RFC [1] (git [2]). I haven't posted the arm64 port, but
it's in a git repo [3]. Chao has a repository with qemu support (TDX)
as well [4].

Eventually, we're likely to have crosvm support as well. If you're
interested, I can keep you CCed on anything we post upstream.

Cheers,
/fuad

[1] https://lore.kernel.org/all/20221202174417.1310826-1-tabba@google.com/
[2] https://android-kvm.googlesource.com/kvmtool/+/refs/heads/tabba/fdmem-v10-core
[3] https://android-kvm.googlesource.com/linux/+/refs/heads/tabba/fdmem-v10-core
[4] https://github.com/chao-p/qemu/tree/privmem-v10

>
> Thanks,
> Elliot
>
> >
> > Some comments inline below...
> >
> > Cheers,
> > /fuad
> >
> > [1] https://lore.kernel.org/kvmarm/20220519134204.5379-1-will@kernel.org/
> > [2] https://lore.kernel.org/kvmarm/20220519134204.5379-34-will@kernel.org/
> > [3] https://lore.kernel.org/all/20221202061347.1070246-1-chao.p.peng@linux.intel.com/
> > [4] https://lore.kernel.org/lkml/20230127112932.38045-1-steven.price@arm.com/
> > [5] This is a modification we've done for the arm64 port, after
> > discussing it with the original authors.
> > [6] Nothing inherent in the proposal to stop migration and swapping.
> > There are some technical issues that need to be resolved.
> >
> > <snip>
> <snip, looking at comments in parallel>
  

Patch

diff --git a/drivers/virt/gunyah/Makefile b/drivers/virt/gunyah/Makefile
index 03951cf82023..ff8bc4925392 100644
--- a/drivers/virt/gunyah/Makefile
+++ b/drivers/virt/gunyah/Makefile
@@ -2,5 +2,5 @@ 
 
 obj-$(CONFIG_GUNYAH) += gunyah.o
 
-gunyah_rsc_mgr-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o
+gunyah_rsc_mgr-y += rsc_mgr.o rsc_mgr_rpc.o vm_mgr.o vm_mgr_mm.o
 obj-$(CONFIG_GUNYAH) += gunyah_rsc_mgr.o
diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
index fd890a57172e..84102bac03cc 100644
--- a/drivers/virt/gunyah/vm_mgr.c
+++ b/drivers/virt/gunyah/vm_mgr.c
@@ -18,8 +18,16 @@ 
 static void gh_vm_free(struct work_struct *work)
 {
 	struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
+	struct gh_vm_mem *mapping, *tmp;
 	int ret;
 
+	mutex_lock(&ghvm->mm_lock);
+	list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
+		gh_vm_mem_reclaim(ghvm, mapping);
+		kfree(mapping);
+	}
+	mutex_unlock(&ghvm->mm_lock);
+
 	ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
 	if (ret)
 		pr_warn("Failed to deallocate vmid: %d\n", ret);
@@ -48,11 +56,46 @@  static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
 	ghvm->vmid = vmid;
 	ghvm->rm = rm;
 
+	mutex_init(&ghvm->mm_lock);
+	INIT_LIST_HEAD(&ghvm->memory_mappings);
 	INIT_WORK(&ghvm->free_work, gh_vm_free);
 
 	return ghvm;
 }
 
+static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct gh_vm *ghvm = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	long r;
+
+	switch (cmd) {
+	case GH_VM_SET_USER_MEM_REGION: {
+		struct gh_userspace_memory_region region;
+
+		if (copy_from_user(&region, argp, sizeof(region)))
+			return -EFAULT;
+
+		/* All other flag bits are reserved for future use */
+		if (region.flags & ~(GH_MEM_ALLOW_READ | GH_MEM_ALLOW_WRITE | GH_MEM_ALLOW_EXEC |
+			GH_MEM_LENT))
+			return -EINVAL;
+
+
+		if (region.memory_size)
+			r = gh_vm_mem_alloc(ghvm, &region);
+		else
+			r = gh_vm_mem_free(ghvm, region.label);
+		break;
+	}
+	default:
+		r = -ENOTTY;
+		break;
+	}
+
+	return r;
+}
+
 static int gh_vm_release(struct inode *inode, struct file *filp)
 {
 	struct gh_vm *ghvm = filp->private_data;
@@ -65,6 +108,7 @@  static int gh_vm_release(struct inode *inode, struct file *filp)
 }
 
 static const struct file_operations gh_vm_fops = {
+	.unlocked_ioctl = gh_vm_ioctl,
 	.release = gh_vm_release,
 	.compat_ioctl	= compat_ptr_ioctl,
 	.llseek = noop_llseek,
diff --git a/drivers/virt/gunyah/vm_mgr.h b/drivers/virt/gunyah/vm_mgr.h
index 76954da706e9..97bc00c34878 100644
--- a/drivers/virt/gunyah/vm_mgr.h
+++ b/drivers/virt/gunyah/vm_mgr.h
@@ -7,16 +7,41 @@ 
 #define _GH_PRIV_VM_MGR_H
 
 #include <linux/gunyah_rsc_mgr.h>
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
 
 #include <uapi/linux/gunyah.h>
 
 long gh_dev_vm_mgr_ioctl(struct gh_rm *rm, unsigned int cmd, unsigned long arg);
 
+enum gh_vm_mem_share_type {
+	VM_MEM_SHARE,
+	VM_MEM_LEND,
+};
+
+struct gh_vm_mem {
+	struct list_head list;
+	enum gh_vm_mem_share_type share_type;
+	struct gh_rm_mem_parcel parcel;
+
+	__u64 guest_phys_addr;
+	struct page **pages;
+	unsigned long npages;
+};
+
 struct gh_vm {
 	u16 vmid;
 	struct gh_rm *rm;
 
 	struct work_struct free_work;
+	struct mutex mm_lock;
+	struct list_head memory_mappings;
 };
 
+int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *region);
+void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping);
+int gh_vm_mem_free(struct gh_vm *ghvm, u32 label);
+struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label);
+
 #endif
diff --git a/drivers/virt/gunyah/vm_mgr_mm.c b/drivers/virt/gunyah/vm_mgr_mm.c
new file mode 100644
index 000000000000..03e71a36ea3b
--- /dev/null
+++ b/drivers/virt/gunyah/vm_mgr_mm.c
@@ -0,0 +1,235 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#define pr_fmt(fmt) "gh_vm_mgr: " fmt
+
+#include <linux/gunyah_rsc_mgr.h>
+#include <linux/mm.h>
+
+#include <uapi/linux/gunyah.h>
+
+#include "vm_mgr.h"
+
+static inline bool page_contiguous(phys_addr_t p, phys_addr_t t)
+{
+	return t - p == PAGE_SIZE;
+}
+
+static struct gh_vm_mem *__gh_vm_mem_find(struct gh_vm *ghvm, u32 label)
+	__must_hold(&ghvm->mm_lock)
+{
+	struct gh_vm_mem *mapping;
+
+	list_for_each_entry(mapping, &ghvm->memory_mappings, list)
+		if (mapping->parcel.label == label)
+			return mapping;
+
+	return NULL;
+}
+
+void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping)
+	__must_hold(&ghvm->mm_lock)
+{
+	int i, ret = 0;
+
+	if (mapping->parcel.mem_handle != GH_MEM_HANDLE_INVAL) {
+		ret = gh_rm_mem_reclaim(ghvm->rm, &mapping->parcel);
+		if (ret)
+			pr_warn("Failed to reclaim memory parcel for label %d: %d\n",
+				mapping->parcel.label, ret);
+	}
+
+	if (!ret)
+		for (i = 0; i < mapping->npages; i++)
+			unpin_user_page(mapping->pages[i]);
+
+	kfree(mapping->pages);
+	kfree(mapping->parcel.acl_entries);
+	kfree(mapping->parcel.mem_entries);
+
+	list_del(&mapping->list);
+}
+
+struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label)
+{
+	struct gh_vm_mem *mapping;
+	int ret;
+
+	ret = mutex_lock_interruptible(&ghvm->mm_lock);
+	if (ret)
+		return ERR_PTR(ret);
+	mapping = __gh_vm_mem_find(ghvm, label);
+	mutex_unlock(&ghvm->mm_lock);
+	return mapping ? : ERR_PTR(-ENODEV);
+}
+
+int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *region)
+{
+	struct gh_vm_mem *mapping, *tmp_mapping;
+	struct gh_rm_mem_entry *mem_entries;
+	phys_addr_t curr_page, prev_page;
+	struct gh_rm_mem_parcel *parcel;
+	int i, j, pinned, ret = 0;
+	size_t entry_size;
+	u16 vmid;
+
+	if (!gh_api_has_feature(GH_API_FEATURE_MEMEXTENT))
+		return -EOPNOTSUPP;
+
+	if (!region->memory_size || !PAGE_ALIGNED(region->memory_size) ||
+		!PAGE_ALIGNED(region->userspace_addr) || !PAGE_ALIGNED(region->guest_phys_addr))
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&ghvm->mm_lock);
+	if (ret)
+		return ret;
+	mapping = __gh_vm_mem_find(ghvm, region->label);
+	if (mapping) {
+		mutex_unlock(&ghvm->mm_lock);
+		return -EEXIST;
+	}
+
+	mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
+	if (!mapping) {
+		ret = -ENOMEM;
+		goto free_mapping;
+	}
+
+	mapping->parcel.label = region->label;
+	mapping->guest_phys_addr = region->guest_phys_addr;
+	mapping->npages = region->memory_size >> PAGE_SHIFT;
+	parcel = &mapping->parcel;
+	parcel->mem_handle = GH_MEM_HANDLE_INVAL; /* to be filled later by mem_share/mem_lend */
+	parcel->mem_type = GH_RM_MEM_TYPE_NORMAL;
+
+	/* Check for overlap */
+	list_for_each_entry(tmp_mapping, &ghvm->memory_mappings, list) {
+		if (!((mapping->guest_phys_addr + (mapping->npages << PAGE_SHIFT) <=
+			tmp_mapping->guest_phys_addr) ||
+			(mapping->guest_phys_addr >=
+			tmp_mapping->guest_phys_addr + (tmp_mapping->npages << PAGE_SHIFT)))) {
+			ret = -EEXIST;
+			goto free_mapping;
+		}
+	}
+
+	list_add(&mapping->list, &ghvm->memory_mappings);
+
+	mapping->pages = kcalloc(mapping->npages, sizeof(*mapping->pages), GFP_KERNEL);
+	if (!mapping->pages) {
+		ret = -ENOMEM;
+		mapping->npages = 0; /* update npages for reclaim */
+		goto reclaim;
+	}
+
+	pinned = pin_user_pages_fast(region->userspace_addr, mapping->npages,
+					FOLL_WRITE | FOLL_LONGTERM, mapping->pages);
+	if (pinned < 0) {
+		ret = pinned;
+		mapping->npages = 0; /* update npages for reclaim */
+		goto reclaim;
+	} else if (pinned != mapping->npages) {
+		ret = -EFAULT;
+		mapping->npages = pinned; /* update npages for reclaim */
+		goto reclaim;
+	}
+
+	if (region->flags & GH_MEM_LENT) {
+		parcel->n_acl_entries = 1;
+		mapping->share_type = VM_MEM_LEND;
+	} else {
+		parcel->n_acl_entries = 2;
+		mapping->share_type = VM_MEM_SHARE;
+	}
+	parcel->acl_entries = kcalloc(parcel->n_acl_entries, sizeof(*parcel->acl_entries),
+					GFP_KERNEL);
+	if (!parcel->acl_entries) {
+		ret = -ENOMEM;
+		goto reclaim;
+	}
+
+	parcel->acl_entries[0].vmid = cpu_to_le16(ghvm->vmid);
+	if (region->flags & GH_MEM_ALLOW_READ)
+		parcel->acl_entries[0].perms |= GH_RM_ACL_R;
+	if (region->flags & GH_MEM_ALLOW_WRITE)
+		parcel->acl_entries[0].perms |= GH_RM_ACL_W;
+	if (region->flags & GH_MEM_ALLOW_EXEC)
+		parcel->acl_entries[0].perms |= GH_RM_ACL_X;
+
+	if (mapping->share_type == VM_MEM_SHARE) {
+		ret = gh_rm_get_vmid(ghvm->rm, &vmid);
+		if (ret)
+			goto reclaim;
+
+		parcel->acl_entries[1].vmid = cpu_to_le16(vmid);
+		/* Host assumed to have all these permissions. Gunyah will not
+		 * grant new permissions if host actually had less than RWX
+		 */
+		parcel->acl_entries[1].perms |= GH_RM_ACL_R | GH_RM_ACL_W | GH_RM_ACL_X;
+	}
+
+	mem_entries = kcalloc(mapping->npages, sizeof(*mem_entries), GFP_KERNEL);
+	if (!mem_entries) {
+		ret = -ENOMEM;
+		goto reclaim;
+	}
+
+	/* reduce number of entries by combining contiguous pages into single memory entry */
+	prev_page = page_to_phys(mapping->pages[0]);
+	mem_entries[0].ipa_base = cpu_to_le64(prev_page);
+	entry_size = PAGE_SIZE;
+	for (i = 1, j = 0; i < mapping->npages; i++) {
+		curr_page = page_to_phys(mapping->pages[i]);
+		if (page_contiguous(prev_page, curr_page)) {
+			entry_size += PAGE_SIZE;
+		} else {
+			mem_entries[j].size = cpu_to_le64(entry_size);
+			j++;
+			mem_entries[j].ipa_base = cpu_to_le64(curr_page);
+			entry_size = PAGE_SIZE;
+		}
+
+		prev_page = curr_page;
+	}
+	mem_entries[j].size = cpu_to_le64(entry_size);
+
+	parcel->n_mem_entries = j + 1;
+	parcel->mem_entries = kmemdup(mem_entries, sizeof(*mem_entries) * parcel->n_mem_entries,
+					GFP_KERNEL);
+	kfree(mem_entries);
+	if (!parcel->mem_entries) {
+		ret = -ENOMEM;
+		goto reclaim;
+	}
+
+	mutex_unlock(&ghvm->mm_lock);
+	return 0;
+reclaim:
+	gh_vm_mem_reclaim(ghvm, mapping);
+free_mapping:
+	kfree(mapping);
+	mutex_unlock(&ghvm->mm_lock);
+	return ret;
+}
+
+int gh_vm_mem_free(struct gh_vm *ghvm, u32 label)
+{
+	struct gh_vm_mem *mapping;
+	int ret;
+
+	ret = mutex_lock_interruptible(&ghvm->mm_lock);
+	if (ret)
+		return ret;
+
+	mapping = __gh_vm_mem_find(ghvm, label);
+	if (!mapping)
+		goto out;
+
+	gh_vm_mem_reclaim(ghvm, mapping);
+	kfree(mapping);
+out:
+	mutex_unlock(&ghvm->mm_lock);
+	return ret;
+}
diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
index 10ba32d2b0a6..d85d12119a48 100644
--- a/include/uapi/linux/gunyah.h
+++ b/include/uapi/linux/gunyah.h
@@ -20,4 +20,37 @@ 
  */
 #define GH_CREATE_VM			_IO(GH_IOCTL_TYPE, 0x0) /* Returns a Gunyah VM fd */
 
+/*
+ * ioctls for VM fds
+ */
+
+/**
+ * struct gh_userspace_memory_region - Userspace memory descripion for GH_VM_SET_USER_MEM_REGION
+ * @label: Unique identifer to the region.
+ * @flags: Flags for memory parcel behavior
+ * @guest_phys_addr: Location of the memory region in guest's memory space (page-aligned)
+ * @memory_size: Size of the region (page-aligned)
+ * @userspace_addr: Location of the memory region in caller (userspace)'s memory
+ *
+ * See Documentation/virt/gunyah/vm-manager.rst for further details.
+ */
+struct gh_userspace_memory_region {
+	__u32 label;
+#define GH_MEM_ALLOW_READ	(1UL << 0)
+#define GH_MEM_ALLOW_WRITE	(1UL << 1)
+#define GH_MEM_ALLOW_EXEC	(1UL << 2)
+/*
+ * The guest will be lent the memory instead of shared.
+ * In other words, the guest has exclusive access to the memory region and the host loses access.
+ */
+#define GH_MEM_LENT		(1UL << 3)
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size;
+	__u64 userspace_addr;
+};
+
+#define GH_VM_SET_USER_MEM_REGION	_IOW(GH_IOCTL_TYPE, 0x1, \
+						struct gh_userspace_memory_region)
+
 #endif