[v11,13/26] gunyah: vm_mgr: Add ioctls to support basic non-proxy VM boot

Message ID 20230304010632.2127470-14-quic_eberman@quicinc.com
State New
Headers
Series Drivers for gunyah hypervisor |

Commit Message

Elliot Berman March 4, 2023, 1:06 a.m. UTC
  Add remaining ioctls to support non-proxy VM boot:

 - Gunyah Resource Manager uses the VM's devicetree to configure the
   virtual machine. The location of the devicetree in the guest's
   virtual memory can be declared via the SET_DTB_CONFIG ioctl.
 - Trigger start of the virtual machine with VM_START ioctl.

Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
 drivers/virt/gunyah/vm_mgr.c    | 243 ++++++++++++++++++++++++++++++--
 drivers/virt/gunyah/vm_mgr.h    |  10 ++
 drivers/virt/gunyah/vm_mgr_mm.c |  23 +++
 include/linux/gunyah_rsc_mgr.h  |   6 +
 include/uapi/linux/gunyah.h     |  13 ++
 5 files changed, 282 insertions(+), 13 deletions(-)
  

Comments

Srinivas Kandagatla March 21, 2023, 2:24 p.m. UTC | #1
On 04/03/2023 01:06, Elliot Berman wrote:
> Add remaining ioctls to support non-proxy VM boot:
> 
>   - Gunyah Resource Manager uses the VM's devicetree to configure the
>     virtual machine. The location of the devicetree in the guest's
>     virtual memory can be declared via the SET_DTB_CONFIG ioctl.
>   - Trigger start of the virtual machine with VM_START ioctl.
> 
> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> ---
>   drivers/virt/gunyah/vm_mgr.c    | 243 ++++++++++++++++++++++++++++++--
>   drivers/virt/gunyah/vm_mgr.h    |  10 ++
>   drivers/virt/gunyah/vm_mgr_mm.c |  23 +++
>   include/linux/gunyah_rsc_mgr.h  |   6 +
>   include/uapi/linux/gunyah.h     |  13 ++
>   5 files changed, 282 insertions(+), 13 deletions(-)
> 

...

> diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
> index a19207e3e065..d6abd8605a2e 100644
> --- a/include/uapi/linux/gunyah.h
> +++ b/include/uapi/linux/gunyah.h
> @@ -49,4 +49,17 @@ struct gh_userspace_memory_region {
>   #define GH_VM_SET_USER_MEM_REGION	_IOW(GH_IOCTL_TYPE, 0x1, \
>   						struct gh_userspace_memory_region)
>   
> +/**
> + * struct gh_vm_dtb_config - Set the location of the VM's devicetree blob
> + * @guest_phys_addr: Address of the VM's devicetree in guest memory.
> + * @size: Maximum size of the devicetree.
> + */
> +struct gh_vm_dtb_config {
> +	__u64 guest_phys_addr;
> +	__u64 size;
> +};
> +#define GH_VM_SET_DTB_CONFIG	_IOW(GH_IOCTL_TYPE, 0x2, struct gh_vm_dtb_config)
> +
> +#define GH_VM_START		_IO(GH_IOCTL_TYPE, 0x3)
A comment here that this is going to *ONLY* start an un-authenticated VM 
would be useful to the users.

with that fixed,

Reviewed-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>


--srini


> +
>   #endif
  
Alex Elder March 31, 2023, 2:26 p.m. UTC | #2
On 3/3/23 7:06 PM, Elliot Berman wrote:
> Add remaining ioctls to support non-proxy VM boot:
> 
>   - Gunyah Resource Manager uses the VM's devicetree to configure the
>     virtual machine. The location of the devicetree in the guest's
>     virtual memory can be declared via the SET_DTB_CONFIG ioctl.
>   - Trigger start of the virtual machine with VM_START ioctl.
> 
> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>

I identify one bug here, possibly another.  And I have a few
suggestions about things that could improve code readability.

					-Alex

> ---
>   drivers/virt/gunyah/vm_mgr.c    | 243 ++++++++++++++++++++++++++++++--
>   drivers/virt/gunyah/vm_mgr.h    |  10 ++
>   drivers/virt/gunyah/vm_mgr_mm.c |  23 +++
>   include/linux/gunyah_rsc_mgr.h  |   6 +
>   include/uapi/linux/gunyah.h     |  13 ++
>   5 files changed, 282 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
> index e950274c6a53..299b9bb81edc 100644
> --- a/drivers/virt/gunyah/vm_mgr.c
> +++ b/drivers/virt/gunyah/vm_mgr.c
> @@ -9,37 +9,118 @@
>   #include <linux/file.h>
>   #include <linux/gunyah_rsc_mgr.h>
>   #include <linux/miscdevice.h>
> +#include <linux/mm.h>
>   #include <linux/module.h>
>   
>   #include <uapi/linux/gunyah.h>
>   
>   #include "vm_mgr.h"
>   
> +static int gh_vm_rm_notification_status(struct gh_vm *ghvm, void *data)
> +{
> +	struct gh_rm_vm_status_payload *payload = data;
> +
> +	if (payload->vmid != ghvm->vmid)
> +		return NOTIFY_OK;
> +
> +	/* All other state transitions are synchronous to a corresponding RM call */
> +	if (payload->vm_status == GH_RM_VM_STATUS_RESET) {
> +		down_write(&ghvm->status_lock);
> +		ghvm->vm_status = payload->vm_status;
> +		up_write(&ghvm->status_lock);
> +		wake_up(&ghvm->vm_status_wait);
> +	}
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static int gh_vm_rm_notification_exited(struct gh_vm *ghvm, void *data)
> +{
> +	struct gh_rm_vm_exited_payload *payload = data;
> +
> +	if (payload->vmid != ghvm->vmid)
> +		return NOTIFY_OK;
> +
> +	down_write(&ghvm->status_lock);
> +	ghvm->vm_status = GH_RM_VM_STATUS_EXITED;
> +	up_write(&ghvm->status_lock);
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static int gh_vm_rm_notification(struct notifier_block *nb, unsigned long action, void *data)
> +{
> +	struct gh_vm *ghvm = container_of(nb, struct gh_vm, nb);
> +
> +	switch (action) {
> +	case GH_RM_NOTIFICATION_VM_STATUS:
> +		return gh_vm_rm_notification_status(ghvm, data);
> +	case GH_RM_NOTIFICATION_VM_EXITED:
> +		return gh_vm_rm_notification_exited(ghvm, data);
> +	default:
> +		return NOTIFY_OK;
> +	}
> +}
> +
> +static void gh_vm_stop(struct gh_vm *ghvm)
> +{
> +	int ret;
> +
> +	down_write(&ghvm->status_lock);
> +	if (ghvm->vm_status == GH_RM_VM_STATUS_RUNNING) {
> +		ret = gh_rm_vm_stop(ghvm->rm, ghvm->vmid);
> +		if (ret)
> +			dev_warn(ghvm->parent, "Failed to stop VM: %d\n", ret);
> +	}
> +
> +	ghvm->vm_status = GH_RM_VM_STATUS_EXITED;
> +	up_write(&ghvm->status_lock);
> +}
> +
>   static void gh_vm_free(struct work_struct *work)
>   {
>   	struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
>   	struct gh_vm_mem *mapping, *tmp;
>   	int ret;
>   
> -	mutex_lock(&ghvm->mm_lock);
> -	list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
> -		gh_vm_mem_reclaim(ghvm, mapping);
> -		kfree(mapping);
> -	}
> -	mutex_unlock(&ghvm->mm_lock);
> -
> -	ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
> -	if (ret)
> -		pr_warn("Failed to deallocate vmid: %d\n", ret);
> +	switch (ghvm->vm_status) {
> +	case GH_RM_VM_STATUS_RUNNING:
> +		gh_vm_stop(ghvm);
> +		fallthrough;
> +	case GH_RM_VM_STATUS_INIT_FAILED:
> +	case GH_RM_VM_STATUS_LOAD:
> +	case GH_RM_VM_STATUS_EXITED:
> +		mutex_lock(&ghvm->mm_lock);
> +		list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
> +			gh_vm_mem_reclaim(ghvm, mapping);
> +			kfree(mapping);
> +		}
> +		mutex_unlock(&ghvm->mm_lock);
> +		fallthrough;
> +	case GH_RM_VM_STATUS_NO_STATE:
> +		ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
> +		if (ret)
> +			dev_warn(ghvm->parent, "Failed to deallocate vmid: %d\n", ret);
> +
> +		gh_rm_notifier_unregister(ghvm->rm, &ghvm->nb);

I think you should unregister the notifier before you
deallocate the VMID.  I think the notifier might be able
to use the VMID.

> +		gh_rm_put(ghvm->rm);
> +		kfree(ghvm);
> +		break;
> +	default:
> +		dev_err(ghvm->parent, "VM is unknown state: %d. VM will not be cleaned up.\n",
> +			ghvm->vm_status);
>   
> -	put_gh_rm(ghvm->rm);
> -	kfree(ghvm);
> +		gh_rm_notifier_unregister(ghvm->rm, &ghvm->nb);
> +		gh_rm_put(ghvm->rm);
> +		kfree(ghvm);
> +		break;
> +	}
>   }
>   
>   static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
>   {
>   	struct gh_vm *ghvm;
> -	int vmid;
> +	int vmid, ret;
>   
>   	vmid = gh_rm_alloc_vmid(rm, 0);
>   	if (vmid < 0)
> @@ -55,13 +136,130 @@ static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
>   	ghvm->vmid = vmid;
>   	ghvm->rm = rm;
>   
> +	init_waitqueue_head(&ghvm->vm_status_wait);
> +	ghvm->nb.notifier_call = gh_vm_rm_notification;
> +	ret = gh_rm_notifier_register(rm, &ghvm->nb);
> +	if (ret) {
> +		gh_rm_put(rm);
> +		gh_rm_dealloc_vmid(rm, vmid);
> +		kfree(ghvm);
> +		return ERR_PTR(ret);
> +	}
> +
>   	mutex_init(&ghvm->mm_lock);
>   	INIT_LIST_HEAD(&ghvm->memory_mappings);
> +	init_rwsem(&ghvm->status_lock);
>   	INIT_WORK(&ghvm->free_work, gh_vm_free);
> +	ghvm->vm_status = GH_RM_VM_STATUS_LOAD;
>   
>   	return ghvm;
>   }
>   
> +static int gh_vm_start(struct gh_vm *ghvm)
> +{
> +	struct gh_vm_mem *mapping;
> +	u64 dtb_offset;
> +	u32 mem_handle;
> +	int ret;
> +
> +	down_write(&ghvm->status_lock);
> +	if (ghvm->vm_status != GH_RM_VM_STATUS_LOAD) {
> +		up_write(&ghvm->status_lock);
> +		return 0;
> +	}
> +
> +	ghvm->vm_status = GH_RM_VM_STATUS_RESET;
> +
> +	mutex_lock(&ghvm->mm_lock);
> +	list_for_each_entry(mapping, &ghvm->memory_mappings, list) {
> +		switch (mapping->share_type) {
> +		case VM_MEM_LEND:
> +			ret = gh_rm_mem_lend(ghvm->rm, &mapping->parcel);
> +			break;
> +		case VM_MEM_SHARE:
> +			ret = gh_rm_mem_share(ghvm->rm, &mapping->parcel);
> +			break;
> +		}
> +		if (ret) {
> +			dev_warn(ghvm->parent, "Failed to %s parcel %d: %d\n",
> +				mapping->share_type == VM_MEM_LEND ? "lend" : "share",
> +				mapping->parcel.label,
> +				ret);
> +			goto err;
> +		}
> +	}
> +	mutex_unlock(&ghvm->mm_lock);
> +
> +	mapping = gh_vm_mem_find_by_addr(ghvm, ghvm->dtb_config.guest_phys_addr,
> +					ghvm->dtb_config.size);
> +	if (!mapping) {
> +		dev_warn(ghvm->parent, "Failed to find the memory_handle for DTB\n");
> +		ret = -EINVAL;
> +		goto err;
> +	}
> +
> +	mem_handle = mapping->parcel.mem_handle;
> +	dtb_offset = ghvm->dtb_config.guest_phys_addr - mapping->guest_phys_addr;
> +
> +	ret = gh_rm_vm_configure(ghvm->rm, ghvm->vmid, ghvm->auth, mem_handle,
> +				0, 0, dtb_offset, ghvm->dtb_config.size);
> +	if (ret) {
> +		dev_warn(ghvm->parent, "Failed to configure VM: %d\n", ret);
> +		goto err;
> +	}
> +
> +	ret = gh_rm_vm_init(ghvm->rm, ghvm->vmid);
> +	if (ret) {
> +		dev_warn(ghvm->parent, "Failed to initialize VM: %d\n", ret);
> +		goto err;
> +	}
> +
> +	ret = gh_rm_vm_start(ghvm->rm, ghvm->vmid);
> +	if (ret) {
> +		dev_warn(ghvm->parent, "Failed to start VM: %d\n", ret);
> +		goto err;
> +	}
> +
> +	ghvm->vm_status = GH_RM_VM_STATUS_RUNNING;
> +	up_write(&ghvm->status_lock);
> +	return ret;
> +err:
> +	ghvm->vm_status = GH_RM_VM_STATUS_INIT_FAILED;
> +	/* gh_vm_free will handle releasing resources and reclaiming memory */
> +	up_write(&ghvm->status_lock);
> +	return ret;
> +}
> +
> +static int gh_vm_ensure_started(struct gh_vm *ghvm)
> +{
> +	int ret;
> +
> +	ret = down_read_interruptible(&ghvm->status_lock);
> +	if (ret)
> +		return ret;
> +
> +	/* Unlikely because VM is typically started */
> +	if (unlikely(ghvm->vm_status == GH_RM_VM_STATUS_LOAD)) {
> +		up_read(&ghvm->status_lock);
> +		ret = gh_vm_start(ghvm);
> +		if (ret)
> +			goto out;

You have already released the status lock at this point.
So going to "out" will do it again.  This is a BUG.

I think you just want to return ret, and you don't need
the "out" error path.


> +		/** gh_vm_start() is guaranteed to bring status out of
> +		 * GH_RM_VM_STATUS_LOAD, thus inifitely recursive call is not
> +		 * possible
> +		 */
> +		return gh_vm_ensure_started(ghvm);
> +	}
> +
> +	/* Unlikely because VM is typically running */
> +	if (unlikely(ghvm->vm_status != GH_RM_VM_STATUS_RUNNING))
> +		ret = -ENODEV;
> +
> +out:
> +	up_read(&ghvm->status_lock);
> +	return ret;
> +}
> +
>   static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>   {
>   	struct gh_vm *ghvm = filp->private_data;
> @@ -85,6 +283,25 @@ static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>   		r = gh_vm_mem_alloc(ghvm, &region);
>   		break;
>   	}
> +	case GH_VM_SET_DTB_CONFIG: {
> +		struct gh_vm_dtb_config dtb_config;
> +
> +		if (copy_from_user(&dtb_config, argp, sizeof(dtb_config)))
> +			return -EFAULT;
> +

It's clear that the base of the DTB does not need to be
page aligned.  But why do you round up the size to a
page boundary?  (It might be the "extra for overlay"
I comment on elsewhere, but even if so, it's worth
mentioning this.)

> +		dtb_config.size = PAGE_ALIGN(dtb_config.size);
> +		if (dtb_config.guest_phys_addr + dtb_config.size < dtb_config.guest_phys_addr)
> +			return -EOVERFLOW;
> +
> +		ghvm->dtb_config = dtb_config;
> +
> +		r = 0;
> +		break;
> +	}
> +	case GH_VM_START: {
> +		r = gh_vm_ensure_started(ghvm);
> +		break;
> +	}
>   	default:
>   		r = -ENOTTY;
>   		break;
> diff --git a/drivers/virt/gunyah/vm_mgr.h b/drivers/virt/gunyah/vm_mgr.h
> index c9f6fa5478ed..26bcc2ae4478 100644
> --- a/drivers/virt/gunyah/vm_mgr.h
> +++ b/drivers/virt/gunyah/vm_mgr.h
> @@ -10,6 +10,8 @@
>   #include <linux/list.h>
>   #include <linux/miscdevice.h>
>   #include <linux/mutex.h>
> +#include <linux/rwsem.h>
> +#include <linux/wait.h>
>   
>   #include <uapi/linux/gunyah.h>
>   
> @@ -34,6 +36,13 @@ struct gh_vm {
>   	u16 vmid;
>   	struct gh_rm *rm;
>   	struct device *parent;
> +	enum gh_rm_vm_auth_mechanism auth;
> +	struct gh_vm_dtb_config dtb_config;
> +
> +	struct notifier_block nb;
> +	enum gh_rm_vm_status vm_status;
> +	wait_queue_head_t vm_status_wait;
> +	struct rw_semaphore status_lock;
>   
>   	struct work_struct free_work;
>   	struct mutex mm_lock;
> @@ -44,5 +53,6 @@ int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *regio
>   void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping);
>   int gh_vm_mem_free(struct gh_vm *ghvm, u32 label);
>   struct gh_vm_mem *gh_vm_mem_find_by_label(struct gh_vm *ghvm, u32 label);
> +struct gh_vm_mem *gh_vm_mem_find_by_addr(struct gh_vm *ghvm, u64 guest_phys_addr, u32 size);
>   
>   #endif
> diff --git a/drivers/virt/gunyah/vm_mgr_mm.c b/drivers/virt/gunyah/vm_mgr_mm.c
> index db6f55cef37f..6e1d2e8bddb7 100644
> --- a/drivers/virt/gunyah/vm_mgr_mm.c
> +++ b/drivers/virt/gunyah/vm_mgr_mm.c
> @@ -47,6 +47,29 @@ void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping)
>   	list_del(&mapping->list);
>   }
>   

I think you should call this gh_vm_mem_find_mapping().  You are
finding the mapping that contains the given range.

> +struct gh_vm_mem *gh_vm_mem_find_by_addr(struct gh_vm *ghvm, u64 guest_phys_addr, u32 size)
> +{
> +	struct gh_vm_mem *mapping = NULL;
> +	int ret;
> +
> +	ret = mutex_lock_interruptible(&ghvm->mm_lock);
> +	if (ret)
> +		return ERR_PTR(ret);
> +

I think you could slightly modify this, and replace this loop and
the one in gh_vm_mem_alloc() with a call to a helper function.

What I suggest is that you define a function like
__gh_vm_mem_overlap(ghvm, offset, size).  It would return a
gh_vm_mem pointer if it found any mapping that overlapped
the range, or null if none is found.

Here, you could then check the returned result to ensure
the entire range fits within it.

And in gh_vm_mem_alloc() you could simply use the result
to indicate that you can't allocate the range (because
at least one existing range covers all or part of the new
one).

> +	list_for_each_entry(mapping, &ghvm->memory_mappings, list) {
> +		if (guest_phys_addr >= mapping->guest_phys_addr &&
> +			(guest_phys_addr + size <= mapping->guest_phys_addr +
> +			(mapping->npages << PAGE_SHIFT))) {
> +			goto unlock;
> +		}
> +	}
> +
> +	mapping = NULL;
> +unlock:
> +	mutex_unlock(&ghvm->mm_lock);
> +	return mapping;
> +}
> +
>   struct gh_vm_mem *gh_vm_mem_find_by_label(struct gh_vm *ghvm, u32 label)
>   {
>   	struct gh_vm_mem *mapping;
> diff --git a/include/linux/gunyah_rsc_mgr.h b/include/linux/gunyah_rsc_mgr.h
> index 88a429dad09e..8b0b46f28e39 100644
> --- a/include/linux/gunyah_rsc_mgr.h
> +++ b/include/linux/gunyah_rsc_mgr.h
> @@ -29,6 +29,12 @@ struct gh_rm_vm_exited_payload {
>   #define GH_RM_NOTIFICATION_VM_EXITED		 0x56100001
>   
>   enum gh_rm_vm_status {
> +	/**
> +	 * RM doesn't have a state where load partially failed because
> +	 * only Linux

I have no idea what the comment above means...  Please fix.

Several of the values below are never explicitly assigned,
and some are used but not assigned.  The others apparently
might come back from the resource manager?  Why, for
example, are the PAUSED, AUTH, and RESETTING statuses
defined if we don't use them?

> +	 */
> +	GH_RM_VM_STATUS_LOAD_FAILED	= -1,
> +
>   	GH_RM_VM_STATUS_NO_STATE	= 0,
>   	GH_RM_VM_STATUS_INIT		= 1,
>   	GH_RM_VM_STATUS_READY		= 2,
> diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
> index a19207e3e065..d6abd8605a2e 100644
> --- a/include/uapi/linux/gunyah.h
> +++ b/include/uapi/linux/gunyah.h
> @@ -49,4 +49,17 @@ struct gh_userspace_memory_region {
>   #define GH_VM_SET_USER_MEM_REGION	_IOW(GH_IOCTL_TYPE, 0x1, \
>   						struct gh_userspace_memory_region)
>   
> +/**
> + * struct gh_vm_dtb_config - Set the location of the VM's devicetree blob
> + * @guest_phys_addr: Address of the VM's devicetree in guest memory.
> + * @size: Maximum size of the devicetree.
> + */
> +struct gh_vm_dtb_config {
> +	__u64 guest_phys_addr;
> +	__u64 size;
> +};
> +#define GH_VM_SET_DTB_CONFIG	_IOW(GH_IOCTL_TYPE, 0x2, struct gh_vm_dtb_config)
> +
> +#define GH_VM_START		_IO(GH_IOCTL_TYPE, 0x3)
> +
>   #endif
  
Elliot Berman April 11, 2023, 9:07 p.m. UTC | #3
On 3/21/2023 7:24 AM, Srinivas Kandagatla wrote:

> On 04/03/2023 01:06, Elliot Berman wrote:
>> +
>> +#define GH_VM_START        _IO(GH_IOCTL_TYPE, 0x3)
> A comment here that this is going to *ONLY* start an un-authenticated VM 
> would be useful to the users.
> 

There is only support for unauthenticated VM in the UAPI being proposed 
and I'd like to re-use GH_VM_START ioctl for other VM types as well. Is 
the comment really useful? I can easily see forgetting to remove the 
comment and then being more confusing once the other VM types get added.

Thanks,
Elliot
  
Alex Elder April 11, 2023, 9:09 p.m. UTC | #4
On 4/11/23 4:07 PM, Elliot Berman wrote:
> 
> 
> On 3/21/2023 7:24 AM, Srinivas Kandagatla wrote:
> 
>> On 04/03/2023 01:06, Elliot Berman wrote:
>>> +
>>> +#define GH_VM_START        _IO(GH_IOCTL_TYPE, 0x3)
>> A comment here that this is going to *ONLY* start an un-authenticated 
>> VM would be useful to the users.
>>
> 
> There is only support for unauthenticated VM in the UAPI being proposed 
> and I'd like to re-use GH_VM_START ioctl for other VM types as well. Is 
> the comment really useful? I can easily see forgetting to remove the 
> comment and then being more confusing once the other VM types get added.

It's up to you.  And in general, I think your responses to my
comments have been fine--even when you just explain why you
don't plan to implement my suggestion.  Thank you.

					-Alex

> 
> Thanks,
> Elliot
  
Elliot Berman April 11, 2023, 9:16 p.m. UTC | #5
On 3/31/2023 7:26 AM, Alex Elder wrote:
> On 3/3/23 7:06 PM, Elliot Berman wrote:
>> diff --git a/include/linux/gunyah_rsc_mgr.h 
>> b/include/linux/gunyah_rsc_mgr.h
>> index 88a429dad09e..8b0b46f28e39 100644
>> --- a/include/linux/gunyah_rsc_mgr.h
>> +++ b/include/linux/gunyah_rsc_mgr.h
>> @@ -29,6 +29,12 @@ struct gh_rm_vm_exited_payload {
>>   #define GH_RM_NOTIFICATION_VM_EXITED         0x56100001
>>   enum gh_rm_vm_status {
>> +    /**
>> +     * RM doesn't have a state where load partially failed because
>> +     * only Linux
> 
> I have no idea what the comment above means...  Please fix.
> 
> Several of the values below are never explicitly assigned,
> and some are used but not assigned.  The others apparently
> might come back from the resource manager?  Why, for
> example, are the PAUSED, AUTH, and RESETTING statuses
> defined if we don't use them?
> 

I ended up no longer needing VM_STATUS_LOAD_FAILED.

The other status values are defined by Gunyah resource manager. RM will 
notify us about the state transitions.

Some of the state transitions can be inferred by Linux directly. For 
instance, gh_rm_vm_init() will transition the VM from 
GH_RM_VM_STATUS_INIT to GH_RM_VM_STATUS_READY iff it returns 
successfully. There is one instance where we wait for VM to exit during 
the VM teardown as well.

Thanks,
Elliot

>> +     */
>> +    GH_RM_VM_STATUS_LOAD_FAILED    = -1,
>> +
>>       GH_RM_VM_STATUS_NO_STATE    = 0,
>>       GH_RM_VM_STATUS_INIT        = 1,
>>       GH_RM_VM_STATUS_READY        = 2,
>> diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
>> index a19207e3e065..d6abd8605a2e 100644
>> --- a/include/uapi/linux/gunyah.h
>> +++ b/include/uapi/linux/gunyah.h
>> @@ -49,4 +49,17 @@ struct gh_userspace_memory_region {
>>   #define GH_VM_SET_USER_MEM_REGION    _IOW(GH_IOCTL_TYPE, 0x1, \
>>                           struct gh_userspace_memory_region)
>> +/**
>> + * struct gh_vm_dtb_config - Set the location of the VM's devicetree 
>> blob
>> + * @guest_phys_addr: Address of the VM's devicetree in guest memory.
>> + * @size: Maximum size of the devicetree.
>> + */
>> +struct gh_vm_dtb_config {
>> +    __u64 guest_phys_addr;
>> +    __u64 size;
>> +};
>> +#define GH_VM_SET_DTB_CONFIG    _IOW(GH_IOCTL_TYPE, 0x2, struct 
>> gh_vm_dtb_config)
>> +
>> +#define GH_VM_START        _IO(GH_IOCTL_TYPE, 0x3)
>> +
>>   #endif
>
  

Patch

diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
index e950274c6a53..299b9bb81edc 100644
--- a/drivers/virt/gunyah/vm_mgr.c
+++ b/drivers/virt/gunyah/vm_mgr.c
@@ -9,37 +9,118 @@ 
 #include <linux/file.h>
 #include <linux/gunyah_rsc_mgr.h>
 #include <linux/miscdevice.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 
 #include <uapi/linux/gunyah.h>
 
 #include "vm_mgr.h"
 
+static int gh_vm_rm_notification_status(struct gh_vm *ghvm, void *data)
+{
+	struct gh_rm_vm_status_payload *payload = data;
+
+	if (payload->vmid != ghvm->vmid)
+		return NOTIFY_OK;
+
+	/* All other state transitions are synchronous to a corresponding RM call */
+	if (payload->vm_status == GH_RM_VM_STATUS_RESET) {
+		down_write(&ghvm->status_lock);
+		ghvm->vm_status = payload->vm_status;
+		up_write(&ghvm->status_lock);
+		wake_up(&ghvm->vm_status_wait);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int gh_vm_rm_notification_exited(struct gh_vm *ghvm, void *data)
+{
+	struct gh_rm_vm_exited_payload *payload = data;
+
+	if (payload->vmid != ghvm->vmid)
+		return NOTIFY_OK;
+
+	down_write(&ghvm->status_lock);
+	ghvm->vm_status = GH_RM_VM_STATUS_EXITED;
+	up_write(&ghvm->status_lock);
+
+	return NOTIFY_DONE;
+}
+
+static int gh_vm_rm_notification(struct notifier_block *nb, unsigned long action, void *data)
+{
+	struct gh_vm *ghvm = container_of(nb, struct gh_vm, nb);
+
+	switch (action) {
+	case GH_RM_NOTIFICATION_VM_STATUS:
+		return gh_vm_rm_notification_status(ghvm, data);
+	case GH_RM_NOTIFICATION_VM_EXITED:
+		return gh_vm_rm_notification_exited(ghvm, data);
+	default:
+		return NOTIFY_OK;
+	}
+}
+
+static void gh_vm_stop(struct gh_vm *ghvm)
+{
+	int ret;
+
+	down_write(&ghvm->status_lock);
+	if (ghvm->vm_status == GH_RM_VM_STATUS_RUNNING) {
+		ret = gh_rm_vm_stop(ghvm->rm, ghvm->vmid);
+		if (ret)
+			dev_warn(ghvm->parent, "Failed to stop VM: %d\n", ret);
+	}
+
+	ghvm->vm_status = GH_RM_VM_STATUS_EXITED;
+	up_write(&ghvm->status_lock);
+}
+
 static void gh_vm_free(struct work_struct *work)
 {
 	struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
 	struct gh_vm_mem *mapping, *tmp;
 	int ret;
 
-	mutex_lock(&ghvm->mm_lock);
-	list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
-		gh_vm_mem_reclaim(ghvm, mapping);
-		kfree(mapping);
-	}
-	mutex_unlock(&ghvm->mm_lock);
-
-	ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
-	if (ret)
-		pr_warn("Failed to deallocate vmid: %d\n", ret);
+	switch (ghvm->vm_status) {
+	case GH_RM_VM_STATUS_RUNNING:
+		gh_vm_stop(ghvm);
+		fallthrough;
+	case GH_RM_VM_STATUS_INIT_FAILED:
+	case GH_RM_VM_STATUS_LOAD:
+	case GH_RM_VM_STATUS_EXITED:
+		mutex_lock(&ghvm->mm_lock);
+		list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
+			gh_vm_mem_reclaim(ghvm, mapping);
+			kfree(mapping);
+		}
+		mutex_unlock(&ghvm->mm_lock);
+		fallthrough;
+	case GH_RM_VM_STATUS_NO_STATE:
+		ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
+		if (ret)
+			dev_warn(ghvm->parent, "Failed to deallocate vmid: %d\n", ret);
+
+		gh_rm_notifier_unregister(ghvm->rm, &ghvm->nb);
+		gh_rm_put(ghvm->rm);
+		kfree(ghvm);
+		break;
+	default:
+		dev_err(ghvm->parent, "VM is unknown state: %d. VM will not be cleaned up.\n",
+			ghvm->vm_status);
 
-	put_gh_rm(ghvm->rm);
-	kfree(ghvm);
+		gh_rm_notifier_unregister(ghvm->rm, &ghvm->nb);
+		gh_rm_put(ghvm->rm);
+		kfree(ghvm);
+		break;
+	}
 }
 
 static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
 {
 	struct gh_vm *ghvm;
-	int vmid;
+	int vmid, ret;
 
 	vmid = gh_rm_alloc_vmid(rm, 0);
 	if (vmid < 0)
@@ -55,13 +136,130 @@  static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
 	ghvm->vmid = vmid;
 	ghvm->rm = rm;
 
+	init_waitqueue_head(&ghvm->vm_status_wait);
+	ghvm->nb.notifier_call = gh_vm_rm_notification;
+	ret = gh_rm_notifier_register(rm, &ghvm->nb);
+	if (ret) {
+		gh_rm_put(rm);
+		gh_rm_dealloc_vmid(rm, vmid);
+		kfree(ghvm);
+		return ERR_PTR(ret);
+	}
+
 	mutex_init(&ghvm->mm_lock);
 	INIT_LIST_HEAD(&ghvm->memory_mappings);
+	init_rwsem(&ghvm->status_lock);
 	INIT_WORK(&ghvm->free_work, gh_vm_free);
+	ghvm->vm_status = GH_RM_VM_STATUS_LOAD;
 
 	return ghvm;
 }
 
+static int gh_vm_start(struct gh_vm *ghvm)
+{
+	struct gh_vm_mem *mapping;
+	u64 dtb_offset;
+	u32 mem_handle;
+	int ret;
+
+	down_write(&ghvm->status_lock);
+	if (ghvm->vm_status != GH_RM_VM_STATUS_LOAD) {
+		up_write(&ghvm->status_lock);
+		return 0;
+	}
+
+	ghvm->vm_status = GH_RM_VM_STATUS_RESET;
+
+	mutex_lock(&ghvm->mm_lock);
+	list_for_each_entry(mapping, &ghvm->memory_mappings, list) {
+		switch (mapping->share_type) {
+		case VM_MEM_LEND:
+			ret = gh_rm_mem_lend(ghvm->rm, &mapping->parcel);
+			break;
+		case VM_MEM_SHARE:
+			ret = gh_rm_mem_share(ghvm->rm, &mapping->parcel);
+			break;
+		}
+		if (ret) {
+			dev_warn(ghvm->parent, "Failed to %s parcel %d: %d\n",
+				mapping->share_type == VM_MEM_LEND ? "lend" : "share",
+				mapping->parcel.label,
+				ret);
+			goto err;
+		}
+	}
+	mutex_unlock(&ghvm->mm_lock);
+
+	mapping = gh_vm_mem_find_by_addr(ghvm, ghvm->dtb_config.guest_phys_addr,
+					ghvm->dtb_config.size);
+	if (!mapping) {
+		dev_warn(ghvm->parent, "Failed to find the memory_handle for DTB\n");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	mem_handle = mapping->parcel.mem_handle;
+	dtb_offset = ghvm->dtb_config.guest_phys_addr - mapping->guest_phys_addr;
+
+	ret = gh_rm_vm_configure(ghvm->rm, ghvm->vmid, ghvm->auth, mem_handle,
+				0, 0, dtb_offset, ghvm->dtb_config.size);
+	if (ret) {
+		dev_warn(ghvm->parent, "Failed to configure VM: %d\n", ret);
+		goto err;
+	}
+
+	ret = gh_rm_vm_init(ghvm->rm, ghvm->vmid);
+	if (ret) {
+		dev_warn(ghvm->parent, "Failed to initialize VM: %d\n", ret);
+		goto err;
+	}
+
+	ret = gh_rm_vm_start(ghvm->rm, ghvm->vmid);
+	if (ret) {
+		dev_warn(ghvm->parent, "Failed to start VM: %d\n", ret);
+		goto err;
+	}
+
+	ghvm->vm_status = GH_RM_VM_STATUS_RUNNING;
+	up_write(&ghvm->status_lock);
+	return ret;
+err:
+	ghvm->vm_status = GH_RM_VM_STATUS_INIT_FAILED;
+	/* gh_vm_free will handle releasing resources and reclaiming memory */
+	up_write(&ghvm->status_lock);
+	return ret;
+}
+
+static int gh_vm_ensure_started(struct gh_vm *ghvm)
+{
+	int ret;
+
+	ret = down_read_interruptible(&ghvm->status_lock);
+	if (ret)
+		return ret;
+
+	/* Unlikely because VM is typically started */
+	if (unlikely(ghvm->vm_status == GH_RM_VM_STATUS_LOAD)) {
+		up_read(&ghvm->status_lock);
+		ret = gh_vm_start(ghvm);
+		if (ret)
+			goto out;
+		/** gh_vm_start() is guaranteed to bring status out of
+		 * GH_RM_VM_STATUS_LOAD, thus inifitely recursive call is not
+		 * possible
+		 */
+		return gh_vm_ensure_started(ghvm);
+	}
+
+	/* Unlikely because VM is typically running */
+	if (unlikely(ghvm->vm_status != GH_RM_VM_STATUS_RUNNING))
+		ret = -ENODEV;
+
+out:
+	up_read(&ghvm->status_lock);
+	return ret;
+}
+
 static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct gh_vm *ghvm = filp->private_data;
@@ -85,6 +283,25 @@  static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		r = gh_vm_mem_alloc(ghvm, &region);
 		break;
 	}
+	case GH_VM_SET_DTB_CONFIG: {
+		struct gh_vm_dtb_config dtb_config;
+
+		if (copy_from_user(&dtb_config, argp, sizeof(dtb_config)))
+			return -EFAULT;
+
+		dtb_config.size = PAGE_ALIGN(dtb_config.size);
+		if (dtb_config.guest_phys_addr + dtb_config.size < dtb_config.guest_phys_addr)
+			return -EOVERFLOW;
+
+		ghvm->dtb_config = dtb_config;
+
+		r = 0;
+		break;
+	}
+	case GH_VM_START: {
+		r = gh_vm_ensure_started(ghvm);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 		break;
diff --git a/drivers/virt/gunyah/vm_mgr.h b/drivers/virt/gunyah/vm_mgr.h
index c9f6fa5478ed..26bcc2ae4478 100644
--- a/drivers/virt/gunyah/vm_mgr.h
+++ b/drivers/virt/gunyah/vm_mgr.h
@@ -10,6 +10,8 @@ 
 #include <linux/list.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/wait.h>
 
 #include <uapi/linux/gunyah.h>
 
@@ -34,6 +36,13 @@  struct gh_vm {
 	u16 vmid;
 	struct gh_rm *rm;
 	struct device *parent;
+	enum gh_rm_vm_auth_mechanism auth;
+	struct gh_vm_dtb_config dtb_config;
+
+	struct notifier_block nb;
+	enum gh_rm_vm_status vm_status;
+	wait_queue_head_t vm_status_wait;
+	struct rw_semaphore status_lock;
 
 	struct work_struct free_work;
 	struct mutex mm_lock;
@@ -44,5 +53,6 @@  int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *regio
 void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping);
 int gh_vm_mem_free(struct gh_vm *ghvm, u32 label);
 struct gh_vm_mem *gh_vm_mem_find_by_label(struct gh_vm *ghvm, u32 label);
+struct gh_vm_mem *gh_vm_mem_find_by_addr(struct gh_vm *ghvm, u64 guest_phys_addr, u32 size);
 
 #endif
diff --git a/drivers/virt/gunyah/vm_mgr_mm.c b/drivers/virt/gunyah/vm_mgr_mm.c
index db6f55cef37f..6e1d2e8bddb7 100644
--- a/drivers/virt/gunyah/vm_mgr_mm.c
+++ b/drivers/virt/gunyah/vm_mgr_mm.c
@@ -47,6 +47,29 @@  void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping)
 	list_del(&mapping->list);
 }
 
+struct gh_vm_mem *gh_vm_mem_find_by_addr(struct gh_vm *ghvm, u64 guest_phys_addr, u32 size)
+{
+	struct gh_vm_mem *mapping = NULL;
+	int ret;
+
+	ret = mutex_lock_interruptible(&ghvm->mm_lock);
+	if (ret)
+		return ERR_PTR(ret);
+
+	list_for_each_entry(mapping, &ghvm->memory_mappings, list) {
+		if (guest_phys_addr >= mapping->guest_phys_addr &&
+			(guest_phys_addr + size <= mapping->guest_phys_addr +
+			(mapping->npages << PAGE_SHIFT))) {
+			goto unlock;
+		}
+	}
+
+	mapping = NULL;
+unlock:
+	mutex_unlock(&ghvm->mm_lock);
+	return mapping;
+}
+
 struct gh_vm_mem *gh_vm_mem_find_by_label(struct gh_vm *ghvm, u32 label)
 {
 	struct gh_vm_mem *mapping;
diff --git a/include/linux/gunyah_rsc_mgr.h b/include/linux/gunyah_rsc_mgr.h
index 88a429dad09e..8b0b46f28e39 100644
--- a/include/linux/gunyah_rsc_mgr.h
+++ b/include/linux/gunyah_rsc_mgr.h
@@ -29,6 +29,12 @@  struct gh_rm_vm_exited_payload {
 #define GH_RM_NOTIFICATION_VM_EXITED		 0x56100001
 
 enum gh_rm_vm_status {
+	/**
+	 * RM doesn't have a state where load partially failed because
+	 * only Linux
+	 */
+	GH_RM_VM_STATUS_LOAD_FAILED	= -1,
+
 	GH_RM_VM_STATUS_NO_STATE	= 0,
 	GH_RM_VM_STATUS_INIT		= 1,
 	GH_RM_VM_STATUS_READY		= 2,
diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
index a19207e3e065..d6abd8605a2e 100644
--- a/include/uapi/linux/gunyah.h
+++ b/include/uapi/linux/gunyah.h
@@ -49,4 +49,17 @@  struct gh_userspace_memory_region {
 #define GH_VM_SET_USER_MEM_REGION	_IOW(GH_IOCTL_TYPE, 0x1, \
 						struct gh_userspace_memory_region)
 
+/**
+ * struct gh_vm_dtb_config - Set the location of the VM's devicetree blob
+ * @guest_phys_addr: Address of the VM's devicetree in guest memory.
+ * @size: Maximum size of the devicetree.
+ */
+struct gh_vm_dtb_config {
+	__u64 guest_phys_addr;
+	__u64 size;
+};
+#define GH_VM_SET_DTB_CONFIG	_IOW(GH_IOCTL_TYPE, 0x2, struct gh_vm_dtb_config)
+
+#define GH_VM_START		_IO(GH_IOCTL_TYPE, 0x3)
+
 #endif