[v10,13/26] gunyah: vm_mgr: Add ioctls to support basic non-proxy VM boot

Message ID 20230214212427.3316544-1-quic_eberman@quicinc.com
State New
Headers
Series Drivers for Gunyah hypervisor |

Commit Message

Elliot Berman Feb. 14, 2023, 9:24 p.m. UTC
  Add remaining ioctls to support non-proxy VM boot:

 - Gunyah Resource Manager uses the VM's devicetree to configure the
   virtual machine. The location of the devicetree in the guest's
   virtual memory can be declared via the SET_DTB_CONFIG ioctl.
 - Trigger start of the virtual machine with VM_START ioctl.

Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
---
 drivers/virt/gunyah/vm_mgr.c    | 229 ++++++++++++++++++++++++++++++--
 drivers/virt/gunyah/vm_mgr.h    |  10 ++
 drivers/virt/gunyah/vm_mgr_mm.c |  23 ++++
 include/linux/gunyah_rsc_mgr.h  |   6 +
 include/uapi/linux/gunyah.h     |  13 ++
 5 files changed, 268 insertions(+), 13 deletions(-)
  

Comments

Greg KH Feb. 16, 2023, 6:35 a.m. UTC | #1
On Tue, Feb 14, 2023 at 01:24:26PM -0800, Elliot Berman wrote:
> +	case GH_VM_SET_DTB_CONFIG: {
> +		struct gh_vm_dtb_config dtb_config;
> +
> +		if (copy_from_user(&dtb_config, argp, sizeof(dtb_config)))
> +			return -EFAULT;
> +
> +		dtb_config.size = PAGE_ALIGN(dtb_config.size);
> +		ghvm->dtb_config = dtb_config;

Do you really mean to copy this tiny structure twice (once from
userspace and the second time off of the stack)?  If so, why?

And where are the values of the structure checked for validity?  Can any
64bit value work for size and "gpa"?

thanks,

greg k-h
  
Elliot Berman Feb. 16, 2023, 5:20 p.m. UTC | #2
On 2/15/2023 10:35 PM, Greg Kroah-Hartman wrote:
> On Tue, Feb 14, 2023 at 01:24:26PM -0800, Elliot Berman wrote:
>> +	case GH_VM_SET_DTB_CONFIG: {
>> +		struct gh_vm_dtb_config dtb_config;
>> +
>> +		if (copy_from_user(&dtb_config, argp, sizeof(dtb_config)))
>> +			return -EFAULT;
>> +
>> +		dtb_config.size = PAGE_ALIGN(dtb_config.size);
>> +		ghvm->dtb_config = dtb_config;
> 
> Do you really mean to copy this tiny structure twice (once from
> userspace and the second time off of the stack)?  If so, why?

Ah, yes this can be optimized to copy directly.
> 
> And where are the values of the structure checked for validity?  Can any
> 64bit value work for size and "gpa"?
> 

The values get checked when starting the VM

static int gh_vm_start(struct gh_vm *ghvm)
	...
	mapping = gh_vm_mem_find_mapping(ghvm, ghvm->dtb_config.gpa, 
ghvm->dtb_config.size);
	if (!mapping) {
		pr_warn("Failed to find the memory_handle for DTB\n");
		ret = -EINVAL;
		goto err;
	}

If user passes an address that they've not set up, then 
gh_vm_mem_find_mapping returns NULL and GH_VM_START ioctl fails.

I've not done the check from the GH_VM_SET_DTB_CONFIG ioctl itself 
because I didn't want to require userspace to share the memory first. 
We'd need to check again anyway since user could SET_USER_MEMORY, 
SET_DTB_CONFIG, SET_USER_MEMORY (remove), VM_START.

Thanks,
Elliot
  
Srivatsa Vaddagiri Feb. 20, 2023, 9:15 a.m. UTC | #3
* Elliot Berman <quic_eberman@quicinc.com> [2023-02-14 13:24:26]:

>  static void gh_vm_free(struct work_struct *work)
>  {
>  	struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
>  	struct gh_vm_mem *mapping, *tmp;
>  	int ret;
>  
> -	mutex_lock(&ghvm->mm_lock);
> -	list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
> -		gh_vm_mem_reclaim(ghvm, mapping);
> -		kfree(mapping);
> +	switch (ghvm->vm_status) {
> +unknown_state:
> +	case GH_RM_VM_STATUS_RUNNING:
> +		gh_vm_stop(ghvm);
> +		fallthrough;
> +	case GH_RM_VM_STATUS_INIT_FAILED:
> +	case GH_RM_VM_STATUS_LOAD:
> +	case GH_RM_VM_STATUS_LOAD_FAILED:
> +		mutex_lock(&ghvm->mm_lock);
> +		list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
> +			gh_vm_mem_reclaim(ghvm, mapping);
> +			kfree(mapping);
> +		}
> +		mutex_unlock(&ghvm->mm_lock);
> +		fallthrough;
> +	case GH_RM_VM_STATUS_NO_STATE:
> +		ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
> +		if (ret)
> +			pr_warn("Failed to deallocate vmid: %d\n", ret);
> +
> +		gh_rm_notifier_unregister(ghvm->rm, &ghvm->nb);
> +		put_gh_rm(ghvm->rm);
> +		kfree(ghvm);
> +		break;
> +	default:
> +		pr_err("VM is unknown state: %d, assuming it's running.\n", ghvm->vm_status);
> +		goto unknown_state;

'goto unknown_state' here leads to a infinite loop AFAICS. For example consider
the case  where VM_START failed (due to mem_lend operation) causing VM state to
be GH_RM_VM_STATUS_RESET. A subsequent close(vmfd) can leads to that forever
loop.

//snip


> +static int gh_vm_start(struct gh_vm *ghvm)
> +{
> +	struct gh_vm_mem *mapping;
> +	u64 dtb_offset;
> +	u32 mem_handle;
> +	int ret;
> +
> +	down_write(&ghvm->status_lock);
> +	if (ghvm->vm_status != GH_RM_VM_STATUS_LOAD) {
> +		up_write(&ghvm->status_lock);
> +		return 0;
> +	}
> +
> +	ghvm->vm_status = GH_RM_VM_STATUS_RESET;
> +
> +	list_for_each_entry(mapping, &ghvm->memory_mappings, list) {

We don't seem to have the right lock here while walking the list.
  
Srivatsa Vaddagiri Feb. 20, 2023, 9:54 a.m. UTC | #4
* Srivatsa Vaddagiri <quic_svaddagi@quicinc.com> [2023-02-20 14:45:55]:

> * Elliot Berman <quic_eberman@quicinc.com> [2023-02-14 13:24:26]:
> 
> >  static void gh_vm_free(struct work_struct *work)
> >  {
> >  	struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
> >  	struct gh_vm_mem *mapping, *tmp;
> >  	int ret;
> >  
> > -	mutex_lock(&ghvm->mm_lock);
> > -	list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
> > -		gh_vm_mem_reclaim(ghvm, mapping);
> > -		kfree(mapping);
> > +	switch (ghvm->vm_status) {
> > +unknown_state:
> > +	case GH_RM_VM_STATUS_RUNNING:
> > +		gh_vm_stop(ghvm);
> > +		fallthrough;
> > +	case GH_RM_VM_STATUS_INIT_FAILED:
> > +	case GH_RM_VM_STATUS_LOAD:
> > +	case GH_RM_VM_STATUS_LOAD_FAILED:
> > +		mutex_lock(&ghvm->mm_lock);
> > +		list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
> > +			gh_vm_mem_reclaim(ghvm, mapping);
> > +			kfree(mapping);
> > +		}
> > +		mutex_unlock(&ghvm->mm_lock);
> > +		fallthrough;
> > +	case GH_RM_VM_STATUS_NO_STATE:
> > +		ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
> > +		if (ret)
> > +			pr_warn("Failed to deallocate vmid: %d\n", ret);
> > +
> > +		gh_rm_notifier_unregister(ghvm->rm, &ghvm->nb);
> > +		put_gh_rm(ghvm->rm);
> > +		kfree(ghvm);
> > +		break;
> > +	default:
> > +		pr_err("VM is unknown state: %d, assuming it's running.\n", ghvm->vm_status);
> > +		goto unknown_state;
> 
> 'goto unknown_state' here leads to a infinite loop AFAICS. For example consider
> the case  where VM_START failed (due to mem_lend operation) causing VM state to
> be GH_RM_VM_STATUS_RESET. A subsequent close(vmfd) can leads to that forever
> loop.

Hmm ..that's not a good example perhaps (VM state is set to
GH_RM_VM_STATUS_INIT_FAILED in failed case). Nevertheless I think we should
avoid the goto in case of unknown state.


- vatsa
  
Srivatsa Vaddagiri Feb. 21, 2023, 1:06 p.m. UTC | #5
* Elliot Berman <quic_eberman@quicinc.com> [2023-02-14 13:24:26]:

> +static int gh_vm_start(struct gh_vm *ghvm)
> +{
> +	struct gh_vm_mem *mapping;
> +	u64 dtb_offset;
> +	u32 mem_handle;
> +	int ret;
> +
> +	down_write(&ghvm->status_lock);
> +	if (ghvm->vm_status != GH_RM_VM_STATUS_LOAD) {
> +		up_write(&ghvm->status_lock);
> +		return 0;
> +	}
> +
> +	ghvm->vm_status = GH_RM_VM_STATUS_RESET;
> +
> +	list_for_each_entry(mapping, &ghvm->memory_mappings, list) {
> +		switch (mapping->share_type) {
> +		case VM_MEM_LEND:
> +			ret = gh_rm_mem_lend(ghvm->rm, &mapping->parcel);
> +			break;
> +		case VM_MEM_SHARE:
> +			ret = gh_rm_mem_share(ghvm->rm, &mapping->parcel);
> +			break;
> +		}
> +		if (ret) {
> +			pr_warn("Failed to %s parcel %d: %d\n",
> +				mapping->share_type == VM_MEM_LEND ? "lend" : "share",
> +				mapping->parcel.label,
> +				ret);
> +			goto err;
> +		}
> +	}
> +
> +	mapping = gh_vm_mem_find_mapping(ghvm, ghvm->dtb_config.gpa, ghvm->dtb_config.size);

It may be some optimization to derive DTB 'mapping' in the first loop you have
above (that lends/shares all mappings)


> +	if (!mapping) {
> +		pr_warn("Failed to find the memory_handle for DTB\n");
> +		ret = -EINVAL;
> +		goto err;
> +	}
  
Srinivas Kandagatla Feb. 21, 2023, 2:17 p.m. UTC | #6
On 14/02/2023 21:24, Elliot Berman wrote:
> 
> Add remaining ioctls to support non-proxy VM boot:
> 
>   - Gunyah Resource Manager uses the VM's devicetree to configure the
>     virtual machine. The location of the devicetree in the guest's
>     virtual memory can be declared via the SET_DTB_CONFIG ioctl.
>   - Trigger start of the virtual machine with VM_START ioctl.
> 
> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
> ---
>   drivers/virt/gunyah/vm_mgr.c    | 229 ++++++++++++++++++++++++++++++--
>   drivers/virt/gunyah/vm_mgr.h    |  10 ++
>   drivers/virt/gunyah/vm_mgr_mm.c |  23 ++++
>   include/linux/gunyah_rsc_mgr.h  |   6 +
>   include/uapi/linux/gunyah.h     |  13 ++
>   5 files changed, 268 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
> index 84102bac03cc..fa324385ade5 100644
> --- a/drivers/virt/gunyah/vm_mgr.c
> +++ b/drivers/virt/gunyah/vm_mgr.c
> @@ -9,37 +9,114 @@
>   #include <linux/file.h>
>   #include <linux/gunyah_rsc_mgr.h>
>   #include <linux/miscdevice.h>
> +#include <linux/mm.h>
>   #include <linux/module.h>
>   
>   #include <uapi/linux/gunyah.h>
>   
>   #include "vm_mgr.h"
>   
> +static int gh_vm_rm_notification_status(struct gh_vm *ghvm, void *data)
> +{
> +	struct gh_rm_vm_status_payload *payload = data;
> +
> +	if (payload->vmid != ghvm->vmid)
> +		return NOTIFY_OK;
Is this even possible? If yes, then this is a bug somewhere, we should 
not be getting notifications for something that does not belong to this vm.
What is the typical case for such behavior? comment would be useful.


> +
> +	/* All other state transitions are synchronous to a corresponding RM call */
> +	if (payload->vm_status == GH_RM_VM_STATUS_RESET) {
> +		down_write(&ghvm->status_lock);
> +		ghvm->vm_status = payload->vm_status;
> +		up_write(&ghvm->status_lock);
> +		wake_up(&ghvm->vm_status_wait);
> +	}
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static int gh_vm_rm_notification_exited(struct gh_vm *ghvm, void *data)
> +{
> +	struct gh_rm_vm_exited_payload *payload = data;
> +
> +	if (payload->vmid != ghvm->vmid)
> +		return NOTIFY_OK;
same

> +
> +	down_write(&ghvm->status_lock);
> +	ghvm->vm_status = GH_RM_VM_STATUS_EXITED;
> +	up_write(&ghvm->status_lock);
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static int gh_vm_rm_notification(struct notifier_block *nb, unsigned long action, void *data)
> +{
> +	struct gh_vm *ghvm = container_of(nb, struct gh_vm, nb);
> +
> +	switch (action) {
> +	case GH_RM_NOTIFICATION_VM_STATUS:
> +		return gh_vm_rm_notification_status(ghvm, data);
> +	case GH_RM_NOTIFICATION_VM_EXITED:
> +		return gh_vm_rm_notification_exited(ghvm, data);
> +	default:
> +		return NOTIFY_OK;
> +	}
> +}
> +
> +static void gh_vm_stop(struct gh_vm *ghvm)
> +{
> +	int ret;
> +
> +	down_write(&ghvm->status_lock);
> +	if (ghvm->vm_status == GH_RM_VM_STATUS_RUNNING) {
> +		ret = gh_rm_vm_stop(ghvm->rm, ghvm->vmid);
> +		if (ret)
> +			pr_warn("Failed to stop VM: %d\n", ret);
Should we not bail out from this fail path?


> +	}
> +
> +	ghvm->vm_status = GH_RM_VM_STATUS_EXITED;
> +	up_write(&ghvm->status_lock);
> +}
> +
>   static void gh_vm_free(struct work_struct *work)
>   {
>   	struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
>   	struct gh_vm_mem *mapping, *tmp;
>   	int ret;
>   
> -	mutex_lock(&ghvm->mm_lock);
> -	list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
> -		gh_vm_mem_reclaim(ghvm, mapping);
> -		kfree(mapping);
> +	switch (ghvm->vm_status) {
> +unknown_state:

Never seen this style of using goto from switch to a new label in 
switch case. Am sure this is some kinda trick but its not helping readers.

Can we rewrite this using a normal semantics.

may be a do while could help.


> +	case GH_RM_VM_STATUS_RUNNING:
> +		gh_vm_stop(ghvm);
> +		fallthrough;
> +	case GH_RM_VM_STATUS_INIT_FAILED:
> +	case GH_RM_VM_STATUS_LOAD:
> +	case GH_RM_VM_STATUS_LOAD_FAILED:
> +		mutex_lock(&ghvm->mm_lock);
> +		list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
> +			gh_vm_mem_reclaim(ghvm, mapping);
> +			kfree(mapping);
> +		}
> +		mutex_unlock(&ghvm->mm_lock);
> +		fallthrough;
> +	case GH_RM_VM_STATUS_NO_STATE:
> +		ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
> +		if (ret)
> +			pr_warn("Failed to deallocate vmid: %d\n", ret);
> +
> +		gh_rm_notifier_unregister(ghvm->rm, &ghvm->nb);
> +		put_gh_rm(ghvm->rm);
> +		kfree(ghvm);
> +		break;
> +	default:
> +		pr_err("VM is unknown state: %d, assuming it's running.\n", ghvm->vm_status);
vm_status did not change do we not endup here again?

> +		goto unknown_state;
>   	}
> -	mutex_unlock(&ghvm->mm_lock);
> -
> -	ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
> -	if (ret)
> -		pr_warn("Failed to deallocate vmid: %d\n", ret);
> -
> -	put_gh_rm(ghvm->rm);
> -	kfree(ghvm);
>   }
>   
>   static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
>   {
>   	struct gh_vm *ghvm;
> -	int vmid;
> +	int vmid, ret;
>   
>   	vmid = gh_rm_alloc_vmid(rm, 0);
>   	if (vmid < 0)
> @@ -56,13 +133,123 @@ static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
>   	ghvm->vmid = vmid;
>   	ghvm->rm = rm;
>   
> +	init_waitqueue_head(&ghvm->vm_status_wait);
> +	ghvm->nb.notifier_call = gh_vm_rm_notification;
> +	ret = gh_rm_notifier_register(rm, &ghvm->nb);
> +	if (ret) {
> +		put_gh_rm(rm);
> +		gh_rm_dealloc_vmid(rm, vmid);
> +		kfree(ghvm);
> +		return ERR_PTR(ret);
> +	}
> +
>   	mutex_init(&ghvm->mm_lock);
>   	INIT_LIST_HEAD(&ghvm->memory_mappings);
> +	init_rwsem(&ghvm->status_lock);
>   	INIT_WORK(&ghvm->free_work, gh_vm_free);
> +	ghvm->vm_status = GH_RM_VM_STATUS_LOAD;
>   
>   	return ghvm;
>   }
>   
> +static int gh_vm_start(struct gh_vm *ghvm)
> +{
> +	struct gh_vm_mem *mapping;
> +	u64 dtb_offset;
> +	u32 mem_handle;
> +	int ret;
> +
> +	down_write(&ghvm->status_lock);
> +	if (ghvm->vm_status != GH_RM_VM_STATUS_LOAD) {
> +		up_write(&ghvm->status_lock);
> +		return 0;
> +	}
> +
> +	ghvm->vm_status = GH_RM_VM_STATUS_RESET;
> +

<------
should we not take ghvm->mm_lock here to make sure that list is 
consistent while processing.
> +	list_for_each_entry(mapping, &ghvm->memory_mappings, list) {
> +		switch (mapping->share_type) {
> +		case VM_MEM_LEND:
> +			ret = gh_rm_mem_lend(ghvm->rm, &mapping->parcel);
> +			break;
> +		case VM_MEM_SHARE:
> +			ret = gh_rm_mem_share(ghvm->rm, &mapping->parcel);
> +			break;
> +		}
> +		if (ret) {
> +			pr_warn("Failed to %s parcel %d: %d\n",
> +				mapping->share_type == VM_MEM_LEND ? "lend" : "share",
> +				mapping->parcel.label,
> +				ret);
> +			goto err;
> +		}
> +	}
--->

> +
> +	mapping = gh_vm_mem_find_mapping(ghvm, ghvm->dtb_config.gpa, ghvm->dtb_config.size);
> +	if (!mapping) {
> +		pr_warn("Failed to find the memory_handle for DTB\n");

What wil happen to the mappings that are lend or shared?

> +		ret = -EINVAL;
> +		goto err;
> +	}
> +
> +	mem_handle = mapping->parcel.mem_handle;
> +	dtb_offset = ghvm->dtb_config.gpa - mapping->guest_phys_addr;
> +
> +	ret = gh_rm_vm_configure(ghvm->rm, ghvm->vmid, ghvm->auth, mem_handle,

where is authentication mechanism (auth) comming from? Who is supposed 
to set this value?

Should it come from userspace? if so I do not see any UAPI facility to 
do that via VM_START ioctl.


> +				0, 0, dtb_offset, ghvm->dtb_config.size);
> +	if (ret) {
> +		pr_warn("Failed to configure VM: %d\n", ret);
> +		goto err;
> +	}
> +
> +	ret = gh_rm_vm_init(ghvm->rm, ghvm->vmid);
> +	if (ret) {
> +		pr_warn("Failed to initialize VM: %d\n", ret);
> +		goto err;
> +	}
> +
> +	ret = gh_rm_vm_start(ghvm->rm, ghvm->vmid);
> +	if (ret) {
> +		pr_warn("Failed to start VM: %d\n", ret);
> +		goto err;
> +	}
> +
> +	ghvm->vm_status = GH_RM_VM_STATUS_RUNNING;
> +	up_write(&ghvm->status_lock);
> +	return ret;
> +err:
> +	ghvm->vm_status = GH_RM_VM_STATUS_INIT_FAILED;
> +	up_write(&ghvm->status_lock);

Am really not sure if we are doing right thing in the error path, there 
are multiple cases that seems to be not handled or if it was not 
required no comments to clarify this are documented.
ex: if vm start fails then what happes with memory mapping or do we need 
to un-configure vm or un-init vm from hypervisor side?

if none of this is required its useful to add come clear comments.

> +	return ret;
> +}
> +
> +static int gh_vm_ensure_started(struct gh_vm *ghvm)
> +{
> +	int ret;
> +
> +retry:
> +	ret = down_read_interruptible(&ghvm->status_lock);
> +	if (ret)
> +		return ret;
> +
> +	/* Unlikely because VM is typically started */
> +	if (unlikely(ghvm->vm_status == GH_RM_VM_STATUS_LOAD)) {
> +		up_read(&ghvm->status_lock);
> +		ret = gh_vm_start(ghvm);
> +		if (ret)
> +			goto out;
> +		goto retry;
> +	}

do while will do better job here w.r.t to readablity.

> +
> +	/* Unlikely because VM is typically running */
> +	if (unlikely(ghvm->vm_status != GH_RM_VM_STATUS_RUNNING))
> +		ret = -ENODEV;
> +
> +out:
> +	up_read(&ghvm->status_lock);
> +	return ret;
> +}
> +
>   static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>   {
>   	struct gh_vm *ghvm = filp->private_data;
> @@ -88,6 +275,22 @@ static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>   			r = gh_vm_mem_free(ghvm, region.label);
>   		break;
>   	}
> +	case GH_VM_SET_DTB_CONFIG: {
> +		struct gh_vm_dtb_config dtb_config;
> +
> +		if (copy_from_user(&dtb_config, argp, sizeof(dtb_config)))
> +			return -EFAULT;
> +
> +		dtb_config.size = PAGE_ALIGN(dtb_config.size);
> +		ghvm->dtb_config = dtb_config;
> +
> +		r = 0;
> +		break;
> +	}
> +	case GH_VM_START: {
> +		r = gh_vm_ensure_started(ghvm);
> +		break;
> +	}
>   	default:
>   		r = -ENOTTY;
>   		break;
> diff --git a/drivers/virt/gunyah/vm_mgr.h b/drivers/virt/gunyah/vm_mgr.h
> index 97bc00c34878..e9cf56647cc2 100644
> --- a/drivers/virt/gunyah/vm_mgr.h
> +++ b/drivers/virt/gunyah/vm_mgr.h
> @@ -10,6 +10,8 @@
>   #include <linux/list.h>
>   #include <linux/miscdevice.h>
>   #include <linux/mutex.h>
> +#include <linux/rwsem.h>
> +#include <linux/wait.h>
>   
>   #include <uapi/linux/gunyah.h>
>   
> @@ -33,6 +35,13 @@ struct gh_vm_mem {
>   struct gh_vm {
>   	u16 vmid;
>   	struct gh_rm *rm;
> +	enum gh_rm_vm_auth_mechanism auth;
> +	struct gh_vm_dtb_config dtb_config;
> +
> +	struct notifier_block nb;
> +	enum gh_rm_vm_status vm_status;
> +	wait_queue_head_t vm_status_wait;
> +	struct rw_semaphore status_lock;
>   
>   	struct work_struct free_work;
>   	struct mutex mm_lock;
> @@ -43,5 +52,6 @@ int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *regio
>   void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping);
>   int gh_vm_mem_free(struct gh_vm *ghvm, u32 label);
>   struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label);
> +struct gh_vm_mem *gh_vm_mem_find_mapping(struct gh_vm *ghvm, u64 gpa, u32 size);
>   
>   #endif
> diff --git a/drivers/virt/gunyah/vm_mgr_mm.c b/drivers/virt/gunyah/vm_mgr_mm.c
> index 03e71a36ea3b..128b90da555a 100644
> --- a/drivers/virt/gunyah/vm_mgr_mm.c
> +++ b/drivers/virt/gunyah/vm_mgr_mm.c
> @@ -52,6 +52,29 @@ void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping)
>   	list_del(&mapping->list);
>   }
>   
> +struct gh_vm_mem *gh_vm_mem_find_mapping(struct gh_vm *ghvm, u64 gpa, u32 size)
naming is bit missleading we already have
gh_vm_mem_find/__gh_vm_mem_find which is returning mapping based on label
now with gh_vm_mem_find_mapping() is doing same thing but with address.

Can we rename them clearly
gh_vm_mem_find_mapping_by_label()
gh_vm_mem_find_mapping_by_addr()

> +{

> +	struct gh_vm_mem *mapping = NULL;
> +	int ret;
> +
> +	ret = mutex_lock_interruptible(&ghvm->mm_lock);
> +	if (ret)
> +		return ERR_PTR(ret);
> +
> +	list_for_each_entry(mapping, &ghvm->memory_mappings, list) {
> +		if (gpa >= mapping->guest_phys_addr &&
> +			(gpa + size <= mapping->guest_phys_addr +
> +			(mapping->npages << PAGE_SHIFT))) {
> +			goto unlock;
> +		}
> +	}
> +
> +	mapping = NULL;
> +unlock:
> +	mutex_unlock(&ghvm->mm_lock);
> +	return mapping;
> +}
> +
>   struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label)
>   {
>   	struct gh_vm_mem *mapping;
> diff --git a/include/linux/gunyah_rsc_mgr.h b/include/linux/gunyah_rsc_mgr.h
> index 2d8b8b6cc394..9cffee6f9b4e 100644
> --- a/include/linux/gunyah_rsc_mgr.h
> +++ b/include/linux/gunyah_rsc_mgr.h
> @@ -32,6 +32,12 @@ struct gh_rm_vm_exited_payload {
>   #define GH_RM_NOTIFICATION_VM_EXITED		 0x56100001
>   
>   enum gh_rm_vm_status {
> +	/**
> +	 * RM doesn't have a state where load partially failed because
> +	 * only Linux
> +	 */
> +	GH_RM_VM_STATUS_LOAD_FAILED	= -1,
> +
>   	GH_RM_VM_STATUS_NO_STATE	= 0,
>   	GH_RM_VM_STATUS_INIT		= 1,
>   	GH_RM_VM_STATUS_READY		= 2,
> diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
> index d85d12119a48..d899bba6a4c6 100644
> --- a/include/uapi/linux/gunyah.h
> +++ b/include/uapi/linux/gunyah.h
> @@ -53,4 +53,17 @@ struct gh_userspace_memory_region {
>   #define GH_VM_SET_USER_MEM_REGION	_IOW(GH_IOCTL_TYPE, 0x1, \
>   						struct gh_userspace_memory_region)
>   
> +/**
> + * struct gh_vm_dtb_config - Set the location of the VM's devicetree blob
> + * @gpa: Address of the VM's devicetree in guest memory.
> + * @size: Maximum size of the devicetree.
> + */
> +struct gh_vm_dtb_config {
> +	__u64 gpa;
> +	__u64 size;
> +};
> +#define GH_VM_SET_DTB_CONFIG	_IOW(GH_IOCTL_TYPE, 0x2, struct gh_vm_dtb_config)
> +
> +#define GH_VM_START		_IO(GH_IOCTL_TYPE, 0x3)
> +
>   #endif
  
Elliot Berman Feb. 23, 2023, 12:50 a.m. UTC | #7
On 2/21/2023 6:17 AM, Srinivas Kandagatla wrote:
> 
> 
> On 14/02/2023 21:24, Elliot Berman wrote:
>>
>> Add remaining ioctls to support non-proxy VM boot:
>>
>>   - Gunyah Resource Manager uses the VM's devicetree to configure the
>>     virtual machine. The location of the devicetree in the guest's
>>     virtual memory can be declared via the SET_DTB_CONFIGioctl.
>>   - Trigger start of the virtual machine with VM_START ioctl.
>>
>> Co-developed-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
>> Signed-off-by: Prakruthi Deepak Heragu <quic_pheragu@quicinc.com>
>> Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
>> ---
>>   drivers/virt/gunyah/vm_mgr.c    | 229 ++++++++++++++++++++++++++++++--
>>   drivers/virt/gunyah/vm_mgr.h    |  10 ++
>>   drivers/virt/gunyah/vm_mgr_mm.c |  23 ++++
>>   include/linux/gunyah_rsc_mgr.h  |   6 +
>>   include/uapi/linux/gunyah.h     |  13 ++
>>   5 files changed, 268 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
>> index 84102bac03cc..fa324385ade5 100644
>> --- a/drivers/virt/gunyah/vm_mgr.c
>> +++ b/drivers/virt/gunyah/vm_mgr.c
>> @@ -9,37 +9,114 @@
>>   #include <linux/file.h>
>>   #include <linux/gunyah_rsc_mgr.h>
>>   #include <linux/miscdevice.h>
>> +#include <linux/mm.h>
>>   #include <linux/module.h>
>>   #include <uapi/linux/gunyah.h>
>>   #include "vm_mgr.h"
>> +static int gh_vm_rm_notification_status(struct gh_vm *ghvm, void *data)
>> +{
>> +    struct gh_rm_vm_status_payload *payload = data;
>> +
>> +    if (payload->vmid != ghvm->vmid)
>> +        return NOTIFY_OK;
> Is this even possible? If yes, then this is a bug somewhere, we should 
> not be getting notifications for something that does not belong to this vm.
> What is the typical case for such behavior? comment would be useful.
> 

VM manager has reigstered to receive all notifications. If there are 
multiple VMs running, then the notifier callback receives notifications 
about all VMs. I've not yet implemented any filtering at resource 
manager level because it added lot of processing code in the resource 
manager that is easily done in the notifier callback.

> 
>> +
>> +    /* All other state transitions are synchronous to a corresponding 
>> RM call */
>> +    if (payload->vm_status == GH_RM_VM_STATUS_RESET){
>> +        down_write(&ghvm->status_lock);
>> +        ghvm->vm_status = payload->vm_status;
>> +        up_write(&ghvm->status_lock);
>> +        wake_up(&ghvm->vm_status_wait);
>> +    }
>> +
>> +    return NOTIFY_DONE;
>> +}
>> +
>> +static int gh_vm_rm_notification_exited(struct gh_vm *ghvm, void *data)
>> +{
>> +    struct gh_rm_vm_exited_payload *payload = data;
>> +
>> +    if (payload->vmid != ghvm->vmid)
>> +        return NOTIFY_OK;
> same
> 
>> +
>> +    down_write(&ghvm->status_lock);
>> +    ghvm->vm_status = GH_RM_VM_STATUS_EXITED;
>> +    up_write(&ghvm->status_lock);
>> +
>> +    return NOTIFY_DONE;
>> +}
>> +
>> +static int gh_vm_rm_notification(struct notifier_block *nb, unsigned 
>> long action, void *data)
>> +{
>> +    struct gh_vm *ghvm = container_of(nb, struct gh_vm, nb);
>> +
>> +    switch (action) {
>> +    case GH_RM_NOTIFICATION_VM_STATUS:
>> +        return gh_vm_rm_notification_status(ghvm, data);
>> +    case GH_RM_NOTIFICATION_VM_EXITED:
>> +        return gh_vm_rm_notification_exited(ghvm, data);
>> +    default:
>> +        return NOTIFY_OK;
>> +    }
>> +}
>> +
>> +static void gh_vm_stop(struct gh_vm *ghvm)
>> +{
>> +    int ret;
>> +
>> +    down_write(&ghvm->status_lock);
>> +    if (ghvm->vm_status == GH_RM_VM_STATUS_RUNNING) {
>> +        ret = gh_rm_vm_stop(ghvm->rm, ghvm->vmid);
>> +        if (ret)
>> +            pr_warn("Failed to stop VM: %d\n", ret);
> Should we not bail out from this fail path?
> 

This is called in the gh_vm_free path and we have some options here when 
we get some error while stopping a VM. So far, my strategy has been to 
ignore error as best we can and continue. We might get further errors, 
but we can also continue to clean up some more resources.

If there's an error, I'm not sure if there is a proper strategy to get 
someone to retry later: userspace is closing all its references to the 
VM and we need to stop the VM and clean up all our resources. Nitro 
Enclaves and ACRN suffer similar

> 
>> +    }
>> +
>> +    ghvm->vm_status = GH_RM_VM_STATUS_EXITED;
>> +    up_write(&ghvm->status_lock);
>> +}
>> +
>>   static void gh_vm_free(struct work_struct *work)
>>   {
>>       struct gh_vm *ghvm = container_of(work,struct gh_vm, free_work);
>>       struct gh_vm_mem *mapping, *tmp;
>>       int ret;
>> -    mutex_lock(&ghvm->mm_lock);
>> -    list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, 
>> list) {
>> -        gh_vm_mem_reclaim(ghvm, mapping);
>> -        kfree(mapping);
>> +    switch (ghvm->vm_status) {
>> +unknown_state:
> 
> Never seen this style of using goto from switch to a new label in switch 
> case. Am sure this is some kinda trick but its not helping readers.
> 
> Can we rewrite this using a normal semantics.
> 
> may be a do while could help.
> 

Srivatsa suggested dropping the goto, I can do that.
> 
>> +    case GH_RM_VM_STATUS_RUNNING:
>> +        gh_vm_stop(ghvm);
>> +        fallthrough;
>> +    case GH_RM_VM_STATUS_INIT_FAILED:
>> +    case GH_RM_VM_STATUS_LOAD:
>> +    case GH_RM_VM_STATUS_LOAD_FAILED:
>> +        mutex_lock(&ghvm->mm_lock);
>> +        list_for_each_entry_safe(mapping, tmp, 
>> &ghvm->memory_mappings, list) {
>> +            gh_vm_mem_reclaim(ghvm, mapping);
>> +            kfree(mapping);
>> +        }
>> +        mutex_unlock(&ghvm->mm_lock);
>> +        fallthrough;
>> +    case GH_RM_VM_STATUS_NO_STATE:
>> +        ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
>> +        if (ret)
>> +            pr_warn("Failed to deallocate vmid: %d\n", ret);
>> +
>> +        gh_rm_notifier_unregister(ghvm->rm, &ghvm->nb);
>> +        put_gh_rm(ghvm->rm);
>> +        kfree(ghvm);
>> +        break;
>> +    default:
>> +        pr_err("VM is unknown state:%d, assuming it's running.\n", 
>> ghvm->vm_status);
> vm_status did not change do we not endup here again?
> 
>> +        goto unknown_state;
>>       }
>> -    mutex_unlock(&ghvm->mm_lock);
>> -
>> -    ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
>> -    if (ret)
>> -        pr_warn("Failed to deallocate vmid: %d\n", ret);
>> -
>> -    put_gh_rm(ghvm->rm);
>> -    kfree(ghvm);
>>   }
>>   static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
>>   {
>>       struct gh_vm *ghvm;
>> -    int vmid;
>> +    int vmid, ret;
>>       vmid = gh_rm_alloc_vmid(rm, 0);
>>       if (vmid < 0)
>> @@ -56,13 +133,123 @@ static __must_check struct gh_vm 
>> *gh_vm_alloc(struct gh_rm *rm)
>>       ghvm->vmid = vmid;
>>       ghvm->rm = rm;
>> +    init_waitqueue_head(&ghvm->vm_status_wait);
>> +    ghvm->nb.notifier_call = gh_vm_rm_notification;
>> +    ret = gh_rm_notifier_register(rm, &ghvm->nb);
>> +    if (ret) {
>> +        put_gh_rm(rm);
>> +        gh_rm_dealloc_vmid(rm, vmid);
>> +        kfree(ghvm);
>> +        return ERR_PTR(ret);
>> +    }
>> +
>>       mutex_init(&ghvm->mm_lock);
>>       INIT_LIST_HEAD(&ghvm->memory_mappings);
>> +    init_rwsem(&ghvm->status_lock);
>>       INIT_WORK(&ghvm->free_work, gh_vm_free);
>> +    ghvm->vm_status = GH_RM_VM_STATUS_LOAD;
>>       return ghvm;
>>   }
>> +static int gh_vm_start(struct gh_vm *ghvm)
>> +{
>> +    struct gh_vm_mem *mapping;
>> +    u64 dtb_offset;
>> +    u32 mem_handle;
>> +    int ret;
>> +
>> +    down_write(&ghvm->status_lock);
>> +    if (ghvm->vm_status != GH_RM_VM_STATUS_LOAD) {
>> +        up_write(&ghvm->status_lock);
>> +        return 0;
>> +    }
>> +
>> +    ghvm->vm_status = GH_RM_VM_STATUS_RESET;
>> +
> 
> <------
> should we not take ghvm->mm_lock here to make sure that list is 
> consistent while processing.

Done.

>> +    list_for_each_entry(mapping, &ghvm->memory_mappings,list) {
>> +        switch (mapping->share_type){
>> +        case VM_MEM_LEND:
>> +            ret = gh_rm_mem_lend(ghvm->rm, &mapping->parcel);
>> +            break;
>> +        case VM_MEM_SHARE:
>> +            ret = gh_rm_mem_share(ghvm->rm, &mapping->parcel);
>> +            break;
>> +        }
>> +        if (ret) {
>> +            pr_warn("Failed to %s parcel %d: %d\n",
>> +                mapping->share_type == VM_MEM_LEND ? "lend" : "share",
>> +                mapping->parcel.label,
>> +                ret);
>> +            gotoerr;
>> +        }
>> +    }
> --->
> 
>> +
>> +    mapping = gh_vm_mem_find_mapping(ghvm, ghvm->dtb_config.gpa, 
>> ghvm->dtb_config.size);
>> +    if (!mapping) {
>> +        pr_warn("Failed to find the memory_handle for DTB\n");
> 
> What wil happen to the mappings that are lend or shared?
> 

When the VM is cleaned up (on final destruction), the mappings are 
reclaimed.

>> +        ret = -EINVAL;
>> +        goto err;
>> +    }
>> +
>> +    mem_handle = mapping->parcel.mem_handle;
>> +    dtb_offset = ghvm->dtb_config.gpa - mapping->guest_phys_addr;
>> +
>> +    ret = gh_rm_vm_configure(ghvm->rm, ghvm->vmid, ghvm->auth, 
>> mem_handle,
> 
> where is authentication mechanism (auth) comming from? Who is supposed 
> to set this value?
> 
> Should it come from userspace? if so I do not see any UAPI facility to 
> do that via VM_START ioctl.
> 

Right, we are only adding the support for unauthenticated VMs for now. 
There would be further UAPI facilities to set the authentication type.

> 
>> +                0, 0, dtb_offset, ghvm->dtb_config.size);
>> +    if (ret) {
>> +        pr_warn("Failed to configureVM: %d\n", ret);
>> +        goto err;
>> +    }
>> +
>> +    ret = gh_rm_vm_init(ghvm->rm, ghvm->vmid);
>> +    if (ret) {
>> +        pr_warn("Failed to initialize VM: %d\n", ret);
>> +        goto err;
>> +    }
>> +
>> +    ret = gh_rm_vm_start(ghvm->rm, ghvm->vmid);
>> +    if (ret) {
>> +        pr_warn("Failed to start VM:%d\n", ret);
>> +        goto err;
>> +    }
>> +
>> +    ghvm->vm_status = GH_RM_VM_STATUS_RUNNING;
>> +    up_write(&ghvm->status_lock);
>> +    return ret;
>> +err:
>> +    ghvm->vm_status = GH_RM_VM_STATUS_INIT_FAILED;
>> +    up_write(&ghvm->status_lock);
> 
> Am really not sure if we are doing right thing in the error path, there 
> are multiple cases that seems to be not handled or if it was not 
> required no comments to clarify this are documented.
> ex: if vm start fails then what happes with memory mapping or do we need 
> to un-configure vm or un-init vm from hypervisor side?
> 
> if none of this is required its useful to add come clear comments.
> 

It is required and done in the VM cleanup path. I'll add comment with 
this info.

>> +    return ret;
>> +}
>> +
>> +static int gh_vm_ensure_started(struct gh_vm *ghvm)
>> +{
>> +    int ret;
>> +
>> +retry:
>> +    ret = down_read_interruptible(&ghvm->status_lock);
>> +    if (ret)
>> +        return ret;
>> +
>> +    /* Unlikely because VM is typically started */
>> +    if (unlikely(ghvm->vm_status == GH_RM_VM_STATUS_LOAD)) {
>> +        up_read(&ghvm->status_lock);
>> +        ret = gh_vm_start(ghvm);
>> +        if (ret)
>> +            gotoout;
>> +        goto retry;
>> +    }
> 
> do while will do better job here w.r.t to readablity.
> 

I think do while and my current "goto retry" imply a long loop is 
possible. The "goto retry" or while loop is guaranteed to run only once 
because gh_vm_start will always bring VM out of GH_RM_VM_STATUS_LOAD.

How about this?

-               goto retry;
+               /** gh_vm_start() is guaranteed to bring status out of
+                * GH_RM_VM_STATUS_LOAD, thus inifitely recursive call 
is not
+                * possible
+                */
+               return gh_vm_ensure_started(ghvm);



>> +
>> +    /* Unlikely because VM is typically running */
>> +    if (unlikely(ghvm->vm_status != GH_RM_VM_STATUS_RUNNING))
>> +        ret = -ENODEV;
>> +
>> +out:
>> +    up_read(&ghvm->status_lock);
>> +    return ret;
>> +}
>> +
>>   static long gh_vm_ioctl(struct file *filp, unsigned int cmd, 
>> unsigned long arg)
>>   {
>>       struct gh_vm *ghvm = filp->private_data;
>> @@ -88,6 +275,22 @@ static long gh_vm_ioctl(struct file *filp, 
>> unsigned int cmd, unsigned long arg)
>>               r = gh_vm_mem_free(ghvm, region.label);
>>           break;
>>       }
>> +    case GH_VM_SET_DTB_CONFIG: {
>> +        struct gh_vm_dtb_config dtb_config;
>> +
>> +        if (copy_from_user(&dtb_config, argp, sizeof(dtb_config)))
>> +            return -EFAULT;
>> +
>> +        dtb_config.size = PAGE_ALIGN(dtb_config.size);
>> +        ghvm->dtb_config = dtb_config;
>> +
>> +        r = 0;
>> +        break;
>> +    }
>> +    case GH_VM_START: {
>> +        r = gh_vm_ensure_started(ghvm);
>> +        break;
>> +    }
>>       default:
>>           r = -ENOTTY;
>>           break;
>> diff --git a/drivers/virt/gunyah/vm_mgr.h b/drivers/virt/gunyah/vm_mgr.h
>> index 97bc00c34878..e9cf56647cc2 100644
>> --- a/drivers/virt/gunyah/vm_mgr.h
>> +++ b/drivers/virt/gunyah/vm_mgr.h
>> @@ -10,6 +10,8 @@
>>   #include <linux/list.h>
>>   #include <linux/miscdevice.h>
>>   #include <linux/mutex.h>
>> +#include <linux/rwsem.h>
>> +#include <linux/wait.h>
>>   #include <uapi/linux/gunyah.h>
>> @@ -33,6 +35,13 @@ struct gh_vm_mem {
>>   struct gh_vm {
>>       u16 vmid;
>>       struct gh_rm *rm;
>> +    enum gh_rm_vm_auth_mechanism auth;
>> +    struct gh_vm_dtb_config dtb_config;
>> +
>> +    struct notifier_block nb;
>> +    enum gh_rm_vm_status vm_status;
>> +    wait_queue_head_t vm_status_wait;
>> +    struct rw_semaphore status_lock;
>>       struct work_struct free_work;
>>       struct mutex mm_lock;
>> @@ -43,5 +52,6 @@ int gh_vm_mem_alloc(struct gh_vm *ghvm, struct 
>> gh_userspace_memory_region *regio
>>   void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping);
>>   int gh_vm_mem_free(struct gh_vm *ghvm, u32 label);
>>   struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label);
>> +struct gh_vm_mem *gh_vm_mem_find_mapping(struct gh_vm *ghvm, u64 gpa, 
>> u32 size);
>>   #endif
>> diff --git a/drivers/virt/gunyah/vm_mgr_mm.c 
>> b/drivers/virt/gunyah/vm_mgr_mm.c
>> index 03e71a36ea3b..128b90da555a 100644
>> --- a/drivers/virt/gunyah/vm_mgr_mm.c
>> +++ b/drivers/virt/gunyah/vm_mgr_mm.c
>> @@ -52,6 +52,29 @@ void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct 
>> gh_vm_mem *mapping)
>>       list_del(&mapping->list);
>>   }
>> +struct gh_vm_mem *gh_vm_mem_find_mapping(struct gh_vm *ghvm, u64 gpa, 
>> u32 size)
> naming is bit missleading we already have
> gh_vm_mem_find/__gh_vm_mem_find which is returning mapping based on label
> now with gh_vm_mem_find_mapping() is doing same thing but with address.
> 
> Can we rename them clearly
> gh_vm_mem_find_mapping_by_label()
> gh_vm_mem_find_mapping_by_addr()
> 

Done.

- Elliot
  
Srinivas Kandagatla Feb. 23, 2023, 9:21 a.m. UTC | #8
On 23/02/2023 00:50, Elliot Berman wrote:
>>>
>>> +
>>> +    mem_handle = mapping->parcel.mem_handle;
>>> +    dtb_offset = ghvm->dtb_config.gpa - mapping->guest_phys_addr;
>>> +
>>> +    ret = gh_rm_vm_configure(ghvm->rm, ghvm->vmid, ghvm->auth, 
>>> mem_handle,
>>
>> where is authentication mechanism (auth) comming from? Who is supposed 
>> to set this value?
>>
>> Should it come from userspace? if so I do not see any UAPI facility to 
>> do that via VM_START ioctl.
>>
> 
> Right, we are only adding the support for unauthenticated VMs for now. 
> There would be further UAPI facilities to set the authentication type.
We have to be careful, please note that you can not change an existing 
UAPI to accommodate new features.

There are two ways to do this properly:

1. Design UAPI to accommodate features that will be part of this in very 
  soon or in future. This way the UAPI is stable and does not change 
over time when we add support this feature in driver.

In this particular case, vm authentication type is one that needs to 
come from user, rather than kernel assuming it, so definitely this need 
to be properly addressed by passing this info from userspace.
Or rename this IOCTl to something like VM_START_UNAUTH_VM to make this 
more explicit.


2. For each feature add new UAPI as and when its required, which is 
really the only option when we failed to design UAPIs correctly in the 
first place.

--srini


> 
>>
>>> +                0, 0, dtb_offset, ghvm->dtb_config.size);
>>> +    if (ret) {
  

Patch

diff --git a/drivers/virt/gunyah/vm_mgr.c b/drivers/virt/gunyah/vm_mgr.c
index 84102bac03cc..fa324385ade5 100644
--- a/drivers/virt/gunyah/vm_mgr.c
+++ b/drivers/virt/gunyah/vm_mgr.c
@@ -9,37 +9,114 @@ 
 #include <linux/file.h>
 #include <linux/gunyah_rsc_mgr.h>
 #include <linux/miscdevice.h>
+#include <linux/mm.h>
 #include <linux/module.h>
 
 #include <uapi/linux/gunyah.h>
 
 #include "vm_mgr.h"
 
+static int gh_vm_rm_notification_status(struct gh_vm *ghvm, void *data)
+{
+	struct gh_rm_vm_status_payload *payload = data;
+
+	if (payload->vmid != ghvm->vmid)
+		return NOTIFY_OK;
+
+	/* All other state transitions are synchronous to a corresponding RM call */
+	if (payload->vm_status == GH_RM_VM_STATUS_RESET) {
+		down_write(&ghvm->status_lock);
+		ghvm->vm_status = payload->vm_status;
+		up_write(&ghvm->status_lock);
+		wake_up(&ghvm->vm_status_wait);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int gh_vm_rm_notification_exited(struct gh_vm *ghvm, void *data)
+{
+	struct gh_rm_vm_exited_payload *payload = data;
+
+	if (payload->vmid != ghvm->vmid)
+		return NOTIFY_OK;
+
+	down_write(&ghvm->status_lock);
+	ghvm->vm_status = GH_RM_VM_STATUS_EXITED;
+	up_write(&ghvm->status_lock);
+
+	return NOTIFY_DONE;
+}
+
+static int gh_vm_rm_notification(struct notifier_block *nb, unsigned long action, void *data)
+{
+	struct gh_vm *ghvm = container_of(nb, struct gh_vm, nb);
+
+	switch (action) {
+	case GH_RM_NOTIFICATION_VM_STATUS:
+		return gh_vm_rm_notification_status(ghvm, data);
+	case GH_RM_NOTIFICATION_VM_EXITED:
+		return gh_vm_rm_notification_exited(ghvm, data);
+	default:
+		return NOTIFY_OK;
+	}
+}
+
+static void gh_vm_stop(struct gh_vm *ghvm)
+{
+	int ret;
+
+	down_write(&ghvm->status_lock);
+	if (ghvm->vm_status == GH_RM_VM_STATUS_RUNNING) {
+		ret = gh_rm_vm_stop(ghvm->rm, ghvm->vmid);
+		if (ret)
+			pr_warn("Failed to stop VM: %d\n", ret);
+	}
+
+	ghvm->vm_status = GH_RM_VM_STATUS_EXITED;
+	up_write(&ghvm->status_lock);
+}
+
 static void gh_vm_free(struct work_struct *work)
 {
 	struct gh_vm *ghvm = container_of(work, struct gh_vm, free_work);
 	struct gh_vm_mem *mapping, *tmp;
 	int ret;
 
-	mutex_lock(&ghvm->mm_lock);
-	list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
-		gh_vm_mem_reclaim(ghvm, mapping);
-		kfree(mapping);
+	switch (ghvm->vm_status) {
+unknown_state:
+	case GH_RM_VM_STATUS_RUNNING:
+		gh_vm_stop(ghvm);
+		fallthrough;
+	case GH_RM_VM_STATUS_INIT_FAILED:
+	case GH_RM_VM_STATUS_LOAD:
+	case GH_RM_VM_STATUS_LOAD_FAILED:
+		mutex_lock(&ghvm->mm_lock);
+		list_for_each_entry_safe(mapping, tmp, &ghvm->memory_mappings, list) {
+			gh_vm_mem_reclaim(ghvm, mapping);
+			kfree(mapping);
+		}
+		mutex_unlock(&ghvm->mm_lock);
+		fallthrough;
+	case GH_RM_VM_STATUS_NO_STATE:
+		ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
+		if (ret)
+			pr_warn("Failed to deallocate vmid: %d\n", ret);
+
+		gh_rm_notifier_unregister(ghvm->rm, &ghvm->nb);
+		put_gh_rm(ghvm->rm);
+		kfree(ghvm);
+		break;
+	default:
+		pr_err("VM is unknown state: %d, assuming it's running.\n", ghvm->vm_status);
+		goto unknown_state;
 	}
-	mutex_unlock(&ghvm->mm_lock);
-
-	ret = gh_rm_dealloc_vmid(ghvm->rm, ghvm->vmid);
-	if (ret)
-		pr_warn("Failed to deallocate vmid: %d\n", ret);
-
-	put_gh_rm(ghvm->rm);
-	kfree(ghvm);
 }
 
 static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
 {
 	struct gh_vm *ghvm;
-	int vmid;
+	int vmid, ret;
 
 	vmid = gh_rm_alloc_vmid(rm, 0);
 	if (vmid < 0)
@@ -56,13 +133,123 @@  static __must_check struct gh_vm *gh_vm_alloc(struct gh_rm *rm)
 	ghvm->vmid = vmid;
 	ghvm->rm = rm;
 
+	init_waitqueue_head(&ghvm->vm_status_wait);
+	ghvm->nb.notifier_call = gh_vm_rm_notification;
+	ret = gh_rm_notifier_register(rm, &ghvm->nb);
+	if (ret) {
+		put_gh_rm(rm);
+		gh_rm_dealloc_vmid(rm, vmid);
+		kfree(ghvm);
+		return ERR_PTR(ret);
+	}
+
 	mutex_init(&ghvm->mm_lock);
 	INIT_LIST_HEAD(&ghvm->memory_mappings);
+	init_rwsem(&ghvm->status_lock);
 	INIT_WORK(&ghvm->free_work, gh_vm_free);
+	ghvm->vm_status = GH_RM_VM_STATUS_LOAD;
 
 	return ghvm;
 }
 
+static int gh_vm_start(struct gh_vm *ghvm)
+{
+	struct gh_vm_mem *mapping;
+	u64 dtb_offset;
+	u32 mem_handle;
+	int ret;
+
+	down_write(&ghvm->status_lock);
+	if (ghvm->vm_status != GH_RM_VM_STATUS_LOAD) {
+		up_write(&ghvm->status_lock);
+		return 0;
+	}
+
+	ghvm->vm_status = GH_RM_VM_STATUS_RESET;
+
+	list_for_each_entry(mapping, &ghvm->memory_mappings, list) {
+		switch (mapping->share_type) {
+		case VM_MEM_LEND:
+			ret = gh_rm_mem_lend(ghvm->rm, &mapping->parcel);
+			break;
+		case VM_MEM_SHARE:
+			ret = gh_rm_mem_share(ghvm->rm, &mapping->parcel);
+			break;
+		}
+		if (ret) {
+			pr_warn("Failed to %s parcel %d: %d\n",
+				mapping->share_type == VM_MEM_LEND ? "lend" : "share",
+				mapping->parcel.label,
+				ret);
+			goto err;
+		}
+	}
+
+	mapping = gh_vm_mem_find_mapping(ghvm, ghvm->dtb_config.gpa, ghvm->dtb_config.size);
+	if (!mapping) {
+		pr_warn("Failed to find the memory_handle for DTB\n");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	mem_handle = mapping->parcel.mem_handle;
+	dtb_offset = ghvm->dtb_config.gpa - mapping->guest_phys_addr;
+
+	ret = gh_rm_vm_configure(ghvm->rm, ghvm->vmid, ghvm->auth, mem_handle,
+				0, 0, dtb_offset, ghvm->dtb_config.size);
+	if (ret) {
+		pr_warn("Failed to configure VM: %d\n", ret);
+		goto err;
+	}
+
+	ret = gh_rm_vm_init(ghvm->rm, ghvm->vmid);
+	if (ret) {
+		pr_warn("Failed to initialize VM: %d\n", ret);
+		goto err;
+	}
+
+	ret = gh_rm_vm_start(ghvm->rm, ghvm->vmid);
+	if (ret) {
+		pr_warn("Failed to start VM: %d\n", ret);
+		goto err;
+	}
+
+	ghvm->vm_status = GH_RM_VM_STATUS_RUNNING;
+	up_write(&ghvm->status_lock);
+	return ret;
+err:
+	ghvm->vm_status = GH_RM_VM_STATUS_INIT_FAILED;
+	up_write(&ghvm->status_lock);
+	return ret;
+}
+
+static int gh_vm_ensure_started(struct gh_vm *ghvm)
+{
+	int ret;
+
+retry:
+	ret = down_read_interruptible(&ghvm->status_lock);
+	if (ret)
+		return ret;
+
+	/* Unlikely because VM is typically started */
+	if (unlikely(ghvm->vm_status == GH_RM_VM_STATUS_LOAD)) {
+		up_read(&ghvm->status_lock);
+		ret = gh_vm_start(ghvm);
+		if (ret)
+			goto out;
+		goto retry;
+	}
+
+	/* Unlikely because VM is typically running */
+	if (unlikely(ghvm->vm_status != GH_RM_VM_STATUS_RUNNING))
+		ret = -ENODEV;
+
+out:
+	up_read(&ghvm->status_lock);
+	return ret;
+}
+
 static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct gh_vm *ghvm = filp->private_data;
@@ -88,6 +275,22 @@  static long gh_vm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			r = gh_vm_mem_free(ghvm, region.label);
 		break;
 	}
+	case GH_VM_SET_DTB_CONFIG: {
+		struct gh_vm_dtb_config dtb_config;
+
+		if (copy_from_user(&dtb_config, argp, sizeof(dtb_config)))
+			return -EFAULT;
+
+		dtb_config.size = PAGE_ALIGN(dtb_config.size);
+		ghvm->dtb_config = dtb_config;
+
+		r = 0;
+		break;
+	}
+	case GH_VM_START: {
+		r = gh_vm_ensure_started(ghvm);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 		break;
diff --git a/drivers/virt/gunyah/vm_mgr.h b/drivers/virt/gunyah/vm_mgr.h
index 97bc00c34878..e9cf56647cc2 100644
--- a/drivers/virt/gunyah/vm_mgr.h
+++ b/drivers/virt/gunyah/vm_mgr.h
@@ -10,6 +10,8 @@ 
 #include <linux/list.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/wait.h>
 
 #include <uapi/linux/gunyah.h>
 
@@ -33,6 +35,13 @@  struct gh_vm_mem {
 struct gh_vm {
 	u16 vmid;
 	struct gh_rm *rm;
+	enum gh_rm_vm_auth_mechanism auth;
+	struct gh_vm_dtb_config dtb_config;
+
+	struct notifier_block nb;
+	enum gh_rm_vm_status vm_status;
+	wait_queue_head_t vm_status_wait;
+	struct rw_semaphore status_lock;
 
 	struct work_struct free_work;
 	struct mutex mm_lock;
@@ -43,5 +52,6 @@  int gh_vm_mem_alloc(struct gh_vm *ghvm, struct gh_userspace_memory_region *regio
 void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping);
 int gh_vm_mem_free(struct gh_vm *ghvm, u32 label);
 struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label);
+struct gh_vm_mem *gh_vm_mem_find_mapping(struct gh_vm *ghvm, u64 gpa, u32 size);
 
 #endif
diff --git a/drivers/virt/gunyah/vm_mgr_mm.c b/drivers/virt/gunyah/vm_mgr_mm.c
index 03e71a36ea3b..128b90da555a 100644
--- a/drivers/virt/gunyah/vm_mgr_mm.c
+++ b/drivers/virt/gunyah/vm_mgr_mm.c
@@ -52,6 +52,29 @@  void gh_vm_mem_reclaim(struct gh_vm *ghvm, struct gh_vm_mem *mapping)
 	list_del(&mapping->list);
 }
 
+struct gh_vm_mem *gh_vm_mem_find_mapping(struct gh_vm *ghvm, u64 gpa, u32 size)
+{
+	struct gh_vm_mem *mapping = NULL;
+	int ret;
+
+	ret = mutex_lock_interruptible(&ghvm->mm_lock);
+	if (ret)
+		return ERR_PTR(ret);
+
+	list_for_each_entry(mapping, &ghvm->memory_mappings, list) {
+		if (gpa >= mapping->guest_phys_addr &&
+			(gpa + size <= mapping->guest_phys_addr +
+			(mapping->npages << PAGE_SHIFT))) {
+			goto unlock;
+		}
+	}
+
+	mapping = NULL;
+unlock:
+	mutex_unlock(&ghvm->mm_lock);
+	return mapping;
+}
+
 struct gh_vm_mem *gh_vm_mem_find(struct gh_vm *ghvm, u32 label)
 {
 	struct gh_vm_mem *mapping;
diff --git a/include/linux/gunyah_rsc_mgr.h b/include/linux/gunyah_rsc_mgr.h
index 2d8b8b6cc394..9cffee6f9b4e 100644
--- a/include/linux/gunyah_rsc_mgr.h
+++ b/include/linux/gunyah_rsc_mgr.h
@@ -32,6 +32,12 @@  struct gh_rm_vm_exited_payload {
 #define GH_RM_NOTIFICATION_VM_EXITED		 0x56100001
 
 enum gh_rm_vm_status {
+	/**
+	 * RM doesn't have a state where load partially failed because
+	 * only Linux
+	 */
+	GH_RM_VM_STATUS_LOAD_FAILED	= -1,
+
 	GH_RM_VM_STATUS_NO_STATE	= 0,
 	GH_RM_VM_STATUS_INIT		= 1,
 	GH_RM_VM_STATUS_READY		= 2,
diff --git a/include/uapi/linux/gunyah.h b/include/uapi/linux/gunyah.h
index d85d12119a48..d899bba6a4c6 100644
--- a/include/uapi/linux/gunyah.h
+++ b/include/uapi/linux/gunyah.h
@@ -53,4 +53,17 @@  struct gh_userspace_memory_region {
 #define GH_VM_SET_USER_MEM_REGION	_IOW(GH_IOCTL_TYPE, 0x1, \
 						struct gh_userspace_memory_region)
 
+/**
+ * struct gh_vm_dtb_config - Set the location of the VM's devicetree blob
+ * @gpa: Address of the VM's devicetree in guest memory.
+ * @size: Maximum size of the devicetree.
+ */
+struct gh_vm_dtb_config {
+	__u64 gpa;
+	__u64 size;
+};
+#define GH_VM_SET_DTB_CONFIG	_IOW(GH_IOCTL_TYPE, 0x2, struct gh_vm_dtb_config)
+
+#define GH_VM_START		_IO(GH_IOCTL_TYPE, 0x3)
+
 #endif