[v3] vdpa/mlx5: should not activate virtq object when suspended

Message ID 1676424640-11673-1-git-send-email-si-wei.liu@oracle.com
State New
Headers
Series [v3] vdpa/mlx5: should not activate virtq object when suspended |

Commit Message

Si-Wei Liu Feb. 15, 2023, 1:30 a.m. UTC
  Otherwise the virtqueue object to instate could point to invalid address
that was unmapped from the MTT:

  mlx5_core 0000:41:04.2: mlx5_cmd_out_err:782:(pid 8321):
  CREATE_GENERAL_OBJECT(0xa00) op_mod(0xd) failed, status
  bad parameter(0x3), syndrome (0x5fa1c), err(-22)

Fixes: cae15c2ed8e6 ("vdpa/mlx5: Implement susupend virtqueue callback")
Cc: Eli Cohen <elic@nvidia.com>
Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>

---
v3: move suspended to struct mlx5_vdpa_dev
v2: removed the change for improving warning message
---
 drivers/vdpa/mlx5/core/mlx5_vdpa.h | 1 +
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)
  

Comments

Jason Wang Feb. 16, 2023, 4:48 a.m. UTC | #1
On Wed, Feb 15, 2023 at 9:31 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
> Otherwise the virtqueue object to instate could point to invalid address
> that was unmapped from the MTT:
>
>   mlx5_core 0000:41:04.2: mlx5_cmd_out_err:782:(pid 8321):
>   CREATE_GENERAL_OBJECT(0xa00) op_mod(0xd) failed, status
>   bad parameter(0x3), syndrome (0x5fa1c), err(-22)
>
> Fixes: cae15c2ed8e6 ("vdpa/mlx5: Implement susupend virtqueue callback")
> Cc: Eli Cohen <elic@nvidia.com>
> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> Reviewed-by: Eli Cohen <elic@nvidia.com>
>
> ---
> v3: move suspended to struct mlx5_vdpa_dev
> v2: removed the change for improving warning message
> ---
>  drivers/vdpa/mlx5/core/mlx5_vdpa.h | 1 +
>  drivers/vdpa/mlx5/net/mlx5_vnet.c  | 6 +++++-
>  2 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
> index 058fbe2..25fc412 100644
> --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
> +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
> @@ -96,6 +96,7 @@ struct mlx5_vdpa_dev {
>         struct mlx5_control_vq cvq;
>         struct workqueue_struct *wq;
>         unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
> +       bool suspended;
>  };
>
>  int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid);
> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> index 3a6dbbc6..daac3ab 100644
> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> @@ -2411,7 +2411,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
>         if (err)
>                 goto err_mr;
>
> -       if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
> +       if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)

One more thought,

Does this mean set_map() is forbidden during suspending? I'm not sure
this is correct or at least we need restrict in in the vDPA core.

Thanks

>                 goto err_mr;
>
>         restore_channels_info(ndev);
> @@ -2579,6 +2579,7 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev)
>         clear_vqs_ready(ndev);
>         mlx5_vdpa_destroy_mr(&ndev->mvdev);
>         ndev->mvdev.status = 0;
> +       ndev->mvdev.suspended = false;
>         ndev->cur_num_vqs = 0;
>         ndev->mvdev.cvq.received_desc = 0;
>         ndev->mvdev.cvq.completed_desc = 0;
> @@ -2815,6 +2816,8 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
>         struct mlx5_vdpa_virtqueue *mvq;
>         int i;
>
> +       mlx5_vdpa_info(mvdev, "suspending device\n");
> +
>         down_write(&ndev->reslock);
>         ndev->nb_registered = false;
>         mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
> @@ -2824,6 +2827,7 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
>                 suspend_vq(ndev, mvq);
>         }
>         mlx5_vdpa_cvq_suspend(mvdev);
> +       mvdev->suspended = true;
>         up_write(&ndev->reslock);
>         return 0;
>  }
> --
> 1.8.3.1
>
  
Si-Wei Liu Feb. 16, 2023, 5:03 a.m. UTC | #2
On 2/15/2023 8:48 PM, Jason Wang wrote:
> On Wed, Feb 15, 2023 at 9:31 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>> Otherwise the virtqueue object to instate could point to invalid address
>> that was unmapped from the MTT:
>>
>>    mlx5_core 0000:41:04.2: mlx5_cmd_out_err:782:(pid 8321):
>>    CREATE_GENERAL_OBJECT(0xa00) op_mod(0xd) failed, status
>>    bad parameter(0x3), syndrome (0x5fa1c), err(-22)
>>
>> Fixes: cae15c2ed8e6 ("vdpa/mlx5: Implement susupend virtqueue callback")
>> Cc: Eli Cohen <elic@nvidia.com>
>> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
>> Reviewed-by: Eli Cohen <elic@nvidia.com>
>>
>> ---
>> v3: move suspended to struct mlx5_vdpa_dev
>> v2: removed the change for improving warning message
>> ---
>>   drivers/vdpa/mlx5/core/mlx5_vdpa.h | 1 +
>>   drivers/vdpa/mlx5/net/mlx5_vnet.c  | 6 +++++-
>>   2 files changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>> index 058fbe2..25fc412 100644
>> --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>> +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>> @@ -96,6 +96,7 @@ struct mlx5_vdpa_dev {
>>          struct mlx5_control_vq cvq;
>>          struct workqueue_struct *wq;
>>          unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
>> +       bool suspended;
>>   };
>>
>>   int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid);
>> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> index 3a6dbbc6..daac3ab 100644
>> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
>> @@ -2411,7 +2411,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
>>          if (err)
>>                  goto err_mr;
>>
>> -       if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
>> +       if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)
> One more thought,
>
> Does this mean set_map() is forbidden during suspending?
No, it will not. Instead it now allows set_map() to proceed even if 
mapping is shrinking while device is suspended. The "goto err_mr" below 
actually returns 0 when it leaves mlx5_vdpa_change_map().

-Siwei

>   I'm not sure
> this is correct or at least we need restrict in in the vDPA core.
>
> Thanks
>
>>                  goto err_mr;
>>
>>          restore_channels_info(ndev);
>> @@ -2579,6 +2579,7 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev)
>>          clear_vqs_ready(ndev);
>>          mlx5_vdpa_destroy_mr(&ndev->mvdev);
>>          ndev->mvdev.status = 0;
>> +       ndev->mvdev.suspended = false;
>>          ndev->cur_num_vqs = 0;
>>          ndev->mvdev.cvq.received_desc = 0;
>>          ndev->mvdev.cvq.completed_desc = 0;
>> @@ -2815,6 +2816,8 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
>>          struct mlx5_vdpa_virtqueue *mvq;
>>          int i;
>>
>> +       mlx5_vdpa_info(mvdev, "suspending device\n");
>> +
>>          down_write(&ndev->reslock);
>>          ndev->nb_registered = false;
>>          mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
>> @@ -2824,6 +2827,7 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
>>                  suspend_vq(ndev, mvq);
>>          }
>>          mlx5_vdpa_cvq_suspend(mvdev);
>> +       mvdev->suspended = true;
>>          up_write(&ndev->reslock);
>>          return 0;
>>   }
>> --
>> 1.8.3.1
>>
  
Jason Wang Feb. 16, 2023, 5:10 a.m. UTC | #3
On Thu, Feb 16, 2023 at 1:03 PM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
>
>
>
> On 2/15/2023 8:48 PM, Jason Wang wrote:
> > On Wed, Feb 15, 2023 at 9:31 AM Si-Wei Liu <si-wei.liu@oracle.com> wrote:
> >> Otherwise the virtqueue object to instate could point to invalid address
> >> that was unmapped from the MTT:
> >>
> >>    mlx5_core 0000:41:04.2: mlx5_cmd_out_err:782:(pid 8321):
> >>    CREATE_GENERAL_OBJECT(0xa00) op_mod(0xd) failed, status
> >>    bad parameter(0x3), syndrome (0x5fa1c), err(-22)
> >>
> >> Fixes: cae15c2ed8e6 ("vdpa/mlx5: Implement susupend virtqueue callback")
> >> Cc: Eli Cohen <elic@nvidia.com>
> >> Signed-off-by: Si-Wei Liu <si-wei.liu@oracle.com>
> >> Reviewed-by: Eli Cohen <elic@nvidia.com>
> >>
> >> ---
> >> v3: move suspended to struct mlx5_vdpa_dev
> >> v2: removed the change for improving warning message
> >> ---
> >>   drivers/vdpa/mlx5/core/mlx5_vdpa.h | 1 +
> >>   drivers/vdpa/mlx5/net/mlx5_vnet.c  | 6 +++++-
> >>   2 files changed, 6 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
> >> index 058fbe2..25fc412 100644
> >> --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
> >> +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
> >> @@ -96,6 +96,7 @@ struct mlx5_vdpa_dev {
> >>          struct mlx5_control_vq cvq;
> >>          struct workqueue_struct *wq;
> >>          unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
> >> +       bool suspended;
> >>   };
> >>
> >>   int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid);
> >> diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> >> index 3a6dbbc6..daac3ab 100644
> >> --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
> >> +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
> >> @@ -2411,7 +2411,7 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
> >>          if (err)
> >>                  goto err_mr;
> >>
> >> -       if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
> >> +       if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)
> > One more thought,
> >
> > Does this mean set_map() is forbidden during suspending?
> No, it will not. Instead it now allows set_map() to proceed even if
> mapping is shrinking while device is suspended. The "goto err_mr" below
> actually returns 0 when it leaves mlx5_vdpa_change_map().

You're right.

So

Acked-by: Jason Wang <jasowang@redhat.com>

Thanks

>
> -Siwei
>
> >   I'm not sure
> > this is correct or at least we need restrict in in the vDPA core.
> >
> > Thanks
> >
> >>                  goto err_mr;
> >>
> >>          restore_channels_info(ndev);
> >> @@ -2579,6 +2579,7 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev)
> >>          clear_vqs_ready(ndev);
> >>          mlx5_vdpa_destroy_mr(&ndev->mvdev);
> >>          ndev->mvdev.status = 0;
> >> +       ndev->mvdev.suspended = false;
> >>          ndev->cur_num_vqs = 0;
> >>          ndev->mvdev.cvq.received_desc = 0;
> >>          ndev->mvdev.cvq.completed_desc = 0;
> >> @@ -2815,6 +2816,8 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
> >>          struct mlx5_vdpa_virtqueue *mvq;
> >>          int i;
> >>
> >> +       mlx5_vdpa_info(mvdev, "suspending device\n");
> >> +
> >>          down_write(&ndev->reslock);
> >>          ndev->nb_registered = false;
> >>          mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
> >> @@ -2824,6 +2827,7 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
> >>                  suspend_vq(ndev, mvq);
> >>          }
> >>          mlx5_vdpa_cvq_suspend(mvdev);
> >> +       mvdev->suspended = true;
> >>          up_write(&ndev->reslock);
> >>          return 0;
> >>   }
> >> --
> >> 1.8.3.1
> >>
>
  

Patch

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 058fbe2..25fc412 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -96,6 +96,7 @@  struct mlx5_vdpa_dev {
 	struct mlx5_control_vq cvq;
 	struct workqueue_struct *wq;
 	unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS];
+	bool suspended;
 };
 
 int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid);
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 3a6dbbc6..daac3ab 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2411,7 +2411,7 @@  static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
 	if (err)
 		goto err_mr;
 
-	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
+	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)
 		goto err_mr;
 
 	restore_channels_info(ndev);
@@ -2579,6 +2579,7 @@  static int mlx5_vdpa_reset(struct vdpa_device *vdev)
 	clear_vqs_ready(ndev);
 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
 	ndev->mvdev.status = 0;
+	ndev->mvdev.suspended = false;
 	ndev->cur_num_vqs = 0;
 	ndev->mvdev.cvq.received_desc = 0;
 	ndev->mvdev.cvq.completed_desc = 0;
@@ -2815,6 +2816,8 @@  static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
 	struct mlx5_vdpa_virtqueue *mvq;
 	int i;
 
+	mlx5_vdpa_info(mvdev, "suspending device\n");
+
 	down_write(&ndev->reslock);
 	ndev->nb_registered = false;
 	mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
@@ -2824,6 +2827,7 @@  static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
 		suspend_vq(ndev, mvq);
 	}
 	mlx5_vdpa_cvq_suspend(mvdev);
+	mvdev->suspended = true;
 	up_write(&ndev->reslock);
 	return 0;
 }