[v4,03/11] virtio-vdpa: Support interrupt affinity spreading mechanism

Message ID 20230323053043.35-4-xieyongji@bytedance.com
State New
Headers
Series VDUSE: Improve performance |

Commit Message

Yongji Xie March 23, 2023, 5:30 a.m. UTC
  To support interrupt affinity spreading mechanism,
this makes use of group_cpus_evenly() to create
an irq callback affinity mask for each virtqueue
of vdpa device. Then we will unify set_vq_affinity
callback to pass the affinity to the vdpa device driver.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
 drivers/virtio/virtio_vdpa.c | 68 ++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
  

Comments

Jason Wang March 24, 2023, 6:27 a.m. UTC | #1
On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji <xieyongji@bytedance.com> wrote:
>
> To support interrupt affinity spreading mechanism,
> this makes use of group_cpus_evenly() to create
> an irq callback affinity mask for each virtqueue
> of vdpa device. Then we will unify set_vq_affinity
> callback to pass the affinity to the vdpa device driver.
>
> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>

Thinking hard of all the logics, I think I've found something interesting.

Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
pass irq_affinity to transport specific find_vqs().  This seems a
layer violation since driver has no knowledge of

1) whether or not the callback is based on an IRQ
2) whether or not the device is a PCI or not (the details are hided by
the transport driver)
3) how many vectors could be used by a device

This means the driver can't actually pass a real affinity masks so the
commit passes a zero irq affinity structure as a hint in fact, so the
PCI layer can build a default affinity based that groups cpus evenly
based on the number of MSI-X vectors (the core logic is the
group_cpus_evenly). I think we should fix this by replacing the
irq_affinity structure with

1) a boolean like auto_cb_spreading

or

2) queue to cpu mapping

So each transport can do its own logic based on that. Then virtio-vDPA
can pass that policy to VDUSE where we only need a group_cpus_evenly()
and avoid duplicating irq_create_affinity_masks()?

Thanks

> ---
>  drivers/virtio/virtio_vdpa.c | 68 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 68 insertions(+)
>
> diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
> index f72696b4c1c2..f3826f42b704 100644
> --- a/drivers/virtio/virtio_vdpa.c
> +++ b/drivers/virtio/virtio_vdpa.c
> @@ -13,6 +13,7 @@
>  #include <linux/kernel.h>
>  #include <linux/slab.h>
>  #include <linux/uuid.h>
> +#include <linux/group_cpus.h>
>  #include <linux/virtio.h>
>  #include <linux/vdpa.h>
>  #include <linux/virtio_config.h>
> @@ -272,6 +273,66 @@ static void virtio_vdpa_del_vqs(struct virtio_device *vdev)
>                 virtio_vdpa_del_vq(vq);
>  }
>
> +static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
> +{
> +       affd->nr_sets = 1;
> +       affd->set_size[0] = affvecs;
> +}
> +
> +static struct cpumask *
> +create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
> +{
> +       unsigned int affvecs = 0, curvec, usedvecs, i;
> +       struct cpumask *masks = NULL;
> +
> +       if (nvecs > affd->pre_vectors + affd->post_vectors)
> +               affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
> +
> +       if (!affd->calc_sets)
> +               affd->calc_sets = default_calc_sets;
> +
> +       affd->calc_sets(affd, affvecs);
> +
> +       if (!affvecs)
> +               return NULL;
> +
> +       masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
> +       if (!masks)
> +               return NULL;
> +
> +       /* Fill out vectors at the beginning that don't need affinity */
> +       for (curvec = 0; curvec < affd->pre_vectors; curvec++)
> +               cpumask_setall(&masks[curvec]);
> +
> +       for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
> +               unsigned int this_vecs = affd->set_size[i];
> +               int j;
> +               struct cpumask *result = group_cpus_evenly(this_vecs);
> +
> +               if (!result) {
> +                       kfree(masks);
> +                       return NULL;
> +               }
> +
> +               for (j = 0; j < this_vecs; j++)
> +                       cpumask_copy(&masks[curvec + j], &result[j]);
> +               kfree(result);
> +
> +               curvec += this_vecs;
> +               usedvecs += this_vecs;
> +       }
> +
> +       /* Fill out vectors at the end that don't need affinity */
> +       if (usedvecs >= affvecs)
> +               curvec = affd->pre_vectors + affvecs;
> +       else
> +               curvec = affd->pre_vectors + usedvecs;
> +       for (; curvec < nvecs; curvec++)
> +               cpumask_setall(&masks[curvec]);
> +
> +       return masks;
> +}
> +
>  static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
>                                 struct virtqueue *vqs[],
>                                 vq_callback_t *callbacks[],
> @@ -282,9 +343,15 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
>         struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
>         struct vdpa_device *vdpa = vd_get_vdpa(vdev);
>         const struct vdpa_config_ops *ops = vdpa->config;
> +       struct irq_affinity default_affd = { 0 };
> +       struct cpumask *masks;
>         struct vdpa_callback cb;
>         int i, err, queue_idx = 0;
>
> +       masks = create_affinity_masks(nvqs, desc ? desc : &default_affd);
> +       if (!masks)
> +               return -ENOMEM;
> +
>         for (i = 0; i < nvqs; ++i) {
>                 if (!names[i]) {
>                         vqs[i] = NULL;
> @@ -298,6 +365,7 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
>                         err = PTR_ERR(vqs[i]);
>                         goto err_setup_vq;
>                 }
> +               ops->set_vq_affinity(vdpa, i, &masks[i]);
>         }
>
>         cb.callback = virtio_vdpa_config_cb;
> --
> 2.20.1
>
  
Michael S. Tsirkin March 24, 2023, 9:12 a.m. UTC | #2
On Fri, Mar 24, 2023 at 02:27:52PM +0800, Jason Wang wrote:
> On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> >
> > To support interrupt affinity spreading mechanism,
> > this makes use of group_cpus_evenly() to create
> > an irq callback affinity mask for each virtqueue
> > of vdpa device. Then we will unify set_vq_affinity
> > callback to pass the affinity to the vdpa device driver.
> >
> > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> 
> Thinking hard of all the logics, I think I've found something interesting.
> 
> Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
> pass irq_affinity to transport specific find_vqs().  This seems a
> layer violation since driver has no knowledge of
> 
> 1) whether or not the callback is based on an IRQ
> 2) whether or not the device is a PCI or not (the details are hided by
> the transport driver)
> 3) how many vectors could be used by a device
> 
> This means the driver can't actually pass a real affinity masks so the
> commit passes a zero irq affinity structure as a hint in fact, so the
> PCI layer can build a default affinity based that groups cpus evenly
> based on the number of MSI-X vectors (the core logic is the
> group_cpus_evenly). I think we should fix this by replacing the
> irq_affinity structure with
> 
> 1) a boolean like auto_cb_spreading
> 
> or
> 
> 2) queue to cpu mapping
> 
> So each transport can do its own logic based on that. Then virtio-vDPA
> can pass that policy to VDUSE where we only need a group_cpus_evenly()
> and avoid duplicating irq_create_affinity_masks()?
> 
> Thanks

I don't really understand what you propose. Care to post a patch?
Also does it have to block this patchset or can it be done on top?

> > ---
> >  drivers/virtio/virtio_vdpa.c | 68 ++++++++++++++++++++++++++++++++++++
> >  1 file changed, 68 insertions(+)
> >
> > diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
> > index f72696b4c1c2..f3826f42b704 100644
> > --- a/drivers/virtio/virtio_vdpa.c
> > +++ b/drivers/virtio/virtio_vdpa.c
> > @@ -13,6 +13,7 @@
> >  #include <linux/kernel.h>
> >  #include <linux/slab.h>
> >  #include <linux/uuid.h>
> > +#include <linux/group_cpus.h>
> >  #include <linux/virtio.h>
> >  #include <linux/vdpa.h>
> >  #include <linux/virtio_config.h>
> > @@ -272,6 +273,66 @@ static void virtio_vdpa_del_vqs(struct virtio_device *vdev)
> >                 virtio_vdpa_del_vq(vq);
> >  }
> >
> > +static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
> > +{
> > +       affd->nr_sets = 1;
> > +       affd->set_size[0] = affvecs;
> > +}
> > +
> > +static struct cpumask *
> > +create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
> > +{
> > +       unsigned int affvecs = 0, curvec, usedvecs, i;
> > +       struct cpumask *masks = NULL;
> > +
> > +       if (nvecs > affd->pre_vectors + affd->post_vectors)
> > +               affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
> > +
> > +       if (!affd->calc_sets)
> > +               affd->calc_sets = default_calc_sets;
> > +
> > +       affd->calc_sets(affd, affvecs);
> > +
> > +       if (!affvecs)
> > +               return NULL;
> > +
> > +       masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
> > +       if (!masks)
> > +               return NULL;
> > +
> > +       /* Fill out vectors at the beginning that don't need affinity */
> > +       for (curvec = 0; curvec < affd->pre_vectors; curvec++)
> > +               cpumask_setall(&masks[curvec]);
> > +
> > +       for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
> > +               unsigned int this_vecs = affd->set_size[i];
> > +               int j;
> > +               struct cpumask *result = group_cpus_evenly(this_vecs);
> > +
> > +               if (!result) {
> > +                       kfree(masks);
> > +                       return NULL;
> > +               }
> > +
> > +               for (j = 0; j < this_vecs; j++)
> > +                       cpumask_copy(&masks[curvec + j], &result[j]);
> > +               kfree(result);
> > +
> > +               curvec += this_vecs;
> > +               usedvecs += this_vecs;
> > +       }
> > +
> > +       /* Fill out vectors at the end that don't need affinity */
> > +       if (usedvecs >= affvecs)
> > +               curvec = affd->pre_vectors + affvecs;
> > +       else
> > +               curvec = affd->pre_vectors + usedvecs;
> > +       for (; curvec < nvecs; curvec++)
> > +               cpumask_setall(&masks[curvec]);
> > +
> > +       return masks;
> > +}
> > +
> >  static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
> >                                 struct virtqueue *vqs[],
> >                                 vq_callback_t *callbacks[],
> > @@ -282,9 +343,15 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
> >         struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
> >         struct vdpa_device *vdpa = vd_get_vdpa(vdev);
> >         const struct vdpa_config_ops *ops = vdpa->config;
> > +       struct irq_affinity default_affd = { 0 };
> > +       struct cpumask *masks;
> >         struct vdpa_callback cb;
> >         int i, err, queue_idx = 0;
> >
> > +       masks = create_affinity_masks(nvqs, desc ? desc : &default_affd);
> > +       if (!masks)
> > +               return -ENOMEM;
> > +
> >         for (i = 0; i < nvqs; ++i) {
> >                 if (!names[i]) {
> >                         vqs[i] = NULL;
> > @@ -298,6 +365,7 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
> >                         err = PTR_ERR(vqs[i]);
> >                         goto err_setup_vq;
> >                 }
> > +               ops->set_vq_affinity(vdpa, i, &masks[i]);
> >         }
> >
> >         cb.callback = virtio_vdpa_config_cb;
> > --
> > 2.20.1
> >
  
Yongji Xie March 28, 2023, 3:03 a.m. UTC | #3
On Fri, Mar 24, 2023 at 2:28 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> >
> > To support interrupt affinity spreading mechanism,
> > this makes use of group_cpus_evenly() to create
> > an irq callback affinity mask for each virtqueue
> > of vdpa device. Then we will unify set_vq_affinity
> > callback to pass the affinity to the vdpa device driver.
> >
> > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
>
> Thinking hard of all the logics, I think I've found something interesting.
>
> Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
> pass irq_affinity to transport specific find_vqs().  This seems a
> layer violation since driver has no knowledge of
>
> 1) whether or not the callback is based on an IRQ
> 2) whether or not the device is a PCI or not (the details are hided by
> the transport driver)
> 3) how many vectors could be used by a device
>
> This means the driver can't actually pass a real affinity masks so the
> commit passes a zero irq affinity structure as a hint in fact, so the
> PCI layer can build a default affinity based that groups cpus evenly
> based on the number of MSI-X vectors (the core logic is the
> group_cpus_evenly). I think we should fix this by replacing the
> irq_affinity structure with
>
> 1) a boolean like auto_cb_spreading
>
> or
>
> 2) queue to cpu mapping
>

But only the driver knows which queues are used in the control path
which don't need the automatic irq affinity assignment. So I think the
irq_affinity structure can only be created by device drivers and
passed to the virtio-pci/virtio-vdpa driver.

> So each transport can do its own logic based on that. Then virtio-vDPA
> can pass that policy to VDUSE where we only need a group_cpus_evenly()
> and avoid duplicating irq_create_affinity_masks()?
>

I don't get why we would have duplicated irq_create_affinity_masks().

Thanks,
Yongji
  
Jason Wang March 28, 2023, 3:14 a.m. UTC | #4
On Tue, Mar 28, 2023 at 11:03 AM Yongji Xie <xieyongji@bytedance.com> wrote:
>
> On Fri, Mar 24, 2023 at 2:28 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > >
> > > To support interrupt affinity spreading mechanism,
> > > this makes use of group_cpus_evenly() to create
> > > an irq callback affinity mask for each virtqueue
> > > of vdpa device. Then we will unify set_vq_affinity
> > > callback to pass the affinity to the vdpa device driver.
> > >
> > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> >
> > Thinking hard of all the logics, I think I've found something interesting.
> >
> > Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
> > pass irq_affinity to transport specific find_vqs().  This seems a
> > layer violation since driver has no knowledge of
> >
> > 1) whether or not the callback is based on an IRQ
> > 2) whether or not the device is a PCI or not (the details are hided by
> > the transport driver)
> > 3) how many vectors could be used by a device
> >
> > This means the driver can't actually pass a real affinity masks so the
> > commit passes a zero irq affinity structure as a hint in fact, so the
> > PCI layer can build a default affinity based that groups cpus evenly
> > based on the number of MSI-X vectors (the core logic is the
> > group_cpus_evenly). I think we should fix this by replacing the
> > irq_affinity structure with
> >
> > 1) a boolean like auto_cb_spreading
> >
> > or
> >
> > 2) queue to cpu mapping
> >
>
> But only the driver knows which queues are used in the control path
> which don't need the automatic irq affinity assignment.

Is this knowledge awarded by the transport driver now?

E.g virtio-blk uses:

        struct irq_affinity desc = { 0, };

Atleast we can tell the transport driver which vq requires automatic
irq affinity.

> So I think the
> irq_affinity structure can only be created by device drivers and
> passed to the virtio-pci/virtio-vdpa driver.

This could be not easy since the driver doesn't even know how many
interrupts will be used by the transport driver, so it can't built the
actual affinity structure.

>
> > So each transport can do its own logic based on that. Then virtio-vDPA
> > can pass that policy to VDUSE where we only need a group_cpus_evenly()
> > and avoid duplicating irq_create_affinity_masks()?
> >
>
> I don't get why we would have duplicated irq_create_affinity_masks().

I meant the create_affinity_masks() in patch 3 seems a duplication of
irq_create_affinity_masks().

Thanks

>
> Thanks,
> Yongji
>
  
Yongji Xie March 28, 2023, 3:33 a.m. UTC | #5
On Tue, Mar 28, 2023 at 11:14 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Tue, Mar 28, 2023 at 11:03 AM Yongji Xie <xieyongji@bytedance.com> wrote:
> >
> > On Fri, Mar 24, 2023 at 2:28 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > > >
> > > > To support interrupt affinity spreading mechanism,
> > > > this makes use of group_cpus_evenly() to create
> > > > an irq callback affinity mask for each virtqueue
> > > > of vdpa device. Then we will unify set_vq_affinity
> > > > callback to pass the affinity to the vdpa device driver.
> > > >
> > > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> > >
> > > Thinking hard of all the logics, I think I've found something interesting.
> > >
> > > Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
> > > pass irq_affinity to transport specific find_vqs().  This seems a
> > > layer violation since driver has no knowledge of
> > >
> > > 1) whether or not the callback is based on an IRQ
> > > 2) whether or not the device is a PCI or not (the details are hided by
> > > the transport driver)
> > > 3) how many vectors could be used by a device
> > >
> > > This means the driver can't actually pass a real affinity masks so the
> > > commit passes a zero irq affinity structure as a hint in fact, so the
> > > PCI layer can build a default affinity based that groups cpus evenly
> > > based on the number of MSI-X vectors (the core logic is the
> > > group_cpus_evenly). I think we should fix this by replacing the
> > > irq_affinity structure with
> > >
> > > 1) a boolean like auto_cb_spreading
> > >
> > > or
> > >
> > > 2) queue to cpu mapping
> > >
> >
> > But only the driver knows which queues are used in the control path
> > which don't need the automatic irq affinity assignment.
>
> Is this knowledge awarded by the transport driver now?
>

This knowledge is awarded by the device driver rather than the transport driver.

E.g. virtio-scsi uses:

    struct irq_affinity desc = { .pre_vectors = 2 }; // vq0 is control
queue, vq1 is event queue

> E.g virtio-blk uses:
>
>         struct irq_affinity desc = { 0, };
>
> Atleast we can tell the transport driver which vq requires automatic
> irq affinity.
>

I think that is what the current implementation does.

> > So I think the
> > irq_affinity structure can only be created by device drivers and
> > passed to the virtio-pci/virtio-vdpa driver.
>
> This could be not easy since the driver doesn't even know how many
> interrupts will be used by the transport driver, so it can't built the
> actual affinity structure.
>

The actual affinity mask is built by the transport driver, device
driver only passes a hint on which queues don't need the automatic irq
affinity assignment.

Thanks,
Yongji
  
Jason Wang March 28, 2023, 3:44 a.m. UTC | #6
On Tue, Mar 28, 2023 at 11:33 AM Yongji Xie <xieyongji@bytedance.com> wrote:
>
> On Tue, Mar 28, 2023 at 11:14 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Tue, Mar 28, 2023 at 11:03 AM Yongji Xie <xieyongji@bytedance.com> wrote:
> > >
> > > On Fri, Mar 24, 2023 at 2:28 PM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > > > >
> > > > > To support interrupt affinity spreading mechanism,
> > > > > this makes use of group_cpus_evenly() to create
> > > > > an irq callback affinity mask for each virtqueue
> > > > > of vdpa device. Then we will unify set_vq_affinity
> > > > > callback to pass the affinity to the vdpa device driver.
> > > > >
> > > > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> > > >
> > > > Thinking hard of all the logics, I think I've found something interesting.
> > > >
> > > > Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
> > > > pass irq_affinity to transport specific find_vqs().  This seems a
> > > > layer violation since driver has no knowledge of
> > > >
> > > > 1) whether or not the callback is based on an IRQ
> > > > 2) whether or not the device is a PCI or not (the details are hided by
> > > > the transport driver)
> > > > 3) how many vectors could be used by a device
> > > >
> > > > This means the driver can't actually pass a real affinity masks so the
> > > > commit passes a zero irq affinity structure as a hint in fact, so the
> > > > PCI layer can build a default affinity based that groups cpus evenly
> > > > based on the number of MSI-X vectors (the core logic is the
> > > > group_cpus_evenly). I think we should fix this by replacing the
> > > > irq_affinity structure with
> > > >
> > > > 1) a boolean like auto_cb_spreading
> > > >
> > > > or
> > > >
> > > > 2) queue to cpu mapping
> > > >
> > >
> > > But only the driver knows which queues are used in the control path
> > > which don't need the automatic irq affinity assignment.
> >
> > Is this knowledge awarded by the transport driver now?
> >
>
> This knowledge is awarded by the device driver rather than the transport driver.
>
> E.g. virtio-scsi uses:
>
>     struct irq_affinity desc = { .pre_vectors = 2 }; // vq0 is control
> queue, vq1 is event queue

Ok, but it only works as a hint, it's not a real affinity. As replied,
we can pass an array of boolean in this case then transport driver
knows it doesn't need to use automatic affinity for the first two
queues.

>
> > E.g virtio-blk uses:
> >
> >         struct irq_affinity desc = { 0, };
> >
> > Atleast we can tell the transport driver which vq requires automatic
> > irq affinity.
> >
>
> I think that is what the current implementation does.
>
> > > So I think the
> > > irq_affinity structure can only be created by device drivers and
> > > passed to the virtio-pci/virtio-vdpa driver.
> >
> > This could be not easy since the driver doesn't even know how many
> > interrupts will be used by the transport driver, so it can't built the
> > actual affinity structure.
> >
>
> The actual affinity mask is built by the transport driver,

For PCI yes, it talks directly to the IRQ subsystems.

> device
> driver only passes a hint on which queues don't need the automatic irq
> affinity assignment.

But not for virtio-vDPA since the IRQ needs to be dealt with by the
parent driver. For our case, it's the VDUSE where it doesn't need IRQ
at all, a queue to cpu mapping is sufficient.

Thanks

>
> Thanks,
> Yongji
>
  
Yongji Xie March 28, 2023, 4:04 a.m. UTC | #7
On Tue, Mar 28, 2023 at 11:44 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Tue, Mar 28, 2023 at 11:33 AM Yongji Xie <xieyongji@bytedance.com> wrote:
> >
> > On Tue, Mar 28, 2023 at 11:14 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Tue, Mar 28, 2023 at 11:03 AM Yongji Xie <xieyongji@bytedance.com> wrote:
> > > >
> > > > On Fri, Mar 24, 2023 at 2:28 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > > > > >
> > > > > > To support interrupt affinity spreading mechanism,
> > > > > > this makes use of group_cpus_evenly() to create
> > > > > > an irq callback affinity mask for each virtqueue
> > > > > > of vdpa device. Then we will unify set_vq_affinity
> > > > > > callback to pass the affinity to the vdpa device driver.
> > > > > >
> > > > > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> > > > >
> > > > > Thinking hard of all the logics, I think I've found something interesting.
> > > > >
> > > > > Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
> > > > > pass irq_affinity to transport specific find_vqs().  This seems a
> > > > > layer violation since driver has no knowledge of
> > > > >
> > > > > 1) whether or not the callback is based on an IRQ
> > > > > 2) whether or not the device is a PCI or not (the details are hided by
> > > > > the transport driver)
> > > > > 3) how many vectors could be used by a device
> > > > >
> > > > > This means the driver can't actually pass a real affinity masks so the
> > > > > commit passes a zero irq affinity structure as a hint in fact, so the
> > > > > PCI layer can build a default affinity based that groups cpus evenly
> > > > > based on the number of MSI-X vectors (the core logic is the
> > > > > group_cpus_evenly). I think we should fix this by replacing the
> > > > > irq_affinity structure with
> > > > >
> > > > > 1) a boolean like auto_cb_spreading
> > > > >
> > > > > or
> > > > >
> > > > > 2) queue to cpu mapping
> > > > >
> > > >
> > > > But only the driver knows which queues are used in the control path
> > > > which don't need the automatic irq affinity assignment.
> > >
> > > Is this knowledge awarded by the transport driver now?
> > >
> >
> > This knowledge is awarded by the device driver rather than the transport driver.
> >
> > E.g. virtio-scsi uses:
> >
> >     struct irq_affinity desc = { .pre_vectors = 2 }; // vq0 is control
> > queue, vq1 is event queue
>
> Ok, but it only works as a hint, it's not a real affinity. As replied,
> we can pass an array of boolean in this case then transport driver
> knows it doesn't need to use automatic affinity for the first two
> queues.
>

But we don't know whether we would use other fields in structure
irq_affinity in the future. So a full set should be better?

> >
> > > E.g virtio-blk uses:
> > >
> > >         struct irq_affinity desc = { 0, };
> > >
> > > Atleast we can tell the transport driver which vq requires automatic
> > > irq affinity.
> > >
> >
> > I think that is what the current implementation does.
> >
> > > > So I think the
> > > > irq_affinity structure can only be created by device drivers and
> > > > passed to the virtio-pci/virtio-vdpa driver.
> > >
> > > This could be not easy since the driver doesn't even know how many
> > > interrupts will be used by the transport driver, so it can't built the
> > > actual affinity structure.
> > >
> >
> > The actual affinity mask is built by the transport driver,
>
> For PCI yes, it talks directly to the IRQ subsystems.
>
> > device
> > driver only passes a hint on which queues don't need the automatic irq
> > affinity assignment.
>
> But not for virtio-vDPA since the IRQ needs to be dealt with by the
> parent driver. For our case, it's the VDUSE where it doesn't need IRQ
> at all, a queue to cpu mapping is sufficient.
>

The device driver doesn't know whether it is binded to virtio-pci or
virtio-vdpa. So it should pass a full set needed by the automatic irq
affinity assignment instead of a subset. Then virtio-vdpa can choose
to pass a queue to cpu mapping to VDUSE, which is what we do now (use
set_vq_affinity()).

Thanks,
Yongji
  
Jason Wang March 28, 2023, 6:07 a.m. UTC | #8
On Tue, Mar 28, 2023 at 12:05 PM Yongji Xie <xieyongji@bytedance.com> wrote:
>
> On Tue, Mar 28, 2023 at 11:44 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Tue, Mar 28, 2023 at 11:33 AM Yongji Xie <xieyongji@bytedance.com> wrote:
> > >
> > > On Tue, Mar 28, 2023 at 11:14 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Tue, Mar 28, 2023 at 11:03 AM Yongji Xie <xieyongji@bytedance.com> wrote:
> > > > >
> > > > > On Fri, Mar 24, 2023 at 2:28 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > > > > > >
> > > > > > > To support interrupt affinity spreading mechanism,
> > > > > > > this makes use of group_cpus_evenly() to create
> > > > > > > an irq callback affinity mask for each virtqueue
> > > > > > > of vdpa device. Then we will unify set_vq_affinity
> > > > > > > callback to pass the affinity to the vdpa device driver.
> > > > > > >
> > > > > > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> > > > > >
> > > > > > Thinking hard of all the logics, I think I've found something interesting.
> > > > > >
> > > > > > Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
> > > > > > pass irq_affinity to transport specific find_vqs().  This seems a
> > > > > > layer violation since driver has no knowledge of
> > > > > >
> > > > > > 1) whether or not the callback is based on an IRQ
> > > > > > 2) whether or not the device is a PCI or not (the details are hided by
> > > > > > the transport driver)
> > > > > > 3) how many vectors could be used by a device
> > > > > >
> > > > > > This means the driver can't actually pass a real affinity masks so the
> > > > > > commit passes a zero irq affinity structure as a hint in fact, so the
> > > > > > PCI layer can build a default affinity based that groups cpus evenly
> > > > > > based on the number of MSI-X vectors (the core logic is the
> > > > > > group_cpus_evenly). I think we should fix this by replacing the
> > > > > > irq_affinity structure with
> > > > > >
> > > > > > 1) a boolean like auto_cb_spreading
> > > > > >
> > > > > > or
> > > > > >
> > > > > > 2) queue to cpu mapping
> > > > > >
> > > > >
> > > > > But only the driver knows which queues are used in the control path
> > > > > which don't need the automatic irq affinity assignment.
> > > >
> > > > Is this knowledge awarded by the transport driver now?
> > > >
> > >
> > > This knowledge is awarded by the device driver rather than the transport driver.
> > >
> > > E.g. virtio-scsi uses:
> > >
> > >     struct irq_affinity desc = { .pre_vectors = 2 }; // vq0 is control
> > > queue, vq1 is event queue
> >
> > Ok, but it only works as a hint, it's not a real affinity. As replied,
> > we can pass an array of boolean in this case then transport driver
> > knows it doesn't need to use automatic affinity for the first two
> > queues.
> >
>
> But we don't know whether we would use other fields in structure
> irq_affinity in the future. So a full set should be better?

Good point. So the issue is the calc_sets() and we probably need that
if there's a virtio driver that needs more than one set of vectors
that needs to be spreaded. Technically, we could have a virtio level
abstraction for this but I agree it's probably not worth bothering
now.

>
> > >
> > > > E.g virtio-blk uses:
> > > >
> > > >         struct irq_affinity desc = { 0, };
> > > >
> > > > Atleast we can tell the transport driver which vq requires automatic
> > > > irq affinity.
> > > >
> > >
> > > I think that is what the current implementation does.
> > >
> > > > > So I think the
> > > > > irq_affinity structure can only be created by device drivers and
> > > > > passed to the virtio-pci/virtio-vdpa driver.
> > > >
> > > > This could be not easy since the driver doesn't even know how many
> > > > interrupts will be used by the transport driver, so it can't built the
> > > > actual affinity structure.
> > > >
> > >
> > > The actual affinity mask is built by the transport driver,
> >
> > For PCI yes, it talks directly to the IRQ subsystems.
> >
> > > device
> > > driver only passes a hint on which queues don't need the automatic irq
> > > affinity assignment.
> >
> > But not for virtio-vDPA since the IRQ needs to be dealt with by the
> > parent driver. For our case, it's the VDUSE where it doesn't need IRQ
> > at all, a queue to cpu mapping is sufficient.
> >
>
> The device driver doesn't know whether it is binded to virtio-pci or
> virtio-vdpa. So it should pass a full set needed by the automatic irq
> affinity assignment instead of a subset. Then virtio-vdpa can choose
> to pass a queue to cpu mapping to VDUSE, which is what we do now (use
> set_vq_affinity()).

Yes, so basically two ways:

1) automatic IRQ management, passing affd to find_vqs(), affinity was
determined by the transport (e.g vDPA).
2) affinity that is under the control of the driver, it needs to use
set_vq_affinity() but need to deal with cpu hotplug stuffs.

Thanks

>
> Thanks,
> Yongji
>
  
Jason Wang March 28, 2023, 6:12 a.m. UTC | #9
在 2023/3/24 17:12, Michael S. Tsirkin 写道:
> On Fri, Mar 24, 2023 at 02:27:52PM +0800, Jason Wang wrote:
>> On Thu, Mar 23, 2023 at 1:31 PM Xie Yongji <xieyongji@bytedance.com> wrote:
>>> To support interrupt affinity spreading mechanism,
>>> this makes use of group_cpus_evenly() to create
>>> an irq callback affinity mask for each virtqueue
>>> of vdpa device. Then we will unify set_vq_affinity
>>> callback to pass the affinity to the vdpa device driver.
>>>
>>> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
>> Thinking hard of all the logics, I think I've found something interesting.
>>
>> Commit ad71473d9c437 ("virtio_blk: use virtio IRQ affinity") tries to
>> pass irq_affinity to transport specific find_vqs().  This seems a
>> layer violation since driver has no knowledge of
>>
>> 1) whether or not the callback is based on an IRQ
>> 2) whether or not the device is a PCI or not (the details are hided by
>> the transport driver)
>> 3) how many vectors could be used by a device
>>
>> This means the driver can't actually pass a real affinity masks so the
>> commit passes a zero irq affinity structure as a hint in fact, so the
>> PCI layer can build a default affinity based that groups cpus evenly
>> based on the number of MSI-X vectors (the core logic is the
>> group_cpus_evenly). I think we should fix this by replacing the
>> irq_affinity structure with
>>
>> 1) a boolean like auto_cb_spreading
>>
>> or
>>
>> 2) queue to cpu mapping
>>
>> So each transport can do its own logic based on that. Then virtio-vDPA
>> can pass that policy to VDUSE where we only need a group_cpus_evenly()
>> and avoid duplicating irq_create_affinity_masks()?
>>
>> Thanks
> I don't really understand what you propose. Care to post a patch?


I meant to avoid passing irq_affinity structure in find_vqs but an array 
of boolean telling us whether or not the vq requires a automatic 
spreading of callbacks. But it seems less flexible.


> Also does it have to block this patchset or can it be done on top?


We can leave it in the future.

So

Acked-by: Jason Wang <jasowang@redhat.com>

Thanks


>
>>> ---
>>>   drivers/virtio/virtio_vdpa.c | 68 ++++++++++++++++++++++++++++++++++++
>>>   1 file changed, 68 insertions(+)
>>>
>>> diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
>>> index f72696b4c1c2..f3826f42b704 100644
>>> --- a/drivers/virtio/virtio_vdpa.c
>>> +++ b/drivers/virtio/virtio_vdpa.c
>>> @@ -13,6 +13,7 @@
>>>   #include <linux/kernel.h>
>>>   #include <linux/slab.h>
>>>   #include <linux/uuid.h>
>>> +#include <linux/group_cpus.h>
>>>   #include <linux/virtio.h>
>>>   #include <linux/vdpa.h>
>>>   #include <linux/virtio_config.h>
>>> @@ -272,6 +273,66 @@ static void virtio_vdpa_del_vqs(struct virtio_device *vdev)
>>>                  virtio_vdpa_del_vq(vq);
>>>   }
>>>
>>> +static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
>>> +{
>>> +       affd->nr_sets = 1;
>>> +       affd->set_size[0] = affvecs;
>>> +}
>>> +
>>> +static struct cpumask *
>>> +create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
>>> +{
>>> +       unsigned int affvecs = 0, curvec, usedvecs, i;
>>> +       struct cpumask *masks = NULL;
>>> +
>>> +       if (nvecs > affd->pre_vectors + affd->post_vectors)
>>> +               affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
>>> +
>>> +       if (!affd->calc_sets)
>>> +               affd->calc_sets = default_calc_sets;
>>> +
>>> +       affd->calc_sets(affd, affvecs);
>>> +
>>> +       if (!affvecs)
>>> +               return NULL;
>>> +
>>> +       masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
>>> +       if (!masks)
>>> +               return NULL;
>>> +
>>> +       /* Fill out vectors at the beginning that don't need affinity */
>>> +       for (curvec = 0; curvec < affd->pre_vectors; curvec++)
>>> +               cpumask_setall(&masks[curvec]);
>>> +
>>> +       for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
>>> +               unsigned int this_vecs = affd->set_size[i];
>>> +               int j;
>>> +               struct cpumask *result = group_cpus_evenly(this_vecs);
>>> +
>>> +               if (!result) {
>>> +                       kfree(masks);
>>> +                       return NULL;
>>> +               }
>>> +
>>> +               for (j = 0; j < this_vecs; j++)
>>> +                       cpumask_copy(&masks[curvec + j], &result[j]);
>>> +               kfree(result);
>>> +
>>> +               curvec += this_vecs;
>>> +               usedvecs += this_vecs;
>>> +       }
>>> +
>>> +       /* Fill out vectors at the end that don't need affinity */
>>> +       if (usedvecs >= affvecs)
>>> +               curvec = affd->pre_vectors + affvecs;
>>> +       else
>>> +               curvec = affd->pre_vectors + usedvecs;
>>> +       for (; curvec < nvecs; curvec++)
>>> +               cpumask_setall(&masks[curvec]);
>>> +
>>> +       return masks;
>>> +}
>>> +
>>>   static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
>>>                                  struct virtqueue *vqs[],
>>>                                  vq_callback_t *callbacks[],
>>> @@ -282,9 +343,15 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
>>>          struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
>>>          struct vdpa_device *vdpa = vd_get_vdpa(vdev);
>>>          const struct vdpa_config_ops *ops = vdpa->config;
>>> +       struct irq_affinity default_affd = { 0 };
>>> +       struct cpumask *masks;
>>>          struct vdpa_callback cb;
>>>          int i, err, queue_idx = 0;
>>>
>>> +       masks = create_affinity_masks(nvqs, desc ? desc : &default_affd);
>>> +       if (!masks)
>>> +               return -ENOMEM;
>>> +
>>>          for (i = 0; i < nvqs; ++i) {
>>>                  if (!names[i]) {
>>>                          vqs[i] = NULL;
>>> @@ -298,6 +365,7 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
>>>                          err = PTR_ERR(vqs[i]);
>>>                          goto err_setup_vq;
>>>                  }
>>> +               ops->set_vq_affinity(vdpa, i, &masks[i]);
>>>          }
>>>
>>>          cb.callback = virtio_vdpa_config_cb;
>>> --
>>> 2.20.1
>>>
  

Patch

diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index f72696b4c1c2..f3826f42b704 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -13,6 +13,7 @@ 
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/uuid.h>
+#include <linux/group_cpus.h>
 #include <linux/virtio.h>
 #include <linux/vdpa.h>
 #include <linux/virtio_config.h>
@@ -272,6 +273,66 @@  static void virtio_vdpa_del_vqs(struct virtio_device *vdev)
 		virtio_vdpa_del_vq(vq);
 }
 
+static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+{
+	affd->nr_sets = 1;
+	affd->set_size[0] = affvecs;
+}
+
+static struct cpumask *
+create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
+{
+	unsigned int affvecs = 0, curvec, usedvecs, i;
+	struct cpumask *masks = NULL;
+
+	if (nvecs > affd->pre_vectors + affd->post_vectors)
+		affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+
+	if (!affd->calc_sets)
+		affd->calc_sets = default_calc_sets;
+
+	affd->calc_sets(affd, affvecs);
+
+	if (!affvecs)
+		return NULL;
+
+	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
+	if (!masks)
+		return NULL;
+
+	/* Fill out vectors at the beginning that don't need affinity */
+	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+		cpumask_setall(&masks[curvec]);
+
+	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+		unsigned int this_vecs = affd->set_size[i];
+		int j;
+		struct cpumask *result = group_cpus_evenly(this_vecs);
+
+		if (!result) {
+			kfree(masks);
+			return NULL;
+		}
+
+		for (j = 0; j < this_vecs; j++)
+			cpumask_copy(&masks[curvec + j], &result[j]);
+		kfree(result);
+
+		curvec += this_vecs;
+		usedvecs += this_vecs;
+	}
+
+	/* Fill out vectors at the end that don't need affinity */
+	if (usedvecs >= affvecs)
+		curvec = affd->pre_vectors + affvecs;
+	else
+		curvec = affd->pre_vectors + usedvecs;
+	for (; curvec < nvecs; curvec++)
+		cpumask_setall(&masks[curvec]);
+
+	return masks;
+}
+
 static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				struct virtqueue *vqs[],
 				vq_callback_t *callbacks[],
@@ -282,9 +343,15 @@  static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 	struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
 	struct vdpa_device *vdpa = vd_get_vdpa(vdev);
 	const struct vdpa_config_ops *ops = vdpa->config;
+	struct irq_affinity default_affd = { 0 };
+	struct cpumask *masks;
 	struct vdpa_callback cb;
 	int i, err, queue_idx = 0;
 
+	masks = create_affinity_masks(nvqs, desc ? desc : &default_affd);
+	if (!masks)
+		return -ENOMEM;
+
 	for (i = 0; i < nvqs; ++i) {
 		if (!names[i]) {
 			vqs[i] = NULL;
@@ -298,6 +365,7 @@  static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 			err = PTR_ERR(vqs[i]);
 			goto err_setup_vq;
 		}
+		ops->set_vq_affinity(vdpa, i, &masks[i]);
 	}
 
 	cb.callback = virtio_vdpa_config_cb;