[v2,06/11] vduse: Support automatic irq callback affinity

Message ID 20221205085846.741-1-xieyongji@bytedance.com
State New
Headers
Series VDUSE: Improve performance |

Commit Message

Yongji Xie Dec. 5, 2022, 8:58 a.m. UTC
  This brings current interrupt affinity spreading mechanism
to vduse device. We will make use of irq_create_affinity_masks()
to create an irq callback affinity mask for each virtqueue of
vduse device. Then we will choose the CPU which has the lowest
number of interrupt allocated in the affinity mask to run the
irq callback.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 50 ++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
  

Comments

Jason Wang Dec. 16, 2022, 5:30 a.m. UTC | #1
On Mon, Dec 5, 2022 at 4:59 PM Xie Yongji <xieyongji@bytedance.com> wrote:
>
> This brings current interrupt affinity spreading mechanism
> to vduse device. We will make use of irq_create_affinity_masks()
> to create an irq callback affinity mask for each virtqueue of
> vduse device. Then we will choose the CPU which has the lowest
> number of interrupt allocated in the affinity mask to run the
> irq callback.

This seems a balance mechanism but it might not be the semantic of the
affinity or any reason we need to do this? I guess we should use at
least round-robin in this case.

>
> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> ---
>  drivers/vdpa/vdpa_user/vduse_dev.c | 50 ++++++++++++++++++++++++++++++
>  1 file changed, 50 insertions(+)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index d126f3e32a20..90c2896039d9 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -23,6 +23,7 @@
>  #include <linux/nospec.h>
>  #include <linux/vmalloc.h>
>  #include <linux/sched/mm.h>
> +#include <linux/interrupt.h>
>  #include <uapi/linux/vduse.h>
>  #include <uapi/linux/vdpa.h>
>  #include <uapi/linux/virtio_config.h>
> @@ -58,6 +59,8 @@ struct vduse_virtqueue {
>         struct work_struct inject;
>         struct work_struct kick;
>         int irq_effective_cpu;
> +       struct cpumask irq_affinity;
> +       spinlock_t irq_affinity_lock;

Ok, I'd suggest to squash this into patch 5 to make it more easier to
be reviewed.

>  };
>
>  struct vduse_dev;
> @@ -123,6 +126,7 @@ struct vduse_control {
>
>  static DEFINE_MUTEX(vduse_lock);
>  static DEFINE_IDR(vduse_idr);
> +static DEFINE_PER_CPU(unsigned long, vduse_allocated_irq);
>
>  static dev_t vduse_major;
>  static struct class *vduse_class;
> @@ -710,6 +714,49 @@ static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
>         return dev->generation;
>  }
>
> +static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
> +{
> +       unsigned int cpu, best_cpu;
> +       unsigned long allocated, allocated_min = UINT_MAX;
> +
> +       spin_lock(&vq->irq_affinity_lock);
> +
> +       best_cpu = vq->irq_effective_cpu;
> +       if (best_cpu != -1)
> +               per_cpu(vduse_allocated_irq, best_cpu) -= 1;
> +
> +       for_each_cpu(cpu, &vq->irq_affinity) {
> +               allocated = per_cpu(vduse_allocated_irq, cpu);
> +               if (!cpu_online(cpu) || allocated >= allocated_min)
> +                       continue;
> +
> +               best_cpu = cpu;
> +               allocated_min = allocated;
> +       }
> +       vq->irq_effective_cpu = best_cpu;
> +       per_cpu(vduse_allocated_irq, best_cpu) += 1;
> +
> +       spin_unlock(&vq->irq_affinity_lock);
> +}
> +
> +static void vduse_vdpa_set_irq_affinity(struct vdpa_device *vdpa,
> +                                       struct irq_affinity *desc)
> +{
> +       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> +       struct irq_affinity_desc *affd = NULL;
> +       int i;
> +
> +       affd = irq_create_affinity_masks(dev->vq_num, desc);
> +       if (!affd)

Let's add a comment on the vdpa config ops to say set_irq_affinity()
is best effort.

Thanks

> +               return;
> +
> +       for (i = 0; i < dev->vq_num; i++) {
> +               cpumask_copy(&dev->vqs[i]->irq_affinity, &affd[i].mask);
> +               vduse_vq_update_effective_cpu(dev->vqs[i]);
> +       }
> +       kfree(affd);
> +}
> +
>  static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
>                                 unsigned int asid,
>                                 struct vhost_iotlb *iotlb)
> @@ -760,6 +807,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
>         .get_config             = vduse_vdpa_get_config,
>         .set_config             = vduse_vdpa_set_config,
>         .get_generation         = vduse_vdpa_get_generation,
> +       .set_irq_affinity       = vduse_vdpa_set_irq_affinity,
>         .reset                  = vduse_vdpa_reset,
>         .set_map                = vduse_vdpa_set_map,
>         .free                   = vduse_vdpa_free,
> @@ -1380,6 +1428,8 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
>                 INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
>                 spin_lock_init(&dev->vqs[i]->kick_lock);
>                 spin_lock_init(&dev->vqs[i]->irq_lock);
> +               spin_lock_init(&dev->vqs[i]->irq_affinity_lock);
> +               cpumask_setall(&dev->vqs[i]->irq_affinity);
>         }
>
>         return 0;
> --
> 2.20.1
>
  
Yongji Xie Dec. 19, 2022, 4:56 a.m. UTC | #2
On Fri, Dec 16, 2022 at 1:30 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Mon, Dec 5, 2022 at 4:59 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> >
> > This brings current interrupt affinity spreading mechanism
> > to vduse device. We will make use of irq_create_affinity_masks()
> > to create an irq callback affinity mask for each virtqueue of
> > vduse device. Then we will choose the CPU which has the lowest
> > number of interrupt allocated in the affinity mask to run the
> > irq callback.
>
> This seems a balance mechanism but it might not be the semantic of the
> affinity or any reason we need to do this? I guess we should use at
> least round-robin in this case.
>

Here we try to follow the pci interrupt management mechanism. In VM
cases, the interrupt should always be triggered to one specific CPU
rather than to each CPU in turn.

> >
> > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> > ---
> >  drivers/vdpa/vdpa_user/vduse_dev.c | 50 ++++++++++++++++++++++++++++++
> >  1 file changed, 50 insertions(+)
> >
> > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > index d126f3e32a20..90c2896039d9 100644
> > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > @@ -23,6 +23,7 @@
> >  #include <linux/nospec.h>
> >  #include <linux/vmalloc.h>
> >  #include <linux/sched/mm.h>
> > +#include <linux/interrupt.h>
> >  #include <uapi/linux/vduse.h>
> >  #include <uapi/linux/vdpa.h>
> >  #include <uapi/linux/virtio_config.h>
> > @@ -58,6 +59,8 @@ struct vduse_virtqueue {
> >         struct work_struct inject;
> >         struct work_struct kick;
> >         int irq_effective_cpu;
> > +       struct cpumask irq_affinity;
> > +       spinlock_t irq_affinity_lock;
>
> Ok, I'd suggest to squash this into patch 5 to make it more easier to
> be reviewed.
>

OK.

> >  };
> >
> >  struct vduse_dev;
> > @@ -123,6 +126,7 @@ struct vduse_control {
> >
> >  static DEFINE_MUTEX(vduse_lock);
> >  static DEFINE_IDR(vduse_idr);
> > +static DEFINE_PER_CPU(unsigned long, vduse_allocated_irq);
> >
> >  static dev_t vduse_major;
> >  static struct class *vduse_class;
> > @@ -710,6 +714,49 @@ static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
> >         return dev->generation;
> >  }
> >
> > +static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
> > +{
> > +       unsigned int cpu, best_cpu;
> > +       unsigned long allocated, allocated_min = UINT_MAX;
> > +
> > +       spin_lock(&vq->irq_affinity_lock);
> > +
> > +       best_cpu = vq->irq_effective_cpu;
> > +       if (best_cpu != -1)
> > +               per_cpu(vduse_allocated_irq, best_cpu) -= 1;
> > +
> > +       for_each_cpu(cpu, &vq->irq_affinity) {
> > +               allocated = per_cpu(vduse_allocated_irq, cpu);
> > +               if (!cpu_online(cpu) || allocated >= allocated_min)
> > +                       continue;
> > +
> > +               best_cpu = cpu;
> > +               allocated_min = allocated;
> > +       }
> > +       vq->irq_effective_cpu = best_cpu;
> > +       per_cpu(vduse_allocated_irq, best_cpu) += 1;
> > +
> > +       spin_unlock(&vq->irq_affinity_lock);
> > +}
> > +
> > +static void vduse_vdpa_set_irq_affinity(struct vdpa_device *vdpa,
> > +                                       struct irq_affinity *desc)
> > +{
> > +       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > +       struct irq_affinity_desc *affd = NULL;
> > +       int i;
> > +
> > +       affd = irq_create_affinity_masks(dev->vq_num, desc);
> > +       if (!affd)
>
> Let's add a comment on the vdpa config ops to say set_irq_affinity()
> is best effort.
>

OK.

Thanks,
Yongji
  
Jason Wang Dec. 20, 2022, 6:32 a.m. UTC | #3
On Mon, Dec 19, 2022 at 12:56 PM Yongji Xie <xieyongji@bytedance.com> wrote:
>
> On Fri, Dec 16, 2022 at 1:30 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Mon, Dec 5, 2022 at 4:59 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > >
> > > This brings current interrupt affinity spreading mechanism
> > > to vduse device. We will make use of irq_create_affinity_masks()
> > > to create an irq callback affinity mask for each virtqueue of
> > > vduse device. Then we will choose the CPU which has the lowest
> > > number of interrupt allocated in the affinity mask to run the
> > > irq callback.
> >
> > This seems a balance mechanism but it might not be the semantic of the
> > affinity or any reason we need to do this? I guess we should use at
> > least round-robin in this case.
> >
>
> Here we try to follow the pci interrupt management mechanism. In VM
> cases, the interrupt should always be triggered to one specific CPU
> rather than to each CPU in turn.

If I was not wrong, when using MSI, most arch allows not only the
cpuid as the destination but policy like rr and low priority first.

Thanks

>
> > >
> > > Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> > > ---
> > >  drivers/vdpa/vdpa_user/vduse_dev.c | 50 ++++++++++++++++++++++++++++++
> > >  1 file changed, 50 insertions(+)
> > >
> > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > index d126f3e32a20..90c2896039d9 100644
> > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > @@ -23,6 +23,7 @@
> > >  #include <linux/nospec.h>
> > >  #include <linux/vmalloc.h>
> > >  #include <linux/sched/mm.h>
> > > +#include <linux/interrupt.h>
> > >  #include <uapi/linux/vduse.h>
> > >  #include <uapi/linux/vdpa.h>
> > >  #include <uapi/linux/virtio_config.h>
> > > @@ -58,6 +59,8 @@ struct vduse_virtqueue {
> > >         struct work_struct inject;
> > >         struct work_struct kick;
> > >         int irq_effective_cpu;
> > > +       struct cpumask irq_affinity;
> > > +       spinlock_t irq_affinity_lock;
> >
> > Ok, I'd suggest to squash this into patch 5 to make it more easier to
> > be reviewed.
> >
>
> OK.
>
> > >  };
> > >
> > >  struct vduse_dev;
> > > @@ -123,6 +126,7 @@ struct vduse_control {
> > >
> > >  static DEFINE_MUTEX(vduse_lock);
> > >  static DEFINE_IDR(vduse_idr);
> > > +static DEFINE_PER_CPU(unsigned long, vduse_allocated_irq);
> > >
> > >  static dev_t vduse_major;
> > >  static struct class *vduse_class;
> > > @@ -710,6 +714,49 @@ static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
> > >         return dev->generation;
> > >  }
> > >
> > > +static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
> > > +{
> > > +       unsigned int cpu, best_cpu;
> > > +       unsigned long allocated, allocated_min = UINT_MAX;
> > > +
> > > +       spin_lock(&vq->irq_affinity_lock);
> > > +
> > > +       best_cpu = vq->irq_effective_cpu;
> > > +       if (best_cpu != -1)
> > > +               per_cpu(vduse_allocated_irq, best_cpu) -= 1;
> > > +
> > > +       for_each_cpu(cpu, &vq->irq_affinity) {
> > > +               allocated = per_cpu(vduse_allocated_irq, cpu);
> > > +               if (!cpu_online(cpu) || allocated >= allocated_min)
> > > +                       continue;
> > > +
> > > +               best_cpu = cpu;
> > > +               allocated_min = allocated;
> > > +       }
> > > +       vq->irq_effective_cpu = best_cpu;
> > > +       per_cpu(vduse_allocated_irq, best_cpu) += 1;
> > > +
> > > +       spin_unlock(&vq->irq_affinity_lock);
> > > +}
> > > +
> > > +static void vduse_vdpa_set_irq_affinity(struct vdpa_device *vdpa,
> > > +                                       struct irq_affinity *desc)
> > > +{
> > > +       struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > +       struct irq_affinity_desc *affd = NULL;
> > > +       int i;
> > > +
> > > +       affd = irq_create_affinity_masks(dev->vq_num, desc);
> > > +       if (!affd)
> >
> > Let's add a comment on the vdpa config ops to say set_irq_affinity()
> > is best effort.
> >
>
> OK.
>
> Thanks,
> Yongji
>
  
Yongji Xie Dec. 20, 2022, 8:21 a.m. UTC | #4
On Tue, Dec 20, 2022 at 2:32 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Mon, Dec 19, 2022 at 12:56 PM Yongji Xie <xieyongji@bytedance.com> wrote:
> >
> > On Fri, Dec 16, 2022 at 1:30 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Mon, Dec 5, 2022 at 4:59 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > > >
> > > > This brings current interrupt affinity spreading mechanism
> > > > to vduse device. We will make use of irq_create_affinity_masks()
> > > > to create an irq callback affinity mask for each virtqueue of
> > > > vduse device. Then we will choose the CPU which has the lowest
> > > > number of interrupt allocated in the affinity mask to run the
> > > > irq callback.
> > >
> > > This seems a balance mechanism but it might not be the semantic of the
> > > affinity or any reason we need to do this? I guess we should use at
> > > least round-robin in this case.
> > >
> >
> > Here we try to follow the pci interrupt management mechanism. In VM
> > cases, the interrupt should always be triggered to one specific CPU
> > rather than to each CPU in turn.
>
> If I was not wrong, when using MSI, most arch allows not only the
> cpuid as the destination but policy like rr and low priority first.
>

I see. I think we can remove the irq effective affinity and just use
the irq affinity. If the irq affinity mask contains more than one CPU,
we can use round-robin to spread IRQ between CPUs. But the sysfs
interface for irq affinity should be writable so that someone wants to
stop round-robin and pick one CPU to run irq callback.

Thanks,
Yongji
  

Patch

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index d126f3e32a20..90c2896039d9 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -23,6 +23,7 @@ 
 #include <linux/nospec.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
+#include <linux/interrupt.h>
 #include <uapi/linux/vduse.h>
 #include <uapi/linux/vdpa.h>
 #include <uapi/linux/virtio_config.h>
@@ -58,6 +59,8 @@  struct vduse_virtqueue {
 	struct work_struct inject;
 	struct work_struct kick;
 	int irq_effective_cpu;
+	struct cpumask irq_affinity;
+	spinlock_t irq_affinity_lock;
 };
 
 struct vduse_dev;
@@ -123,6 +126,7 @@  struct vduse_control {
 
 static DEFINE_MUTEX(vduse_lock);
 static DEFINE_IDR(vduse_idr);
+static DEFINE_PER_CPU(unsigned long, vduse_allocated_irq);
 
 static dev_t vduse_major;
 static struct class *vduse_class;
@@ -710,6 +714,49 @@  static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
 	return dev->generation;
 }
 
+static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
+{
+	unsigned int cpu, best_cpu;
+	unsigned long allocated, allocated_min = UINT_MAX;
+
+	spin_lock(&vq->irq_affinity_lock);
+
+	best_cpu = vq->irq_effective_cpu;
+	if (best_cpu != -1)
+		per_cpu(vduse_allocated_irq, best_cpu) -= 1;
+
+	for_each_cpu(cpu, &vq->irq_affinity) {
+		allocated = per_cpu(vduse_allocated_irq, cpu);
+		if (!cpu_online(cpu) || allocated >= allocated_min)
+			continue;
+
+		best_cpu = cpu;
+		allocated_min = allocated;
+	}
+	vq->irq_effective_cpu = best_cpu;
+	per_cpu(vduse_allocated_irq, best_cpu) += 1;
+
+	spin_unlock(&vq->irq_affinity_lock);
+}
+
+static void vduse_vdpa_set_irq_affinity(struct vdpa_device *vdpa,
+					struct irq_affinity *desc)
+{
+	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+	struct irq_affinity_desc *affd = NULL;
+	int i;
+
+	affd = irq_create_affinity_masks(dev->vq_num, desc);
+	if (!affd)
+		return;
+
+	for (i = 0; i < dev->vq_num; i++) {
+		cpumask_copy(&dev->vqs[i]->irq_affinity, &affd[i].mask);
+		vduse_vq_update_effective_cpu(dev->vqs[i]);
+	}
+	kfree(affd);
+}
+
 static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
 				unsigned int asid,
 				struct vhost_iotlb *iotlb)
@@ -760,6 +807,7 @@  static const struct vdpa_config_ops vduse_vdpa_config_ops = {
 	.get_config		= vduse_vdpa_get_config,
 	.set_config		= vduse_vdpa_set_config,
 	.get_generation		= vduse_vdpa_get_generation,
+	.set_irq_affinity	= vduse_vdpa_set_irq_affinity,
 	.reset			= vduse_vdpa_reset,
 	.set_map		= vduse_vdpa_set_map,
 	.free			= vduse_vdpa_free,
@@ -1380,6 +1428,8 @@  static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
 		INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
 		spin_lock_init(&dev->vqs[i]->kick_lock);
 		spin_lock_init(&dev->vqs[i]->irq_lock);
+		spin_lock_init(&dev->vqs[i]->irq_affinity_lock);
+		cpumask_setall(&dev->vqs[i]->irq_affinity);
 	}
 
 	return 0;