[V5,net-next] net: mana: Assigning IRQ affinity on HT cores

Message ID 1702029754-6520-1-git-send-email-schakrabarti@linux.microsoft.com
State New
Headers
Series [V5,net-next] net: mana: Assigning IRQ affinity on HT cores |

Commit Message

Souradeep Chakrabarti Dec. 8, 2023, 10:02 a.m. UTC
  Existing MANA design assigns IRQ to every CPU, including sibling
hyper-threads. This may cause multiple IRQs to be active simultaneously
in the same core and may reduce the network performance with RSS.

Improve the performance by assigning IRQ to non sibling CPUs in local
NUMA node. The performance improvement we are getting using ntttcp with
following patch is around 15 percent with existing design and approximately
11 percent, when trying to assign one IRQ in each core across NUMA nodes,
if enough cores are present.

Suggested-by: Yury Norov <yury.norov@gmali.com>
Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
---
V4 -> V5:
* Fixed use of for_each_numa_hop_mask in irq_setup, by using
visited_cpus to mask previous node mask.
* Changed the way IRQ0 is gettng assigned, to assign it a
separate CPU if possible rather than assigning it to a already
assigned CPU.
* Added Suggested-by tag.

V3 -> V4:
* Used for_each_numa_hop_mask() macro and simplified the code.
Thanks to Yury Norov for the suggestion.
* Added code to assign hwc irq separately in mana_gd_setup_irqs.

V2 -> V3:
* Created a helper function to get the next NUMA with CPU.
* Added some error checks for unsuccessful memory allocation.
* Fixed some comments on the code.

V1 -> V2:
* Simplified the code by removing filter_mask_list and using avail_cpus.
* Addressed infinite loop issue when there are numa nodes with no CPUs.
* Addressed uses of local numa node instead of 0 to start.
* Removed uses of BUG_ON.
* Placed cpus_read_lock in parent function to avoid num_online_cpus
  to get changed before function finishes the affinity assignment.
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 92 +++++++++++++++++--
 1 file changed, 83 insertions(+), 9 deletions(-)
  

Comments

Yury Norov Dec. 8, 2023, 2:03 p.m. UTC | #1
On Fri, Dec 08, 2023 at 02:02:34AM -0800, Souradeep Chakrabarti wrote:
> Existing MANA design assigns IRQ to every CPU, including sibling
> hyper-threads. This may cause multiple IRQs to be active simultaneously
> in the same core and may reduce the network performance with RSS.

Can you add an IRQ distribution diagram to compare before/after
behavior, similarly to what I did in the other email?

> Improve the performance by assigning IRQ to non sibling CPUs in local
> NUMA node. The performance improvement we are getting using ntttcp with
> following patch is around 15 percent with existing design and approximately
> 11 percent, when trying to assign one IRQ in each core across NUMA nodes,
> if enough cores are present.

How did you measure it? In the other email you said you used perf, can
you show your procedure in details?

> Suggested-by: Yury Norov <yury.norov@gmali.com>
> Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> ---

[...]

>  .../net/ethernet/microsoft/mana/gdma_main.c   | 92 +++++++++++++++++--
>  1 file changed, 83 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 6367de0c2c2e..18e8908c5d29 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -1243,15 +1243,56 @@ void mana_gd_free_res_map(struct gdma_resource *r)
>  	r->size = 0;
>  }
>  
> +static int irq_setup(int *irqs, int nvec, int start_numa_node)
> +{
> +	int w, cnt, cpu, err = 0, i = 0;
> +	int next_node = start_numa_node;

What for this?

> +	const struct cpumask *next, *prev = cpu_none_mask;
> +	cpumask_var_t curr, cpus;
> +
> +	if (!zalloc_cpumask_var(&curr, GFP_KERNEL)) {
> +		err = -ENOMEM;
> +		return err;
> +	}
> +	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) {

                free(curr);

> +		err = -ENOMEM;
> +		return err;
> +	}
> +
> +	rcu_read_lock();
> +	for_each_numa_hop_mask(next, next_node) {
> +		cpumask_andnot(curr, next, prev);
> +		for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {
> +			cpumask_copy(cpus, curr);
> +			for_each_cpu(cpu, cpus) {
> +				irq_set_affinity_and_hint(irqs[i], topology_sibling_cpumask(cpu));
> +				if (++i == nvec)
> +					goto done;

Think what if you're passed with irq_setup(NULL, 0, 0).
That's why I suggested to place this check at the beginning.


> +				cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
> +				++cnt;
> +			}
> +		}
> +		prev = next;
> +	}
> +done:
> +	rcu_read_unlock();
> +	free_cpumask_var(curr);
> +	free_cpumask_var(cpus);
> +	return err;
> +}
> +
>  static int mana_gd_setup_irqs(struct pci_dev *pdev)
>  {
> -	unsigned int max_queues_per_port = num_online_cpus();
>  	struct gdma_context *gc = pci_get_drvdata(pdev);
> +	unsigned int max_queues_per_port;
>  	struct gdma_irq_context *gic;
>  	unsigned int max_irqs, cpu;
> -	int nvec, irq;
> +	int start_irq_index = 1;
> +	int nvec, *irqs, irq;
>  	int err, i = 0, j;
>  
> +	cpus_read_lock();
> +	max_queues_per_port = num_online_cpus();
>  	if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
>  		max_queues_per_port = MANA_MAX_NUM_QUEUES;
>  
> @@ -1261,6 +1302,14 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
>  	nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
>  	if (nvec < 0)
>  		return nvec;
> +	if (nvec <= num_online_cpus())
> +		start_irq_index = 0;
> +
> +	irqs = kmalloc_array((nvec - start_irq_index), sizeof(int), GFP_KERNEL);
> +	if (!irqs) {
> +		err = -ENOMEM;
> +		goto free_irq_vector;
> +	}
>  
>  	gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
>  				   GFP_KERNEL);
> @@ -1287,21 +1336,44 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
>  			goto free_irq;
>  		}
>  
> -		err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> -		if (err)
> -			goto free_irq;
> -
> -		cpu = cpumask_local_spread(i, gc->numa_node);
> -		irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> +		if (!i) {
> +			err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> +			if (err)
> +				goto free_irq;
> +
> +			/* If number of IRQ is one extra than number of online CPUs,
> +			 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
> +			 * same CPU.
> +			 * Else we will use different CPUs for IRQ0 and IRQ1.
> +			 * Also we are using cpumask_local_spread instead of
> +			 * cpumask_first for the node, because the node can be
> +			 * mem only.
> +			 */
> +			if (start_irq_index) {
> +				cpu = cpumask_local_spread(i, gc->numa_node);

I already mentioned that: if i == 0, you don't need to spread, just
pick 1st cpu from node.

> +				irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> +			} else {
> +				irqs[start_irq_index] = irq;
> +			}
> +		} else {
> +			irqs[i - start_irq_index] = irq;
> +			err = request_irq(irqs[i - start_irq_index], mana_gd_intr, 0,
> +					  gic->name, gic);
> +			if (err)
> +				goto free_irq;
> +		}
>  	}
>  
> +	err = irq_setup(irqs, (nvec - start_irq_index), gc->numa_node);
> +	if (err)
> +		goto free_irq;
>  	err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
>  	if (err)
>  		goto free_irq;
>  
>  	gc->max_num_msix = nvec;
>  	gc->num_msix_usable = nvec;
> -
> +	cpus_read_unlock();
>  	return 0;
>  
>  free_irq:
> @@ -1314,8 +1386,10 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
>  	}
>  
>  	kfree(gc->irq_contexts);
> +	kfree(irqs);
>  	gc->irq_contexts = NULL;
>  free_irq_vector:
> +	cpus_read_unlock();
>  	pci_free_irq_vectors(pdev);
>  	return err;
>  }
> -- 
> 2.34.1
  
Yury Norov Dec. 8, 2023, 9:53 p.m. UTC | #2
Few more nits

On Fri, Dec 08, 2023 at 06:03:40AM -0800, Yury Norov wrote:
> On Fri, Dec 08, 2023 at 02:02:34AM -0800, Souradeep Chakrabarti wrote:
> > Existing MANA design assigns IRQ to every CPU, including sibling
> > hyper-threads. This may cause multiple IRQs to be active simultaneously
> > in the same core and may reduce the network performance with RSS.
> 
> Can you add an IRQ distribution diagram to compare before/after
> behavior, similarly to what I did in the other email?
> 
> > Improve the performance by assigning IRQ to non sibling CPUs in local
> > NUMA node. The performance improvement we are getting using ntttcp with
> > following patch is around 15 percent with existing design and approximately
> > 11 percent, when trying to assign one IRQ in each core across NUMA nodes,
> > if enough cores are present.
> 
> How did you measure it? In the other email you said you used perf, can
> you show your procedure in details?
> 
> > Suggested-by: Yury Norov <yury.norov@gmali.com>
> > Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > ---
> 
> [...]
> 
> >  .../net/ethernet/microsoft/mana/gdma_main.c   | 92 +++++++++++++++++--
> >  1 file changed, 83 insertions(+), 9 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > index 6367de0c2c2e..18e8908c5d29 100644
> > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > @@ -1243,15 +1243,56 @@ void mana_gd_free_res_map(struct gdma_resource *r)
> >  	r->size = 0;
> >  }
> >  
> > +static int irq_setup(int *irqs, int nvec, int start_numa_node)
> > +{
> > +	int w, cnt, cpu, err = 0, i = 0;
> > +	int next_node = start_numa_node;
> 
> What for this?
> 
> > +	const struct cpumask *next, *prev = cpu_none_mask;
> > +	cpumask_var_t curr, cpus;
> > +
> > +	if (!zalloc_cpumask_var(&curr, GFP_KERNEL)) {

alloc_cpumask_var() here and below, because you initialize them by
copying

> > +		err = -ENOMEM;
> > +		return err;
> > +	}
> > +	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) {
> 
>                 free(curr);
> 
> > +		err = -ENOMEM;
> > +		return err;
> > +	}
> > +
> > +	rcu_read_lock();
> > +	for_each_numa_hop_mask(next, next_node) {
> > +		cpumask_andnot(curr, next, prev);
> > +		for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {

OK, if you can't increment inside for-loop, I'd switch it to a
while-loop:
                w = cpumask_weight(curr);
                cnt = 0;

		while (cnt < w) {

> > +			cpumask_copy(cpus, curr);
> > +			for_each_cpu(cpu, cpus) {
> > +				irq_set_affinity_and_hint(irqs[i], topology_sibling_cpumask(cpu));
> > +				if (++i == nvec)
> > +					goto done;
> 
> Think what if you're passed with irq_setup(NULL, 0, 0).
> That's why I suggested to place this check at the beginning.
> 
> 
> > +				cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
> > +				++cnt;
> > +			}
> > +		}
> > +		prev = next;
> > +	}

Don't hesitate to add even more vertical spacing. It's like: "take a
breath folks, this section is done". :)

> > +done:
> > +	rcu_read_unlock();
> > +	free_cpumask_var(curr);
> > +	free_cpumask_var(cpus);
> > +	return err;
> > +}
> > +
> >  static int mana_gd_setup_irqs(struct pci_dev *pdev)
> >  {
> > -	unsigned int max_queues_per_port = num_online_cpus();
> >  	struct gdma_context *gc = pci_get_drvdata(pdev);
> > +	unsigned int max_queues_per_port;
> >  	struct gdma_irq_context *gic;
> >  	unsigned int max_irqs, cpu;
> > -	int nvec, irq;
> > +	int start_irq_index = 1;
> > +	int nvec, *irqs, irq;
> >  	int err, i = 0, j;
> >  
> > +	cpus_read_lock();
> > +	max_queues_per_port = num_online_cpus();
> >  	if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
> >  		max_queues_per_port = MANA_MAX_NUM_QUEUES;
> >  
> > @@ -1261,6 +1302,14 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> >  	nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
> >  	if (nvec < 0)
> >  		return nvec;
> > +	if (nvec <= num_online_cpus())
> > +		start_irq_index = 0;
> > +
> > +	irqs = kmalloc_array((nvec - start_irq_index), sizeof(int), GFP_KERNEL);
> > +	if (!irqs) {
> > +		err = -ENOMEM;
> > +		goto free_irq_vector;
> > +	}
> >  
> >  	gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
> >  				   GFP_KERNEL);
> > @@ -1287,21 +1336,44 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> >  			goto free_irq;
> >  		}
> >  
> > -		err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> > -		if (err)
> > -			goto free_irq;
> > -
> > -		cpu = cpumask_local_spread(i, gc->numa_node);
> > -		irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> > +		if (!i) {
> > +			err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> > +			if (err)
> > +				goto free_irq;
> > +
> > +			/* If number of IRQ is one extra than number of online CPUs,
> > +			 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
> > +			 * same CPU.
> > +			 * Else we will use different CPUs for IRQ0 and IRQ1.
> > +			 * Also we are using cpumask_local_spread instead of
> > +			 * cpumask_first for the node, because the node can be
> > +			 * mem only.
> > +			 */
> > +			if (start_irq_index) {
> > +				cpu = cpumask_local_spread(i, gc->numa_node);
> 
> I already mentioned that: if i == 0, you don't need to spread, just
> pick 1st cpu from node.
> 
> > +				irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> > +			} else {
> > +				irqs[start_irq_index] = irq;
> > +			}
> > +		} else {
> > +			irqs[i - start_irq_index] = irq;
> > +			err = request_irq(irqs[i - start_irq_index], mana_gd_intr, 0,
> > +					  gic->name, gic);
> > +			if (err)
> > +				goto free_irq;
> > +		}
> >  	}
> >  
> > +	err = irq_setup(irqs, (nvec - start_irq_index), gc->numa_node);
> > +	if (err)
> > +		goto free_irq;
> >  	err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
> >  	if (err)
> >  		goto free_irq;
> >  
> >  	gc->max_num_msix = nvec;
> >  	gc->num_msix_usable = nvec;
> > -
> > +	cpus_read_unlock();
> >  	return 0;
> >  
> >  free_irq:
> > @@ -1314,8 +1386,10 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> >  	}
> >  
> >  	kfree(gc->irq_contexts);
> > +	kfree(irqs);
> >  	gc->irq_contexts = NULL;
> >  free_irq_vector:
> > +	cpus_read_unlock();
> >  	pci_free_irq_vectors(pdev);
> >  	return err;
> >  }
> > -- 
> > 2.34.1
  
Souradeep Chakrabarti Dec. 11, 2023, 6:37 a.m. UTC | #3
On Fri, Dec 08, 2023 at 06:03:39AM -0800, Yury Norov wrote:
> On Fri, Dec 08, 2023 at 02:02:34AM -0800, Souradeep Chakrabarti wrote:
> > Existing MANA design assigns IRQ to every CPU, including sibling
> > hyper-threads. This may cause multiple IRQs to be active simultaneously
> > in the same core and may reduce the network performance with RSS.
> 
> Can you add an IRQ distribution diagram to compare before/after
> behavior, similarly to what I did in the other email?
> 
Let's consider this topology:

Node            0               1
Core        0       1       2       3
CPU       0   1   2   3   4   5   6   7

 Before  
 IRQ     Nodes   Cores   CPUs
 0       1       0       0
 1       1       1       2
 2       1       0       1
 3       1       1       3
 4       2       2       4
 5       2       3       6
 6       2       2       5
 7       2       3       7
 
 Now
 IRQ     Nodes   Cores   CPUs
 0       1       0       0-1
 1       1       1       2-3
 2       1       0       0-1
 3       1       1       2-3
 4       2       2       4-5
 5       2       3       6-7
 6       2       2       4-5
 7       2       3       6-7
> > Improve the performance by assigning IRQ to non sibling CPUs in local
> > NUMA node. The performance improvement we are getting using ntttcp with
> > following patch is around 15 percent with existing design and approximately
> > 11 percent, when trying to assign one IRQ in each core across NUMA nodes,
> > if enough cores are present.
> 
> How did you measure it? In the other email you said you used perf, can
> you show your procedure in details?
I have used ntttcp for performance analysis, by perf I had meant performance
analysis. I have used ntttcp with following parameters
ntttcp -r -m 64 <receiver> 

ntttcp -s <receiver side ip address>  -m 64 <sender>
Both the VMs are in same Azure subnet and private IP address is used.
MTU and tcp buffer is set accordingly and number of channels are set using ethtool
accordingly for best performance. Also irqbalance was disabled.
https://github.com/microsoft/ntttcp-for-linux
https://learn.microsoft.com/en-us/azure/virtual-network/virtual-network-bandwidth-testing?tabs=linux

> 
> > Suggested-by: Yury Norov <yury.norov@gmali.com>
> > Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > ---
> 
> [...]
> 
> >  .../net/ethernet/microsoft/mana/gdma_main.c   | 92 +++++++++++++++++--
> >  1 file changed, 83 insertions(+), 9 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > index 6367de0c2c2e..18e8908c5d29 100644
> > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > @@ -1243,15 +1243,56 @@ void mana_gd_free_res_map(struct gdma_resource *r)
> >  	r->size = 0;
> >  }
> >  
> > +static int irq_setup(int *irqs, int nvec, int start_numa_node)
> > +{
> > +	int w, cnt, cpu, err = 0, i = 0;
> > +	int next_node = start_numa_node;
> 
> What for this?
This is the local numa node, from where to start hopping.
Please see how we are calling irq_setup(). We are passing the array of allocated irqs, total
number of irqs allocated, and the local numa node to the device.
> 
> > +	const struct cpumask *next, *prev = cpu_none_mask;
> > +	cpumask_var_t curr, cpus;
> > +
> > +	if (!zalloc_cpumask_var(&curr, GFP_KERNEL)) {
> > +		err = -ENOMEM;
> > +		return err;
> > +	}
> > +	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) {
> 
>                 free(curr);
Will fix it in next version. Thanks for pointing.
> 
> > +		err = -ENOMEM;
> > +		return err;
> > +	}
> > +
> > +	rcu_read_lock();
> > +	for_each_numa_hop_mask(next, next_node) {
> > +		cpumask_andnot(curr, next, prev);
> > +		for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {
> > +			cpumask_copy(cpus, curr);
> > +			for_each_cpu(cpu, cpus) {
> > +				irq_set_affinity_and_hint(irqs[i], topology_sibling_cpumask(cpu));
> > +				if (++i == nvec)
> > +					goto done;
> 
> Think what if you're passed with irq_setup(NULL, 0, 0).
> That's why I suggested to place this check at the beginning.
> 
irq_setup() is a helper function for mana_gd_setup_irqs(), which already takes
care of no NULL pointer for irqs, and 0 number of interrupts can not be passed.

nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
if (nvec < 0)
	return nvec;
> 
> > +				cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
> > +				++cnt;
> > +			}
> > +		}
> > +		prev = next;
> > +	}
> > +done:
> > +	rcu_read_unlock();
> > +	free_cpumask_var(curr);
> > +	free_cpumask_var(cpus);
> > +	return err;
> > +}
> > +
> >  static int mana_gd_setup_irqs(struct pci_dev *pdev)
> >  {
> > -	unsigned int max_queues_per_port = num_online_cpus();
> >  	struct gdma_context *gc = pci_get_drvdata(pdev);
> > +	unsigned int max_queues_per_port;
> >  	struct gdma_irq_context *gic;
> >  	unsigned int max_irqs, cpu;
> > -	int nvec, irq;
> > +	int start_irq_index = 1;
> > +	int nvec, *irqs, irq;
> >  	int err, i = 0, j;
> >  
> > +	cpus_read_lock();
> > +	max_queues_per_port = num_online_cpus();
> >  	if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
> >  		max_queues_per_port = MANA_MAX_NUM_QUEUES;
> >  
> > @@ -1261,6 +1302,14 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> >  	nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
> >  	if (nvec < 0)
> >  		return nvec;
> > +	if (nvec <= num_online_cpus())
> > +		start_irq_index = 0;
> > +
> > +	irqs = kmalloc_array((nvec - start_irq_index), sizeof(int), GFP_KERNEL);
> > +	if (!irqs) {
> > +		err = -ENOMEM;
> > +		goto free_irq_vector;
> > +	}
> >  
> >  	gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
> >  				   GFP_KERNEL);
> > @@ -1287,21 +1336,44 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> >  			goto free_irq;
> >  		}
> >  
> > -		err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> > -		if (err)
> > -			goto free_irq;
> > -
> > -		cpu = cpumask_local_spread(i, gc->numa_node);
> > -		irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> > +		if (!i) {
> > +			err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> > +			if (err)
> > +				goto free_irq;
> > +
> > +			/* If number of IRQ is one extra than number of online CPUs,
> > +			 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
> > +			 * same CPU.
> > +			 * Else we will use different CPUs for IRQ0 and IRQ1.
> > +			 * Also we are using cpumask_local_spread instead of
> > +			 * cpumask_first for the node, because the node can be
> > +			 * mem only.
> > +			 */
> > +			if (start_irq_index) {
> > +				cpu = cpumask_local_spread(i, gc->numa_node);
> 
> I already mentioned that: if i == 0, you don't need to spread, just
> pick 1st cpu from node.
The reason I have picked cpumask_local_spread here, is that, the gc->numa_node 
can be a memory only node, in that case we need to jump to next node to get the CPU.
Which cpumask_local_spread() using sched_numa_find_nth_cpu() takes care off.
> 
> > +				irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> > +			} else {
> > +				irqs[start_irq_index] = irq;
> > +			}
> > +		} else {
> > +			irqs[i - start_irq_index] = irq;
> > +			err = request_irq(irqs[i - start_irq_index], mana_gd_intr, 0,
> > +					  gic->name, gic);
> > +			if (err)
> > +				goto free_irq;
> > +		}
> >  	}
> >  
> > +	err = irq_setup(irqs, (nvec - start_irq_index), gc->numa_node);
> > +	if (err)
> > +		goto free_irq;
> >  	err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
> >  	if (err)
> >  		goto free_irq;
> >  
> >  	gc->max_num_msix = nvec;
> >  	gc->num_msix_usable = nvec;
> > -
> > +	cpus_read_unlock();
> >  	return 0;
> >  
> >  free_irq:
> > @@ -1314,8 +1386,10 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> >  	}
> >  
> >  	kfree(gc->irq_contexts);
> > +	kfree(irqs);
> >  	gc->irq_contexts = NULL;
> >  free_irq_vector:
> > +	cpus_read_unlock();
> >  	pci_free_irq_vectors(pdev);
> >  	return err;
> >  }
> > -- 
> > 2.34.1
  
Souradeep Chakrabarti Dec. 11, 2023, 6:53 a.m. UTC | #4
On Fri, Dec 08, 2023 at 01:53:51PM -0800, Yury Norov wrote:
> Few more nits
> 
> On Fri, Dec 08, 2023 at 06:03:40AM -0800, Yury Norov wrote:
> > On Fri, Dec 08, 2023 at 02:02:34AM -0800, Souradeep Chakrabarti wrote:
> > > Existing MANA design assigns IRQ to every CPU, including sibling
> > > hyper-threads. This may cause multiple IRQs to be active simultaneously
> > > in the same core and may reduce the network performance with RSS.
> > 
> > Can you add an IRQ distribution diagram to compare before/after
> > behavior, similarly to what I did in the other email?
> > 
> > > Improve the performance by assigning IRQ to non sibling CPUs in local
> > > NUMA node. The performance improvement we are getting using ntttcp with
> > > following patch is around 15 percent with existing design and approximately
> > > 11 percent, when trying to assign one IRQ in each core across NUMA nodes,
> > > if enough cores are present.
> > 
> > How did you measure it? In the other email you said you used perf, can
> > you show your procedure in details?
> > 
> > > Suggested-by: Yury Norov <yury.norov@gmali.com>
> > > Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > > ---
> > 
> > [...]
> > 
> > >  .../net/ethernet/microsoft/mana/gdma_main.c   | 92 +++++++++++++++++--
> > >  1 file changed, 83 insertions(+), 9 deletions(-)
> > > 
> > > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > index 6367de0c2c2e..18e8908c5d29 100644
> > > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > @@ -1243,15 +1243,56 @@ void mana_gd_free_res_map(struct gdma_resource *r)
> > >  	r->size = 0;
> > >  }
> > >  
> > > +static int irq_setup(int *irqs, int nvec, int start_numa_node)
> > > +{
> > > +	int w, cnt, cpu, err = 0, i = 0;
> > > +	int next_node = start_numa_node;
> > 
> > What for this?
> > 
> > > +	const struct cpumask *next, *prev = cpu_none_mask;
> > > +	cpumask_var_t curr, cpus;
> > > +
> > > +	if (!zalloc_cpumask_var(&curr, GFP_KERNEL)) {
> 
> alloc_cpumask_var() here and below, because you initialize them by
> copying
I have used zalloc here as prev gets initialized after the first hop, before that
it may contain unwanted values, which may impact cpumask_andnot(curr, next, prev).
Regarding curr I will change it to alloc_cpumask_var().
Please let me know if that sounds right.
> 
> > > +		err = -ENOMEM;
> > > +		return err;
> > > +	}
> > > +	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) {
> > 
> >                 free(curr);
> > 
> > > +		err = -ENOMEM;
> > > +		return err;
> > > +	}
> > > +
> > > +	rcu_read_lock();
> > > +	for_each_numa_hop_mask(next, next_node) {
> > > +		cpumask_andnot(curr, next, prev);
> > > +		for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {
> 
> OK, if you can't increment inside for-loop, I'd switch it to a
> while-loop:
>                 w = cpumask_weight(curr);
>                 cnt = 0;
> 
Thanks will change it to while loop.
> 		while (cnt < w) {
> 
> > > +			cpumask_copy(cpus, curr);
> > > +			for_each_cpu(cpu, cpus) {
> > > +				irq_set_affinity_and_hint(irqs[i], topology_sibling_cpumask(cpu));
> > > +				if (++i == nvec)
> > > +					goto done;
> > 
> > Think what if you're passed with irq_setup(NULL, 0, 0).
> > That's why I suggested to place this check at the beginning.
> > 
> > 
> > > +				cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
> > > +				++cnt;
> > > +			}
> > > +		}
> > > +		prev = next;
> > > +	}
> 
> Don't hesitate to add even more vertical spacing. It's like: "take a
> breath folks, this section is done". :)
> 
Sure will add in next version.
> > > +done:
> > > +	rcu_read_unlock();
> > > +	free_cpumask_var(curr);
> > > +	free_cpumask_var(cpus);
> > > +	return err;
> > > +}
> > > +
> > >  static int mana_gd_setup_irqs(struct pci_dev *pdev)
> > >  {
> > > -	unsigned int max_queues_per_port = num_online_cpus();
> > >  	struct gdma_context *gc = pci_get_drvdata(pdev);
> > > +	unsigned int max_queues_per_port;
> > >  	struct gdma_irq_context *gic;
> > >  	unsigned int max_irqs, cpu;
> > > -	int nvec, irq;
> > > +	int start_irq_index = 1;
> > > +	int nvec, *irqs, irq;
> > >  	int err, i = 0, j;
> > >  
> > > +	cpus_read_lock();
> > > +	max_queues_per_port = num_online_cpus();
> > >  	if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
> > >  		max_queues_per_port = MANA_MAX_NUM_QUEUES;
> > >  
> > > @@ -1261,6 +1302,14 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> > >  	nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
> > >  	if (nvec < 0)
> > >  		return nvec;
> > > +	if (nvec <= num_online_cpus())
> > > +		start_irq_index = 0;
> > > +
> > > +	irqs = kmalloc_array((nvec - start_irq_index), sizeof(int), GFP_KERNEL);
> > > +	if (!irqs) {
> > > +		err = -ENOMEM;
> > > +		goto free_irq_vector;
> > > +	}
> > >  
> > >  	gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
> > >  				   GFP_KERNEL);
> > > @@ -1287,21 +1336,44 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> > >  			goto free_irq;
> > >  		}
> > >  
> > > -		err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> > > -		if (err)
> > > -			goto free_irq;
> > > -
> > > -		cpu = cpumask_local_spread(i, gc->numa_node);
> > > -		irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> > > +		if (!i) {
> > > +			err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> > > +			if (err)
> > > +				goto free_irq;
> > > +
> > > +			/* If number of IRQ is one extra than number of online CPUs,
> > > +			 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
> > > +			 * same CPU.
> > > +			 * Else we will use different CPUs for IRQ0 and IRQ1.
> > > +			 * Also we are using cpumask_local_spread instead of
> > > +			 * cpumask_first for the node, because the node can be
> > > +			 * mem only.
> > > +			 */
> > > +			if (start_irq_index) {
> > > +				cpu = cpumask_local_spread(i, gc->numa_node);
> > 
> > I already mentioned that: if i == 0, you don't need to spread, just
> > pick 1st cpu from node.
> > 
> > > +				irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> > > +			} else {
> > > +				irqs[start_irq_index] = irq;
> > > +			}
> > > +		} else {
> > > +			irqs[i - start_irq_index] = irq;
> > > +			err = request_irq(irqs[i - start_irq_index], mana_gd_intr, 0,
> > > +					  gic->name, gic);
> > > +			if (err)
> > > +				goto free_irq;
> > > +		}
> > >  	}
> > >  
> > > +	err = irq_setup(irqs, (nvec - start_irq_index), gc->numa_node);
> > > +	if (err)
> > > +		goto free_irq;
> > >  	err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
> > >  	if (err)
> > >  		goto free_irq;
> > >  
> > >  	gc->max_num_msix = nvec;
> > >  	gc->num_msix_usable = nvec;
> > > -
> > > +	cpus_read_unlock();
> > >  	return 0;
> > >  
> > >  free_irq:
> > > @@ -1314,8 +1386,10 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> > >  	}
> > >  
> > >  	kfree(gc->irq_contexts);
> > > +	kfree(irqs);
> > >  	gc->irq_contexts = NULL;
> > >  free_irq_vector:
> > > +	cpus_read_unlock();
> > >  	pci_free_irq_vectors(pdev);
> > >  	return err;
> > >  }
> > > -- 
> > > 2.34.1
  
Yury Norov Dec. 11, 2023, 2 p.m. UTC | #5
On Sun, Dec 10, 2023 at 10:53:23PM -0800, Souradeep Chakrabarti wrote:
> On Fri, Dec 08, 2023 at 01:53:51PM -0800, Yury Norov wrote:
> > Few more nits
> > 
> > On Fri, Dec 08, 2023 at 06:03:40AM -0800, Yury Norov wrote:
> > > On Fri, Dec 08, 2023 at 02:02:34AM -0800, Souradeep Chakrabarti wrote:
> > > > Existing MANA design assigns IRQ to every CPU, including sibling
> > > > hyper-threads. This may cause multiple IRQs to be active simultaneously
> > > > in the same core and may reduce the network performance with RSS.
> > > 
> > > Can you add an IRQ distribution diagram to compare before/after
> > > behavior, similarly to what I did in the other email?
> > > 
> > > > Improve the performance by assigning IRQ to non sibling CPUs in local
> > > > NUMA node. The performance improvement we are getting using ntttcp with
> > > > following patch is around 15 percent with existing design and approximately
> > > > 11 percent, when trying to assign one IRQ in each core across NUMA nodes,
> > > > if enough cores are present.
> > > 
> > > How did you measure it? In the other email you said you used perf, can
> > > you show your procedure in details?
> > > 
> > > > Suggested-by: Yury Norov <yury.norov@gmali.com>
> > > > Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > > > ---
> > > 
> > > [...]
> > > 
> > > >  .../net/ethernet/microsoft/mana/gdma_main.c   | 92 +++++++++++++++++--
> > > >  1 file changed, 83 insertions(+), 9 deletions(-)
> > > > 
> > > > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > index 6367de0c2c2e..18e8908c5d29 100644
> > > > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > @@ -1243,15 +1243,56 @@ void mana_gd_free_res_map(struct gdma_resource *r)
> > > >  	r->size = 0;
> > > >  }
> > > >  
> > > > +static int irq_setup(int *irqs, int nvec, int start_numa_node)
> > > > +{
> > > > +	int w, cnt, cpu, err = 0, i = 0;
> > > > +	int next_node = start_numa_node;
> > > 
> > > What for this?
> > > 
> > > > +	const struct cpumask *next, *prev = cpu_none_mask;
> > > > +	cpumask_var_t curr, cpus;
> > > > +
> > > > +	if (!zalloc_cpumask_var(&curr, GFP_KERNEL)) {
> > 
> > alloc_cpumask_var() here and below, because you initialize them by
> > copying
> I have used zalloc here as prev gets initialized after the first hop, before that
> it may contain unwanted values, which may impact cpumask_andnot(curr, next, prev).
> Regarding curr I will change it to alloc_cpumask_var().
> Please let me know if that sounds right.

What? prev is initialized at declaration:
        
        const struct cpumask *next, *prev = cpu_none_mask;
  
Yury Norov Dec. 11, 2023, 3:30 p.m. UTC | #6
On Sun, Dec 10, 2023 at 10:37:26PM -0800, Souradeep Chakrabarti wrote:
> On Fri, Dec 08, 2023 at 06:03:39AM -0800, Yury Norov wrote:
> > On Fri, Dec 08, 2023 at 02:02:34AM -0800, Souradeep Chakrabarti wrote:
> > > Existing MANA design assigns IRQ to every CPU, including sibling
> > > hyper-threads. This may cause multiple IRQs to be active simultaneously
> > > in the same core and may reduce the network performance with RSS.
> > 
> > Can you add an IRQ distribution diagram to compare before/after
> > behavior, similarly to what I did in the other email?
> > 
> Let's consider this topology:

Not here - in commit message, please.

> 
> Node            0               1
> Core        0       1       2       3
> CPU       0   1   2   3   4   5   6   7
> 
>  Before  
>  IRQ     Nodes   Cores   CPUs
>  0       1       0       0
>  1       1       1       2
>  2       1       0       1
>  3       1       1       3
>  4       2       2       4
>  5       2       3       6
>  6       2       2       5
>  7       2       3       7
>  
>  Now
>  IRQ     Nodes   Cores   CPUs
>  0       1       0       0-1
>  1       1       1       2-3
>  2       1       0       0-1
>  3       1       1       2-3
>  4       2       2       4-5
>  5       2       3       6-7
>  6       2       2       4-5
>  7       2       3       6-7

If you decided to take my wording, please give credits.

> > > Improve the performance by assigning IRQ to non sibling CPUs in local
> > > NUMA node. The performance improvement we are getting using ntttcp with
> > > following patch is around 15 percent with existing design and approximately
> > > 11 percent, when trying to assign one IRQ in each core across NUMA nodes,
> > > if enough cores are present.
> > 
> > How did you measure it? In the other email you said you used perf, can
> > you show your procedure in details?
> I have used ntttcp for performance analysis, by perf I had meant performance
> analysis. I have used ntttcp with following parameters
> ntttcp -r -m 64 <receiver> 
> 
> ntttcp -s <receiver side ip address>  -m 64 <sender>
> Both the VMs are in same Azure subnet and private IP address is used.
> MTU and tcp buffer is set accordingly and number of channels are set using ethtool
> accordingly for best performance. Also irqbalance was disabled.
> https://github.com/microsoft/ntttcp-for-linux
> https://learn.microsoft.com/en-us/azure/virtual-network/virtual-network-bandwidth-testing?tabs=linux

OK. Can you also print the before/after outputs of ntttcp that demonstrate
+15%?

> > > Suggested-by: Yury Norov <yury.norov@gmali.com>
> > > Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > > ---
> > 
> > [...]
> > 
> > >  .../net/ethernet/microsoft/mana/gdma_main.c   | 92 +++++++++++++++++--
> > >  1 file changed, 83 insertions(+), 9 deletions(-)
> > > 
> > > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > index 6367de0c2c2e..18e8908c5d29 100644
> > > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > @@ -1243,15 +1243,56 @@ void mana_gd_free_res_map(struct gdma_resource *r)
> > >  	r->size = 0;
> > >  }
> > >  
> > > +static int irq_setup(int *irqs, int nvec, int start_numa_node)

Is it intentional that irqs and nvec are signed? If not, please make
them unsigned.

> > > +{
> > > +	int w, cnt, cpu, err = 0, i = 0;
> > > +	int next_node = start_numa_node;
> > 
> > What for this?
> This is the local numa node, from where to start hopping.
> Please see how we are calling irq_setup(). We are passing the array of allocated irqs, total
> number of irqs allocated, and the local numa node to the device.

I'll ask again: you copy parameter (start_numa_node) to a local
variable (next_node), and never use start_numa_node after that.

You can just use the parameter, and avoid creating local variable at
all, so what for the latter exist?

The naming is confusing. I think just 'node' is OK here.

> > > +	const struct cpumask *next, *prev = cpu_none_mask;
> > > +	cpumask_var_t curr, cpus;
> > > +
> > > +	if (!zalloc_cpumask_var(&curr, GFP_KERNEL)) {
> > > +		err = -ENOMEM;
> > > +		return err;
> > > +	}
> > > +	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) {
> > 
> >                 free(curr);
> Will fix it in next version. Thanks for pointing.

And also drop 'err' - just 'return -ENOMEM'.

> > 
> > > +		err = -ENOMEM;
> > > +		return err;
> > > +	}
> > > +
> > > +	rcu_read_lock();
> > > +	for_each_numa_hop_mask(next, next_node) {
> > > +		cpumask_andnot(curr, next, prev);
> > > +		for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {
> > > +			cpumask_copy(cpus, curr);
> > > +			for_each_cpu(cpu, cpus) {
> > > +				irq_set_affinity_and_hint(irqs[i], topology_sibling_cpumask(cpu));
> > > +				if (++i == nvec)
> > > +					goto done;
> > 
> > Think what if you're passed with irq_setup(NULL, 0, 0).
> > That's why I suggested to place this check at the beginning.
> > 
> irq_setup() is a helper function for mana_gd_setup_irqs(), which already takes
> care of no NULL pointer for irqs, and 0 number of interrupts can not be passed.
> 
> nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
> if (nvec < 0)
> 	return nvec;

I know that. But still it's a bug. The common convention is that if a
0-length array is passed to a function, it should not dereference the
pointer.

...

> > > @@ -1287,21 +1336,44 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> > >  			goto free_irq;
> > >  		}
> > >  
> > > -		err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> > > -		if (err)
> > > -			goto free_irq;
> > > -
> > > -		cpu = cpumask_local_spread(i, gc->numa_node);
> > > -		irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> > > +		if (!i) {
> > > +			err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> > > +			if (err)
> > > +				goto free_irq;
> > > +
> > > +			/* If number of IRQ is one extra than number of online CPUs,
> > > +			 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
> > > +			 * same CPU.
> > > +			 * Else we will use different CPUs for IRQ0 and IRQ1.
> > > +			 * Also we are using cpumask_local_spread instead of
> > > +			 * cpumask_first for the node, because the node can be
> > > +			 * mem only.
> > > +			 */
> > > +			if (start_irq_index) {
> > > +				cpu = cpumask_local_spread(i, gc->numa_node);
> > 
> > I already mentioned that: if i == 0, you don't need to spread, just
> > pick 1st cpu from node.
> The reason I have picked cpumask_local_spread here, is that, the gc->numa_node 
> can be a memory only node, in that case we need to jump to next node to get the CPU.
> Which cpumask_local_spread() using sched_numa_find_nth_cpu() takes care off.

OK, makes sense.

What if you need to distribute more IRQs than the number of CPUs? In
that case you'd call the function many times. But because you return
0, user has no chance catch that. I think you should handle it inside
the helper, or do like this:

        while (nvec) {
                distributed = irq_setup(irqs, nvec, node);
                if (distributed < 0)
                        break;

                irq += distributed;
                nvec -= distributed;
        }

Thanks,
Yury
  
Souradeep Chakrabarti Dec. 12, 2023, 6:03 a.m. UTC | #7
On Mon, Dec 11, 2023 at 06:00:22AM -0800, Yury Norov wrote:
> On Sun, Dec 10, 2023 at 10:53:23PM -0800, Souradeep Chakrabarti wrote:
> > On Fri, Dec 08, 2023 at 01:53:51PM -0800, Yury Norov wrote:
> > > Few more nits
> > > 
> > > On Fri, Dec 08, 2023 at 06:03:40AM -0800, Yury Norov wrote:
> > > > On Fri, Dec 08, 2023 at 02:02:34AM -0800, Souradeep Chakrabarti wrote:
> > > > > Existing MANA design assigns IRQ to every CPU, including sibling
> > > > > hyper-threads. This may cause multiple IRQs to be active simultaneously
> > > > > in the same core and may reduce the network performance with RSS.
> > > > 
> > > > Can you add an IRQ distribution diagram to compare before/after
> > > > behavior, similarly to what I did in the other email?
> > > > 
> > > > > Improve the performance by assigning IRQ to non sibling CPUs in local
> > > > > NUMA node. The performance improvement we are getting using ntttcp with
> > > > > following patch is around 15 percent with existing design and approximately
> > > > > 11 percent, when trying to assign one IRQ in each core across NUMA nodes,
> > > > > if enough cores are present.
> > > > 
> > > > How did you measure it? In the other email you said you used perf, can
> > > > you show your procedure in details?
> > > > 
> > > > > Suggested-by: Yury Norov <yury.norov@gmali.com>
> > > > > Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > > > > ---
> > > > 
> > > > [...]
> > > > 
> > > > >  .../net/ethernet/microsoft/mana/gdma_main.c   | 92 +++++++++++++++++--
> > > > >  1 file changed, 83 insertions(+), 9 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > > index 6367de0c2c2e..18e8908c5d29 100644
> > > > > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > > @@ -1243,15 +1243,56 @@ void mana_gd_free_res_map(struct gdma_resource *r)
> > > > >  	r->size = 0;
> > > > >  }
> > > > >  
> > > > > +static int irq_setup(int *irqs, int nvec, int start_numa_node)
> > > > > +{
> > > > > +	int w, cnt, cpu, err = 0, i = 0;
> > > > > +	int next_node = start_numa_node;
> > > > 
> > > > What for this?
> > > > 
> > > > > +	const struct cpumask *next, *prev = cpu_none_mask;
> > > > > +	cpumask_var_t curr, cpus;
> > > > > +
> > > > > +	if (!zalloc_cpumask_var(&curr, GFP_KERNEL)) {
> > > 
> > > alloc_cpumask_var() here and below, because you initialize them by
> > > copying
> > I have used zalloc here as prev gets initialized after the first hop, before that
> > it may contain unwanted values, which may impact cpumask_andnot(curr, next, prev).
> > Regarding curr I will change it to alloc_cpumask_var().
> > Please let me know if that sounds right.
> 
> What? prev is initialized at declaration:
Yes, I will remove the zalloc and will change it to alloc in V6.
Thanks for pointing.
>         
>         const struct cpumask *next, *prev = cpu_none_mask;
  
Souradeep Chakrabarti Dec. 12, 2023, 11:38 a.m. UTC | #8
On Mon, Dec 11, 2023 at 07:30:46AM -0800, Yury Norov wrote:
> On Sun, Dec 10, 2023 at 10:37:26PM -0800, Souradeep Chakrabarti wrote:
> > On Fri, Dec 08, 2023 at 06:03:39AM -0800, Yury Norov wrote:
> > > On Fri, Dec 08, 2023 at 02:02:34AM -0800, Souradeep Chakrabarti wrote:
> > > > Existing MANA design assigns IRQ to every CPU, including sibling
> > > > hyper-threads. This may cause multiple IRQs to be active simultaneously
> > > > in the same core and may reduce the network performance with RSS.
> > > 
> > > Can you add an IRQ distribution diagram to compare before/after
> > > behavior, similarly to what I did in the other email?
> > > 
> > Let's consider this topology:
> 
> Not here - in commit message, please.
Okay, will do that.
> 
> > 
> > Node            0               1
> > Core        0       1       2       3
> > CPU       0   1   2   3   4   5   6   7
> > 
> >  Before  
> >  IRQ     Nodes   Cores   CPUs
> >  0       1       0       0
> >  1       1       1       2
> >  2       1       0       1
> >  3       1       1       3
> >  4       2       2       4
> >  5       2       3       6
> >  6       2       2       5
> >  7       2       3       7
> >  
> >  Now
> >  IRQ     Nodes   Cores   CPUs
> >  0       1       0       0-1
> >  1       1       1       2-3
> >  2       1       0       0-1
> >  3       1       1       2-3
> >  4       2       2       4-5
> >  5       2       3       6-7
> >  6       2       2       4-5
> >  7       2       3       6-7
> 
> If you decided to take my wording, please give credits.
> 
Will take care of that :).
> > > > Improve the performance by assigning IRQ to non sibling CPUs in local
> > > > NUMA node. The performance improvement we are getting using ntttcp with
> > > > following patch is around 15 percent with existing design and approximately
> > > > 11 percent, when trying to assign one IRQ in each core across NUMA nodes,
> > > > if enough cores are present.
> > > 
> > > How did you measure it? In the other email you said you used perf, can
> > > you show your procedure in details?
> > I have used ntttcp for performance analysis, by perf I had meant performance
> > analysis. I have used ntttcp with following parameters
> > ntttcp -r -m 64 <receiver> 
> > 
> > ntttcp -s <receiver side ip address>  -m 64 <sender>
> > Both the VMs are in same Azure subnet and private IP address is used.
> > MTU and tcp buffer is set accordingly and number of channels are set using ethtool
> > accordingly for best performance. Also irqbalance was disabled.
> > https://github.com/microsoft/ntttcp-for-linux
> > https://learn.microsoft.com/en-us/azure/virtual-network/virtual-network-bandwidth-testing?tabs=linux
> 
> OK. Can you also print the before/after outputs of ntttcp that demonstrate
> +15%?
> 
With affinity spread like each core only 1 irq and spreading accross multiple NUMA node>
8 ./ntttcp -r -m 16
NTTTCP for Linux 1.4.0
---------------------------------------------------------
08:05:20 INFO: 17 threads created
08:05:28 INFO: Network activity progressing...
08:06:28 INFO: Test run completed.
08:06:28 INFO: Test cycle finished.
08:06:28 INFO: #####  Totals:  #####
08:06:28 INFO: test duration    :60.00 seconds
08:06:28 INFO: total bytes      :630292053310
08:06:28 INFO:   throughput     :84.04Gbps
08:06:28 INFO:   retrans segs   :4
08:06:28 INFO: cpu cores        :192
08:06:28 INFO:   cpu speed      :3799.725MHz
08:06:28 INFO:   user           :0.05%
08:06:28 INFO:   system         :1.60%
08:06:28 INFO:   idle           :96.41%
08:06:28 INFO:   iowait         :0.00%
08:06:28 INFO:   softirq        :1.94%
08:06:28 INFO:   cycles/byte    :2.50
08:06:28 INFO: cpu busy (all)   :534.41%

With our new proposal

./ntttcp -r -m 16
NTTTCP for Linux 1.4.0
---------------------------------------------------------
08:08:51 INFO: 17 threads created
08:08:56 INFO: Network activity progressing...
08:09:56 INFO: Test run completed.
08:09:56 INFO: Test cycle finished.
08:09:56 INFO: #####  Totals:  #####
08:09:56 INFO: test duration    :60.00 seconds
08:09:56 INFO: total bytes      :741966608384
08:09:56 INFO:   throughput     :98.93Gbps
08:09:56 INFO:   retrans segs   :6
08:09:56 INFO: cpu cores        :192
08:09:56 INFO:   cpu speed      :3799.791MHz
08:09:56 INFO:   user           :0.06%
08:09:56 INFO:   system         :1.81%
08:09:56 INFO:   idle           :96.18%
08:09:56 INFO:   iowait         :0.00%
08:09:56 INFO:   softirq        :1.95%
08:09:56 INFO:   cycles/byte    :2.25
08:09:56 INFO: cpu busy (all)   :569.22%
---------------------------------------------------------

> > > > Suggested-by: Yury Norov <yury.norov@gmali.com>
> > > > Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> > > > ---
> > > 
> > > [...]
> > > 
> > > >  .../net/ethernet/microsoft/mana/gdma_main.c   | 92 +++++++++++++++++--
> > > >  1 file changed, 83 insertions(+), 9 deletions(-)
> > > > 
> > > > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > index 6367de0c2c2e..18e8908c5d29 100644
> > > > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > @@ -1243,15 +1243,56 @@ void mana_gd_free_res_map(struct gdma_resource *r)
> > > >  	r->size = 0;
> > > >  }
> > > >  
> > > > +static int irq_setup(int *irqs, int nvec, int start_numa_node)
> 
> Is it intentional that irqs and nvec are signed? If not, please make
> them unsigned.
Will do it in next version.
> 
> > > > +{
> > > > +	int w, cnt, cpu, err = 0, i = 0;
> > > > +	int next_node = start_numa_node;
> > > 
> > > What for this?
> > This is the local numa node, from where to start hopping.
> > Please see how we are calling irq_setup(). We are passing the array of allocated irqs, total
> > number of irqs allocated, and the local numa node to the device.
> 
> I'll ask again: you copy parameter (start_numa_node) to a local
> variable (next_node), and never use start_numa_node after that.
> 
> You can just use the parameter, and avoid creating local variable at
> all, so what for the latter exist?
> 
> The naming is confusing. I think just 'node' is OK here.
Thanks, I wll not use the extra variable next_node.
> 
> > > > +	const struct cpumask *next, *prev = cpu_none_mask;
> > > > +	cpumask_var_t curr, cpus;
> > > > +
> > > > +	if (!zalloc_cpumask_var(&curr, GFP_KERNEL)) {
> > > > +		err = -ENOMEM;
> > > > +		return err;
> > > > +	}
> > > > +	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) {
> > > 
> > >                 free(curr);
> > Will fix it in next version. Thanks for pointing.
> 
> And also drop 'err' - just 'return -ENOMEM'.
> 
Will fix it in next revision.
> > > 
> > > > +		err = -ENOMEM;
> > > > +		return err;
> > > > +	}
> > > > +
> > > > +	rcu_read_lock();
> > > > +	for_each_numa_hop_mask(next, next_node) {
> > > > +		cpumask_andnot(curr, next, prev);
> > > > +		for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {
> > > > +			cpumask_copy(cpus, curr);
> > > > +			for_each_cpu(cpu, cpus) {
> > > > +				irq_set_affinity_and_hint(irqs[i], topology_sibling_cpumask(cpu));
> > > > +				if (++i == nvec)
> > > > +					goto done;
> > > 
> > > Think what if you're passed with irq_setup(NULL, 0, 0).
> > > That's why I suggested to place this check at the beginning.
> > > 
> > irq_setup() is a helper function for mana_gd_setup_irqs(), which already takes
> > care of no NULL pointer for irqs, and 0 number of interrupts can not be passed.
> > 
> > nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
> > if (nvec < 0)
> > 	return nvec;
> 
> I know that. But still it's a bug. The common convention is that if a
> 0-length array is passed to a function, it should not dereference the
> pointer.
> 
I will add one if check in the begining of irq_setup() to verify the pointer
and the nvec number.
> ...
> 
> > > > @@ -1287,21 +1336,44 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> > > >  			goto free_irq;
> > > >  		}
> > > >  
> > > > -		err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> > > > -		if (err)
> > > > -			goto free_irq;
> > > > -
> > > > -		cpu = cpumask_local_spread(i, gc->numa_node);
> > > > -		irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> > > > +		if (!i) {
> > > > +			err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> > > > +			if (err)
> > > > +				goto free_irq;
> > > > +
> > > > +			/* If number of IRQ is one extra than number of online CPUs,
> > > > +			 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
> > > > +			 * same CPU.
> > > > +			 * Else we will use different CPUs for IRQ0 and IRQ1.
> > > > +			 * Also we are using cpumask_local_spread instead of
> > > > +			 * cpumask_first for the node, because the node can be
> > > > +			 * mem only.
> > > > +			 */
> > > > +			if (start_irq_index) {
> > > > +				cpu = cpumask_local_spread(i, gc->numa_node);
> > > 
> > > I already mentioned that: if i == 0, you don't need to spread, just
> > > pick 1st cpu from node.
> > The reason I have picked cpumask_local_spread here, is that, the gc->numa_node 
> > can be a memory only node, in that case we need to jump to next node to get the CPU.
> > Which cpumask_local_spread() using sched_numa_find_nth_cpu() takes care off.
> 
> OK, makes sense.
> 
> What if you need to distribute more IRQs than the number of CPUs? In
> that case you'd call the function many times. But because you return
> 0, user has no chance catch that. I think you should handle it inside
> the helper, or do like this:
> 
>         while (nvec) {
>                 distributed = irq_setup(irqs, nvec, node);
>                 if (distributed < 0)
>                         break;
> 
>                 irq += distributed;
>                 nvec -= distributed;
>         }
We can not have irqs more greater than 1 of num of online CPUs, as we are
setting it inside cpu_read_lock() with num_online_cpus().
We can have minimum 2 IRQs and max number_online_cpus() +1 or 64, which is
maximum supported IRQs per port.

1295         cpus_read_lock();
1296         max_queues_per_port = num_online_cpus();
1297         if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
1298                 max_queues_per_port = MANA_MAX_NUM_QUEUES;
1299
1300         /* Need 1 interrupt for the Hardware communication Channel (HWC) */
1301         max_irqs = max_queues_per_port + 1;
1302
1303         nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
1304         if (nvec < 0)
1305                 return nvec;
> 
> Thanks,
> Yury
  
Yury Norov Dec. 12, 2023, 4:34 p.m. UTC | #9
> > > > > +	rcu_read_lock();
> > > > > +	for_each_numa_hop_mask(next, next_node) {
> > > > > +		cpumask_andnot(curr, next, prev);
> > > > > +		for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {
> > > > > +			cpumask_copy(cpus, curr);
> > > > > +			for_each_cpu(cpu, cpus) {
> > > > > +				irq_set_affinity_and_hint(irqs[i], topology_sibling_cpumask(cpu));
> > > > > +				if (++i == nvec)
> > > > > +					goto done;
> > > > 
> > > > Think what if you're passed with irq_setup(NULL, 0, 0).
> > > > That's why I suggested to place this check at the beginning.
> > > > 
> > > irq_setup() is a helper function for mana_gd_setup_irqs(), which already takes
> > > care of no NULL pointer for irqs, and 0 number of interrupts can not be passed.
> > > 
> > > nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
> > > if (nvec < 0)
> > > 	return nvec;
> > 
> > I know that. But still it's a bug. The common convention is that if a
> > 0-length array is passed to a function, it should not dereference the
> > pointer.
> > 
> I will add one if check in the begining of irq_setup() to verify the pointer
> and the nvec number.

Yes you can, but what for? This is an error anyways, and you don't
care about early return. So instead of adding and bearing extra logic,
I'd just swap 2 lines of existing code.
  
Souradeep Chakrabarti Dec. 12, 2023, 5:18 p.m. UTC | #10
>-----Original Message-----
>From: Yury Norov <yury.norov@gmail.com>
>Sent: Tuesday, December 12, 2023 10:04 PM
>To: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
>Cc: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
><haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
><decui@microsoft.com>; davem@davemloft.net; edumazet@google.com;
>kuba@kernel.org; pabeni@redhat.com; Long Li <longli@microsoft.com>;
>leon@kernel.org; cai.huoqing@linux.dev; ssengar@linux.microsoft.com;
>vkuznets@redhat.com; tglx@linutronix.de; linux-hyperv@vger.kernel.org;
>netdev@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
>rdma@vger.kernel.org; Souradeep Chakrabarti <schakrabarti@microsoft.com>;
>Paul Rosswurm <paulros@microsoft.com>
>Subject: [EXTERNAL] Re: [PATCH V5 net-next] net: mana: Assigning IRQ affinity on
>HT cores
>
>[Some people who received this message don't often get email from
>yury.norov@gmail.com. Learn why this is important at
>https://aka.ms/LearnAboutSenderIdentification ]
>
>> > > > > +     rcu_read_lock();
>> > > > > +     for_each_numa_hop_mask(next, next_node) {
>> > > > > +             cpumask_andnot(curr, next, prev);
>> > > > > +             for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {
>> > > > > +                     cpumask_copy(cpus, curr);
>> > > > > +                     for_each_cpu(cpu, cpus) {
>> > > > > +                             irq_set_affinity_and_hint(irqs[i],
>topology_sibling_cpumask(cpu));
>> > > > > +                             if (++i == nvec)
>> > > > > +                                     goto done;
>> > > >
>> > > > Think what if you're passed with irq_setup(NULL, 0, 0).
>> > > > That's why I suggested to place this check at the beginning.
>> > > >
>> > > irq_setup() is a helper function for mana_gd_setup_irqs(), which
>> > > already takes care of no NULL pointer for irqs, and 0 number of interrupts can
>not be passed.
>> > >
>> > > nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX); if
>> > > (nvec < 0)
>> > >   return nvec;
>> >
>> > I know that. But still it's a bug. The common convention is that if
>> > a 0-length array is passed to a function, it should not dereference
>> > the pointer.
>> >
>> I will add one if check in the begining of irq_setup() to verify the
>> pointer and the nvec number.
>
>Yes you can, but what for? This is an error anyways, and you don't care about early
>return. So instead of adding and bearing extra logic, I'd just swap 2 lines of existing
>code.
Problem with the code you had proposed is shown below:

> ./a.out
 i is 1
 i is 2
 i is 3
 i is 4
 i is 5
 i is 6
 i is 7
 i is 8
 i is 9
 i is 10
in done
lisatest ~
> cat test3.c
#include<stdio.h>

main() {
        int i = 0, cur, nvec = 10;
        for (cur = 0; cur < 20; cur++) {
                if (i++ == nvec)
                        goto done;
                printf(" i is %d\n", i);
        }
done:                                                                                                                                                                                                                                                                                  
printf("in done\n");
}

So now it is because post increment operator in i++,
For that reason in the posposed code we will hit irqs[nvec], which may cause crash, as size of
irqs is nvec.

Now if we preincrement, then we will loop correctly, but nvec == 0 check will not happen.

Like here with preincrement in above code we are not hitting (i == nvec) .
> ./a.out
 i is 1
 i is 2
 i is 3
 i is 4
 i is 5
 i is 6
 i is 7
 i is 8
 i is 9
in done

So with preincrement if we want the check for nvec == 0, we will need the check with extra if condition
before the loop.
  
Yury Norov Dec. 12, 2023, 5:40 p.m. UTC | #11
On Tue, Dec 12, 2023 at 05:18:31PM +0000, Souradeep Chakrabarti wrote:
> 
> 
> >-----Original Message-----
> >From: Yury Norov <yury.norov@gmail.com>
> >Sent: Tuesday, December 12, 2023 10:04 PM
> >To: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> >Cc: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> ><haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> ><decui@microsoft.com>; davem@davemloft.net; edumazet@google.com;
> >kuba@kernel.org; pabeni@redhat.com; Long Li <longli@microsoft.com>;
> >leon@kernel.org; cai.huoqing@linux.dev; ssengar@linux.microsoft.com;
> >vkuznets@redhat.com; tglx@linutronix.de; linux-hyperv@vger.kernel.org;
> >netdev@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
> >rdma@vger.kernel.org; Souradeep Chakrabarti <schakrabarti@microsoft.com>;
> >Paul Rosswurm <paulros@microsoft.com>
> >Subject: [EXTERNAL] Re: [PATCH V5 net-next] net: mana: Assigning IRQ affinity on
> >HT cores
> >
> >[Some people who received this message don't often get email from
> >yury.norov@gmail.com. Learn why this is important at
> >https://aka.ms/LearnAboutSenderIdentification ]
> >
> >> > > > > +     rcu_read_lock();
> >> > > > > +     for_each_numa_hop_mask(next, next_node) {
> >> > > > > +             cpumask_andnot(curr, next, prev);
> >> > > > > +             for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {
> >> > > > > +                     cpumask_copy(cpus, curr);
> >> > > > > +                     for_each_cpu(cpu, cpus) {
> >> > > > > +                             irq_set_affinity_and_hint(irqs[i],
> >topology_sibling_cpumask(cpu));
> >> > > > > +                             if (++i == nvec)
> >> > > > > +                                     goto done;
> >> > > >
> >> > > > Think what if you're passed with irq_setup(NULL, 0, 0).
> >> > > > That's why I suggested to place this check at the beginning.
> >> > > >
> >> > > irq_setup() is a helper function for mana_gd_setup_irqs(), which
> >> > > already takes care of no NULL pointer for irqs, and 0 number of interrupts can
> >not be passed.
> >> > >
> >> > > nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX); if
> >> > > (nvec < 0)
> >> > >   return nvec;
> >> >
> >> > I know that. But still it's a bug. The common convention is that if
> >> > a 0-length array is passed to a function, it should not dereference
> >> > the pointer.
> >> >
> >> I will add one if check in the begining of irq_setup() to verify the
> >> pointer and the nvec number.
> >
> >Yes you can, but what for? This is an error anyways, and you don't care about early
> >return. So instead of adding and bearing extra logic, I'd just swap 2 lines of existing
> >code.
> Problem with the code you had proposed is shown below:
> 
> > ./a.out
>  i is 1
>  i is 2
>  i is 3
>  i is 4
>  i is 5
>  i is 6
>  i is 7
>  i is 8
>  i is 9
>  i is 10
> in done
> lisatest ~
> > cat test3.c
> #include<stdio.h>
> 
> main() {
>         int i = 0, cur, nvec = 10;
>         for (cur = 0; cur < 20; cur++) {
>                 if (i++ == nvec)
>                         goto done;
>                 printf(" i is %d\n", i);
>         }
> done:                                                                                                                                                                                                                                                                                  
> printf("in done\n");
> }
> 
> So now it is because post increment operator in i++,
> For that reason in the posposed code we will hit irqs[nvec], which may cause crash, as size of
> irqs is nvec.
> 
> Now if we preincrement, then we will loop correctly, but nvec == 0 check will not happen.
> 
> Like here with preincrement in above code we are not hitting (i == nvec) .
> > ./a.out
>  i is 1
>  i is 2
>  i is 3
>  i is 4
>  i is 5
>  i is 6
>  i is 7
>  i is 8
>  i is 9
> in done
> 
> So with preincrement if we want the check for nvec == 0, we will need the check with extra if condition
> before the loop.

OK, I see. Then just separate it:

         for (cur = 0; cur < 20; cur++) {
                 if (i == nvec)
                         goto done;
                 printf(" i is %d\n", i++);
  
Suman Ghosh Dec. 12, 2023, 6:17 p.m. UTC | #12
Hi Souradeep,

Please find inline for couple of comments.

>+
>+	if (!zalloc_cpumask_var(&curr, GFP_KERNEL)) {
>+		err = -ENOMEM;
>+		return err;
>+	}
>+	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) {
[Suman] memory leak here, should free 'curr'.
>+		err = -ENOMEM;
>+		return err;
>+	}
>+
>+	rcu_read_lock();
>+	for_each_numa_hop_mask(next, next_node) {
>+		cpumask_andnot(curr, next, prev);
>+		for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {
>+			cpumask_copy(cpus, curr);
>+			for_each_cpu(cpu, cpus) {
>+				irq_set_affinity_and_hint(irqs[i],
>topology_sibling_cpumask(cpu));
>+				if (++i == nvec)
>+					goto done;
>+				cpumask_andnot(cpus, cpus,
>topology_sibling_cpumask(cpu));
>+				++cnt;
>+			}
>+		}
>+		prev = next;
>+	}
>+done:
>+	rcu_read_unlock();
>+	free_cpumask_var(curr);
>+	free_cpumask_var(cpus);
>+	return err;
>+}
>+
> static int mana_gd_setup_irqs(struct pci_dev *pdev)  {
>-	unsigned int max_queues_per_port = num_online_cpus();
> 	struct gdma_context *gc = pci_get_drvdata(pdev);
>+	unsigned int max_queues_per_port;
> 	struct gdma_irq_context *gic;
> 	unsigned int max_irqs, cpu;
>-	int nvec, irq;
>+	int start_irq_index = 1;
>+	int nvec, *irqs, irq;
> 	int err, i = 0, j;
>
>+	cpus_read_lock();
>+	max_queues_per_port = num_online_cpus();
> 	if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
> 		max_queues_per_port = MANA_MAX_NUM_QUEUES;
>
>@@ -1261,6 +1302,14 @@ static int mana_gd_setup_irqs(struct pci_dev
>*pdev)
> 	nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
> 	if (nvec < 0)
[Suman] cpus_read_unlock()?
> 		return nvec;
>+	if (nvec <= num_online_cpus())
>+		start_irq_index = 0;
>+
>+	irqs = kmalloc_array((nvec - start_irq_index), sizeof(int),
>GFP_KERNEL);
>+	if (!irqs) {
>+		err = -ENOMEM;
>+		goto free_irq_vector;
>+	}
>
> 	gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
> 				   GFP_KERNEL);
>@@ -1287,21 +1336,44 @@ static int mana_gd_setup_irqs(struct pci_dev
>*pdev)
> 			goto free_irq;
> 		}
>
>-		err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
>-		if (err)
>-			goto free_irq;
>-
>-		cpu = cpumask_local_spread(i, gc->numa_node);
>-		irq_set_affinity_and_hint(irq, cpumask_of(cpu));
>+		if (!i) {
>+			err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
>+			if (err)
>+				goto free_irq;
>+
>+			/* If number of IRQ is one extra than number of online
>CPUs,
>+			 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
>+			 * same CPU.
>+			 * Else we will use different CPUs for IRQ0 and IRQ1.
>+			 * Also we are using cpumask_local_spread instead of
>+			 * cpumask_first for the node, because the node can be
>+			 * mem only.
>+			 */
>+			if (start_irq_index) {
>+				cpu = cpumask_local_spread(i, gc->numa_node);
>+				irq_set_affinity_and_hint(irq, cpumask_of(cpu));
>+			} else {
>+				irqs[start_irq_index] = irq;
>+			}
>+		} else {
>+			irqs[i - start_irq_index] = irq;
>+			err = request_irq(irqs[i - start_irq_index],
>mana_gd_intr, 0,
>+					  gic->name, gic);
>+			if (err)
>+				goto free_irq;
>+		}
> 	}
>
>+	err = irq_setup(irqs, (nvec - start_irq_index), gc->numa_node);
>+	if (err)
>+		goto free_irq;
> 	err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
> 	if (err)
> 		goto free_irq;
>
> 	gc->max_num_msix = nvec;
> 	gc->num_msix_usable = nvec;
>-
>+	cpus_read_unlock();
> 	return 0;
>
> free_irq:
>@@ -1314,8 +1386,10 @@ static int mana_gd_setup_irqs(struct pci_dev
>*pdev)
> 	}
>
> 	kfree(gc->irq_contexts);
>+	kfree(irqs);
> 	gc->irq_contexts = NULL;
> free_irq_vector:
>+	cpus_read_unlock();
> 	pci_free_irq_vectors(pdev);
> 	return err;
> }
>--
>2.34.1
>
  
Souradeep Chakrabarti Dec. 12, 2023, 6:22 p.m. UTC | #13
>-----Original Message-----
>From: Suman Ghosh <sumang@marvell.com>
>Sent: Tuesday, December 12, 2023 11:48 PM
>To: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>; KY Srinivasan
><kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>;
>wei.liu@kernel.org; Dexuan Cui <decui@microsoft.com>; davem@davemloft.net;
>edumazet@google.com; kuba@kernel.org; pabeni@redhat.com; Long Li
><longli@microsoft.com>; yury.norov@gmail.com; leon@kernel.org;
>cai.huoqing@linux.dev; ssengar@linux.microsoft.com; vkuznets@redhat.com;
>tglx@linutronix.de; linux-hyperv@vger.kernel.org; netdev@vger.kernel.org; linux-
>kernel@vger.kernel.org; linux-rdma@vger.kernel.org
>Cc: Souradeep Chakrabarti <schakrabarti@microsoft.com>; Paul Rosswurm
><paulros@microsoft.com>
>Subject: [EXTERNAL] RE: [EXT] [PATCH V5 net-next] net: mana: Assigning IRQ
>affinity on HT cores
>
>[Some people who received this message don't often get email from
>sumang@marvell.com. Learn why this is important at
>https://aka.ms/LearnAboutSenderIdentification ]
>
>Hi Souradeep,
>
>Please find inline for couple of comments.
>
>>+
>>+      if (!zalloc_cpumask_var(&curr, GFP_KERNEL)) {
>>+              err = -ENOMEM;
>>+              return err;
>>+      }
>>+      if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) {
>[Suman] memory leak here, should free 'curr'.
This will be taken care in next version.
>>+              err = -ENOMEM;
>>+              return err;
>>+      }
>>+
>>+      rcu_read_lock();
>>+      for_each_numa_hop_mask(next, next_node) {
>>+              cpumask_andnot(curr, next, prev);
>>+              for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {
>>+                      cpumask_copy(cpus, curr);
>>+                      for_each_cpu(cpu, cpus) {
>>+                              irq_set_affinity_and_hint(irqs[i],
>>topology_sibling_cpumask(cpu));
>>+                              if (++i == nvec)
>>+                                      goto done;
>>+                              cpumask_andnot(cpus, cpus,
>>topology_sibling_cpumask(cpu));
>>+                              ++cnt;
>>+                      }
>>+              }
>>+              prev = next;
>>+      }
>>+done:
>>+      rcu_read_unlock();
>>+      free_cpumask_var(curr);
>>+      free_cpumask_var(cpus);
>>+      return err;
>>+}
>>+
>> static int mana_gd_setup_irqs(struct pci_dev *pdev)  {
>>-      unsigned int max_queues_per_port = num_online_cpus();
>>       struct gdma_context *gc = pci_get_drvdata(pdev);
>>+      unsigned int max_queues_per_port;
>>       struct gdma_irq_context *gic;
>>       unsigned int max_irqs, cpu;
>>-      int nvec, irq;
>>+      int start_irq_index = 1;
>>+      int nvec, *irqs, irq;
>>       int err, i = 0, j;
>>
>>+      cpus_read_lock();
>>+      max_queues_per_port = num_online_cpus();
>>       if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
>>               max_queues_per_port = MANA_MAX_NUM_QUEUES;
>>
>>@@ -1261,6 +1302,14 @@ static int mana_gd_setup_irqs(struct pci_dev
>>*pdev)
>>       nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
>>       if (nvec < 0)
>[Suman] cpus_read_unlock()?
Thanks for pointing, it will be taken care off in the V6.
>>               return nvec;
>>+      if (nvec <= num_online_cpus())
>>+              start_irq_index = 0;
>>+
>>+      irqs = kmalloc_array((nvec - start_irq_index), sizeof(int),
>>GFP_KERNEL);
>>+      if (!irqs) {
>>+              err = -ENOMEM;
>>+              goto free_irq_vector;
>>+      }
>>
>>       gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
>>                                  GFP_KERNEL); @@ -1287,21 +1336,44 @@
>>static int mana_gd_setup_irqs(struct pci_dev
>>*pdev)
>>                       goto free_irq;
>>               }
>>
>>-              err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
>>-              if (err)
>>-                      goto free_irq;
>>-
>>-              cpu = cpumask_local_spread(i, gc->numa_node);
>>-              irq_set_affinity_and_hint(irq, cpumask_of(cpu));
>>+              if (!i) {
>>+                      err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
>>+                      if (err)
>>+                              goto free_irq;
>>+
>>+                      /* If number of IRQ is one extra than number of
>>+ online
>>CPUs,
>>+                       * then we need to assign IRQ0 (hwc irq) and IRQ1 to
>>+                       * same CPU.
>>+                       * Else we will use different CPUs for IRQ0 and IRQ1.
>>+                       * Also we are using cpumask_local_spread instead of
>>+                       * cpumask_first for the node, because the node can be
>>+                       * mem only.
>>+                       */
>>+                      if (start_irq_index) {
>>+                              cpu = cpumask_local_spread(i, gc->numa_node);
>>+                              irq_set_affinity_and_hint(irq, cpumask_of(cpu));
>>+                      } else {
>>+                              irqs[start_irq_index] = irq;
>>+                      }
>>+              } else {
>>+                      irqs[i - start_irq_index] = irq;
>>+                      err = request_irq(irqs[i - start_irq_index],
>>mana_gd_intr, 0,
>>+                                        gic->name, gic);
>>+                      if (err)
>>+                              goto free_irq;
>>+              }
>>       }
>>
>>+      err = irq_setup(irqs, (nvec - start_irq_index), gc->numa_node);
>>+      if (err)
>>+              goto free_irq;
>>       err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
>>       if (err)
>>               goto free_irq;
>>
>>       gc->max_num_msix = nvec;
>>       gc->num_msix_usable = nvec;
>>-
>>+      cpus_read_unlock();
>>       return 0;
>>
>> free_irq:
>>@@ -1314,8 +1386,10 @@ static int mana_gd_setup_irqs(struct pci_dev
>>*pdev)
>>       }
>>
>>       kfree(gc->irq_contexts);
>>+      kfree(irqs);
>>       gc->irq_contexts = NULL;
>> free_irq_vector:
>>+      cpus_read_unlock();
>>       pci_free_irq_vectors(pdev);
>>       return err;
>> }
>>--
>>2.34.1
>>
  

Patch

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 6367de0c2c2e..18e8908c5d29 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1243,15 +1243,56 @@  void mana_gd_free_res_map(struct gdma_resource *r)
 	r->size = 0;
 }
 
+static int irq_setup(int *irqs, int nvec, int start_numa_node)
+{
+	int w, cnt, cpu, err = 0, i = 0;
+	int next_node = start_numa_node;
+	const struct cpumask *next, *prev = cpu_none_mask;
+	cpumask_var_t curr, cpus;
+
+	if (!zalloc_cpumask_var(&curr, GFP_KERNEL)) {
+		err = -ENOMEM;
+		return err;
+	}
+	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) {
+		err = -ENOMEM;
+		return err;
+	}
+
+	rcu_read_lock();
+	for_each_numa_hop_mask(next, next_node) {
+		cpumask_andnot(curr, next, prev);
+		for (w = cpumask_weight(curr), cnt = 0; cnt < w; ) {
+			cpumask_copy(cpus, curr);
+			for_each_cpu(cpu, cpus) {
+				irq_set_affinity_and_hint(irqs[i], topology_sibling_cpumask(cpu));
+				if (++i == nvec)
+					goto done;
+				cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
+				++cnt;
+			}
+		}
+		prev = next;
+	}
+done:
+	rcu_read_unlock();
+	free_cpumask_var(curr);
+	free_cpumask_var(cpus);
+	return err;
+}
+
 static int mana_gd_setup_irqs(struct pci_dev *pdev)
 {
-	unsigned int max_queues_per_port = num_online_cpus();
 	struct gdma_context *gc = pci_get_drvdata(pdev);
+	unsigned int max_queues_per_port;
 	struct gdma_irq_context *gic;
 	unsigned int max_irqs, cpu;
-	int nvec, irq;
+	int start_irq_index = 1;
+	int nvec, *irqs, irq;
 	int err, i = 0, j;
 
+	cpus_read_lock();
+	max_queues_per_port = num_online_cpus();
 	if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
 		max_queues_per_port = MANA_MAX_NUM_QUEUES;
 
@@ -1261,6 +1302,14 @@  static int mana_gd_setup_irqs(struct pci_dev *pdev)
 	nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
 	if (nvec < 0)
 		return nvec;
+	if (nvec <= num_online_cpus())
+		start_irq_index = 0;
+
+	irqs = kmalloc_array((nvec - start_irq_index), sizeof(int), GFP_KERNEL);
+	if (!irqs) {
+		err = -ENOMEM;
+		goto free_irq_vector;
+	}
 
 	gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
 				   GFP_KERNEL);
@@ -1287,21 +1336,44 @@  static int mana_gd_setup_irqs(struct pci_dev *pdev)
 			goto free_irq;
 		}
 
-		err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
-		if (err)
-			goto free_irq;
-
-		cpu = cpumask_local_spread(i, gc->numa_node);
-		irq_set_affinity_and_hint(irq, cpumask_of(cpu));
+		if (!i) {
+			err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
+			if (err)
+				goto free_irq;
+
+			/* If number of IRQ is one extra than number of online CPUs,
+			 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
+			 * same CPU.
+			 * Else we will use different CPUs for IRQ0 and IRQ1.
+			 * Also we are using cpumask_local_spread instead of
+			 * cpumask_first for the node, because the node can be
+			 * mem only.
+			 */
+			if (start_irq_index) {
+				cpu = cpumask_local_spread(i, gc->numa_node);
+				irq_set_affinity_and_hint(irq, cpumask_of(cpu));
+			} else {
+				irqs[start_irq_index] = irq;
+			}
+		} else {
+			irqs[i - start_irq_index] = irq;
+			err = request_irq(irqs[i - start_irq_index], mana_gd_intr, 0,
+					  gic->name, gic);
+			if (err)
+				goto free_irq;
+		}
 	}
 
+	err = irq_setup(irqs, (nvec - start_irq_index), gc->numa_node);
+	if (err)
+		goto free_irq;
 	err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
 	if (err)
 		goto free_irq;
 
 	gc->max_num_msix = nvec;
 	gc->num_msix_usable = nvec;
-
+	cpus_read_unlock();
 	return 0;
 
 free_irq:
@@ -1314,8 +1386,10 @@  static int mana_gd_setup_irqs(struct pci_dev *pdev)
 	}
 
 	kfree(gc->irq_contexts);
+	kfree(irqs);
 	gc->irq_contexts = NULL;
 free_irq_vector:
+	cpus_read_unlock();
 	pci_free_irq_vectors(pdev);
 	return err;
 }