[05/39] genirq/msi: Remove filter from msi_free_descs_free_range()

Message ID 20221111122013.888850936@linutronix.de
State New
Headers
Series genirq, PCI/MSI: Support for per device MSI and PCI/IMS - Part 1 cleanups |

Commit Message

Thomas Gleixner Nov. 11, 2022, 1:54 p.m. UTC
  When a range of descriptors is freed then all of them are not associated to
a linux interrupt. Remove the filter and add a warning to the free function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/base/platform-msi.c |    2 +-
 include/linux/msi.h         |    5 ++---
 kernel/irq/msi.c            |   19 ++++++++++---------
 3 files changed, 13 insertions(+), 13 deletions(-)
  

Comments

Jason Gunthorpe Nov. 16, 2022, 5:43 p.m. UTC | #1
On Fri, Nov 11, 2022 at 02:54:22PM +0100, Thomas Gleixner wrote:
> When a range of descriptors is freed then all of them are not associated to
> a linux interrupt. Remove the filter and add a warning to the free function.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  drivers/base/platform-msi.c |    2 +-
>  include/linux/msi.h         |    5 ++---
>  kernel/irq/msi.c            |   19 ++++++++++---------
>  3 files changed, 13 insertions(+), 13 deletions(-)

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>

Jason
  
Miquel Raynal March 1, 2023, 10:55 a.m. UTC | #2
Hi Thomas,

tglx@linutronix.de wrote on Fri, 11 Nov 2022 14:54:22 +0100 (CET):

> When a range of descriptors is freed then all of them are not associated to
> a linux interrupt. Remove the filter and add a warning to the free function.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---

[...]

> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -120,7 +120,7 @@ static int msi_add_simple_msi_descs(stru
>  fail_mem:
>  	ret = -ENOMEM;
>  fail:
> -	msi_free_msi_descs_range(dev, MSI_DESC_ALL, index, last);
> +	msi_free_msi_descs_range(dev, index, last);
>  	return ret;
>  }
>  
> @@ -141,12 +141,11 @@ static bool msi_desc_match(struct msi_de
>  /**
>   * msi_free_msi_descs_range - Free MSI descriptors of a device
>   * @dev:		Device to free the descriptors
> - * @filter:		Descriptor state filter
>   * @first_index:	Index to start freeing from
>   * @last_index:		Last index to be freed
>   */
> -void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
> -			      unsigned int first_index, unsigned int last_index)
> +void msi_free_msi_descs_range(struct device *dev, unsigned int first_index,
> +			      unsigned int last_index)
>  {
>  	struct xarray *xa = &dev->msi.data->__store;
>  	struct msi_desc *desc;
> @@ -155,10 +154,12 @@ void msi_free_msi_descs_range(struct dev
>  	lockdep_assert_held(&dev->msi.data->mutex);
>  
>  	xa_for_each_range(xa, idx, desc, first_index, last_index) {
> -		if (msi_desc_match(desc, filter)) {
> -			xa_erase(xa, idx);
> -			msi_free_desc(desc);
> -		}
> +		xa_erase(xa, idx);
> +
> +		/* Leak the descriptor when it is still referenced */
> +		if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED)))
> +			continue;
> +		msi_free_desc(desc);
>  	}
>  }

It looks like since this commit I am getting warnings upon EPROBE_DEFER
errors in the mvpp2 Marvell Ethernet driver. I looked a bit at the
internals to understand why this warning was shown, and it seems that
nothing "de-references" the descriptors, which would mean here:
resetting desc->irq to 0.

In my case I don't think the mvpp2_main.c driver nor the
irq_mvebu_icu.c driver behind do anything strange (as far as I
understand them). I believe any error during a ->probe() leading
to an irq_dispose_mapping() call with MSI behind will trigger that
warning.

I am wondering how useful thisd WARN_ON() is, or otherwise where the
desc->irq entry should be zeroed (if I understand that correctly), any
help will be appreciated.

Here is the splat:

[    2.045857] ------------[ cut here ]------------
[    2.050497] WARNING: CPU: 2 PID: 9 at kernel/irq/msi.c:197 msi_domain_free_descs+0x120/0x128
[    2.058993] Modules linked in:
[    2.062068] CPU: 2 PID: 9 Comm: kworker/u8:0 Not tainted 6.2.0-rc1+ #168
[    2.068804] Hardware name: Delta TN48M (DT)
[    2.073008] Workqueue: events_unbound deferred_probe_work_func
[    2.078878] pstate: 00000005 (nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[    2.085875] pc : msi_domain_free_descs+0x120/0x128
[    2.090693] lr : msi_domain_free_descs+0xe0/0x128
[    2.095423] sp : ffff80000a82b8d0
[    2.098745] x29: ffff80000a82b8d0 x28: 00007bfdbb508ca8 x27: ffff000102e28940
[    2.105921] x26: 0000000000000004 x25: ffff000100257660 x24: ffff800009a6b8d8
[    2.113096] x23: ffff800008a1c868 x22: ffff000102e4b700 x21: ffff000101494bb0
[    2.120270] x20: ffff80000a82b958 x19: ffff800009afe248 x18: 0000000000000010
[    2.127444] x17: fffffc0000fa4008 x16: 0000000000000008 x15: 000000000000013a
[    2.134618] x14: ffff80000a82b6a0 x13: 00000000ffffffea x12: ffff80000a82b870
[    2.141794] x11: 0000000000000002 x10: 0000000000000000 x9 : 0000000000000001
[    2.148967] x8 : 0000000000000000 x7 : 0000000000000238 x6 : ffff0001005f1230
[    2.156141] x5 : 0000000000000000 x4 : 0000200000000000 x3 : 0000000000000000
[    2.163315] x2 : b586accf01c45400 x1 : ffff0001000e0000 x0 : 000000000000002d
[    2.170489] Call trace:
[    2.172948]  msi_domain_free_descs+0x120/0x128
[    2.177417]  msi_domain_free_msi_descs_range+0x64/0x9c
[    2.182586]  platform_msi_device_domain_free+0x88/0xb8
[    2.187752]  mvebu_icu_irq_domain_free+0x60/0x80
[    2.192396]  irq_domain_free_irqs_hierarchy.part.21+0x94/0xa8
[    2.198173]  irq_domain_free_irqs+0xec/0x150
[    2.202466]  irq_dispose_mapping+0x104/0x120
[    2.206758]  mvpp2_probe+0x2028/0x21f8
[    2.210530]  platform_probe+0x68/0xd8
[    2.214210]  really_probe+0xbc/0x2a8
[    2.217807]  __driver_probe_device+0x78/0xe0
[    2.222102]  driver_probe_device+0x3c/0x108
[    2.226308]  __device_attach_driver+0xb8/0xf8
[    2.230690]  bus_for_each_drv+0x7c/0xd0
[    2.234547]  __device_attach+0xec/0x188
[    2.238404]  device_initial_probe+0x14/0x20
[    2.242611]  bus_probe_device+0x9c/0xa8
[    2.246468]  deferred_probe_work_func+0x88/0xc0
[    2.251024]  process_one_work+0x1fc/0x348
[    2.255056]  worker_thread+0x228/0x420
[    2.258825]  kthread+0x10c/0x118
[    2.262075]  ret_from_fork+0x10/0x20
[    2.265672] ---[ end trace 0000000000000000 ]---

Thanks,
Miquèl
  
Thomas Gleixner March 1, 2023, 9:07 p.m. UTC | #3
Miquel!

On Wed, Mar 01 2023 at 11:55, Miquel Raynal wrote:
> tglx@linutronix.de wrote on Fri, 11 Nov 2022 14:54:22 +0100 (CET):
>
>> When a range of descriptors is freed then all of them are not associated to
>> a linux interrupt. Remove the filter and add a warning to the free function.
>> +		/* Leak the descriptor when it is still referenced */
>> +		if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED)))
>> +			continue;
>> +		msi_free_desc(desc);
>>  	}
>>  }
>
> It looks like since this commit I am getting warnings upon EPROBE_DEFER
> errors in the mvpp2 Marvell Ethernet driver. I looked a bit at the
> internals to understand why this warning was shown, and it seems that
> nothing "de-references" the descriptors, which would mean here:
> resetting desc->irq to 0.

Correct. This platform-msi ^(*&!@&^ hack really needs to die ASAP.

Marc, where are we on that? Is this still in limbo?

> I am wondering how useful thisd WARN_ON() is, or otherwise where the

It is useful as it caught bugs already.

> desc->irq entry should be zeroed (if I understand that correctly), any
> help will be appreciated.

Untested workaround below. I hate it with a passion, but *shrug*.

Thanks,

        tglx
---
 drivers/base/platform-msi.c |    1 +
 include/linux/msi.h         |    2 ++
 kernel/irq/msi.c            |   23 ++++++++++++++++++++++-
 3 files changed, 25 insertions(+), 1 deletion(-)

--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -324,6 +324,7 @@ void platform_msi_device_domain_free(str
 	struct platform_msi_priv_data *data = domain->host_data;
 
 	msi_lock_descs(data->dev);
+	msi_domain_depopulate_descs(data->dev, virq, nr_irqs);
 	irq_domain_free_irqs_common(domain, virq, nr_irqs);
 	msi_free_msi_descs_range(data->dev, virq, virq + nr_irqs - 1);
 	msi_unlock_descs(data->dev);
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -631,6 +631,8 @@ int msi_domain_prepare_irqs(struct irq_d
 			    int nvec, msi_alloc_info_t *args);
 int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
 			     int virq, int nvec, msi_alloc_info_t *args);
+void msi_domain_depopulate_descs(struct device *dev, int virq, int nvec);
+
 struct irq_domain *
 __platform_msi_create_device_domain(struct device *dev,
 				    unsigned int nvec,
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1109,14 +1109,35 @@ int msi_domain_populate_irqs(struct irq_
 	return 0;
 
 fail:
-	for (--virq; virq >= virq_base; virq--)
+	for (--virq; virq >= virq_base; virq--) {
+		msi_domain_depopulate_descs(dev, virq, 1);
 		irq_domain_free_irqs_common(domain, virq, 1);
+	}
 	msi_domain_free_descs(dev, &ctrl);
 unlock:
 	msi_unlock_descs(dev);
 	return ret;
 }
 
+void msi_domain_depopulate_descs(struct device *dev, int virq_base, int nvec)
+{
+	struct msi_ctrl ctrl = {
+		.domid	= MSI_DEFAULT_DOMAIN,
+		.first  = virq_base,
+		.last	= virq_base + nvec - 1,
+	};
+	struct msi_desc *desc;
+	struct xarray *xa;
+	unsigned long idx;
+
+	if (!msi_ctrl_valid(dev, &ctrl))
+		return;
+
+	xa = &dev->msi.data->__domains[ctrl.domid].store;
+	xa_for_each_range(xa, idx, desc, ctrl.first, ctrl.last)
+		desc->irq = 0;
+}
+
 /*
  * Carefully check whether the device can use reservation mode. If
  * reservation mode is enabled then the early activation will assign a
  
Miquel Raynal March 2, 2023, 2:43 p.m. UTC | #4
Hi Thomas,

tglx@linutronix.de wrote on Wed, 01 Mar 2023 22:07:48 +0100:

> Miquel!
> 
> On Wed, Mar 01 2023 at 11:55, Miquel Raynal wrote:
> > tglx@linutronix.de wrote on Fri, 11 Nov 2022 14:54:22 +0100 (CET):
> >  
> >> When a range of descriptors is freed then all of them are not associated to
> >> a linux interrupt. Remove the filter and add a warning to the free function.
> >> +		/* Leak the descriptor when it is still referenced */
> >> +		if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED)))
> >> +			continue;
> >> +		msi_free_desc(desc);
> >>  	}
> >>  }  
> >
> > It looks like since this commit I am getting warnings upon EPROBE_DEFER
> > errors in the mvpp2 Marvell Ethernet driver. I looked a bit at the
> > internals to understand why this warning was shown, and it seems that
> > nothing "de-references" the descriptors, which would mean here:
> > resetting desc->irq to 0.  
> 
> Correct. This platform-msi ^(*&!@&^ hack really needs to die ASAP.

:-)

> Marc, where are we on that? Is this still in limbo?
> 
> > I am wondering how useful thisd WARN_ON() is, or otherwise where the  
> 
> It is useful as it caught bugs already.

Sure.

> > desc->irq entry should be zeroed (if I understand that correctly), any
> > help will be appreciated.  
> 
> Untested workaround below.

Excellent!

> I hate it with a passion, but *shrug*.

:'D

> Thanks,
> 
>         tglx
> ---
>  drivers/base/platform-msi.c |    1 +
>  include/linux/msi.h         |    2 ++
>  kernel/irq/msi.c            |   23 ++++++++++++++++++++++-
>  3 files changed, 25 insertions(+), 1 deletion(-)

Kudos for the diff, which indeed works perfectly in my case. I guess
you'll make a proper patch soon, if that's the case you can add my:

Tested-by: Miquel Raynal <miquel.raynal@bootlin.com>

Let me know otherwise.

Thanks a lot for the very quick fix!
Miquèl

> --- a/drivers/base/platform-msi.c
> +++ b/drivers/base/platform-msi.c
> @@ -324,6 +324,7 @@ void platform_msi_device_domain_free(str
>  	struct platform_msi_priv_data *data = domain->host_data;
>  
>  	msi_lock_descs(data->dev);
> +	msi_domain_depopulate_descs(data->dev, virq, nr_irqs);
>  	irq_domain_free_irqs_common(domain, virq, nr_irqs);
>  	msi_free_msi_descs_range(data->dev, virq, virq + nr_irqs - 1);
>  	msi_unlock_descs(data->dev);
> --- a/include/linux/msi.h
> +++ b/include/linux/msi.h
> @@ -631,6 +631,8 @@ int msi_domain_prepare_irqs(struct irq_d
>  			    int nvec, msi_alloc_info_t *args);
>  int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
>  			     int virq, int nvec, msi_alloc_info_t *args);
> +void msi_domain_depopulate_descs(struct device *dev, int virq, int nvec);
> +
>  struct irq_domain *
>  __platform_msi_create_device_domain(struct device *dev,
>  				    unsigned int nvec,
> --- a/kernel/irq/msi.c
> +++ b/kernel/irq/msi.c
> @@ -1109,14 +1109,35 @@ int msi_domain_populate_irqs(struct irq_
>  	return 0;
>  
>  fail:
> -	for (--virq; virq >= virq_base; virq--)
> +	for (--virq; virq >= virq_base; virq--) {
> +		msi_domain_depopulate_descs(dev, virq, 1);
>  		irq_domain_free_irqs_common(domain, virq, 1);
> +	}
>  	msi_domain_free_descs(dev, &ctrl);
>  unlock:
>  	msi_unlock_descs(dev);
>  	return ret;
>  }
>  
> +void msi_domain_depopulate_descs(struct device *dev, int virq_base, int nvec)
> +{
> +	struct msi_ctrl ctrl = {
> +		.domid	= MSI_DEFAULT_DOMAIN,
> +		.first  = virq_base,
> +		.last	= virq_base + nvec - 1,
> +	};
> +	struct msi_desc *desc;
> +	struct xarray *xa;
> +	unsigned long idx;
> +
> +	if (!msi_ctrl_valid(dev, &ctrl))
> +		return;
> +
> +	xa = &dev->msi.data->__domains[ctrl.domid].store;
> +	xa_for_each_range(xa, idx, desc, ctrl.first, ctrl.last)
> +		desc->irq = 0;
> +}
> +
>  /*
>   * Carefully check whether the device can use reservation mode. If
>   * reservation mode is enabled then the early activation will assign a
  

Patch

--- a/drivers/base/platform-msi.c
+++ b/drivers/base/platform-msi.c
@@ -325,7 +325,7 @@  void platform_msi_device_domain_free(str
 
 	msi_lock_descs(data->dev);
 	irq_domain_free_irqs_common(domain, virq, nr_irqs);
-	msi_free_msi_descs_range(data->dev, MSI_DESC_ALL, virq, virq + nr_irqs - 1);
+	msi_free_msi_descs_range(data->dev, virq, virq + nr_irqs - 1);
 	msi_unlock_descs(data->dev);
 }
 
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -247,8 +247,7 @@  static inline void pci_write_msi_msg(uns
 #endif /* CONFIG_PCI_MSI */
 
 int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc);
-void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
-			      unsigned int first_index, unsigned int last_index);
+void msi_free_msi_descs_range(struct device *dev, unsigned int first_index, unsigned int last_index);
 
 /**
  * msi_free_msi_descs - Free MSI descriptors of a device
@@ -256,7 +255,7 @@  void msi_free_msi_descs_range(struct dev
  */
 static inline void msi_free_msi_descs(struct device *dev)
 {
-	msi_free_msi_descs_range(dev, MSI_DESC_ALL, 0, MSI_MAX_INDEX);
+	msi_free_msi_descs_range(dev, 0, MSI_MAX_INDEX);
 }
 
 void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -120,7 +120,7 @@  static int msi_add_simple_msi_descs(stru
 fail_mem:
 	ret = -ENOMEM;
 fail:
-	msi_free_msi_descs_range(dev, MSI_DESC_ALL, index, last);
+	msi_free_msi_descs_range(dev, index, last);
 	return ret;
 }
 
@@ -141,12 +141,11 @@  static bool msi_desc_match(struct msi_de
 /**
  * msi_free_msi_descs_range - Free MSI descriptors of a device
  * @dev:		Device to free the descriptors
- * @filter:		Descriptor state filter
  * @first_index:	Index to start freeing from
  * @last_index:		Last index to be freed
  */
-void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
-			      unsigned int first_index, unsigned int last_index)
+void msi_free_msi_descs_range(struct device *dev, unsigned int first_index,
+			      unsigned int last_index)
 {
 	struct xarray *xa = &dev->msi.data->__store;
 	struct msi_desc *desc;
@@ -155,10 +154,12 @@  void msi_free_msi_descs_range(struct dev
 	lockdep_assert_held(&dev->msi.data->mutex);
 
 	xa_for_each_range(xa, idx, desc, first_index, last_index) {
-		if (msi_desc_match(desc, filter)) {
-			xa_erase(xa, idx);
-			msi_free_desc(desc);
-		}
+		xa_erase(xa, idx);
+
+		/* Leak the descriptor when it is still referenced */
+		if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED)))
+			continue;
+		msi_free_desc(desc);
 	}
 }
 
@@ -739,7 +740,7 @@  int msi_domain_populate_irqs(struct irq_
 fail:
 	for (--virq; virq >= virq_base; virq--)
 		irq_domain_free_irqs_common(domain, virq, 1);
-	msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1);
+	msi_free_msi_descs_range(dev, virq_base, virq_base + nvec - 1);
 unlock:
 	msi_unlock_descs(dev);
 	return ret;