[01/17] PCI: Add concurrency safe clear_and_set variants for LNKCTL{,2}

Message ID 20230511131441.45704-2-ilpo.jarvinen@linux.intel.com
State New
Headers
Series PCI: Improve LNKCTL & LNKCTL2 concurrency control |

Commit Message

Ilpo Järvinen May 11, 2023, 1:14 p.m. UTC
  A few places write LNKCTL and LNKCTL2 registers without proper
concurrency control and this could result in losing the changes
one of the writers intended to make.

Add pcie_capability_clear_and_set_word_locked() and helpers to use it
with LNKCTL and LNKCTL2. The concurrency control is provided using a
spinlock in the struct pci_dev.

Suggested-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/pci/access.c | 14 ++++++++++++++
 drivers/pci/probe.c  |  1 +
 include/linux/pci.h  | 17 +++++++++++++++++
 3 files changed, 32 insertions(+)
  

Comments

Bjorn Helgaas May 11, 2023, 3:55 p.m. UTC | #1
On Thu, May 11, 2023 at 04:14:25PM +0300, Ilpo Järvinen wrote:
> A few places write LNKCTL and LNKCTL2 registers without proper
> concurrency control and this could result in losing the changes
> one of the writers intended to make.
> 
> Add pcie_capability_clear_and_set_word_locked() and helpers to use it
> with LNKCTL and LNKCTL2. The concurrency control is provided using a
> spinlock in the struct pci_dev.
> 
> Suggested-by: Lukas Wunner <lukas@wunner.de>
> Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>

Thanks for raising this issue!  Definitely looks like something that
needs attention.

> ---
>  drivers/pci/access.c | 14 ++++++++++++++
>  drivers/pci/probe.c  |  1 +
>  include/linux/pci.h  | 17 +++++++++++++++++
>  3 files changed, 32 insertions(+)
> 
> diff --git a/drivers/pci/access.c b/drivers/pci/access.c
> index 3c230ca3de58..d92a3daadd0c 100644
> --- a/drivers/pci/access.c
> +++ b/drivers/pci/access.c
> @@ -531,6 +531,20 @@ int pcie_capability_clear_and_set_dword(struct pci_dev *dev, int pos,
>  }
>  EXPORT_SYMBOL(pcie_capability_clear_and_set_dword);
>  
> +int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos,
> +					      u16 clear, u16 set)
> +{
> +	unsigned long flags;
> +	int ret;
> +
> +	spin_lock_irqsave(&dev->cap_lock, flags);
> +	ret = pcie_capability_clear_and_set_word(dev, pos, clear, set);
> +	spin_unlock_irqrestore(&dev->cap_lock, flags);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(pcie_capability_clear_and_set_word_locked);

I didn't see the prior discussion with Lukas, so maybe this was
answered there, but is there any reason not to add locking to
pcie_capability_clear_and_set_word() and friends directly?  

It would be nice to avoid having to decide whether to use the locked
or unlocked versions.  It would also be nice to preserve the use of
PCI_EXP_LNKCTL directly, for grep purposes.  And it would obviate the
need for some of these patches, e.g., the use of
pcie_capability_clear_word(), where it's not obvious at the call site
why a change is needed.

Bjorn

>  int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val)
>  {
>  	if (pci_dev_is_disconnected(dev)) {
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index 0b2826c4a832..0c14a283f1c7 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -2318,6 +2318,7 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
>  		.end = -1,
>  	};
>  
> +	spin_lock_init(&dev->cap_lock);
>  #ifdef CONFIG_PCI_MSI
>  	raw_spin_lock_init(&dev->msi_lock);
>  #endif
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 60b8772b5bd4..82faea085d95 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -467,6 +467,7 @@ struct pci_dev {
>  	pci_dev_flags_t dev_flags;
>  	atomic_t	enable_cnt;	/* pci_enable_device has been called */
>  
> +	spinlock_t	cap_lock;		/* Protects RMW ops done with locked RMW capability accessors */
>  	u32		saved_config_space[16]; /* Config space saved at suspend time */
>  	struct hlist_head saved_cap_space;
>  	int		rom_attr_enabled;	/* Display of ROM attribute enabled? */
> @@ -1221,6 +1222,8 @@ int pcie_capability_clear_and_set_word(struct pci_dev *dev, int pos,
>  				       u16 clear, u16 set);
>  int pcie_capability_clear_and_set_dword(struct pci_dev *dev, int pos,
>  					u32 clear, u32 set);
> +int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos,
> +					      u16 clear, u16 set);
>  
>  static inline int pcie_capability_set_word(struct pci_dev *dev, int pos,
>  					   u16 set)
> @@ -1246,6 +1249,20 @@ static inline int pcie_capability_clear_dword(struct pci_dev *dev, int pos,
>  	return pcie_capability_clear_and_set_dword(dev, pos, clear, 0);
>  }
>  
> +static inline int pcie_lnkctl_clear_and_set(struct pci_dev *dev, u16 clear,
> +					    u16 set)
> +{
> +	return pcie_capability_clear_and_set_word_locked(dev, PCI_EXP_LNKCTL,
> +							 clear, set);
> +}
> +
> +static inline int pcie_lnkctl2_clear_and_set(struct pci_dev *dev, u16 clear,
> +					    u16 set)
> +{
> +	return pcie_capability_clear_and_set_word_locked(dev, PCI_EXP_LNKCTL2,
> +							 clear, set);
> +}
> +
>  /* User-space driven config access */
>  int pci_user_read_config_byte(struct pci_dev *dev, int where, u8 *val);
>  int pci_user_read_config_word(struct pci_dev *dev, int where, u16 *val);
> -- 
> 2.30.2
>
  
Ilpo Järvinen May 11, 2023, 5:35 p.m. UTC | #2
On Thu, 11 May 2023, Bjorn Helgaas wrote:

> On Thu, May 11, 2023 at 04:14:25PM +0300, Ilpo Järvinen wrote:
> > A few places write LNKCTL and LNKCTL2 registers without proper
> > concurrency control and this could result in losing the changes
> > one of the writers intended to make.
> > 
> > Add pcie_capability_clear_and_set_word_locked() and helpers to use it
> > with LNKCTL and LNKCTL2. The concurrency control is provided using a
> > spinlock in the struct pci_dev.
> > 
> > Suggested-by: Lukas Wunner <lukas@wunner.de>
> > Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
> 
> Thanks for raising this issue!  Definitely looks like something that
> needs attention.
> 
> > ---
> >  drivers/pci/access.c | 14 ++++++++++++++
> >  drivers/pci/probe.c  |  1 +
> >  include/linux/pci.h  | 17 +++++++++++++++++
> >  3 files changed, 32 insertions(+)
> > 
> > diff --git a/drivers/pci/access.c b/drivers/pci/access.c
> > index 3c230ca3de58..d92a3daadd0c 100644
> > --- a/drivers/pci/access.c
> > +++ b/drivers/pci/access.c
> > @@ -531,6 +531,20 @@ int pcie_capability_clear_and_set_dword(struct pci_dev *dev, int pos,
> >  }
> >  EXPORT_SYMBOL(pcie_capability_clear_and_set_dword);
> >  
> > +int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos,
> > +					      u16 clear, u16 set)
> > +{
> > +	unsigned long flags;
> > +	int ret;
> > +
> > +	spin_lock_irqsave(&dev->cap_lock, flags);
> > +	ret = pcie_capability_clear_and_set_word(dev, pos, clear, set);
> > +	spin_unlock_irqrestore(&dev->cap_lock, flags);
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL(pcie_capability_clear_and_set_word_locked);
> 
> I didn't see the prior discussion with Lukas, so maybe this was
> answered there, but is there any reason not to add locking to
> pcie_capability_clear_and_set_word() and friends directly?  
>
> It would be nice to avoid having to decide whether to use the locked
> or unlocked versions.  It would also be nice to preserve the use of
> PCI_EXP_LNKCTL directly, for grep purposes.  And it would obviate the
> need for some of these patches, e.g., the use of
> pcie_capability_clear_word(), where it's not obvious at the call site
> why a change is needed.

There wasn't that big discussion about it (internally). I brought both
alternatives up and Lukas just said he didn't know what's the best 
approach (+ gave a weak nudge towards the separate accessor so I went 
with it to make forward progress). Based on that I don't think he had a 
strong opinion on it.

I'm certainly fine to just use it in the normal accessor functions that 
do RMW and add the locking there. It would certainly have to those good 
sides you mentioned.
  
Bjorn Helgaas May 11, 2023, 7:22 p.m. UTC | #3
On Thu, May 11, 2023 at 08:35:48PM +0300, Ilpo Järvinen wrote:
> On Thu, 11 May 2023, Bjorn Helgaas wrote:
> 
> > On Thu, May 11, 2023 at 04:14:25PM +0300, Ilpo Järvinen wrote:
> > > A few places write LNKCTL and LNKCTL2 registers without proper
> > > concurrency control and this could result in losing the changes
> > > one of the writers intended to make.
> > > 
> > > Add pcie_capability_clear_and_set_word_locked() and helpers to use it
> > > with LNKCTL and LNKCTL2. The concurrency control is provided using a
> > > spinlock in the struct pci_dev.
> > > 
> > > Suggested-by: Lukas Wunner <lukas@wunner.de>
> > > Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
> > 
> > Thanks for raising this issue!  Definitely looks like something that
> > needs attention.
> > 
> > > ---
> > >  drivers/pci/access.c | 14 ++++++++++++++
> > >  drivers/pci/probe.c  |  1 +
> > >  include/linux/pci.h  | 17 +++++++++++++++++
> > >  3 files changed, 32 insertions(+)
> > > 
> > > diff --git a/drivers/pci/access.c b/drivers/pci/access.c
> > > index 3c230ca3de58..d92a3daadd0c 100644
> > > --- a/drivers/pci/access.c
> > > +++ b/drivers/pci/access.c
> > > @@ -531,6 +531,20 @@ int pcie_capability_clear_and_set_dword(struct pci_dev *dev, int pos,
> > >  }
> > >  EXPORT_SYMBOL(pcie_capability_clear_and_set_dword);
> > >  
> > > +int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos,
> > > +					      u16 clear, u16 set)
> > > +{
> > > +	unsigned long flags;
> > > +	int ret;
> > > +
> > > +	spin_lock_irqsave(&dev->cap_lock, flags);
> > > +	ret = pcie_capability_clear_and_set_word(dev, pos, clear, set);
> > > +	spin_unlock_irqrestore(&dev->cap_lock, flags);
> > > +
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL(pcie_capability_clear_and_set_word_locked);
> > 
> > I didn't see the prior discussion with Lukas, so maybe this was
> > answered there, but is there any reason not to add locking to
> > pcie_capability_clear_and_set_word() and friends directly?  
> >
> > It would be nice to avoid having to decide whether to use the locked
> > or unlocked versions.  It would also be nice to preserve the use of
> > PCI_EXP_LNKCTL directly, for grep purposes.  And it would obviate the
> > need for some of these patches, e.g., the use of
> > pcie_capability_clear_word(), where it's not obvious at the call site
> > why a change is needed.
> 
> There wasn't that big discussion about it (internally). I brought both
> alternatives up and Lukas just said he didn't know what's the best 
> approach (+ gave a weak nudge towards the separate accessor so I went 
> with it to make forward progress). Based on that I don't think he had a 
> strong opinion on it.
> 
> I'm certainly fine to just use it in the normal accessor functions that 
> do RMW and add the locking there. It would certainly have to those good 
> sides you mentioned.

Let's start with that, then.

Many of these are ASPM-related updates that IMHO should not be in
drivers at all.  Drivers should use PCI core interfaces so the core
doesn't get confused.

Bjorn
  
Ilpo Järvinen May 11, 2023, 7:58 p.m. UTC | #4
On Thu, 11 May 2023, Bjorn Helgaas wrote:

> On Thu, May 11, 2023 at 08:35:48PM +0300, Ilpo Järvinen wrote:
> > On Thu, 11 May 2023, Bjorn Helgaas wrote:
> > 
> > > On Thu, May 11, 2023 at 04:14:25PM +0300, Ilpo Järvinen wrote:
> > > > A few places write LNKCTL and LNKCTL2 registers without proper
> > > > concurrency control and this could result in losing the changes
> > > > one of the writers intended to make.
> > > > 
> > > > Add pcie_capability_clear_and_set_word_locked() and helpers to use it
> > > > with LNKCTL and LNKCTL2. The concurrency control is provided using a
> > > > spinlock in the struct pci_dev.
> > > > 
> > > > Suggested-by: Lukas Wunner <lukas@wunner.de>
> > > > Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
> > > 
> > > Thanks for raising this issue!  Definitely looks like something that
> > > needs attention.
> > > 
> > > > ---
> > > >  drivers/pci/access.c | 14 ++++++++++++++
> > > >  drivers/pci/probe.c  |  1 +
> > > >  include/linux/pci.h  | 17 +++++++++++++++++
> > > >  3 files changed, 32 insertions(+)
> > > > 
> > > > diff --git a/drivers/pci/access.c b/drivers/pci/access.c
> > > > index 3c230ca3de58..d92a3daadd0c 100644
> > > > --- a/drivers/pci/access.c
> > > > +++ b/drivers/pci/access.c
> > > > @@ -531,6 +531,20 @@ int pcie_capability_clear_and_set_dword(struct pci_dev *dev, int pos,
> > > >  }
> > > >  EXPORT_SYMBOL(pcie_capability_clear_and_set_dword);
> > > >  
> > > > +int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos,
> > > > +					      u16 clear, u16 set)
> > > > +{
> > > > +	unsigned long flags;
> > > > +	int ret;
> > > > +
> > > > +	spin_lock_irqsave(&dev->cap_lock, flags);
> > > > +	ret = pcie_capability_clear_and_set_word(dev, pos, clear, set);
> > > > +	spin_unlock_irqrestore(&dev->cap_lock, flags);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +EXPORT_SYMBOL(pcie_capability_clear_and_set_word_locked);
> > > 
> > > I didn't see the prior discussion with Lukas, so maybe this was
> > > answered there, but is there any reason not to add locking to
> > > pcie_capability_clear_and_set_word() and friends directly?  
> > >
> > > It would be nice to avoid having to decide whether to use the locked
> > > or unlocked versions.  It would also be nice to preserve the use of
> > > PCI_EXP_LNKCTL directly, for grep purposes.  And it would obviate the
> > > need for some of these patches, e.g., the use of
> > > pcie_capability_clear_word(), where it's not obvious at the call site
> > > why a change is needed.
> > 
> > There wasn't that big discussion about it (internally). I brought both
> > alternatives up and Lukas just said he didn't know what's the best 
> > approach (+ gave a weak nudge towards the separate accessor so I went 
> > with it to make forward progress). Based on that I don't think he had a 
> > strong opinion on it.
> > 
> > I'm certainly fine to just use it in the normal accessor functions that 
> > do RMW and add the locking there. It would certainly have to those good 
> > sides you mentioned.
> 
> Let's start with that, then.
> 
> Many of these are ASPM-related updates that IMHO should not be in
> drivers at all.  Drivers should use PCI core interfaces so the core
> doesn't get confused.

Ah, yes. I forgot to mention it in the cover letter but I noticed that 
some of those seem to be workarounds for the cases where core refuses to 
disable ASPM. Some sites even explicit have a comment about that after 
the call to pci_disable_link_state():

static void bcm4377_disable_aspm(struct bcm4377_data *bcm4377)
{
        pci_disable_link_state(bcm4377->pdev,
                               PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1);

        /*
         * pci_disable_link_state can fail if either CONFIG_PCIEASPM is disabled
         * or if the BIOS hasn't handed over control to us. We must *always*
         * disable ASPM for this device due to hardware errata though.
         */
        pcie_capability_clear_word(bcm4377->pdev, PCI_EXP_LNKCTL,
                                   PCI_EXP_LNKCTL_ASPMC);
}

That kinda feels something that would want a force disable quirk that is 
reliable. There are quirks for some devices which try to disable it but 
could fail for reasons mentioned in that comment. (But I'd prefer to make 
another series out of it rather than putting it into this one.)

It might even be that some drivers don't even bother to make the 
pci_disable_link_state() call because it isn't reliable enough.
  
Lukas Wunner May 11, 2023, 8:07 p.m. UTC | #5
On Thu, May 11, 2023 at 10:58:40PM +0300, Ilpo Järvinen wrote:
> On Thu, 11 May 2023, Bjorn Helgaas wrote:
> > Many of these are ASPM-related updates that IMHO should not be in
> > drivers at all.  Drivers should use PCI core interfaces so the core
> > doesn't get confused.
> 
> Ah, yes. I forgot to mention it in the cover letter but I noticed that 
> some of those seem to be workarounds for the cases where core refuses to 
> disable ASPM. Some sites even explicit have a comment about that after 
> the call to pci_disable_link_state():
[...]
> That kinda feels something that would want a force disable quirk that is 
> reliable. There are quirks for some devices which try to disable it but 
> could fail for reasons mentioned in that comment. (But I'd prefer to make 
> another series out of it rather than putting it into this one.)

I'm wondering if it's worth cleaning up ASPM handling in drivers first
as the locking issue may then largely solve itself.  The locking could
probably be kept internal to ASPM core code then.

Thanks,

Lukas
  
Lukas Wunner May 11, 2023, 8:23 p.m. UTC | #6
On Thu, May 11, 2023 at 10:55:06AM -0500, Bjorn Helgaas wrote:
> On Thu, May 11, 2023 at 04:14:25PM +0300, Ilpo Järvinen wrote:
> > +int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos,
> > +					      u16 clear, u16 set)
> > +{
> > +	unsigned long flags;
> > +	int ret;
> > +
> > +	spin_lock_irqsave(&dev->cap_lock, flags);
> > +	ret = pcie_capability_clear_and_set_word(dev, pos, clear, set);
> > +	spin_unlock_irqrestore(&dev->cap_lock, flags);
> > +
> > +	return ret;
> > +}
> > +EXPORT_SYMBOL(pcie_capability_clear_and_set_word_locked);
> 
> I didn't see the prior discussion with Lukas, so maybe this was
> answered there, but is there any reason not to add locking to
> pcie_capability_clear_and_set_word() and friends directly?
> 
> It would be nice to avoid having to decide whether to use the locked
> or unlocked versions.

I think we definitely want to also offer lockless accessors which
can be used in hotpaths such as interrupt handlers if the accessed
registers don't need any locking (e.g. because there are no concurrent
accesses).

So the relatively lean approach chosen here which limits locking to
Link Control and Link Control 2, but allows future expansion to other
registers as well, seemed reasonable to me.

Thanks,

Lukas
  
Ilpo Järvinen May 11, 2023, 8:28 p.m. UTC | #7
On Thu, 11 May 2023, Lukas Wunner wrote:

> On Thu, May 11, 2023 at 10:58:40PM +0300, Ilpo Järvinen wrote:
> > On Thu, 11 May 2023, Bjorn Helgaas wrote:
> > > Many of these are ASPM-related updates that IMHO should not be in
> > > drivers at all.  Drivers should use PCI core interfaces so the core
> > > doesn't get confused.
> > 
> > Ah, yes. I forgot to mention it in the cover letter but I noticed that 
> > some of those seem to be workarounds for the cases where core refuses to 
> > disable ASPM. Some sites even explicit have a comment about that after 
> > the call to pci_disable_link_state():
> [...]
> > That kinda feels something that would want a force disable quirk that is 
> > reliable. There are quirks for some devices which try to disable it but 
> > could fail for reasons mentioned in that comment. (But I'd prefer to make 
> > another series out of it rather than putting it into this one.)
> 
> I'm wondering if it's worth cleaning up ASPM handling in drivers first
> as the locking issue may then largely solve itself.  The locking could
> probably be kept internal to ASPM core code then.

For some part yes, but at least those copy-pasted gpu setup codes did some 
other things too.

In any case, it would go against some earlier policy decision:

/**
 * pci_disable_link_state - Disable device's link state, so the link will
 * never enter specific states.  Note that if the BIOS didn't grant ASPM
 * control to the OS, this does nothing because we can't touch the LNKCTL
 * register. Returns 0 or a negative errno.

Is it fine to make core capable of violating that policy?

One question before I trying to come up something is when PCIEASPM is =n, 
should I provide some simple function that just does the LNKCTL write to 
disable it? And another thing is the existing quirks, should they be 
kept depending on the existing behavior or not?
  
Bjorn Helgaas May 11, 2023, 9:27 p.m. UTC | #8
[+cc Emmanuel, Rafael, Heiner, ancient ASPM history]

On Thu, May 11, 2023 at 10:58:40PM +0300, Ilpo Järvinen wrote:
> On Thu, 11 May 2023, Bjorn Helgaas wrote:
> > On Thu, May 11, 2023 at 08:35:48PM +0300, Ilpo Järvinen wrote:
> > > On Thu, 11 May 2023, Bjorn Helgaas wrote:
> > > > On Thu, May 11, 2023 at 04:14:25PM +0300, Ilpo Järvinen wrote:
> > > > > A few places write LNKCTL and LNKCTL2 registers without proper
> > > > > concurrency control and this could result in losing the changes
> > > > > one of the writers intended to make.
> > > > > 
> > > > > Add pcie_capability_clear_and_set_word_locked() and helpers to use it
> > > > > with LNKCTL and LNKCTL2. The concurrency control is provided using a
> > > > > spinlock in the struct pci_dev.
> ...

[beginning of thread is
https://lore.kernel.org/r/20230511131441.45704-1-ilpo.jarvinen@linux.intel.com;
context here is that several drivers clear ASPM config directly,
probably because pci_disable_link_state() doesn't always do it]

> > Many of these are ASPM-related updates that IMHO should not be in
> > drivers at all.  Drivers should use PCI core interfaces so the core
> > doesn't get confused.
> 
> Ah, yes. I forgot to mention it in the cover letter but I noticed that 
> some of those seem to be workarounds for the cases where core refuses to 
> disable ASPM. Some sites even explicit have a comment about that after 
> the call to pci_disable_link_state():
> 
> static void bcm4377_disable_aspm(struct bcm4377_data *bcm4377)
> {
>         pci_disable_link_state(bcm4377->pdev,
>                                PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1);
> 
>         /*
>          * pci_disable_link_state can fail if either CONFIG_PCIEASPM is disabled
>          * or if the BIOS hasn't handed over control to us. We must *always*
>          * disable ASPM for this device due to hardware errata though.
>          */
>         pcie_capability_clear_word(bcm4377->pdev, PCI_EXP_LNKCTL,
>                                    PCI_EXP_LNKCTL_ASPMC);
> }
> 
> That kinda feels something that would want a force disable quirk that is 
> reliable. There are quirks for some devices which try to disable it but 
> could fail for reasons mentioned in that comment. (But I'd prefer to make 
> another series out of it rather than putting it into this one.)
> 
> It might even be that some drivers don't even bother to make the 
> pci_disable_link_state() call because it isn't reliable enough.

Yeah, I noticed that this is problematic.

We went round and round about this ten years ago [1], which resulted
in https://git.kernel.org/linus/2add0ec14c25 ("PCI/ASPM: Warn when
driver asks to disable ASPM, but we can't do it").

I'm not 100% convinced by that anymore.  It's true that if firmware
retains control of the PCIe capability, the OS is technically not
allowed to write to it, and it's conceivable that even a locked OS
update could collide with some SMI or something that also writes to
it.

I can certainly imagine that firmware might know that *enabling* ASPM
might break because of signal integrity issues or something.  It seems
less likely that *disabling* ASPM would break something, but Rafael [2]
and Matthew [3] rightly pointed out that there is some risk.

But the current situation, where pci_disable_link_state() does nothing
if CONFIG_PCIEASPM is unset or if _OSC says firmware owns it, leads to
drivers doing it directly anyway.  I'm not sure that's better than
making pci_disable_link_state() work 100% of the time, regardless of
CONFIG_PCIEASPM and _OSC.  At least then the PCI core would know
what's going on.

Bjorn

[1] https://lore.kernel.org/all/CANUX_P3F5YhbZX3WGU-j1AGpbXb_T9Bis2ErhvKkFMtDvzatVQ@mail.gmail.com/
[2] https://lore.kernel.org/all/1725435.3DlCxYF2FV@vostro.rjw.lan/
[3] https://lore.kernel.org/all/1368303730.2425.47.camel@x230/
  
Bjorn Helgaas May 11, 2023, 10:21 p.m. UTC | #9
[+cc Emmanuel, Rafael, Heiner]

On Thu, May 11, 2023 at 11:28:05PM +0300, Ilpo Järvinen wrote:
> ...
> One question before I trying to come up something is when PCIEASPM is =n, 
> should I provide some simple function that just does the LNKCTL write to 
> disable it?

The current pci_disable_link_state() stub when CONFIG_PCIEASPM is
unset seems clearly wrong.  In fact, it returns *success* when it
actually did nothing.

I think it should probably clear ASPM Control, at least when the OS
has ownership via _OSC.  I kind of think it should do that independent
of _OSC, but that depends on the conversation at [1].

Bjorn

[1] https://lore.kernel.org/r/ZF1dsvJYYnl8Wv0v@bhelgaas
  
Ilpo Järvinen May 12, 2023, 8:25 a.m. UTC | #10
On Thu, 11 May 2023, Lukas Wunner wrote:

> On Thu, May 11, 2023 at 10:55:06AM -0500, Bjorn Helgaas wrote:
> > On Thu, May 11, 2023 at 04:14:25PM +0300, Ilpo Järvinen wrote:
> > > +int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos,
> > > +					      u16 clear, u16 set)
> > > +{
> > > +	unsigned long flags;
> > > +	int ret;
> > > +
> > > +	spin_lock_irqsave(&dev->cap_lock, flags);
> > > +	ret = pcie_capability_clear_and_set_word(dev, pos, clear, set);
> > > +	spin_unlock_irqrestore(&dev->cap_lock, flags);
> > > +
> > > +	return ret;
> > > +}
> > > +EXPORT_SYMBOL(pcie_capability_clear_and_set_word_locked);
> > 
> > I didn't see the prior discussion with Lukas, so maybe this was
> > answered there, but is there any reason not to add locking to
> > pcie_capability_clear_and_set_word() and friends directly?
> > 
> > It would be nice to avoid having to decide whether to use the locked
> > or unlocked versions.
> 
> I think we definitely want to also offer lockless accessors which
> can be used in hotpaths such as interrupt handlers if the accessed
> registers don't need any locking (e.g. because there are no concurrent
> accesses).
> 
> So the relatively lean approach chosen here which limits locking to
> Link Control and Link Control 2, but allows future expansion to other
> registers as well, seemed reasonable to me.

Hi Lukas,

I went through every single use of these functions in the mainline tree 
excluding LNKCTL/LNKCTL2 ones which will be having the lock anyway:

- pcie_capability_clear_and_set_*
- pcie_capability_set_*
- pcie_capability_clear_*

Everything outside of drivers/pci/ is dev init or dev reset related.

Almost all uses inside drivers/pci/ are init/configure/scan/PCI_FIXUP/pci_flr 
or suspend/resume related. With these exceptions:

->set_attention_status() drivers/pci/hotplug/pnv_php.c: pcie_capability_clear_and_set_word(bridge, PCI_EXP_SLTCTL,
spinlock + work (from pme.c) drivers/pci/pci.c: pcie_capability_set_dword(dev, PCI_EXP_RTSTA
spinlock + irq / work drivers/pci/pcie/pme.c: pcie_capability_set_word(dev, PCI_EXP_RTCTL,
spinlock + irq / work drivers/pci/pcie/pme.c: pcie_capability_clear_word(dev, PCI_EXP_RTCTL,

So the only case which seems relevant to your concern are those in
drivers/pci/pcie/pme.c which already takes a spinlock so it's not lockless 
as is.

What's more important though, isn't it possible that AER and PME RMW
PCI_EXP_RTCTL at the same time so it would need this RMW locking too 
despite the pme internal spinlock?

Do you still feel there's a need to differentiate this per capability 
given all the information above?


There could of course be open-coded capability RMW ops outside of the ones 
I checked but I suspect the conclusion would still remain pretty much the 
same.
  
Lukas Wunner May 14, 2023, 10:10 a.m. UTC | #11
On Fri, May 12, 2023 at 11:25:32AM +0300, Ilpo Järvinen wrote:
> On Thu, 11 May 2023, Lukas Wunner wrote:
> > On Thu, May 11, 2023 at 10:55:06AM -0500, Bjorn Helgaas wrote:
> > > I didn't see the prior discussion with Lukas, so maybe this was
> > > answered there, but is there any reason not to add locking to
> > > pcie_capability_clear_and_set_word() and friends directly?
> > > 
> > > It would be nice to avoid having to decide whether to use the locked
> > > or unlocked versions.
> > 
> > I think we definitely want to also offer lockless accessors which
> > can be used in hotpaths such as interrupt handlers if the accessed
> > registers don't need any locking (e.g. because there are no concurrent
> > accesses).
> > 
> > So the relatively lean approach chosen here which limits locking to
> > Link Control and Link Control 2, but allows future expansion to other
> > registers as well, seemed reasonable to me.
> 
> I went through every single use of these functions in the mainline tree 
> excluding LNKCTL/LNKCTL2 ones which will be having the lock anyway:
> 
> - pcie_capability_clear_and_set_*
> - pcie_capability_set_*
> - pcie_capability_clear_*

We're also performing RMW through pcie_capability_read_word() +
pcie_capability_write_word() combos, see drivers/pci/hotplug/pciehp_hpc.c
for examples.


> Do you still feel there's a need to differentiate this per capability 
> given all the information above?

What I think is unnecessary and counterproductive is to add wholesale
locking of any access to the PCI Express Capability Structure.

It's fine to have a single spinlock, but I'd suggest only using it
for registers which are actually accessed concurrently by multiple
places in the kernel.


> spinlock + irq / work drivers/pci/pcie/pme.c: pcie_capability_set_word(dev, PCI_EXP_RTCTL,
> spinlock + irq / work drivers/pci/pcie/pme.c: pcie_capability_clear_word(dev, PCI_EXP_RTCTL,
[...]
> What's more important though, isn't it possible that AER and PME RMW
> PCI_EXP_RTCTL at the same time so it would need this RMW locking too 
> despite the pme internal spinlock?

Yes that looks broken, so RTCTL would be another register besides
LNKCTL and LNKCTL2 that needs protection, good catch.

Thanks,

Lukas
  
Ilpo Järvinen May 15, 2023, 11:59 a.m. UTC | #12
On Sun, 14 May 2023, Lukas Wunner wrote:

> On Fri, May 12, 2023 at 11:25:32AM +0300, Ilpo Järvinen wrote:
> > On Thu, 11 May 2023, Lukas Wunner wrote:
> > > On Thu, May 11, 2023 at 10:55:06AM -0500, Bjorn Helgaas wrote:
> > > > I didn't see the prior discussion with Lukas, so maybe this was
> > > > answered there, but is there any reason not to add locking to
> > > > pcie_capability_clear_and_set_word() and friends directly?
> > > > 
> > > > It would be nice to avoid having to decide whether to use the locked
> > > > or unlocked versions.
> > > 
> > > I think we definitely want to also offer lockless accessors which
> > > can be used in hotpaths such as interrupt handlers if the accessed
> > > registers don't need any locking (e.g. because there are no concurrent
> > > accesses).
> > > 
> > > So the relatively lean approach chosen here which limits locking to
> > > Link Control and Link Control 2, but allows future expansion to other
> > > registers as well, seemed reasonable to me.
> > 
> > I went through every single use of these functions in the mainline tree 
> > excluding LNKCTL/LNKCTL2 ones which will be having the lock anyway:
> > 
> > - pcie_capability_clear_and_set_*
> > - pcie_capability_set_*
> > - pcie_capability_clear_*
> 
> We're also performing RMW through pcie_capability_read_word() +
> pcie_capability_write_word() combos, see drivers/pci/hotplug/pciehp_hpc.c
> for examples.

That's why I said there could be other RMW operations outside of what
I carefully looked at. It, however, does not mean I didn't take any look 
at those.

But since brought it up, lets go through this case with
drivers/pci/hotplug/pciehp_hpc.c, it won't change anything:

All PCI_EXP_SLTSTA ones looked not real RMW but ACK bits type of writes
(real RMW = preverse other bits vs ACK write = other bits are written as 
zeros). Using RMW accessors would need an odd construct such as this
(and pcie_capability_set/clear_word() would be plain wrong):
	pcie_capability_clear_and_set_word(dev, PCI_EXP_SLTSTA,
					   ~PCI_EXP_SLTSTA_CC,
					   PCI_EXP_SLTSTA_CC);

PCI_EXP_SLTCTL write is protected by a mutex, it doesn't look something 
that matches your initial concern about "hot paths (e.g. interrupt 
handlers)".

In general, outside of drivers/pci/hotplug there are not that many 
capability writes (beyond LNKCTL/LNKCTL2 and now also RTCTL). None of 
those seem hot paths.

> > Do you still feel there's a need to differentiate this per capability 
> > given all the information above?
> 
> What I think is unnecessary and counterproductive is to add wholesale
> locking of any access to the PCI Express Capability Structure.
> 
> It's fine to have a single spinlock, but I'd suggest only using it
> for registers which are actually accessed concurrently by multiple
> places in the kernel.

While it does feel entirely unnecessary layer of complexity to me, it would 
be possible to rename the original pcie_capability_clear_and_set_word() to 
pcie_capability_clear_and_set_word_unlocked() and add this into 
include/linux/pci.h:

static inline int pcie_capability_clear_and_set_word(struct pci_dev *dev,
					int pos, u16 clear, u16 set)
{
	if (pos == PCI_EXP_LNKCTL || pos == PCI_EXP_LNKCTL2 ||
	    pos == PCI_EXP_RTCTL)
		pcie_capability_clear_and_set_word_locked(...);
	else
		pcie_capability_clear_and_set_word_unlocked(...);
}

It would keep the interface exactly the same but protect only a selectable 
set of registers. As pos is always a constant, the compiler should be able 
to optimize all the dead code away.

Would that be ok then?
  
Bjorn Helgaas May 15, 2023, 6:28 p.m. UTC | #13
On Mon, May 15, 2023 at 02:59:42PM +0300, Ilpo Järvinen wrote:
> On Sun, 14 May 2023, Lukas Wunner wrote:
> > On Fri, May 12, 2023 at 11:25:32AM +0300, Ilpo Järvinen wrote:
> > > On Thu, 11 May 2023, Lukas Wunner wrote:
> > > > On Thu, May 11, 2023 at 10:55:06AM -0500, Bjorn Helgaas wrote:
> > > > > I didn't see the prior discussion with Lukas, so maybe this was
> > > > > answered there, but is there any reason not to add locking to
> > > > > pcie_capability_clear_and_set_word() and friends directly?
> > > > > 
> > > > > It would be nice to avoid having to decide whether to use the locked
> > > > > or unlocked versions.
> > > > 
> > > > I think we definitely want to also offer lockless accessors which
> > > > can be used in hotpaths such as interrupt handlers if the accessed
> > > > registers don't need any locking (e.g. because there are no concurrent
> > > > accesses).
> > > > ...

> All PCI_EXP_SLTSTA ones looked not real RMW but ACK bits type of writes

PCI_EXP_SLTSTA, PCI_EXP_LNKSTA, etc are typically RW1C and do not need
the usual RMW locking (which I think is what you were saying).

> > ...
> > What I think is unnecessary and counterproductive is to add wholesale
> > locking of any access to the PCI Express Capability Structure.
> > 
> > It's fine to have a single spinlock, but I'd suggest only using it
> > for registers which are actually accessed concurrently by multiple
> > places in the kernel.
> 
> While it does feel entirely unnecessary layer of complexity to me, it would 
> be possible to rename the original pcie_capability_clear_and_set_word() to 
> pcie_capability_clear_and_set_word_unlocked() and add this into 
> include/linux/pci.h:
> 
> static inline int pcie_capability_clear_and_set_word(struct pci_dev *dev,
> 					int pos, u16 clear, u16 set)
> {
> 	if (pos == PCI_EXP_LNKCTL || pos == PCI_EXP_LNKCTL2 ||
> 	    pos == PCI_EXP_RTCTL)
> 		pcie_capability_clear_and_set_word_locked(...);
> 	else
> 		pcie_capability_clear_and_set_word_unlocked(...);
> }
> 
> It would keep the interface exactly the same but protect only a selectable 
> set of registers. As pos is always a constant, the compiler should be able 
> to optimize all the dead code away.
> 
> Would that be ok then?

Sounds like you have a pretty strong opinion, Lukas, but I guess I
don't really understand the value of having locked and unlocked
variants of RMW accessors.  Config accesses are relatively slow and I
don't think they're used in performance-sensitive paths.  I would
expect the lock to be uncontended and cheap relative to the config
access itself, but I have no actual numbers to back up my speculation.
Is the performance win worth the extra complexity in callers?

Bjorn
  
Lukas Wunner May 15, 2023, 10:12 p.m. UTC | #14
On Mon, May 15, 2023 at 02:59:42PM +0300, Ilpo Järvinen wrote:
> While it does feel entirely unnecessary layer of complexity to me, it would 
> be possible to rename the original pcie_capability_clear_and_set_word() to 
> pcie_capability_clear_and_set_word_unlocked() and add this into 
> include/linux/pci.h:
> 
> static inline int pcie_capability_clear_and_set_word(struct pci_dev *dev,
> 					int pos, u16 clear, u16 set)
> {
> 	if (pos == PCI_EXP_LNKCTL || pos == PCI_EXP_LNKCTL2 ||
> 	    pos == PCI_EXP_RTCTL)
> 		pcie_capability_clear_and_set_word_locked(...);
> 	else
> 		pcie_capability_clear_and_set_word_unlocked(...);
> }
> 
> It would keep the interface exactly the same but protect only a selectable 
> set of registers. As pos is always a constant, the compiler should be able 
> to optimize all the dead code away.

That's actually quite neat, I like it.  It documents clearly which
registers need protection because of concurrent RMWs and callers can't
do anything wrong.

Though I'd use a switch/case statement such that future additions
of registers that need protection are always just a clean, one-line
change.

Plus some kernel-doc or code comment to explain that certain
registers in the PCI Express Capability Structure are accessed
concurrently in a RMW fashion, hence require locking.

Since this protects specifically registers in the PCI Express
Capability, whose location is cached in struct pci_dev->pcie_cap,
I'm wondering if pcie_cap_lock is a clearer name.


> PCI_EXP_SLTCTL write is protected by a mutex, it doesn't look something 
> that matches your initial concern about "hot paths (e.g. interrupt 
> handlers)".

PCI_EXP_SLTCTL is definitely modified from the interrupt handler
pciehp_ist(), but one could argue that hotplug interrupts don't
usually occur *that* often.  (We've had interrupt storms though
from broken devices or ones with a shared interrupt etc.)

I guess I'm just generally worried about acquiring a lock that's
not necessary.  E.g. on boot, numerous config space accesses are
performed to enumerate and initialize devices and reducing concurrency
might slow down boot times.  It's just a risk that I'd recommend
to avoid if possible.

Thanks,

Lukas
  

Patch

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 3c230ca3de58..d92a3daadd0c 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -531,6 +531,20 @@  int pcie_capability_clear_and_set_dword(struct pci_dev *dev, int pos,
 }
 EXPORT_SYMBOL(pcie_capability_clear_and_set_dword);
 
+int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos,
+					      u16 clear, u16 set)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&dev->cap_lock, flags);
+	ret = pcie_capability_clear_and_set_word(dev, pos, clear, set);
+	spin_unlock_irqrestore(&dev->cap_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(pcie_capability_clear_and_set_word_locked);
+
 int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val)
 {
 	if (pci_dev_is_disconnected(dev)) {
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0b2826c4a832..0c14a283f1c7 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2318,6 +2318,7 @@  struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
 		.end = -1,
 	};
 
+	spin_lock_init(&dev->cap_lock);
 #ifdef CONFIG_PCI_MSI
 	raw_spin_lock_init(&dev->msi_lock);
 #endif
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 60b8772b5bd4..82faea085d95 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -467,6 +467,7 @@  struct pci_dev {
 	pci_dev_flags_t dev_flags;
 	atomic_t	enable_cnt;	/* pci_enable_device has been called */
 
+	spinlock_t	cap_lock;		/* Protects RMW ops done with locked RMW capability accessors */
 	u32		saved_config_space[16]; /* Config space saved at suspend time */
 	struct hlist_head saved_cap_space;
 	int		rom_attr_enabled;	/* Display of ROM attribute enabled? */
@@ -1221,6 +1222,8 @@  int pcie_capability_clear_and_set_word(struct pci_dev *dev, int pos,
 				       u16 clear, u16 set);
 int pcie_capability_clear_and_set_dword(struct pci_dev *dev, int pos,
 					u32 clear, u32 set);
+int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos,
+					      u16 clear, u16 set);
 
 static inline int pcie_capability_set_word(struct pci_dev *dev, int pos,
 					   u16 set)
@@ -1246,6 +1249,20 @@  static inline int pcie_capability_clear_dword(struct pci_dev *dev, int pos,
 	return pcie_capability_clear_and_set_dword(dev, pos, clear, 0);
 }
 
+static inline int pcie_lnkctl_clear_and_set(struct pci_dev *dev, u16 clear,
+					    u16 set)
+{
+	return pcie_capability_clear_and_set_word_locked(dev, PCI_EXP_LNKCTL,
+							 clear, set);
+}
+
+static inline int pcie_lnkctl2_clear_and_set(struct pci_dev *dev, u16 clear,
+					    u16 set)
+{
+	return pcie_capability_clear_and_set_word_locked(dev, PCI_EXP_LNKCTL2,
+							 clear, set);
+}
+
 /* User-space driven config access */
 int pci_user_read_config_byte(struct pci_dev *dev, int where, u8 *val);
 int pci_user_read_config_word(struct pci_dev *dev, int where, u16 *val);