[1/2] cxl/pci: Add generic MSI-X/MSI irq support

Message ID 20221018030010.20913-2-dave@stgolabs.net
State New
Headers
Series cxl: Add general MSI-X/MSI irq support |

Commit Message

Davidlohr Bueso Oct. 18, 2022, 3 a.m. UTC
  Introduce a generic irq table for CXL components/features that can have
standard irq support - DOE requires dynamic vector sizing and is not
considered here. For now the table is empty.

Create an infrastructure to query the max vectors required for the CXL
device. Upon successful allocation, users can plug in their respective isr
at any point thereafter, which is supported by a new cxlds->has_irq flag,
for example, if the irq setup is not done in the PCI driver, such as
the case of the CXL-PMU.

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
---
 drivers/cxl/cxlmem.h |  3 ++
 drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)
  

Comments

Jonathan Cameron Oct. 18, 2022, 9:36 a.m. UTC | #1
On Mon, 17 Oct 2022 20:00:09 -0700
Davidlohr Bueso <dave@stgolabs.net> wrote:

> Introduce a generic irq table for CXL components/features that can have
> standard irq support - DOE requires dynamic vector sizing and is not
> considered here. For now the table is empty.
> 
> Create an infrastructure to query the max vectors required for the CXL
> device. Upon successful allocation, users can plug in their respective isr
> at any point thereafter, which is supported by a new cxlds->has_irq flag,
> for example, if the irq setup is not done in the PCI driver, such as
> the case of the CXL-PMU.
> 
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>

A few nitpicks inline.

With the comment one tidied up (other one optional)
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

I'll rebase my cpmu code on top of this shortly.

Jonathan


> ---
>  drivers/cxl/cxlmem.h |  3 ++
>  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 75 insertions(+)
> 
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 88e3a8e54b6a..72b69b003302 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -211,6 +211,7 @@ struct cxl_endpoint_dvsec_info {
>   * @info: Cached DVSEC information about the device.
>   * @serial: PCIe Device Serial Number
>   * @doe_mbs: PCI DOE mailbox array
> + * @has_irq: PCIe MSI-X/MSI support
>   * @mbox_send: @dev specific transport for transmitting mailbox commands
>   *
>   * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
> @@ -247,6 +248,8 @@ struct cxl_dev_state {
>  
>  	struct xarray doe_mbs;
>  
> +	bool has_irq;
> +
>  	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
>  };
>  
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index faeb5d9d7a7a..9c3e95ebaa26 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -428,6 +428,73 @@ static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
>  	}
>  }
>  
> +/**
> + * struct cxl_irq_cap - CXL feature that is capable of receiving MSI-X/MSI irqs.
> + *
> + * @name: Name of the device/component generating this interrupt.
> + * @get_max_msgnum: Get the feature's largest interrupt message number.  If the
> + *		    feature does not have the Interrupt Supported bit set, then
> + *		    return -1.
> + */
> +struct cxl_irq_cap {
> +	const char *name;
> +	int (*get_max_msgnum)(struct cxl_dev_state *cxlds);
> +};
> +
> +static const struct cxl_irq_cap cxl_irq_cap_table[] = {
> +	NULL
> +};
> +
> +static void cxl_pci_free_irq_vectors(void *data)
> +{
> +	pci_free_irq_vectors(data);
> +}
> +
> +/*
> + * Attempt to allocate the largest amount of necessary vectors.
> + *
> + * Returns 0 upon a successful allocation of *all* vectors, or a

Technically not all vectors.  If we wanted to do that we could
just directly query that via pci_msix_vec_count() etc that gets
it from the MSIX capability. That's frowned upon because it's common
to stick lots of extra vectors on the end for stuff that linux never
cares about (debug etc, or optional features).

All vectors up to the maximum one the code uses would be more accurate.

> + * negative value otherwise.
> + */
> +static int cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
> +{
> +	struct device *dev = cxlds->dev;
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	int rc, i, vectors = -1;
> +
> +	for (i = 0; i < ARRAY_SIZE(cxl_irq_cap_table); i++) {
> +		int irq;
> +
> +		if (!cxl_irq_cap_table[i].get_max_msgnum)
> +			continue;
> +
> +		irq = cxl_irq_cap_table[i].get_max_msgnum(cxlds);
> +		vectors = max_t(int, irq, vectors);
> +	}
> +
> +	/*
> +	 * Semantically lack of irq support is not an error, but we
> +	 * still fail to allocate, so return negative.
> +	 */
> +	if (vectors == -1)
> +		return -1;
> +
> +	vectors++;
> +	rc = pci_alloc_irq_vectors(pdev, vectors, vectors,
> +				   PCI_IRQ_MSIX | PCI_IRQ_MSI);
> +	if (rc < 0)
> +		return rc;
> +
> +	if (rc != vectors) {
> +		dev_dbg(dev, "Not enough interrupts; use polling instead.\n");
> +		/* some got allocated, clean them up */
> +		cxl_pci_free_irq_vectors(pdev);
> +		return -ENOSPC;
> +	}
> +
> +	return devm_add_action_or_reset(dev, cxl_pci_free_irq_vectors, pdev);
> +}
> +
>  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  {
>  	struct cxl_register_map map;
> @@ -494,6 +561,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (rc)
>  		return rc;
>  
> +	if (!cxl_pci_alloc_irq_vectors(cxlds)) {
> +		cxlds->has_irq = true;
> +	} else
> +		cxlds->has_irq = false;
> +
	cxlds->has_irq = !(cxl_pci_aloc_irq_vectors(cxlds) < 0);

maybe...

>  	cxlmd = devm_cxl_add_memdev(cxlds);
>  	if (IS_ERR(cxlmd))
>  		return PTR_ERR(cxlmd);
  
Jonathan Cameron Oct. 18, 2022, 10:52 a.m. UTC | #2
On Tue, 18 Oct 2022 10:36:19 +0100
Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:

> On Mon, 17 Oct 2022 20:00:09 -0700
> Davidlohr Bueso <dave@stgolabs.net> wrote:
> 
> > Introduce a generic irq table for CXL components/features that can have
> > standard irq support - DOE requires dynamic vector sizing and is not
> > considered here. For now the table is empty.
> > 
> > Create an infrastructure to query the max vectors required for the CXL
> > device. Upon successful allocation, users can plug in their respective isr
> > at any point thereafter, which is supported by a new cxlds->has_irq flag,
> > for example, if the irq setup is not done in the PCI driver, such as
> > the case of the CXL-PMU.
> > 
> > Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> > Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>  
> 
> A few nitpicks inline.
> 
> With the comment one tidied up (other one optional)
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> 
> I'll rebase my cpmu code on top of this shortly.
Hi Davidlohr,

Doing the CPMU rebase has shown up that using this generic infrastructure
ends up rather ugly.

Previously I had a local array to manage the required register maps
that was then freed.  Now I have to move that into the cxl device state
just so I can get at it from the irq finding callback.

So I have an extra step to be able to use this generic framework.

1. Query how many CPMU devices there are.  Stash that and register map
   info in cxlds.  I could do this in the callback but that's really really
   horrible layering issue as most of what is done has nothing to do
   with finding the vector numbers.
2. The callback below to find those numbers 
3. Registration of the cpmu devices.

Reality is that it is cleaner to more or less ignore the infrastructure
proposed in this patch.

1. Query how many CPMU devices there are. Whilst there stash the maximim
   cpmu vector number in the cxlds.
2. Run a stub in this infrastructure that does max(irq, cxlds->irq_num);
3. Carry on as before.

Thus destroying the point of this infrastructure for that usecase at least
and leaving an extra bit of state in the cxl_dev_state that is just
to squirt a value into the callback...

So with that in mind I'm withdrawing the RB above.  This looks to be
an idea that with hindsight doesn't necessarily pan out.
Long hand equivalent with the specific handling needed for each case
is probably going to be neater than walking a table of much more
restricted callbacks.  Maybe there is a nice way to fit the CPMU
registration into this infrastructure, but I'm not immediately seeing it.

One other note inline via a compiler warning.

Jonathan

> 
> Jonathan
> 
> 
> > ---
> >  drivers/cxl/cxlmem.h |  3 ++
> >  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 75 insertions(+)
> > 
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index 88e3a8e54b6a..72b69b003302 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -211,6 +211,7 @@ struct cxl_endpoint_dvsec_info {
> >   * @info: Cached DVSEC information about the device.
> >   * @serial: PCIe Device Serial Number
> >   * @doe_mbs: PCI DOE mailbox array
> > + * @has_irq: PCIe MSI-X/MSI support
> >   * @mbox_send: @dev specific transport for transmitting mailbox commands
> >   *
> >   * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
> > @@ -247,6 +248,8 @@ struct cxl_dev_state {
> >  
> >  	struct xarray doe_mbs;
> >  
> > +	bool has_irq;
> > +
> >  	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
> >  };
> >  
> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > index faeb5d9d7a7a..9c3e95ebaa26 100644
> > --- a/drivers/cxl/pci.c
> > +++ b/drivers/cxl/pci.c
> > @@ -428,6 +428,73 @@ static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
> >  	}
> >  }
> >  
> > +/**
> > + * struct cxl_irq_cap - CXL feature that is capable of receiving MSI-X/MSI irqs.
> > + *
> > + * @name: Name of the device/component generating this interrupt.
> > + * @get_max_msgnum: Get the feature's largest interrupt message number.  If the
> > + *		    feature does not have the Interrupt Supported bit set, then
> > + *		    return -1.
> > + */
> > +struct cxl_irq_cap {
> > +	const char *name;
> > +	int (*get_max_msgnum)(struct cxl_dev_state *cxlds);
> > +};
> > +
> > +static const struct cxl_irq_cap cxl_irq_cap_table[] = {
> > +	NULL

That's not valid, just make it empty instead.


> > +};
> > +
> > +static void cxl_pci_free_irq_vectors(void *data)
> > +{
> > +	pci_free_irq_vectors(data);
> > +}
> > +
> > +/*
> > + * Attempt to allocate the largest amount of necessary vectors.
> > + *
> > + * Returns 0 upon a successful allocation of *all* vectors, or a  
> 
> Technically not all vectors.  If we wanted to do that we could
> just directly query that via pci_msix_vec_count() etc that gets
> it from the MSIX capability. That's frowned upon because it's common
> to stick lots of extra vectors on the end for stuff that linux never
> cares about (debug etc, or optional features).
> 
> All vectors up to the maximum one the code uses would be more accurate.
> 
> > + * negative value otherwise.
> > + */
> > +static int cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
> > +{
> > +	struct device *dev = cxlds->dev;
> > +	struct pci_dev *pdev = to_pci_dev(dev);
> > +	int rc, i, vectors = -1;
> > +
> > +	for (i = 0; i < ARRAY_SIZE(cxl_irq_cap_table); i++) {
> > +		int irq;
> > +
> > +		if (!cxl_irq_cap_table[i].get_max_msgnum)
> > +			continue;
> > +
> > +		irq = cxl_irq_cap_table[i].get_max_msgnum(cxlds);
> > +		vectors = max_t(int, irq, vectors);
> > +	}
> > +
> > +	/*
> > +	 * Semantically lack of irq support is not an error, but we
> > +	 * still fail to allocate, so return negative.
> > +	 */
> > +	if (vectors == -1)
> > +		return -1;
> > +
> > +	vectors++;
> > +	rc = pci_alloc_irq_vectors(pdev, vectors, vectors,
> > +				   PCI_IRQ_MSIX | PCI_IRQ_MSI);
> > +	if (rc < 0)
> > +		return rc;
> > +
> > +	if (rc != vectors) {
> > +		dev_dbg(dev, "Not enough interrupts; use polling instead.\n");
> > +		/* some got allocated, clean them up */
> > +		cxl_pci_free_irq_vectors(pdev);
> > +		return -ENOSPC;
> > +	}
> > +
> > +	return devm_add_action_or_reset(dev, cxl_pci_free_irq_vectors, pdev);
> > +}
> > +
> >  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  {
> >  	struct cxl_register_map map;
> > @@ -494,6 +561,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  	if (rc)
> >  		return rc;
> >  
> > +	if (!cxl_pci_alloc_irq_vectors(cxlds)) {
> > +		cxlds->has_irq = true;
> > +	} else
> > +		cxlds->has_irq = false;
> > +  
> 	cxlds->has_irq = !(cxl_pci_aloc_irq_vectors(cxlds) < 0);
> 
> maybe...
> 
> >  	cxlmd = devm_cxl_add_memdev(cxlds);
> >  	if (IS_ERR(cxlmd))
> >  		return PTR_ERR(cxlmd);  
>
  
Jonathan Cameron Oct. 18, 2022, 11:17 a.m. UTC | #3
On Mon, 17 Oct 2022 20:00:09 -0700
Davidlohr Bueso <dave@stgolabs.net> wrote:

> Introduce a generic irq table for CXL components/features that can have
> standard irq support - DOE requires dynamic vector sizing and is not
> considered here. For now the table is empty.
> 
> Create an infrastructure to query the max vectors required for the CXL
> device. Upon successful allocation, users can plug in their respective isr
> at any point thereafter, which is supported by a new cxlds->has_irq flag,
> for example, if the irq setup is not done in the PCI driver, such as
> the case of the CXL-PMU.
> 
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>

> +	vectors++;
> +	rc = pci_alloc_irq_vectors(pdev, vectors, vectors,
> +				   PCI_IRQ_MSIX | PCI_IRQ_MSI);
> +	if (rc < 0)
> +		return rc;
> +
> +	if (rc != vectors) {

Just catching up with David Jiang's review of the CPMU driver.
He correctly points out that we won't hit this condition because
we provide minvec to the pci_alloc_irq_vectors() call:

> I don't think you'll hit here since you passed in vectors for min and 
> max. You'll get -ENOSPC and return from the earlier check.
> 
> https://elixir.bootlin.com/linux/v6.0-rc3/source/drivers/pci/msi/msi.c#L1005
> 
> DJ


> +		dev_dbg(dev, "Not enough interrupts; use polling instead.\n");
> +		/* some got allocated, clean them up */
> +		cxl_pci_free_irq_vectors(pdev);
> +		return -ENOSPC;
> +	}
> +
> +	return devm_add_action_or_reset(dev, cxl_pci_free_irq_vectors, pdev);
> +}
  
Davidlohr Bueso Oct. 20, 2022, 10:31 p.m. UTC | #4
On Tue, 18 Oct 2022, Jonathan Cameron wrote:

>Reality is that it is cleaner to more or less ignore the infrastructure
>proposed in this patch.
>
>1. Query how many CPMU devices there are. Whilst there stash the maximim
>   cpmu vector number in the cxlds.
>2. Run a stub in this infrastructure that does max(irq, cxlds->irq_num);
>3. Carry on as before.
>
>Thus destroying the point of this infrastructure for that usecase at least
>and leaving an extra bit of state in the cxl_dev_state that is just
>to squirt a value into the callback...

If it doesn't fit, then it doesn't fit.

However, while I was expecting pass one to be in the callback, I wasn't
expecting that both pass 1 and 2 shared the cpmu_regs_array. If the array
could be reconstructed during pass 2, then it would fit a bit better;
albeit the extra allocation, cycles etc., but this is probing phase, so
overhead isn't that important (and cpmu_count isn't big enough to matter).

But if we're going to go with a free-for-all approach, can we establish
who goes for the initial pci_alloc_irq_vectors()? I think perhaps mbox
since it's the most straightforward and with least requirements, I'm
also unsure of the status yet to merge events and pmu, but regardless
they are still larger patchsets. If folks agree I can send a new mbox-only
patch.

Thanks,
Davidlohr
  
Ira Weiny Oct. 21, 2022, 4:14 a.m. UTC | #5
On Tue, Oct 18, 2022 at 11:52:27AM +0100, Jonathan Cameron wrote:
> On Tue, 18 Oct 2022 10:36:19 +0100
> Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
> 
> > On Mon, 17 Oct 2022 20:00:09 -0700
> > Davidlohr Bueso <dave@stgolabs.net> wrote:
> > 
> > > Introduce a generic irq table for CXL components/features that can have
> > > standard irq support - DOE requires dynamic vector sizing and is not
> > > considered here. For now the table is empty.
> > > 
> > > Create an infrastructure to query the max vectors required for the CXL
> > > device. Upon successful allocation, users can plug in their respective isr
> > > at any point thereafter, which is supported by a new cxlds->has_irq flag,
> > > for example, if the irq setup is not done in the PCI driver, such as
> > > the case of the CXL-PMU.
> > > 
> > > Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> > > Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>  
> > 
> > A few nitpicks inline.
> > 
> > With the comment one tidied up (other one optional)
> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > 
> > I'll rebase my cpmu code on top of this shortly.
> Hi Davidlohr,
> 
> Doing the CPMU rebase has shown up that using this generic infrastructure
> ends up rather ugly.
> 
> Previously I had a local array to manage the required register maps
> that was then freed.  Now I have to move that into the cxl device state
> just so I can get at it from the irq finding callback.
> 
> So I have an extra step to be able to use this generic framework.
> 
> 1. Query how many CPMU devices there are.  Stash that and register map
>    info in cxlds.  I could do this in the callback but that's really really
>    horrible layering issue as most of what is done has nothing to do
>    with finding the vector numbers.

FWIW I did this for the event stuff and did not find it so distasteful...  :-/

However the information I am stashing in the cxlds is all interrupt
information.  So I think it is different from what I see in the CPMU stuff.

> 2. The callback below to find those numbers 
> 3. Registration of the cpmu devices.
> 
> Reality is that it is cleaner to more or less ignore the infrastructure
> proposed in this patch.
> 
> 1. Query how many CPMU devices there are. Whilst there stash the maximim
>    cpmu vector number in the cxlds.
> 2. Run a stub in this infrastructure that does max(irq, cxlds->irq_num);
> 3. Carry on as before.
> 
> Thus destroying the point of this infrastructure for that usecase at least
> and leaving an extra bit of state in the cxl_dev_state that is just
> to squirt a value into the callback...

I'm not sure I follow?  Do you mean this?

static int cxl_cpmu_get_max_msgnum(struct cxl_dev_state *cxlds)
{
	return cxlds->cpmu_max_vector;
}  

> 
> So with that in mind I'm withdrawing the RB above.  This looks to be
> an idea that with hindsight doesn't necessarily pan out.
> Long hand equivalent with the specific handling needed for each case
> is probably going to be neater than walking a table of much more
> restricted callbacks.

I'm not married to the idea of the array of callbacks but I'm not sure how this
solves having to iterate on the CPMU devices twice?

Ira

> Maybe there is a nice way to fit the CPMU
> registration into this infrastructure, but I'm not immediately seeing it.
> 
> One other note inline via a compiler warning.
> 
> Jonathan
> 
> > 
> > Jonathan
> > 
> > 
> > > ---
> > >  drivers/cxl/cxlmem.h |  3 ++
> > >  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 75 insertions(+)
> > > 
> > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > index 88e3a8e54b6a..72b69b003302 100644
> > > --- a/drivers/cxl/cxlmem.h
> > > +++ b/drivers/cxl/cxlmem.h
> > > @@ -211,6 +211,7 @@ struct cxl_endpoint_dvsec_info {
> > >   * @info: Cached DVSEC information about the device.
> > >   * @serial: PCIe Device Serial Number
> > >   * @doe_mbs: PCI DOE mailbox array
> > > + * @has_irq: PCIe MSI-X/MSI support
> > >   * @mbox_send: @dev specific transport for transmitting mailbox commands
> > >   *
> > >   * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
> > > @@ -247,6 +248,8 @@ struct cxl_dev_state {
> > >  
> > >  	struct xarray doe_mbs;
> > >  
> > > +	bool has_irq;
> > > +
> > >  	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
> > >  };
> > >  
> > > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > > index faeb5d9d7a7a..9c3e95ebaa26 100644
> > > --- a/drivers/cxl/pci.c
> > > +++ b/drivers/cxl/pci.c
> > > @@ -428,6 +428,73 @@ static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
> > >  	}
> > >  }
> > >  
> > > +/**
> > > + * struct cxl_irq_cap - CXL feature that is capable of receiving MSI-X/MSI irqs.
> > > + *
> > > + * @name: Name of the device/component generating this interrupt.
> > > + * @get_max_msgnum: Get the feature's largest interrupt message number.  If the
> > > + *		    feature does not have the Interrupt Supported bit set, then
> > > + *		    return -1.
> > > + */
> > > +struct cxl_irq_cap {
> > > +	const char *name;
> > > +	int (*get_max_msgnum)(struct cxl_dev_state *cxlds);
> > > +};
> > > +
> > > +static const struct cxl_irq_cap cxl_irq_cap_table[] = {
> > > +	NULL
> 
> That's not valid, just make it empty instead.
> 
> 
> > > +};
> > > +
> > > +static void cxl_pci_free_irq_vectors(void *data)
> > > +{
> > > +	pci_free_irq_vectors(data);
> > > +}
> > > +
> > > +/*
> > > + * Attempt to allocate the largest amount of necessary vectors.
> > > + *
> > > + * Returns 0 upon a successful allocation of *all* vectors, or a  
> > 
> > Technically not all vectors.  If we wanted to do that we could
> > just directly query that via pci_msix_vec_count() etc that gets
> > it from the MSIX capability. That's frowned upon because it's common
> > to stick lots of extra vectors on the end for stuff that linux never
> > cares about (debug etc, or optional features).
> > 
> > All vectors up to the maximum one the code uses would be more accurate.
> > 
> > > + * negative value otherwise.
> > > + */
> > > +static int cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
> > > +{
> > > +	struct device *dev = cxlds->dev;
> > > +	struct pci_dev *pdev = to_pci_dev(dev);
> > > +	int rc, i, vectors = -1;
> > > +
> > > +	for (i = 0; i < ARRAY_SIZE(cxl_irq_cap_table); i++) {
> > > +		int irq;
> > > +
> > > +		if (!cxl_irq_cap_table[i].get_max_msgnum)
> > > +			continue;
> > > +
> > > +		irq = cxl_irq_cap_table[i].get_max_msgnum(cxlds);
> > > +		vectors = max_t(int, irq, vectors);
> > > +	}
> > > +
> > > +	/*
> > > +	 * Semantically lack of irq support is not an error, but we
> > > +	 * still fail to allocate, so return negative.
> > > +	 */
> > > +	if (vectors == -1)
> > > +		return -1;
> > > +
> > > +	vectors++;
> > > +	rc = pci_alloc_irq_vectors(pdev, vectors, vectors,
> > > +				   PCI_IRQ_MSIX | PCI_IRQ_MSI);
> > > +	if (rc < 0)
> > > +		return rc;
> > > +
> > > +	if (rc != vectors) {
> > > +		dev_dbg(dev, "Not enough interrupts; use polling instead.\n");
> > > +		/* some got allocated, clean them up */
> > > +		cxl_pci_free_irq_vectors(pdev);
> > > +		return -ENOSPC;
> > > +	}
> > > +
> > > +	return devm_add_action_or_reset(dev, cxl_pci_free_irq_vectors, pdev);
> > > +}
> > > +
> > >  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> > >  {
> > >  	struct cxl_register_map map;
> > > @@ -494,6 +561,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> > >  	if (rc)
> > >  		return rc;
> > >  
> > > +	if (!cxl_pci_alloc_irq_vectors(cxlds)) {
> > > +		cxlds->has_irq = true;
> > > +	} else
> > > +		cxlds->has_irq = false;
> > > +  
> > 	cxlds->has_irq = !(cxl_pci_aloc_irq_vectors(cxlds) < 0);
> > 
> > maybe...
> > 
> > >  	cxlmd = devm_cxl_add_memdev(cxlds);
> > >  	if (IS_ERR(cxlmd))
> > >  		return PTR_ERR(cxlmd);  
> > 
>
  
Ira Weiny Oct. 21, 2022, 4:18 a.m. UTC | #6
On Thu, Oct 20, 2022 at 03:31:25PM -0700, Davidlohr Bueso wrote:
> On Tue, 18 Oct 2022, Jonathan Cameron wrote:
> 
> > Reality is that it is cleaner to more or less ignore the infrastructure
> > proposed in this patch.
> > 
> > 1. Query how many CPMU devices there are. Whilst there stash the maximim
> >   cpmu vector number in the cxlds.
> > 2. Run a stub in this infrastructure that does max(irq, cxlds->irq_num);
> > 3. Carry on as before.
> > 
> > Thus destroying the point of this infrastructure for that usecase at least
> > and leaving an extra bit of state in the cxl_dev_state that is just
> > to squirt a value into the callback...
> 
> If it doesn't fit, then it doesn't fit.
> 
> However, while I was expecting pass one to be in the callback, I wasn't
> expecting that both pass 1 and 2 shared the cpmu_regs_array. If the array
> could be reconstructed during pass 2, then it would fit a bit better;
> albeit the extra allocation, cycles etc., but this is probing phase, so
> overhead isn't that important (and cpmu_count isn't big enough to matter).
> 
> But if we're going to go with a free-for-all approach, can we establish
> who goes for the initial pci_alloc_irq_vectors()? I think perhaps mbox
> since it's the most straightforward and with least requirements, I'm
> also unsure of the status yet to merge events and pmu, but regardless
> they are still larger patchsets. If folks agree I can send a new mbox-only
> patch.

I think there needs to be some mechanism for all of the sub-device-functions to
report their max required vectors.

I don't think that the mbox code is necessarily the code which should need to
know about all those other sub-device-thingys.  But it could certainly take
some 'max vectors' value that probe passed to it.

I'm still not sure how dropping this infrastructure makes Jonathan's code
cleaner.  I still think there will need to be 2 passes over the number of
CPMU's.

Ira

> 
> Thanks,
> Davidlohr
  
Jonathan Cameron Oct. 21, 2022, 8:49 a.m. UTC | #7
On Thu, 20 Oct 2022 21:18:58 -0700
Ira Weiny <ira.weiny@intel.com> wrote:

> On Thu, Oct 20, 2022 at 03:31:25PM -0700, Davidlohr Bueso wrote:
> > On Tue, 18 Oct 2022, Jonathan Cameron wrote:
> >   
> > > Reality is that it is cleaner to more or less ignore the infrastructure
> > > proposed in this patch.
> > > 
> > > 1. Query how many CPMU devices there are. Whilst there stash the maximim
> > >   cpmu vector number in the cxlds.
> > > 2. Run a stub in this infrastructure that does max(irq, cxlds->irq_num);
> > > 3. Carry on as before.
> > > 
> > > Thus destroying the point of this infrastructure for that usecase at least
> > > and leaving an extra bit of state in the cxl_dev_state that is just
> > > to squirt a value into the callback...  
> > 
> > If it doesn't fit, then it doesn't fit.
> > 
> > However, while I was expecting pass one to be in the callback, I wasn't
> > expecting that both pass 1 and 2 shared the cpmu_regs_array. If the array
> > could be reconstructed during pass 2, then it would fit a bit better;
> > albeit the extra allocation, cycles etc., but this is probing phase, so
> > overhead isn't that important (and cpmu_count isn't big enough to matter).

I thought about that approach, but it's really ugly to have to do

1) For the IRQ number gathering.
  a) Parse 1 to count CPMUs
  b) Parse 2 to get the register maps - grab the irq numbers and unmap them again
2) For the CPMU registration
  a) Parse 3 to count CPMUs (we could stash the number of CPMUS form 1a) but
     that's no advantage over stashing the max irq in current proposal.
     Both are putting state where it's not relevant or wanted just to make it
     available in a callback.  This way is even worse because it's getting
     stashed as a side effect of a parse in a function doing something different.
  b) Parse 4 to get the register maps and actually create the devices. Could have
     stashed this earlier as well, but same 'side effects' argument applies.

Sure, can move to this however with appropriate comments on why we are playing
these games because otherwise I suspect a future 'cleanup' would remove double, double
pass.

To allow for an irq registration wrapper that turns a series of straight
line calls into callbacks in an array.  The straight line calls aren't exactly
complex in the first place.
//find cpmu filling in cxl_cpmu_reg_maps.

max_irq = -1
rc = cxl_mailbox_get_irq()
if (rc < 0)
	return rc;
max_irq = max(max_irq, rc);

rc = cxl_events_get_irq()
if (rc < 0)
	return rc;
max_irq = max(max_irq, rc);

rc = cxl_cpmus_get_irq(cxl_cpmu_reg_maps);
if (rc < 0)
	return rc;
max_irq = max(max_irq, rC);

...

if (irq > 0) {

	pci_get...
}

//create all the devices...


> > 
> > But if we're going to go with a free-for-all approach, can we establish
> > who goes for the initial pci_alloc_irq_vectors()? I think perhaps mbox
> > since it's the most straightforward and with least requirements, I'm
> > also unsure of the status yet to merge events and pmu, but regardless
> > they are still larger patchsets. If folks agree I can send a new mbox-only
> > patch.  
> 
> I think there needs to be some mechanism for all of the sub-device-functions to
> report their max required vectors.
> 
> I don't think that the mbox code is necessarily the code which should need to
> know about all those other sub-device-thingys.  But it could certainly take
> some 'max vectors' value that probe passed to it.
> 
> I'm still not sure how dropping this infrastructure makes Jonathan's code
> cleaner.  I still think there will need to be 2 passes over the number of
> CPMU's.
> 

Primarily that there is no need to stash anything about the CPMUs in the
cxl_device_state (option 1) or repeat all the counting and discovery logic twice
(option 2).

I can live with it (it's what we have to do in pcie port for the equivalent)
but the wrapped up version feels like a false optimization.

Saves a few lines of code and adds a bunch of complexity elsewhere that looks to
me to outweigh that saving.

If people are convinced this is the way to go then fair enough, but be prepared
for the ugly corners!

Jonathan

> Ira
> 
> > 
> > Thanks,
> > Davidlohr  
>
  
Jonathan Cameron Oct. 21, 2022, 8:58 a.m. UTC | #8
On Thu, 20 Oct 2022 21:14:29 -0700
Ira Weiny <ira.weiny@intel.com> wrote:

> On Tue, Oct 18, 2022 at 11:52:27AM +0100, Jonathan Cameron wrote:
> > On Tue, 18 Oct 2022 10:36:19 +0100
> > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
> >   
> > > On Mon, 17 Oct 2022 20:00:09 -0700
> > > Davidlohr Bueso <dave@stgolabs.net> wrote:
> > >   
> > > > Introduce a generic irq table for CXL components/features that can have
> > > > standard irq support - DOE requires dynamic vector sizing and is not
> > > > considered here. For now the table is empty.
> > > > 
> > > > Create an infrastructure to query the max vectors required for the CXL
> > > > device. Upon successful allocation, users can plug in their respective isr
> > > > at any point thereafter, which is supported by a new cxlds->has_irq flag,
> > > > for example, if the irq setup is not done in the PCI driver, such as
> > > > the case of the CXL-PMU.
> > > > 
> > > > Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> > > > Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>    
> > > 
> > > A few nitpicks inline.
> > > 
> > > With the comment one tidied up (other one optional)
> > > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > 
> > > I'll rebase my cpmu code on top of this shortly.  
> > Hi Davidlohr,
> > 
> > Doing the CPMU rebase has shown up that using this generic infrastructure
> > ends up rather ugly.
> > 
> > Previously I had a local array to manage the required register maps
> > that was then freed.  Now I have to move that into the cxl device state
> > just so I can get at it from the irq finding callback.
> > 
> > So I have an extra step to be able to use this generic framework.
> > 
> > 1. Query how many CPMU devices there are.  Stash that and register map
> >    info in cxlds.  I could do this in the callback but that's really really
> >    horrible layering issue as most of what is done has nothing to do
> >    with finding the vector numbers.  
> 
> FWIW I did this for the event stuff and did not find it so distasteful...  :-/
> 
> However the information I am stashing in the cxlds is all interrupt
> information.  So I think it is different from what I see in the CPMU stuff.

Right now I'm just stashing the max interrupt number to squirt into a callback
a few lines later. That feels like a hack to get around parsing the structures
4 times.  If it's an acceptable hack then fair enough.

> 
> > 2. The callback below to find those numbers 
> > 3. Registration of the cpmu devices.
> > 
> > Reality is that it is cleaner to more or less ignore the infrastructure
> > proposed in this patch.
> > 
> > 1. Query how many CPMU devices there are. Whilst there stash the maximim
> >    cpmu vector number in the cxlds.
> > 2. Run a stub in this infrastructure that does max(irq, cxlds->irq_num);
> > 3. Carry on as before.
> > 
> > Thus destroying the point of this infrastructure for that usecase at least
> > and leaving an extra bit of state in the cxl_dev_state that is just
> > to squirt a value into the callback...  
> 
> I'm not sure I follow?  Do you mean this?
> 
> static int cxl_cpmu_get_max_msgnum(struct cxl_dev_state *cxlds)
> {
> 	return cxlds->cpmu_max_vector;
> }

Yup. That state is no relevance to the cxl_dev_state outside of this tiny
block of code.  Hence I really don't like putting it in there.
  
> 
> > 
> > So with that in mind I'm withdrawing the RB above.  This looks to be
> > an idea that with hindsight doesn't necessarily pan out.
> > Long hand equivalent with the specific handling needed for each case
> > is probably going to be neater than walking a table of much more
> > restricted callbacks.  
> 
> I'm not married to the idea of the array of callbacks but I'm not sure how this
> solves having to iterate on the CPMU devices twice?

Laid that out in the other branch of the thread but basically either
1) We stash irrelevant information in cxl_dev_state just to get it into the callback
   It's not used for anything else and this makes a fiddly and non obvious tie
   up between different registration steps that appear somewhat independent.

2) We do the whole double parse twice (so 4 times in total) which is the right
   option to keep the layering if using this array of callbacks approach, but
   really ugly.  If we flatten it to straight line code there is no implication
   of layering and the state being parsed on is right there in a local variable.

I can live with it either way, but it's definitely not as pretty as it looks
for the mailbox case.

Jonathan

> 
> Ira
> 
> > Maybe there is a nice way to fit the CPMU
> > registration into this infrastructure, but I'm not immediately seeing it.
> > 
> > One other note inline via a compiler warning.
> > 
> > Jonathan
> >   
> > > 
> > > Jonathan
> > > 
> > >   
> > > > ---
> > > >  drivers/cxl/cxlmem.h |  3 ++
> > > >  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
> > > >  2 files changed, 75 insertions(+)
> > > > 
> > > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > > index 88e3a8e54b6a..72b69b003302 100644
> > > > --- a/drivers/cxl/cxlmem.h
> > > > +++ b/drivers/cxl/cxlmem.h
> > > > @@ -211,6 +211,7 @@ struct cxl_endpoint_dvsec_info {
> > > >   * @info: Cached DVSEC information about the device.
> > > >   * @serial: PCIe Device Serial Number
> > > >   * @doe_mbs: PCI DOE mailbox array
> > > > + * @has_irq: PCIe MSI-X/MSI support
> > > >   * @mbox_send: @dev specific transport for transmitting mailbox commands
> > > >   *
> > > >   * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
> > > > @@ -247,6 +248,8 @@ struct cxl_dev_state {
> > > >  
> > > >  	struct xarray doe_mbs;
> > > >  
> > > > +	bool has_irq;
> > > > +
> > > >  	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
> > > >  };
> > > >  
> > > > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > > > index faeb5d9d7a7a..9c3e95ebaa26 100644
> > > > --- a/drivers/cxl/pci.c
> > > > +++ b/drivers/cxl/pci.c
> > > > @@ -428,6 +428,73 @@ static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
> > > >  	}
> > > >  }
> > > >  
> > > > +/**
> > > > + * struct cxl_irq_cap - CXL feature that is capable of receiving MSI-X/MSI irqs.
> > > > + *
> > > > + * @name: Name of the device/component generating this interrupt.
> > > > + * @get_max_msgnum: Get the feature's largest interrupt message number.  If the
> > > > + *		    feature does not have the Interrupt Supported bit set, then
> > > > + *		    return -1.
> > > > + */
> > > > +struct cxl_irq_cap {
> > > > +	const char *name;
> > > > +	int (*get_max_msgnum)(struct cxl_dev_state *cxlds);
> > > > +};
> > > > +
> > > > +static const struct cxl_irq_cap cxl_irq_cap_table[] = {
> > > > +	NULL  
> > 
> > That's not valid, just make it empty instead.
> > 
> >   
> > > > +};
> > > > +
> > > > +static void cxl_pci_free_irq_vectors(void *data)
> > > > +{
> > > > +	pci_free_irq_vectors(data);
> > > > +}
> > > > +
> > > > +/*
> > > > + * Attempt to allocate the largest amount of necessary vectors.
> > > > + *
> > > > + * Returns 0 upon a successful allocation of *all* vectors, or a    
> > > 
> > > Technically not all vectors.  If we wanted to do that we could
> > > just directly query that via pci_msix_vec_count() etc that gets
> > > it from the MSIX capability. That's frowned upon because it's common
> > > to stick lots of extra vectors on the end for stuff that linux never
> > > cares about (debug etc, or optional features).
> > > 
> > > All vectors up to the maximum one the code uses would be more accurate.
> > >   
> > > > + * negative value otherwise.
> > > > + */
> > > > +static int cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
> > > > +{
> > > > +	struct device *dev = cxlds->dev;
> > > > +	struct pci_dev *pdev = to_pci_dev(dev);
> > > > +	int rc, i, vectors = -1;
> > > > +
> > > > +	for (i = 0; i < ARRAY_SIZE(cxl_irq_cap_table); i++) {
> > > > +		int irq;
> > > > +
> > > > +		if (!cxl_irq_cap_table[i].get_max_msgnum)
> > > > +			continue;
> > > > +
> > > > +		irq = cxl_irq_cap_table[i].get_max_msgnum(cxlds);
> > > > +		vectors = max_t(int, irq, vectors);
> > > > +	}
> > > > +
> > > > +	/*
> > > > +	 * Semantically lack of irq support is not an error, but we
> > > > +	 * still fail to allocate, so return negative.
> > > > +	 */
> > > > +	if (vectors == -1)
> > > > +		return -1;
> > > > +
> > > > +	vectors++;
> > > > +	rc = pci_alloc_irq_vectors(pdev, vectors, vectors,
> > > > +				   PCI_IRQ_MSIX | PCI_IRQ_MSI);
> > > > +	if (rc < 0)
> > > > +		return rc;
> > > > +
> > > > +	if (rc != vectors) {
> > > > +		dev_dbg(dev, "Not enough interrupts; use polling instead.\n");
> > > > +		/* some got allocated, clean them up */
> > > > +		cxl_pci_free_irq_vectors(pdev);
> > > > +		return -ENOSPC;
> > > > +	}
> > > > +
> > > > +	return devm_add_action_or_reset(dev, cxl_pci_free_irq_vectors, pdev);
> > > > +}
> > > > +
> > > >  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> > > >  {
> > > >  	struct cxl_register_map map;
> > > > @@ -494,6 +561,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> > > >  	if (rc)
> > > >  		return rc;
> > > >  
> > > > +	if (!cxl_pci_alloc_irq_vectors(cxlds)) {
> > > > +		cxlds->has_irq = true;
> > > > +	} else
> > > > +		cxlds->has_irq = false;
> > > > +    
> > > 	cxlds->has_irq = !(cxl_pci_aloc_irq_vectors(cxlds) < 0);
> > > 
> > > maybe...
> > >   
> > > >  	cxlmd = devm_cxl_add_memdev(cxlds);
> > > >  	if (IS_ERR(cxlmd))
> > > >  		return PTR_ERR(cxlmd);    
> > >   
> >   
>
  
Davidlohr Bueso Oct. 21, 2022, 3:58 p.m. UTC | #9
On Fri, 21 Oct 2022, Jonathan Cameron wrote:

>> FWIW I did this for the event stuff and did not find it so distasteful...  :-/
>>
>> However the information I am stashing in the cxlds is all interrupt
>> information.  So I think it is different from what I see in the CPMU stuff.
>
>Right now I'm just stashing the max interrupt number to squirt into a callback
>a few lines later. That feels like a hack to get around parsing the structures
>4 times.  If it's an acceptable hack then fair enough.
>
>>
>> > 2. The callback below to find those numbers
>> > 3. Registration of the cpmu devices.
>> >
>> > Reality is that it is cleaner to more or less ignore the infrastructure
>> > proposed in this patch.
>> >
>> > 1. Query how many CPMU devices there are. Whilst there stash the maximim
>> >    cpmu vector number in the cxlds.
>> > 2. Run a stub in this infrastructure that does max(irq, cxlds->irq_num);
>> > 3. Carry on as before.
>> >
>> > Thus destroying the point of this infrastructure for that usecase at least
>> > and leaving an extra bit of state in the cxl_dev_state that is just
>> > to squirt a value into the callback...
>>
>> I'm not sure I follow?  Do you mean this?
>>
>> static int cxl_cpmu_get_max_msgnum(struct cxl_dev_state *cxlds)
>> {
>>	return cxlds->cpmu_max_vector;
>> }
>
>Yup. That state is no relevance to the cxl_dev_state outside of this tiny
>block of code.  Hence I really don't like putting it in there.

Oh absolutely, this is ugly as sin. And if there is anything even worth stashing
the max would only be mbox, as Ira suggested earlier in v1, iirc. So no,
we should not be doing this sort of thing. And if pass one were done in the
callback the need for this would disappear.

>>
>> >
>> > So with that in mind I'm withdrawing the RB above.  This looks to be
>> > an idea that with hindsight doesn't necessarily pan out.
>> > Long hand equivalent with the specific handling needed for each case
>> > is probably going to be neater than walking a table of much more
>> > restricted callbacks.
>>
>> I'm not married to the idea of the array of callbacks but I'm not sure how this
>> solves having to iterate on the CPMU devices twice?
>
>Laid that out in the other branch of the thread but basically either
>1) We stash irrelevant information in cxl_dev_state just to get it into the callback
>   It's not used for anything else and this makes a fiddly and non obvious tie
>   up between different registration steps that appear somewhat independent.

Yeah anything _but_ this.

>
>2) We do the whole double parse twice (so 4 times in total) which is the right
>   option to keep the layering if using this array of callbacks approach, but
>   really ugly.  If we flatten it to straight line code there is no implication
>   of layering and the state being parsed on is right there in a local variable.

If we are keeping this patch, then as mentioned before, I would prefer this. imo
this is better than both 1 above and the open-coding approach.

>I can live with it either way, but it's definitely not as pretty as it looks
>for the mailbox case.

Agreed.

Thanks,
Davidlohr
  
Davidlohr Bueso Oct. 21, 2022, 4:20 p.m. UTC | #10
On Fri, 21 Oct 2022, Jonathan Cameron wrote:

>On Thu, 20 Oct 2022 21:18:58 -0700
>Ira Weiny <ira.weiny@intel.com> wrote:
>
>> On Thu, Oct 20, 2022 at 03:31:25PM -0700, Davidlohr Bueso wrote:
>> > On Tue, 18 Oct 2022, Jonathan Cameron wrote:
>> >
>> > > Reality is that it is cleaner to more or less ignore the infrastructure
>> > > proposed in this patch.
>> > >
>> > > 1. Query how many CPMU devices there are. Whilst there stash the maximim
>> > >   cpmu vector number in the cxlds.
>> > > 2. Run a stub in this infrastructure that does max(irq, cxlds->irq_num);
>> > > 3. Carry on as before.
>> > >
>> > > Thus destroying the point of this infrastructure for that usecase at least
>> > > and leaving an extra bit of state in the cxl_dev_state that is just
>> > > to squirt a value into the callback...
>> >
>> > If it doesn't fit, then it doesn't fit.
>> >
>> > However, while I was expecting pass one to be in the callback, I wasn't
>> > expecting that both pass 1 and 2 shared the cpmu_regs_array. If the array
>> > could be reconstructed during pass 2, then it would fit a bit better;
>> > albeit the extra allocation, cycles etc., but this is probing phase, so
>> > overhead isn't that important (and cpmu_count isn't big enough to matter).
>
>I thought about that approach, but it's really ugly to have to do
>
>1) For the IRQ number gathering.
>  a) Parse 1 to count CPMUs
>  b) Parse 2 to get the register maps - grab the irq numbers and unmap them again
>2) For the CPMU registration
>  a) Parse 3 to count CPMUs (we could stash the number of CPMUS form 1a) but
>     that's no advantage over stashing the max irq in current proposal.
>     Both are putting state where it's not relevant or wanted just to make it
>     available in a callback.  This way is even worse because it's getting
>     stashed as a side effect of a parse in a function doing something different.
>  b) Parse 4 to get the register maps and actually create the devices. Could have
>     stashed this earlier as well, but same 'side effects' argument applies.
>
>Sure, can move to this however with appropriate comments on why we are playing
>these games because otherwise I suspect a future 'cleanup' would remove double, double
>pass.
>
>To allow for an irq registration wrapper that turns a series of straight
>line calls into callbacks in an array.  The straight line calls aren't exactly
>complex in the first place.
>//find cpmu filling in cxl_cpmu_reg_maps.
>
>max_irq = -1
>rc = cxl_mailbox_get_irq()
>if (rc < 0)
>	return rc;
>max_irq = max(max_irq, rc);
>
>rc = cxl_events_get_irq()
>if (rc < 0)
>	return rc;
>max_irq = max(max_irq, rc);
>
>rc = cxl_cpmus_get_irq(cxl_cpmu_reg_maps);
>if (rc < 0)
>	return rc;
>max_irq = max(max_irq, rC);
>
>...
>
>if (irq > 0) {
>
>	pci_get...
>}
>
>//create all the devices...

Yes, this was sort of what I pictured if we go this way. It doesn't make
my eyes sore.

>
>> >
>> > But if we're going to go with a free-for-all approach, can we establish
>> > who goes for the initial pci_alloc_irq_vectors()? I think perhaps mbox
>> > since it's the most straightforward and with least requirements, I'm
>> > also unsure of the status yet to merge events and pmu, but regardless
>> > they are still larger patchsets. If folks agree I can send a new mbox-only
>> > patch.
>>
>> I think there needs to be some mechanism for all of the sub-device-functions to
>> report their max required vectors.
>>
>> I don't think that the mbox code is necessarily the code which should need to
>> know about all those other sub-device-thingys.  But it could certainly take
>> some 'max vectors' value that probe passed to it.
>>
>> I'm still not sure how dropping this infrastructure makes Jonathan's code
>> cleaner.  I still think there will need to be 2 passes over the number of
>> CPMU's.
>>
>
>Primarily that there is no need to stash anything about the CPMUs in the
>cxl_device_state (option 1) or repeat all the counting and discovery logic twice
>(option 2).
>
>I can live with it (it's what we have to do in pcie port for the equivalent)
>but the wrapped up version feels like a false optimization.
>
>Saves a few lines of code and adds a bunch of complexity elsewhere that looks to
>me to outweigh that saving.

Yeah it's hard to justify the extra complexity here when the alternative isn't
even that bad.

Thanks,
Davidlohr
  
Ira Weiny Oct. 21, 2022, 9:05 p.m. UTC | #11
On Fri, Oct 21, 2022 at 09:20:55AM -0700, Davidlohr Bueso wrote:
> On Fri, 21 Oct 2022, Jonathan Cameron wrote:
> 
> > On Thu, 20 Oct 2022 21:18:58 -0700
> > Ira Weiny <ira.weiny@intel.com> wrote:
> > 
> > > On Thu, Oct 20, 2022 at 03:31:25PM -0700, Davidlohr Bueso wrote:
> > > > On Tue, 18 Oct 2022, Jonathan Cameron wrote:
> > > >
> > > > > Reality is that it is cleaner to more or less ignore the infrastructure
> > > > > proposed in this patch.
> > > > >
> > > > > 1. Query how many CPMU devices there are. Whilst there stash the maximim
> > > > >   cpmu vector number in the cxlds.
> > > > > 2. Run a stub in this infrastructure that does max(irq, cxlds->irq_num);
> > > > > 3. Carry on as before.
> > > > >
> > > > > Thus destroying the point of this infrastructure for that usecase at least
> > > > > and leaving an extra bit of state in the cxl_dev_state that is just
> > > > > to squirt a value into the callback...
> > > >
> > > > If it doesn't fit, then it doesn't fit.
> > > >
> > > > However, while I was expecting pass one to be in the callback, I wasn't
> > > > expecting that both pass 1 and 2 shared the cpmu_regs_array. If the array
> > > > could be reconstructed during pass 2, then it would fit a bit better;
> > > > albeit the extra allocation, cycles etc., but this is probing phase, so
> > > > overhead isn't that important (and cpmu_count isn't big enough to matter).
> > 
> > I thought about that approach, but it's really ugly to have to do
> > 
> > 1) For the IRQ number gathering.
> >  a) Parse 1 to count CPMUs
> >  b) Parse 2 to get the register maps - grab the irq numbers and unmap them again
> > 2) For the CPMU registration
> >  a) Parse 3 to count CPMUs (we could stash the number of CPMUS form 1a) but
> >     that's no advantage over stashing the max irq in current proposal.
> >     Both are putting state where it's not relevant or wanted just to make it
> >     available in a callback.  This way is even worse because it's getting
> >     stashed as a side effect of a parse in a function doing something different.
> >  b) Parse 4 to get the register maps and actually create the devices. Could have
> >     stashed this earlier as well, but same 'side effects' argument applies.
> > 
> > Sure, can move to this however with appropriate comments on why we are playing
> > these games because otherwise I suspect a future 'cleanup' would remove double, double
> > pass.
> > 
> > To allow for an irq registration wrapper that turns a series of straight
> > line calls into callbacks in an array.  The straight line calls aren't exactly
> > complex in the first place.
> > //find cpmu filling in cxl_cpmu_reg_maps.
> > 
> > max_irq = -1
> > rc = cxl_mailbox_get_irq()
> > if (rc < 0)
> > 	return rc;
> > max_irq = max(max_irq, rc);
> > 
> > rc = cxl_events_get_irq()
> > if (rc < 0)
> > 	return rc;
> > max_irq = max(max_irq, rc);
> > 
> > rc = cxl_cpmus_get_irq(cxl_cpmu_reg_maps);
> > if (rc < 0)
> > 	return rc;
> > max_irq = max(max_irq, rC);
> > 
> > ...
> > 
> > if (irq > 0) {
> > 
> > 	pci_get...
> > }
> > 
> > //create all the devices...
> 
> Yes, this was sort of what I pictured if we go this way. It doesn't make
> my eyes sore.

Ok

> 
> > 
> > > >
> > > > But if we're going to go with a free-for-all approach, can we establish
> > > > who goes for the initial pci_alloc_irq_vectors()? I think perhaps mbox
> > > > since it's the most straightforward and with least requirements, I'm
> > > > also unsure of the status yet to merge events and pmu, but regardless
> > > > they are still larger patchsets. If folks agree I can send a new mbox-only
> > > > patch.
> > > 
> > > I think there needs to be some mechanism for all of the sub-device-functions to
> > > report their max required vectors.
> > > 
> > > I don't think that the mbox code is necessarily the code which should need to
> > > know about all those other sub-device-thingys.  But it could certainly take
> > > some 'max vectors' value that probe passed to it.
> > > 
> > > I'm still not sure how dropping this infrastructure makes Jonathan's code
> > > cleaner.  I still think there will need to be 2 passes over the number of
> > > CPMU's.
> > > 
> > 
> > Primarily that there is no need to stash anything about the CPMUs in the
> > cxl_device_state (option 1) or repeat all the counting and discovery logic twice
> > (option 2).

I see what you are driving at now.  But I don't think having irq information in
cxlds is a layering violation.

> > 
> > I can live with it (it's what we have to do in pcie port for the equivalent)
> > but the wrapped up version feels like a false optimization.
> > 
> > Saves a few lines of code and adds a bunch of complexity elsewhere that looks to
> > me to outweigh that saving.
> 
> Yeah it's hard to justify the extra complexity here when the alternative isn't
> even that bad.

Lets just do this.  I don't think it matters much either and I need something
to land before the event stuff.

Ira

> 
> Thanks,
> Davidlohr
  
Dan Williams Oct. 22, 2022, 10:05 p.m. UTC | #12
Davidlohr Bueso wrote:
> Introduce a generic irq table for CXL components/features that can have
> standard irq support - DOE requires dynamic vector sizing and is not
> considered here. For now the table is empty.
> 
> Create an infrastructure to query the max vectors required for the CXL
> device. Upon successful allocation, users can plug in their respective isr
> at any point thereafter, which is supported by a new cxlds->has_irq flag,
> for example, if the irq setup is not done in the PCI driver, such as
> the case of the CXL-PMU.
> 
> Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> ---
>  drivers/cxl/cxlmem.h |  3 ++
>  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 75 insertions(+)
> 
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 88e3a8e54b6a..72b69b003302 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -211,6 +211,7 @@ struct cxl_endpoint_dvsec_info {
>   * @info: Cached DVSEC information about the device.
>   * @serial: PCIe Device Serial Number
>   * @doe_mbs: PCI DOE mailbox array
> + * @has_irq: PCIe MSI-X/MSI support
>   * @mbox_send: @dev specific transport for transmitting mailbox commands
>   *
>   * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
> @@ -247,6 +248,8 @@ struct cxl_dev_state {
>  
>  	struct xarray doe_mbs;
>  
> +	bool has_irq;
> +
>  	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
>  };
>  
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index faeb5d9d7a7a..9c3e95ebaa26 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -428,6 +428,73 @@ static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
>  	}
>  }
>  
> +/**
> + * struct cxl_irq_cap - CXL feature that is capable of receiving MSI-X/MSI irqs.
> + *
> + * @name: Name of the device/component generating this interrupt.
> + * @get_max_msgnum: Get the feature's largest interrupt message number.  If the
> + *		    feature does not have the Interrupt Supported bit set, then
> + *		    return -1.
> + */
> +struct cxl_irq_cap {
> +	const char *name;
> +	int (*get_max_msgnum)(struct cxl_dev_state *cxlds);

Why is this a callback, why not just have the features populate their
irq numbers?

> +};
> +
> +static const struct cxl_irq_cap cxl_irq_cap_table[] = {
> +	NULL
> +};
> +
> +static void cxl_pci_free_irq_vectors(void *data)
> +{
> +	pci_free_irq_vectors(data);
> +}
> +
> +/*
> + * Attempt to allocate the largest amount of necessary vectors.
> + *
> + * Returns 0 upon a successful allocation of *all* vectors, or a
> + * negative value otherwise.
> + */
> +static int cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
> +{
> +	struct device *dev = cxlds->dev;
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	int rc, i, vectors = -1;
> +
> +	for (i = 0; i < ARRAY_SIZE(cxl_irq_cap_table); i++) {
> +		int irq;
> +
> +		if (!cxl_irq_cap_table[i].get_max_msgnum)
> +			continue;
> +
> +		irq = cxl_irq_cap_table[i].get_max_msgnum(cxlds);
> +		vectors = max_t(int, irq, vectors);
> +	}

Forgive me if I have missed something, I only look at interrupt enable
code once every few years, and the APIs are always a bit different, but
is this not too early to read the message number? The number is not
stable until either MSI or MSI-X has been selected below at
pci_alloc_irq_vectors() time?

> +
> +	/*
> +	 * Semantically lack of irq support is not an error, but we
> +	 * still fail to allocate, so return negative.
> +	 */
> +	if (vectors == -1)
> +		return -1;
> +
> +	vectors++;
> +	rc = pci_alloc_irq_vectors(pdev, vectors, vectors,
> +				   PCI_IRQ_MSIX | PCI_IRQ_MSI);
> +	if (rc < 0)
> +		return rc;
> +
> +	if (rc != vectors) {
> +		dev_dbg(dev, "Not enough interrupts; use polling instead.\n");
> +		/* some got allocated, clean them up */
> +		cxl_pci_free_irq_vectors(pdev);
> +		return -ENOSPC;
> +	}
> +
> +	return devm_add_action_or_reset(dev, cxl_pci_free_irq_vectors, pdev);
> +}
> +
>  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  {
>  	struct cxl_register_map map;
> @@ -494,6 +561,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (rc)
>  		return rc;
>  
> +	if (!cxl_pci_alloc_irq_vectors(cxlds)) {
> +		cxlds->has_irq = true;
> +	} else
> +		cxlds->has_irq = false;
> +
>  	cxlmd = devm_cxl_add_memdev(cxlds);
>  	if (IS_ERR(cxlmd))
>  		return PTR_ERR(cxlmd);
> -- 
> 2.38.0
>
  
Dan Williams Oct. 22, 2022, 10:17 p.m. UTC | #13
Jonathan Cameron wrote:
> On Thu, 20 Oct 2022 21:14:29 -0700
> Ira Weiny <ira.weiny@intel.com> wrote:
> 
> > On Tue, Oct 18, 2022 at 11:52:27AM +0100, Jonathan Cameron wrote:
> > > On Tue, 18 Oct 2022 10:36:19 +0100
> > > Jonathan Cameron <Jonathan.Cameron@huawei.com> wrote:
> > >   
> > > > On Mon, 17 Oct 2022 20:00:09 -0700
> > > > Davidlohr Bueso <dave@stgolabs.net> wrote:
> > > >   
> > > > > Introduce a generic irq table for CXL components/features that can have
> > > > > standard irq support - DOE requires dynamic vector sizing and is not
> > > > > considered here. For now the table is empty.
> > > > > 
> > > > > Create an infrastructure to query the max vectors required for the CXL
> > > > > device. Upon successful allocation, users can plug in their respective isr
> > > > > at any point thereafter, which is supported by a new cxlds->has_irq flag,
> > > > > for example, if the irq setup is not done in the PCI driver, such as
> > > > > the case of the CXL-PMU.
> > > > > 
> > > > > Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> > > > > Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>    
> > > > 
> > > > A few nitpicks inline.
> > > > 
> > > > With the comment one tidied up (other one optional)
> > > > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > 
> > > > I'll rebase my cpmu code on top of this shortly.  
> > > Hi Davidlohr,
> > > 
> > > Doing the CPMU rebase has shown up that using this generic infrastructure
> > > ends up rather ugly.
> > > 
> > > Previously I had a local array to manage the required register maps
> > > that was then freed.  Now I have to move that into the cxl device state
> > > just so I can get at it from the irq finding callback.
> > > 
> > > So I have an extra step to be able to use this generic framework.
> > > 
> > > 1. Query how many CPMU devices there are.  Stash that and register map
> > >    info in cxlds.  I could do this in the callback but that's really really
> > >    horrible layering issue as most of what is done has nothing to do
> > >    with finding the vector numbers.  
> > 
> > FWIW I did this for the event stuff and did not find it so distasteful...  :-/
> > 
> > However the information I am stashing in the cxlds is all interrupt
> > information.  So I think it is different from what I see in the CPMU stuff.
> 
> Right now I'm just stashing the max interrupt number to squirt into a callback
> a few lines later. That feels like a hack to get around parsing the structures
> 4 times.  If it's an acceptable hack then fair enough.
> 
> > 
> > > 2. The callback below to find those numbers 
> > > 3. Registration of the cpmu devices.
> > > 
> > > Reality is that it is cleaner to more or less ignore the infrastructure
> > > proposed in this patch.
> > > 
> > > 1. Query how many CPMU devices there are. Whilst there stash the maximim
> > >    cpmu vector number in the cxlds.
> > > 2. Run a stub in this infrastructure that does max(irq, cxlds->irq_num);
> > > 3. Carry on as before.
> > > 
> > > Thus destroying the point of this infrastructure for that usecase at least
> > > and leaving an extra bit of state in the cxl_dev_state that is just
> > > to squirt a value into the callback...  
> > 
> > I'm not sure I follow?  Do you mean this?
> > 
> > static int cxl_cpmu_get_max_msgnum(struct cxl_dev_state *cxlds)
> > {
> > 	return cxlds->cpmu_max_vector;
> > }
> 
> Yup. That state is no relevance to the cxl_dev_state outside of this tiny
> block of code.  Hence I really don't like putting it in there.

Yeah, I tend to agree. cxl_dev_state is the catch-all of last resort,
but if there is a more appropriate / less-abstract object to carry a
given property it should.
  
Ira Weiny Oct. 24, 2022, 12:09 a.m. UTC | #14
On Sat, Oct 22, 2022 at 03:05:45PM -0700, Dan Williams wrote:
> Davidlohr Bueso wrote:
> > Introduce a generic irq table for CXL components/features that can have
> > standard irq support - DOE requires dynamic vector sizing and is not
> > considered here. For now the table is empty.
> > 
> > Create an infrastructure to query the max vectors required for the CXL
> > device. Upon successful allocation, users can plug in their respective isr
> > at any point thereafter, which is supported by a new cxlds->has_irq flag,
> > for example, if the irq setup is not done in the PCI driver, such as
> > the case of the CXL-PMU.
> > 
> > Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> > Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> > ---
> >  drivers/cxl/cxlmem.h |  3 ++
> >  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 75 insertions(+)
> > 
> > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > index 88e3a8e54b6a..72b69b003302 100644
> > --- a/drivers/cxl/cxlmem.h
> > +++ b/drivers/cxl/cxlmem.h
> > @@ -211,6 +211,7 @@ struct cxl_endpoint_dvsec_info {
> >   * @info: Cached DVSEC information about the device.
> >   * @serial: PCIe Device Serial Number
> >   * @doe_mbs: PCI DOE mailbox array
> > + * @has_irq: PCIe MSI-X/MSI support
> >   * @mbox_send: @dev specific transport for transmitting mailbox commands
> >   *
> >   * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
> > @@ -247,6 +248,8 @@ struct cxl_dev_state {
> >  
> >  	struct xarray doe_mbs;
> >  
> > +	bool has_irq;
> > +
> >  	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
> >  };
> >  
> > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > index faeb5d9d7a7a..9c3e95ebaa26 100644
> > --- a/drivers/cxl/pci.c
> > +++ b/drivers/cxl/pci.c
> > @@ -428,6 +428,73 @@ static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
> >  	}
> >  }
> >  
> > +/**
> > + * struct cxl_irq_cap - CXL feature that is capable of receiving MSI-X/MSI irqs.
> > + *
> > + * @name: Name of the device/component generating this interrupt.
> > + * @get_max_msgnum: Get the feature's largest interrupt message number.  If the
> > + *		    feature does not have the Interrupt Supported bit set, then
> > + *		    return -1.
> > + */
> > +struct cxl_irq_cap {
> > +	const char *name;
> > +	int (*get_max_msgnum)(struct cxl_dev_state *cxlds);
> 
> Why is this a callback, why not just have the features populate their
> irq numbers?

I think we have decided to forgo the callback but I'm not sure what you mean by
'populate their irq numbers'?

> 
> > +};
> > +
> > +static const struct cxl_irq_cap cxl_irq_cap_table[] = {
> > +	NULL
> > +};
> > +
> > +static void cxl_pci_free_irq_vectors(void *data)
> > +{
> > +	pci_free_irq_vectors(data);
> > +}
> > +
> > +/*
> > + * Attempt to allocate the largest amount of necessary vectors.
> > + *
> > + * Returns 0 upon a successful allocation of *all* vectors, or a
> > + * negative value otherwise.
> > + */
> > +static int cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
> > +{
> > +	struct device *dev = cxlds->dev;
> > +	struct pci_dev *pdev = to_pci_dev(dev);
> > +	int rc, i, vectors = -1;
> > +
> > +	for (i = 0; i < ARRAY_SIZE(cxl_irq_cap_table); i++) {
> > +		int irq;
> > +
> > +		if (!cxl_irq_cap_table[i].get_max_msgnum)
> > +			continue;
> > +
> > +		irq = cxl_irq_cap_table[i].get_max_msgnum(cxlds);
> > +		vectors = max_t(int, irq, vectors);
> > +	}
> 
> Forgive me if I have missed something, I only look at interrupt enable
> code once every few years, and the APIs are always a bit different, but
> is this not too early to read the message number? The number is not
> stable until either MSI or MSI-X has been selected below at
> pci_alloc_irq_vectors() time?
 
Well I keep getting wrapped around the axle on this one too.

This all started back when Jonathan originally attempted to allocate the
maximum number of vectors a device _could_ allocate.  But it was recommended that
we determine the max number first then allocate that number.

This seems like a chicken and egg issue.  How is the number not stable before
calling pci_alloc_irq_vectors() when you need the max msg number in that call?

The Event code seems pretty simple because we use a mailbox command to
configure which logs to use irq's and the device reports the message number for
each.  Thus we can determine the max message number Events need without
enabling anything.  But your comment about them not being stable now has me
more worried...  :-(

Confused,
Ira

> > +
> > +	/*
> > +	 * Semantically lack of irq support is not an error, but we
> > +	 * still fail to allocate, so return negative.
> > +	 */
> > +	if (vectors == -1)
> > +		return -1;
> > +
> > +	vectors++;
> > +	rc = pci_alloc_irq_vectors(pdev, vectors, vectors,
> > +				   PCI_IRQ_MSIX | PCI_IRQ_MSI);
> > +	if (rc < 0)
> > +		return rc;
> > +
> > +	if (rc != vectors) {
> > +		dev_dbg(dev, "Not enough interrupts; use polling instead.\n");
> > +		/* some got allocated, clean them up */
> > +		cxl_pci_free_irq_vectors(pdev);
> > +		return -ENOSPC;
> > +	}
> > +
> > +	return devm_add_action_or_reset(dev, cxl_pci_free_irq_vectors, pdev);
> > +}
> > +
> >  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  {
> >  	struct cxl_register_map map;
> > @@ -494,6 +561,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >  	if (rc)
> >  		return rc;
> >  
> > +	if (!cxl_pci_alloc_irq_vectors(cxlds)) {
> > +		cxlds->has_irq = true;
> > +	} else
> > +		cxlds->has_irq = false;
> > +
> >  	cxlmd = devm_cxl_add_memdev(cxlds);
> >  	if (IS_ERR(cxlmd))
> >  		return PTR_ERR(cxlmd);
> > -- 
> > 2.38.0
> > 
> 
>
  
Dan Williams Oct. 24, 2022, 2:08 a.m. UTC | #15
Ira Weiny wrote:
> On Sat, Oct 22, 2022 at 03:05:45PM -0700, Dan Williams wrote:
> > Davidlohr Bueso wrote:
> > > Introduce a generic irq table for CXL components/features that can have
> > > standard irq support - DOE requires dynamic vector sizing and is not
> > > considered here. For now the table is empty.
> > > 
> > > Create an infrastructure to query the max vectors required for the CXL
> > > device. Upon successful allocation, users can plug in their respective isr
> > > at any point thereafter, which is supported by a new cxlds->has_irq flag,
> > > for example, if the irq setup is not done in the PCI driver, such as
> > > the case of the CXL-PMU.
> > > 
> > > Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> > > Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> > > ---
> > >  drivers/cxl/cxlmem.h |  3 ++
> > >  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 75 insertions(+)
> > > 
> > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > index 88e3a8e54b6a..72b69b003302 100644
> > > --- a/drivers/cxl/cxlmem.h
> > > +++ b/drivers/cxl/cxlmem.h
> > > @@ -211,6 +211,7 @@ struct cxl_endpoint_dvsec_info {
> > >   * @info: Cached DVSEC information about the device.
> > >   * @serial: PCIe Device Serial Number
> > >   * @doe_mbs: PCI DOE mailbox array
> > > + * @has_irq: PCIe MSI-X/MSI support
> > >   * @mbox_send: @dev specific transport for transmitting mailbox commands
> > >   *
> > >   * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
> > > @@ -247,6 +248,8 @@ struct cxl_dev_state {
> > >  
> > >  	struct xarray doe_mbs;
> > >  
> > > +	bool has_irq;
> > > +
> > >  	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
> > >  };
> > >  
> > > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > > index faeb5d9d7a7a..9c3e95ebaa26 100644
> > > --- a/drivers/cxl/pci.c
> > > +++ b/drivers/cxl/pci.c
> > > @@ -428,6 +428,73 @@ static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
> > >  	}
> > >  }
> > >  
> > > +/**
> > > + * struct cxl_irq_cap - CXL feature that is capable of receiving MSI-X/MSI irqs.
> > > + *
> > > + * @name: Name of the device/component generating this interrupt.
> > > + * @get_max_msgnum: Get the feature's largest interrupt message number.  If the
> > > + *		    feature does not have the Interrupt Supported bit set, then
> > > + *		    return -1.
> > > + */
> > > +struct cxl_irq_cap {
> > > +	const char *name;
> > > +	int (*get_max_msgnum)(struct cxl_dev_state *cxlds);
> > 
> > Why is this a callback, why not just have the features populate their
> > irq numbers?
> 
> I think we have decided to forgo the callback but I'm not sure what you mean by
> 'populate their irq numbers'?
> 
> > 
> > > +};
> > > +
> > > +static const struct cxl_irq_cap cxl_irq_cap_table[] = {
> > > +	NULL
> > > +};
> > > +
> > > +static void cxl_pci_free_irq_vectors(void *data)
> > > +{
> > > +	pci_free_irq_vectors(data);
> > > +}
> > > +
> > > +/*
> > > + * Attempt to allocate the largest amount of necessary vectors.
> > > + *
> > > + * Returns 0 upon a successful allocation of *all* vectors, or a
> > > + * negative value otherwise.
> > > + */
> > > +static int cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
> > > +{
> > > +	struct device *dev = cxlds->dev;
> > > +	struct pci_dev *pdev = to_pci_dev(dev);
> > > +	int rc, i, vectors = -1;
> > > +
> > > +	for (i = 0; i < ARRAY_SIZE(cxl_irq_cap_table); i++) {
> > > +		int irq;
> > > +
> > > +		if (!cxl_irq_cap_table[i].get_max_msgnum)
> > > +			continue;
> > > +
> > > +		irq = cxl_irq_cap_table[i].get_max_msgnum(cxlds);
> > > +		vectors = max_t(int, irq, vectors);
> > > +	}
> > 
> > Forgive me if I have missed something, I only look at interrupt enable
> > code once every few years, and the APIs are always a bit different, but
> > is this not too early to read the message number? The number is not
> > stable until either MSI or MSI-X has been selected below at
> > pci_alloc_irq_vectors() time?
>  
> Well I keep getting wrapped around the axle on this one too.
> 
> This all started back when Jonathan originally attempted to allocate the
> maximum number of vectors a device _could_ allocate.  But it was recommended that
> we determine the max number first then allocate that number.
> 
> This seems like a chicken and egg issue.  How is the number not stable before
> calling pci_alloc_irq_vectors() when you need the max msg number in that call?

Are we talking about the same thing? I am talking about the value in the
"Interrupt Message Number" field. That depends on whether MSI or MSI-X
gets enabled. The number of vectors the device can support is static.

Since CXL is such an a la carte spec I think this is situation to just
specify a large number of amx vectors to pci_alloc_irq_vectors() and
then find out after the fact if all of the interrupt generators that
today's cxl_pci knows about in the device each got their own vector.
  
Jonathan Cameron Oct. 24, 2022, 12:36 p.m. UTC | #16
On Sun, 23 Oct 2022 19:08:57 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> Ira Weiny wrote:
> > On Sat, Oct 22, 2022 at 03:05:45PM -0700, Dan Williams wrote:  
> > > Davidlohr Bueso wrote:  
> > > > Introduce a generic irq table for CXL components/features that can have
> > > > standard irq support - DOE requires dynamic vector sizing and is not
> > > > considered here. For now the table is empty.
> > > > 
> > > > Create an infrastructure to query the max vectors required for the CXL
> > > > device. Upon successful allocation, users can plug in their respective isr
> > > > at any point thereafter, which is supported by a new cxlds->has_irq flag,
> > > > for example, if the irq setup is not done in the PCI driver, such as
> > > > the case of the CXL-PMU.
> > > > 
> > > > Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> > > > Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> > > > ---
> > > >  drivers/cxl/cxlmem.h |  3 ++
> > > >  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
> > > >  2 files changed, 75 insertions(+)
> > > > 
> > > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > > index 88e3a8e54b6a..72b69b003302 100644
> > > > --- a/drivers/cxl/cxlmem.h
> > > > +++ b/drivers/cxl/cxlmem.h
> > > > @@ -211,6 +211,7 @@ struct cxl_endpoint_dvsec_info {
> > > >   * @info: Cached DVSEC information about the device.
> > > >   * @serial: PCIe Device Serial Number
> > > >   * @doe_mbs: PCI DOE mailbox array
> > > > + * @has_irq: PCIe MSI-X/MSI support
> > > >   * @mbox_send: @dev specific transport for transmitting mailbox commands
> > > >   *
> > > >   * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
> > > > @@ -247,6 +248,8 @@ struct cxl_dev_state {
> > > >  
> > > >  	struct xarray doe_mbs;
> > > >  
> > > > +	bool has_irq;
> > > > +
> > > >  	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
> > > >  };
> > > >  
> > > > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > > > index faeb5d9d7a7a..9c3e95ebaa26 100644
> > > > --- a/drivers/cxl/pci.c
> > > > +++ b/drivers/cxl/pci.c
> > > > @@ -428,6 +428,73 @@ static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
> > > >  	}
> > > >  }
> > > >  
> > > > +/**
> > > > + * struct cxl_irq_cap - CXL feature that is capable of receiving MSI-X/MSI irqs.
> > > > + *
> > > > + * @name: Name of the device/component generating this interrupt.
> > > > + * @get_max_msgnum: Get the feature's largest interrupt message number.  If the
> > > > + *		    feature does not have the Interrupt Supported bit set, then
> > > > + *		    return -1.
> > > > + */
> > > > +struct cxl_irq_cap {
> > > > +	const char *name;
> > > > +	int (*get_max_msgnum)(struct cxl_dev_state *cxlds);  
> > > 
> > > Why is this a callback, why not just have the features populate their
> > > irq numbers?  
> > 
> > I think we have decided to forgo the callback but I'm not sure what you mean by
> > 'populate their irq numbers'?
> >   
> > >   
> > > > +};
> > > > +
> > > > +static const struct cxl_irq_cap cxl_irq_cap_table[] = {
> > > > +	NULL
> > > > +};
> > > > +
> > > > +static void cxl_pci_free_irq_vectors(void *data)
> > > > +{
> > > > +	pci_free_irq_vectors(data);
> > > > +}
> > > > +
> > > > +/*
> > > > + * Attempt to allocate the largest amount of necessary vectors.
> > > > + *
> > > > + * Returns 0 upon a successful allocation of *all* vectors, or a
> > > > + * negative value otherwise.
> > > > + */
> > > > +static int cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
> > > > +{
> > > > +	struct device *dev = cxlds->dev;
> > > > +	struct pci_dev *pdev = to_pci_dev(dev);
> > > > +	int rc, i, vectors = -1;
> > > > +
> > > > +	for (i = 0; i < ARRAY_SIZE(cxl_irq_cap_table); i++) {
> > > > +		int irq;
> > > > +
> > > > +		if (!cxl_irq_cap_table[i].get_max_msgnum)
> > > > +			continue;
> > > > +
> > > > +		irq = cxl_irq_cap_table[i].get_max_msgnum(cxlds);
> > > > +		vectors = max_t(int, irq, vectors);
> > > > +	}  
> > > 
> > > Forgive me if I have missed something, I only look at interrupt enable
> > > code once every few years, and the APIs are always a bit different, but
> > > is this not too early to read the message number? The number is not
> > > stable until either MSI or MSI-X has been selected below at
> > > pci_alloc_irq_vectors() time?  
> >  
> > Well I keep getting wrapped around the axle on this one too.
> > 
> > This all started back when Jonathan originally attempted to allocate the
> > maximum number of vectors a device _could_ allocate.  But it was recommended that
> > we determine the max number first then allocate that number.
> > 
> > This seems like a chicken and egg issue.  How is the number not stable before
> > calling pci_alloc_irq_vectors() when you need the max msg number in that call?  
> 
> Are we talking about the same thing? I am talking about the value in the
> "Interrupt Message Number" field. That depends on whether MSI or MSI-X
> gets enabled. The number of vectors the device can support is static.
> 
> Since CXL is such an a la carte spec I think this is situation to just
> specify a large number of amx vectors to pci_alloc_irq_vectors() and
> then find out after the fact if all of the interrupt generators that
> today's cxl_pci knows about in the device each got their own vector.

I'd misunderstood how this worked and not read the spec :( I wrongly thought portdrv
did the query first and allocated the vectors after, but that's not
the case.  It first allocates max entries, then frees them all and then
allocates the ones that we find present.
We should probably look to do something similar to that though I'm not sure
even that code is always optimal.

https://elixir.bootlin.com/linux/v6.1-rc2/source/drivers/pci/pcie/portdrv_core.c#L101

In short that calls:
/* Allocate the maximum possible number of MSI/MSI-X vectors */
nr_entries = pci_alloc_irq_vectors(dev, 1, PCIE_PORT_MAX_MSI_ENTRIES,
			PCI_IRQ_MSIX | PCI_IRQ_MSI);

/* See how many and which Interrupt Message Numbers we actually use */
nvec = pcie_message_numbers(dev, mask, &pme, &aer, &dpc);

if (nvec != nr_entries) {
	pci_free_irq_vectors(dev);

	nr_entries = pci_alloc_irq_vectors(dev, nvec, nvec,
			PCI_IRQ_MSIX | PCI_IRQ_MSI);
}

My worry here is that the implicit assumption is that the vectors won't
move if we reduce the overall number of vectors we are asking for...

However, imagine the case that we have a feature the driver doesn't know
about that was previously at a higher vector.  After reducing the vectors
allocated the hardware might decide that feature needs it's own vector whereas
some others can be combined.  Hence we'd end up with a less than ideal packing
for the features we actually support.

Could do something iterative to solve this if it actually matters (increase
number of vectors until the layout matches what we get with max possible vectors).

+CC linux-pci and Bjorn for their take on this.  Maybe I'm over thinking things
and in reality this never happens.

Jonathan
  
Bjorn Helgaas Oct. 25, 2022, 11:25 p.m. UTC | #17
[+cc Christoph, beginning of thread https://lore.kernel.org/r/20221018030010.20913-2-dave@stgolabs.net]

On Mon, Oct 24, 2022 at 01:36:33PM +0100, Jonathan Cameron wrote:
> On Sun, 23 Oct 2022 19:08:57 -0700
> Dan Williams <dan.j.williams@intel.com> wrote:
> > Ira Weiny wrote:
> > > On Sat, Oct 22, 2022 at 03:05:45PM -0700, Dan Williams wrote:  
> > > > Davidlohr Bueso wrote:  
> > > > > Introduce a generic irq table for CXL components/features that can have
> > > > > standard irq support - DOE requires dynamic vector sizing and is not
> > > > > considered here. For now the table is empty.
> > > > > 
> > > > > Create an infrastructure to query the max vectors required for the CXL
> > > > > device. Upon successful allocation, users can plug in their respective isr
> > > > > at any point thereafter, which is supported by a new cxlds->has_irq flag,
> > > > > for example, if the irq setup is not done in the PCI driver, such as
> > > > > the case of the CXL-PMU.
> > > > > 
> > > > > Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> > > > > Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
> > > > > ---
> > > > >  drivers/cxl/cxlmem.h |  3 ++
> > > > >  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
> > > > >  2 files changed, 75 insertions(+)
> > > > > 
> > > > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> > > > > index 88e3a8e54b6a..72b69b003302 100644
> > > > > --- a/drivers/cxl/cxlmem.h
> > > > > +++ b/drivers/cxl/cxlmem.h
> > > > > @@ -211,6 +211,7 @@ struct cxl_endpoint_dvsec_info {
> > > > >   * @info: Cached DVSEC information about the device.
> > > > >   * @serial: PCIe Device Serial Number
> > > > >   * @doe_mbs: PCI DOE mailbox array
> > > > > + * @has_irq: PCIe MSI-X/MSI support
> > > > >   * @mbox_send: @dev specific transport for transmitting mailbox commands
> > > > >   *
> > > > >   * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
> > > > > @@ -247,6 +248,8 @@ struct cxl_dev_state {
> > > > >  
> > > > >  	struct xarray doe_mbs;
> > > > >  
> > > > > +	bool has_irq;
> > > > > +
> > > > >  	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
> > > > >  };
> > > > >  
> > > > > diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> > > > > index faeb5d9d7a7a..9c3e95ebaa26 100644
> > > > > --- a/drivers/cxl/pci.c
> > > > > +++ b/drivers/cxl/pci.c
> > > > > @@ -428,6 +428,73 @@ static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
> > > > >  	}
> > > > >  }
> > > > >  
> > > > > +/**
> > > > > + * struct cxl_irq_cap - CXL feature that is capable of receiving MSI-X/MSI irqs.
> > > > > + *
> > > > > + * @name: Name of the device/component generating this interrupt.
> > > > > + * @get_max_msgnum: Get the feature's largest interrupt message number.  If the
> > > > > + *		    feature does not have the Interrupt Supported bit set, then
> > > > > + *		    return -1.
> > > > > + */
> > > > > +struct cxl_irq_cap {
> > > > > +	const char *name;
> > > > > +	int (*get_max_msgnum)(struct cxl_dev_state *cxlds);  
> > > > 
> > > > Why is this a callback, why not just have the features populate their
> > > > irq numbers?  
> > > 
> > > I think we have decided to forgo the callback but I'm not sure what you mean by
> > > 'populate their irq numbers'?
> > >   
> > > >   
> > > > > +};
> > > > > +
> > > > > +static const struct cxl_irq_cap cxl_irq_cap_table[] = {
> > > > > +	NULL
> > > > > +};
> > > > > +
> > > > > +static void cxl_pci_free_irq_vectors(void *data)
> > > > > +{
> > > > > +	pci_free_irq_vectors(data);
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > + * Attempt to allocate the largest amount of necessary vectors.
> > > > > + *
> > > > > + * Returns 0 upon a successful allocation of *all* vectors, or a
> > > > > + * negative value otherwise.
> > > > > + */
> > > > > +static int cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
> > > > > +{
> > > > > +	struct device *dev = cxlds->dev;
> > > > > +	struct pci_dev *pdev = to_pci_dev(dev);
> > > > > +	int rc, i, vectors = -1;
> > > > > +
> > > > > +	for (i = 0; i < ARRAY_SIZE(cxl_irq_cap_table); i++) {
> > > > > +		int irq;
> > > > > +
> > > > > +		if (!cxl_irq_cap_table[i].get_max_msgnum)
> > > > > +			continue;
> > > > > +
> > > > > +		irq = cxl_irq_cap_table[i].get_max_msgnum(cxlds);
> > > > > +		vectors = max_t(int, irq, vectors);
> > > > > +	}  
> > > > 
> > > > Forgive me if I have missed something, I only look at interrupt enable
> > > > code once every few years, and the APIs are always a bit different, but
> > > > is this not too early to read the message number? The number is not
> > > > stable until either MSI or MSI-X has been selected below at
> > > > pci_alloc_irq_vectors() time?  
> > >  
> > > Well I keep getting wrapped around the axle on this one too.
> > > 
> > > This all started back when Jonathan originally attempted to
> > > allocate the maximum number of vectors a device _could_
> > > allocate.  But it was recommended that we determine the max
> > > number first then allocate that number.
> > > 
> > > This seems like a chicken and egg issue.  How is the number not
> > > stable before calling pci_alloc_irq_vectors() when you need the
> > > max msg number in that call?  
> > 
> > Are we talking about the same thing? I am talking about the value in the
> > "Interrupt Message Number" field. That depends on whether MSI or MSI-X
> > gets enabled. The number of vectors the device can support is static.
> > 
> > Since CXL is such an a la carte spec I think this is situation to just
> > specify a large number of amx vectors to pci_alloc_irq_vectors() and
> > then find out after the fact if all of the interrupt generators that
> > today's cxl_pci knows about in the device each got their own vector.
> 
> I'd misunderstood how this worked and not read the spec :( I wrongly
> thought portdrv did the query first and allocated the vectors after,
> but that's not the case.  It first allocates max entries, then frees
> them all and then allocates the ones that we find present.  We
> should probably look to do something similar to that though I'm not
> sure even that code is always optimal.
> 
> https://elixir.bootlin.com/linux/v6.1-rc2/source/drivers/pci/pcie/portdrv_core.c#L101
> 
> In short that calls:
> /* Allocate the maximum possible number of MSI/MSI-X vectors */
> nr_entries = pci_alloc_irq_vectors(dev, 1, PCIE_PORT_MAX_MSI_ENTRIES,
> 			PCI_IRQ_MSIX | PCI_IRQ_MSI);
> 
> /* See how many and which Interrupt Message Numbers we actually use */
> nvec = pcie_message_numbers(dev, mask, &pme, &aer, &dpc);
> 
> if (nvec != nr_entries) {
> 	pci_free_irq_vectors(dev);
> 
> 	nr_entries = pci_alloc_irq_vectors(dev, nvec, nvec,
> 			PCI_IRQ_MSIX | PCI_IRQ_MSI);
> }
> 
> My worry here is that the implicit assumption is that the vectors
> won't move if we reduce the overall number of vectors we are asking
> for...
> 
> However, imagine the case that we have a feature the driver doesn't
> know about that was previously at a higher vector.  After reducing
> the vectors allocated the hardware might decide that feature needs
> its own vector whereas some others can be combined.  Hence we'd end
> up with a less than ideal packing for the features we actually
> support.
> 
> Could do something iterative to solve this if it actually matters
> (increase number of vectors until the layout matches what we get
> with max possible vectors).

Is this cxl code allocating vectors for devices that might also be
claimed by portdrv?  I assume not because that sounds like a problem.

Ugh.  I always feel like the portdrv design must be sub-optimal
because this seems so hard to do cleanly.

pci_alloc_irq_vectors() has a lot of magic inside it and is great for
most drivers, but the PCIe service IRQs are definitely unusual and
maybe it's not the best fit for this situation.

If I understand correctly, Interrupt Message Numbers for all these
PCIe services (hotplug, AER, DPC, etc) are restricted to 0-31 for both
MSI and MSI-X, and the reason we don't just allocate 32 vectors all
the time is to avoid consuming too many IRQs.

The MSI case is ugly because the Interrupt Message Number can change
when we set Multiple Message Enable.  Maybe we can separate it out and
have a less than optimal solution for this case, like allocating one
or two vectors and polling if that's not enough.  I expect most
devices will support MSI-X.

For MSI-X, the vector table is in a BAR, so IIUC there's no resource
pressure until we actually assign an IRQ to a vector.  Would it be
feasible to always allocate vectors 0-31 of the MSI-X Table, and then
selectively allocate IRQs for the services we need, so we might
consume 32 MSI-X vectors but only a handful of IRQs?

Bjorn
  
Christoph Hellwig Oct. 30, 2022, 8:38 a.m. UTC | #18
On Tue, Oct 25, 2022 at 06:25:35PM -0500, Bjorn Helgaas wrote:
> Is this cxl code allocating vectors for devices that might also be
> claimed by portdrv?  I assume not because that sounds like a problem.
> 
> Ugh.  I always feel like the portdrv design must be sub-optimal
> because this seems so hard to do cleanly.

Yes, portdrv is a mess.  And I fear we really need to bite the bullet
rather sooner than later to sort much of this out by lifting all the
logic to the core and just keep the "drivers" around for sysfs
pretence.

And I think CXL is trying to run into a similar (but not quiete as bad)
mess with it's overly modular approach.  In either case the right
thing would be to do anough early setup to find the requird number of
interrupts and highest interrupt number and just request that once.
  
Davidlohr Bueso Nov. 2, 2022, 5:15 p.m. UTC | #19
On Tue, 25 Oct 2022, Bjorn Helgaas wrote:

>> In short that calls:
>> /* Allocate the maximum possible number of MSI/MSI-X vectors */
>> nr_entries = pci_alloc_irq_vectors(dev, 1, PCIE_PORT_MAX_MSI_ENTRIES,
>>			PCI_IRQ_MSIX | PCI_IRQ_MSI);
>>
>> /* See how many and which Interrupt Message Numbers we actually use */
>> nvec = pcie_message_numbers(dev, mask, &pme, &aer, &dpc);
>>
>> if (nvec != nr_entries) {
>>	pci_free_irq_vectors(dev);
>>
>>	nr_entries = pci_alloc_irq_vectors(dev, nvec, nvec,
>>			PCI_IRQ_MSIX | PCI_IRQ_MSI);
>> }
>>
>> My worry here is that the implicit assumption is that the vectors
>> won't move if we reduce the overall number of vectors we are asking
>> for...

This would also apply to what is currently in portdrv machinery, no?

>>
>> However, imagine the case that we have a feature the driver doesn't
>> know about that was previously at a higher vector.  After reducing
>> the vectors allocated the hardware might decide that feature needs
>> its own vector whereas some others can be combined.  Hence we'd end
>> up with a less than ideal packing for the features we actually
>> support.
>>
>> Could do something iterative to solve this if it actually matters
>> (increase number of vectors until the layout matches what we get
>> with max possible vectors).

Maybe do a bounded retry loop until we get stable value?

retry = 1;
do {
	pci_alloc_irq_vectors(1, 32);
	nvecs = get_max_msgnum(); // max(pmu, events, mbox, isolation)
	pci_free_irq_vectors();

	pci_alloc_irq_vectors(nvecs, nvecs);
	new_nvecs = get_max_msgnum();

	if (likely(new_nvecs == nvecs))
		return 0;

	pci_free_irq_vectors();
}  while (retry--);

return -1; // no irq support

But yeah I'm not sure how much we actually care about this. But if so,
it  also might be worth re-visiting the generic table thing, as if
nothing else it can standalone co-exist and avoid allocating any irqs
altogether if we know a-priori that there is no irq support.

>
>Is this cxl code allocating vectors for devices that might also be
>claimed by portdrv?  I assume not because that sounds like a problem.
>
>Ugh.  I always feel like the portdrv design must be sub-optimal
>because this seems so hard to do cleanly.
>
>pci_alloc_irq_vectors() has a lot of magic inside it and is great for
>most drivers, but the PCIe service IRQs are definitely unusual and
>maybe it's not the best fit for this situation.
>
>If I understand correctly, Interrupt Message Numbers for all these
>PCIe services (hotplug, AER, DPC, etc) are restricted to 0-31 for both
>MSI and MSI-X, and the reason we don't just allocate 32 vectors all
>the time is to avoid consuming too many IRQs.

Most CXL features that can have irqs will normally use only the first 16,
with the exception of isolation (cxl 3.0), which per the spec is up to 32.

>The MSI case is ugly because the Interrupt Message Number can change
>when we set Multiple Message Enable.  Maybe we can separate it out and
>have a less than optimal solution for this case, like allocating one
>or two vectors and polling if that's not enough.  I expect most
>devices will support MSI-X.

Would only supporting MSI-X be so terrible?

Thanks,
Davidlohr
  
Bjorn Helgaas Nov. 2, 2022, 10:54 p.m. UTC | #20
On Wed, Nov 02, 2022 at 10:15:24AM -0700, Davidlohr Bueso wrote:
> On Tue, 25 Oct 2022, Bjorn Helgaas wrote:
> ...

> > The MSI case is ugly because the Interrupt Message Number can change
> > when we set Multiple Message Enable.  Maybe we can separate it out and
> > have a less than optimal solution for this case, like allocating one
> > or two vectors and polling if that's not enough.  I expect most
> > devices will support MSI-X.
> 
> Would only supporting MSI-X be so terrible?

My gut feeling is that polling when MSI-X isn't supported wouldn't be
terrible, but I have no data on how common devices that only support
MSI are.

Bjorn
  
Ira Weiny Nov. 2, 2022, 11:42 p.m. UTC | #21
On Wed, Nov 02, 2022 at 10:15:24AM -0700, Davidlohr Bueso wrote:
> On Tue, 25 Oct 2022, Bjorn Helgaas wrote:
> 
> > > In short that calls:
> > > /* Allocate the maximum possible number of MSI/MSI-X vectors */
> > > nr_entries = pci_alloc_irq_vectors(dev, 1, PCIE_PORT_MAX_MSI_ENTRIES,
> > > 			PCI_IRQ_MSIX | PCI_IRQ_MSI);
> > > 
> > > /* See how many and which Interrupt Message Numbers we actually use */
> > > nvec = pcie_message_numbers(dev, mask, &pme, &aer, &dpc);
> > > 
> > > if (nvec != nr_entries) {
> > > 	pci_free_irq_vectors(dev);
> > > 
> > > 	nr_entries = pci_alloc_irq_vectors(dev, nvec, nvec,
> > > 			PCI_IRQ_MSIX | PCI_IRQ_MSI);
> > > }
> > > 
> > > My worry here is that the implicit assumption is that the vectors
> > > won't move if we reduce the overall number of vectors we are asking
> > > for...
> 
> This would also apply to what is currently in portdrv machinery, no?
> 
> > > 
> > > However, imagine the case that we have a feature the driver doesn't
> > > know about that was previously at a higher vector.  After reducing
> > > the vectors allocated the hardware might decide that feature needs
> > > its own vector whereas some others can be combined.  Hence we'd end
> > > up with a less than ideal packing for the features we actually
> > > support.
> > > 
> > > Could do something iterative to solve this if it actually matters
> > > (increase number of vectors until the layout matches what we get
> > > with max possible vectors).
> 
> Maybe do a bounded retry loop until we get stable value?
> 
> retry = 1;
> do {
> 	pci_alloc_irq_vectors(1, 32);
> 	nvecs = get_max_msgnum(); // max(pmu, events, mbox, isolation)
> 	pci_free_irq_vectors();
> 
> 	pci_alloc_irq_vectors(nvecs, nvecs);
> 	new_nvecs = get_max_msgnum();
> 
> 	if (likely(new_nvecs == nvecs))
> 		return 0;
> 
> 	pci_free_irq_vectors();
> }  while (retry--);
> 
> return -1; // no irq support
> 
> But yeah I'm not sure how much we actually care about this. But if so,
> it  also might be worth re-visiting the generic table thing, as if
> nothing else it can standalone co-exist and avoid allocating any irqs
> altogether if we know a-priori that there is no irq support.
> 
> > 
> > Is this cxl code allocating vectors for devices that might also be
> > claimed by portdrv?  I assume not because that sounds like a problem.
> > 
> > Ugh.  I always feel like the portdrv design must be sub-optimal
> > because this seems so hard to do cleanly.
> > 
> > pci_alloc_irq_vectors() has a lot of magic inside it and is great for
> > most drivers, but the PCIe service IRQs are definitely unusual and
> > maybe it's not the best fit for this situation.
> > 
> > If I understand correctly, Interrupt Message Numbers for all these
> > PCIe services (hotplug, AER, DPC, etc) are restricted to 0-31 for both
> > MSI and MSI-X, and the reason we don't just allocate 32 vectors all
> > the time is to avoid consuming too many IRQs.
> 
> Most CXL features that can have irqs will normally use only the first 16,
> with the exception of isolation (cxl 3.0), which per the spec is up to 32.

Dan, Dave, and I were discussing this and we agree.  For now the only things
people are working on are within the first 16 so why not just request 16 as the
max for now?

Ira

> 
> > The MSI case is ugly because the Interrupt Message Number can change
> > when we set Multiple Message Enable.  Maybe we can separate it out and
> > have a less than optimal solution for this case, like allocating one
> > or two vectors and polling if that's not enough.  I expect most
> > devices will support MSI-X.
> 
> Would only supporting MSI-X be so terrible?
> 
> Thanks,
> Davidlohr
  
Davidlohr Bueso Nov. 3, 2022, 12:18 a.m. UTC | #22
On Wed, 02 Nov 2022, Ira Weiny wrote:

>On Wed, Nov 02, 2022 at 10:15:24AM -0700, Davidlohr Bueso wrote:
>> Most CXL features that can have irqs will normally use only the first 16,
>> with the exception of isolation (cxl 3.0), which per the spec is up to 32.
>
>Dan, Dave, and I were discussing this and we agree.  For now the only things
>people are working on are within the first 16 so why not just request 16 as the
>max for now?

It is a fair compromise, yes.
  
Jonathan Cameron Nov. 3, 2022, 6:08 p.m. UTC | #23
On Wed, 2 Nov 2022 10:15:24 -0700
Davidlohr Bueso <dave@stgolabs.net> wrote:

> On Tue, 25 Oct 2022, Bjorn Helgaas wrote:
> 
> >> In short that calls:
> >> /* Allocate the maximum possible number of MSI/MSI-X vectors */
> >> nr_entries = pci_alloc_irq_vectors(dev, 1, PCIE_PORT_MAX_MSI_ENTRIES,
> >>			PCI_IRQ_MSIX | PCI_IRQ_MSI);
> >>
> >> /* See how many and which Interrupt Message Numbers we actually use */
> >> nvec = pcie_message_numbers(dev, mask, &pme, &aer, &dpc);
> >>
> >> if (nvec != nr_entries) {
> >>	pci_free_irq_vectors(dev);
> >>
> >>	nr_entries = pci_alloc_irq_vectors(dev, nvec, nvec,
> >>			PCI_IRQ_MSIX | PCI_IRQ_MSI);
> >> }
> >>
> >> My worry here is that the implicit assumption is that the vectors
> >> won't move if we reduce the overall number of vectors we are asking
> >> for...  
> 
> This would also apply to what is currently in portdrv machinery, no?
> 
> >>
> >> However, imagine the case that we have a feature the driver doesn't
> >> know about that was previously at a higher vector.  After reducing
> >> the vectors allocated the hardware might decide that feature needs
> >> its own vector whereas some others can be combined.  Hence we'd end
> >> up with a less than ideal packing for the features we actually
> >> support.
> >>
> >> Could do something iterative to solve this if it actually matters
> >> (increase number of vectors until the layout matches what we get
> >> with max possible vectors).  
> 
> Maybe do a bounded retry loop until we get stable value?
> 
> retry = 1;
> do {
> 	pci_alloc_irq_vectors(1, 32);
> 	nvecs = get_max_msgnum(); // max(pmu, events, mbox, isolation)
> 	pci_free_irq_vectors();
> 
> 	pci_alloc_irq_vectors(nvecs, nvecs);
> 	new_nvecs = get_max_msgnum();
> 
> 	if (likely(new_nvecs == nvecs))
> 		return 0;
> 
> 	pci_free_irq_vectors();
> }  while (retry--);
> 
> return -1; // no irq support

Yup. That's pretty much what I was thinking - if we care :)

> 
> But yeah I'm not sure how much we actually care about this. 

That was my feeling. This might be worth a comment to say that
it's not guaranteed to be optimal (in portdrv), but probably 
a won't fix.

Jonathan
  
Jonathan Cameron Nov. 3, 2022, 6:09 p.m. UTC | #24
On Wed, 2 Nov 2022 17:18:33 -0700
Davidlohr Bueso <dave@stgolabs.net> wrote:

> On Wed, 02 Nov 2022, Ira Weiny wrote:
> 
> >On Wed, Nov 02, 2022 at 10:15:24AM -0700, Davidlohr Bueso wrote:  
> >> Most CXL features that can have irqs will normally use only the first 16,
> >> with the exception of isolation (cxl 3.0), which per the spec is up to 32.  
> >
> >Dan, Dave, and I were discussing this and we agree.  For now the only things
> >people are working on are within the first 16 so why not just request 16 as the
> >max for now?  
> 
> It is a fair compromise, yes.

works for me.
  
Ira Weiny Nov. 10, 2022, 3:30 a.m. UTC | #25
On Thu, Nov 03, 2022 at 06:09:16PM +0000, Jonathan Cameron wrote:
> On Wed, 2 Nov 2022 17:18:33 -0700
> Davidlohr Bueso <dave@stgolabs.net> wrote:
> 
> > On Wed, 02 Nov 2022, Ira Weiny wrote:
> > 
> > >On Wed, Nov 02, 2022 at 10:15:24AM -0700, Davidlohr Bueso wrote:  
> > >> Most CXL features that can have irqs will normally use only the first 16,
> > >> with the exception of isolation (cxl 3.0), which per the spec is up to 32.  
> > >
> > >Dan, Dave, and I were discussing this and we agree.  For now the only things
> > >people are working on are within the first 16 so why not just request 16 as the
> > >max for now?  
> > 
> > It is a fair compromise, yes.
> 
> works for me.

I made what I thought would be a simple change to your patch and built this
into my series.

Unfortunately the following does not work with the current Qemu.

/*
 * NOTE: Currently all the functions which are enabled for CXL require their
 * vectors to be in the first 16.  Allocate this number as the min/max.
 */
#define CXL_PCI_REQUIRED_VECTORS 16

...

        rc = pci_alloc_irq_vectors(pdev, CXL_PCI_REQUIRED_VECTORS,            
                                   CXL_PCI_REQUIRED_VECTORS,
                                   PCI_IRQ_MSIX | PCI_IRQ_MSI);

This is because Qemu CXL devices only support (with the event changes I have
made) 8 msg numbers.  So the code fails to allocate any vectors.

I guess I should have known better.  But allocating something less than 16 I
guess needs to be allowed.

But that also means that beyond knowing _if_ irq's have been enabled I think
each CXL feature needs to know the number of vectors allocated so they can
ensure their msg numbers are going to work.

So how about the following as a diff to this patch?

In the event code I have then used the nr_irq_vecs field to determine if I
should enable the irq for each log.

If you are ok with it I'm going to squash it into your patch and send out a new
version of the event log series.

Thanks,
Ira


From 105561243c800442a2b7ff39b931e73b0a89bc34 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 9 Nov 2022 12:35:07 -0800
Subject: [PATCH] squash: Allocate up to a static 16 vectors.

This covers the current desired features which CXL needs now.
---
 drivers/cxl/cxlmem.h |  4 +--
 drivers/cxl/cxlpci.h |  6 ++++
 drivers/cxl/pci.c    | 68 +++++++++-----------------------------------
 3 files changed, 22 insertions(+), 56 deletions(-)

diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 78ff6dca3c4b..03da4f8f74d3 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -212,7 +212,7 @@ struct cxl_endpoint_dvsec_info {
  * @info: Cached DVSEC information about the device.
  * @serial: PCIe Device Serial Number
  * @doe_mbs: PCI DOE mailbox array
- * @has_irq: PCIe MSI-X/MSI support
+ * @nr_irq_vecs: Number of MSI-X/MSI vectors available
  * @mbox_send: @dev specific transport for transmitting mailbox commands
  *
  * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
@@ -249,7 +249,7 @@ struct cxl_dev_state {
 
 	struct xarray doe_mbs;
 
-	bool has_irq;
+	int nr_irq_vecs;
 
 	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
 };
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index eec597dbe763..b7f4e2f417d3 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -53,6 +53,12 @@
 #define	    CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK			GENMASK(15, 8)
 #define     CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK		GENMASK(31, 16)
 
+/*
+ * NOTE: Currently all the functions which are enabled for CXL require their
+ * vectors to be in the first 16.  Use this as the max.
+ */
+#define CXL_PCI_REQUIRED_VECTORS 16
+
 /* Register Block Identifier (RBI) */
 enum cxl_regloc_type {
 	CXL_REGLOC_RBI_EMPTY = 0,
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 9dc32b802594..e0d511575b45 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -428,71 +428,34 @@ static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
 	}
 }
 
-/**
- * struct cxl_irq_cap - CXL feature that is capable of receiving MSI-X/MSI irqs.
- *
- * @name: Name of the device/component generating this interrupt.
- * @get_max_msgnum: Get the feature's largest interrupt message number.  If the
- *		    feature does not have the Interrupt Supported bit set, then
- *		    return -1.
- */
-struct cxl_irq_cap {
-	const char *name;
-	int (*get_max_msgnum)(struct cxl_dev_state *cxlds);
-};
-
-static const struct cxl_irq_cap cxl_irq_cap_table[] = {
-	NULL
-};
-
 static void cxl_pci_free_irq_vectors(void *data)
 {
 	pci_free_irq_vectors(data);
 }
 
-/*
- * Attempt to allocate the largest amount of necessary vectors.
- *
- * Returns 0 upon a successful allocation of *all* vectors, or a
- * negative value otherwise.
- */
-static int cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
+static void cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
 {
 	struct device *dev = cxlds->dev;
 	struct pci_dev *pdev = to_pci_dev(dev);
-	int rc, i, vectors = -1;
-
-	for (i = 0; i < ARRAY_SIZE(cxl_irq_cap_table); i++) {
-		int irq;
-
-		if (!cxl_irq_cap_table[i].get_max_msgnum)
-			continue;
-
-		irq = cxl_irq_cap_table[i].get_max_msgnum(cxlds);
-		vectors = max_t(int, irq, vectors);
-	}
-
-	/*
-	 * Semantically lack of irq support is not an error, but we
-	 * still fail to allocate, so return negative.
-	 */
-	if (vectors == -1)
-		return -1;
+	int nvecs;
+	int rc;
 
-	vectors++;
-	rc = pci_alloc_irq_vectors(pdev, vectors, vectors,
+	nvecs = pci_alloc_irq_vectors(pdev, 1, CXL_PCI_REQUIRED_VECTORS,
 				   PCI_IRQ_MSIX | PCI_IRQ_MSI);
-	if (rc < 0)
-		return rc;
-
-	if (rc != vectors) {
+	if (nvecs < 0) {
 		dev_dbg(dev, "Not enough interrupts; use polling instead.\n");
+		return;
+	}
+
+	rc = devm_add_action_or_reset(dev, cxl_pci_free_irq_vectors, pdev);
+	if (rc) {
+		dev_dbg(dev, "Device managed call failed; interrupts disabled.\n");
 		/* some got allocated, clean them up */
 		cxl_pci_free_irq_vectors(pdev);
-		return -ENOSPC;
+		return;
 	}
 
-	return devm_add_action_or_reset(dev, cxl_pci_free_irq_vectors, pdev);
+	cxlds->nr_irq_vecs = nvecs;
 }
 
 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
@@ -561,10 +524,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
-	if (!cxl_pci_alloc_irq_vectors(cxlds)) {
-		cxlds->has_irq = true;
-	} else
-		cxlds->has_irq = false;
+	cxl_pci_alloc_irq_vectors(cxlds);
 
 	cxlmd = devm_cxl_add_memdev(cxlds);
 	if (IS_ERR(cxlmd))

base-commit: aae703b02f92bde9264366c545e87cec451de471
prerequisite-patch-id: e3c882f3fce0872c2259538d00ad798236ec251f
prerequisite-patch-id: a8faa71e6d79cb30eae9b863349f0cb5ffa55b05
prerequisite-patch-id: f8e6edeb4a1d8bc4b34a509cb3d4a625becdf1b3
prerequisite-patch-id: 665a2b5af761a3f50e20da2fa8e5fdc9df13969d
prerequisite-patch-id: 5cd9f56597f9c8637201193849f68811a94d2309
prerequisite-patch-id: 89be9f2bd84118682b9b37e4f3a6e057fd4ad0d6
prerequisite-patch-id: 3de731a18a28c32572bf65f38e8eb2fb927e4f56
  
Davidlohr Bueso Nov. 11, 2022, 9:18 p.m. UTC | #26
On Wed, 09 Nov 2022, Ira Weiny wrote:

>Unfortunately the following does not work with the current Qemu.
>
>/*
> * NOTE: Currently all the functions which are enabled for CXL require their
> * vectors to be in the first 16.  Allocate this number as the min/max.
> */
>#define CXL_PCI_REQUIRED_VECTORS 16
>
>...
>
>        rc = pci_alloc_irq_vectors(pdev, CXL_PCI_REQUIRED_VECTORS,
>                                   CXL_PCI_REQUIRED_VECTORS,
>                                   PCI_IRQ_MSIX | PCI_IRQ_MSI);
>
>This is because Qemu CXL devices only support (with the event changes I have
>made) 8 msg numbers.  So the code fails to allocate any vectors.
>
>I guess I should have known better.  But allocating something less than 16 I
>guess needs to be allowed.
>
>But that also means that beyond knowing _if_ irq's have been enabled I think
>each CXL feature needs to know the number of vectors allocated so they can
>ensure their msg numbers are going to work.
>
>So how about the following as a diff to this patch?
>
>In the event code I have then used the nr_irq_vecs field to determine if I
>should enable the irq for each log.
>
>If you are ok with it I'm going to squash it into your patch and send out a new
>version of the event log series.

LGTM, thanks.
  

Patch

diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 88e3a8e54b6a..72b69b003302 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -211,6 +211,7 @@  struct cxl_endpoint_dvsec_info {
  * @info: Cached DVSEC information about the device.
  * @serial: PCIe Device Serial Number
  * @doe_mbs: PCI DOE mailbox array
+ * @has_irq: PCIe MSI-X/MSI support
  * @mbox_send: @dev specific transport for transmitting mailbox commands
  *
  * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
@@ -247,6 +248,8 @@  struct cxl_dev_state {
 
 	struct xarray doe_mbs;
 
+	bool has_irq;
+
 	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
 };
 
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index faeb5d9d7a7a..9c3e95ebaa26 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -428,6 +428,73 @@  static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
 	}
 }
 
+/**
+ * struct cxl_irq_cap - CXL feature that is capable of receiving MSI-X/MSI irqs.
+ *
+ * @name: Name of the device/component generating this interrupt.
+ * @get_max_msgnum: Get the feature's largest interrupt message number.  If the
+ *		    feature does not have the Interrupt Supported bit set, then
+ *		    return -1.
+ */
+struct cxl_irq_cap {
+	const char *name;
+	int (*get_max_msgnum)(struct cxl_dev_state *cxlds);
+};
+
+static const struct cxl_irq_cap cxl_irq_cap_table[] = {
+	NULL
+};
+
+static void cxl_pci_free_irq_vectors(void *data)
+{
+	pci_free_irq_vectors(data);
+}
+
+/*
+ * Attempt to allocate the largest amount of necessary vectors.
+ *
+ * Returns 0 upon a successful allocation of *all* vectors, or a
+ * negative value otherwise.
+ */
+static int cxl_pci_alloc_irq_vectors(struct cxl_dev_state *cxlds)
+{
+	struct device *dev = cxlds->dev;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int rc, i, vectors = -1;
+
+	for (i = 0; i < ARRAY_SIZE(cxl_irq_cap_table); i++) {
+		int irq;
+
+		if (!cxl_irq_cap_table[i].get_max_msgnum)
+			continue;
+
+		irq = cxl_irq_cap_table[i].get_max_msgnum(cxlds);
+		vectors = max_t(int, irq, vectors);
+	}
+
+	/*
+	 * Semantically lack of irq support is not an error, but we
+	 * still fail to allocate, so return negative.
+	 */
+	if (vectors == -1)
+		return -1;
+
+	vectors++;
+	rc = pci_alloc_irq_vectors(pdev, vectors, vectors,
+				   PCI_IRQ_MSIX | PCI_IRQ_MSI);
+	if (rc < 0)
+		return rc;
+
+	if (rc != vectors) {
+		dev_dbg(dev, "Not enough interrupts; use polling instead.\n");
+		/* some got allocated, clean them up */
+		cxl_pci_free_irq_vectors(pdev);
+		return -ENOSPC;
+	}
+
+	return devm_add_action_or_reset(dev, cxl_pci_free_irq_vectors, pdev);
+}
+
 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct cxl_register_map map;
@@ -494,6 +561,11 @@  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
+	if (!cxl_pci_alloc_irq_vectors(cxlds)) {
+		cxlds->has_irq = true;
+	} else
+		cxlds->has_irq = false;
+
 	cxlmd = devm_cxl_add_memdev(cxlds);
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);