[3/5] cxl/pci: Discover and cache pointer to RCD dport's CXL RAS registers

Message ID 20221021185615.605233-4-terry.bowman@amd.com
State New
Headers
Series cxl: Log downport PCIe AER and CXL RAS error information |

Commit Message

Terry Bowman Oct. 21, 2022, 6:56 p.m. UTC
  CXL RAS information resides in a RAS capability structure located in
CXL.cache and CXL.mem registers.[1] The RAS capability provides CXL
specific error information that can be helpful in debugging. This
information is not currently logged but needs to be logged during PCIe AER
error handling.

Update the CXL driver to find and cache a pointer to the CXL RAS
capability. The RAS registers resides in the downport's component register
block. Note:RAS registers are not in the upport. The component registers
can be found by first using the RCRB to goto the downport. Next, the
downport's 64-bit BAR[0] will point to the component register block.

[1] CXL3.0 Spec, '8.2.5 CXL.cache and CXL.mem Registers'

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/cxl/cxl.h    |  4 +++
 drivers/cxl/cxlmem.h |  1 +
 drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+)
  

Comments

Dan Williams Oct. 22, 2022, 10:44 p.m. UTC | #1
Terry Bowman wrote:
> CXL RAS information resides in a RAS capability structure located in
> CXL.cache and CXL.mem registers.[1] The RAS capability provides CXL
> specific error information that can be helpful in debugging. This
> information is not currently logged but needs to be logged during PCIe AER
> error handling.
> 
> Update the CXL driver to find and cache a pointer to the CXL RAS
> capability. The RAS registers resides in the downport's component register
> block. Note:RAS registers are not in the upport. The component registers
> can be found by first using the RCRB to goto the downport. Next, the
> downport's 64-bit BAR[0] will point to the component register block.
> 
> [1] CXL3.0 Spec, '8.2.5 CXL.cache and CXL.mem Registers'
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> ---
>  drivers/cxl/cxl.h    |  4 +++
>  drivers/cxl/cxlmem.h |  1 +
>  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 77 insertions(+)
> 
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 7d507ab80a78..69b50131ad86 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -36,6 +36,10 @@
>  #define   CXL_CM_CAP_CAP_ID_HDM 0x5
>  #define   CXL_CM_CAP_CAP_HDM_VERSION 1
>  
> +/* CXL 3.0 8.2.4.2 CXL RAS Capability Header */
> +#define CXL_CM_CAP_ID_RAS 0x2
> +#define CXL_CM_CAP_SIZE_RAS 0x5C
> +
>  /* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */
>  #define CXL_HDM_DECODER_CAP_OFFSET 0x0
>  #define   CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0)
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 079db2e15acc..515273e224ea 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -243,6 +243,7 @@ struct cxl_dev_state {
>  	u64 next_persistent_bytes;
>  
>  	struct cxl_register_map aer_map;
> +	struct cxl_register_map ras_map;
>  
>  	resource_size_t component_reg_phys;
>  	u64 serial;
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 2287b5225862..7f717fb47a36 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -586,6 +586,78 @@ void cxl_pci_aer_init(struct cxl_memdev *cxlmd)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_pci_aer_init, CXL);
>  
> +static resource_size_t cxl_get_dport_ras_base(struct cxl_memdev *cxlmd)
> +{
> +	resource_size_t component_reg_phys, offset = 0;
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	void *cap_hdr_addr, *comp_reg_mapped;
> +	u32 cap_hdr, ras_cap_hdr;
> +	int cap_ndx;
> +
> +	comp_reg_mapped = ioremap(cxlds->component_reg_phys +
> +				  CXL_CM_OFFSET, CXL_COMPONENT_REG_BLOCK_SIZE);
> +	if (!comp_reg_mapped)
> +		return 0;
> +
> +	cap_hdr_addr = comp_reg_mapped;
> +	cap_hdr = readl(cap_hdr_addr);
> +	for (cap_ndx = 0;
> +	     cap_ndx < FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_hdr);
> +	     cap_ndx++) {
> +		ras_cap_hdr = readl(cap_hdr_addr + cap_ndx*sizeof(u32));
> +
> +		if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) == CXL_CM_CAP_ID_RAS) {
> +			pr_debug("RAS cap header = %X @ %pa, cap_ndx = %d\n",
> +				 ras_cap_hdr, cap_hdr_addr, cap_ndx);
> +			break;
> +		}
> +	}
> +
> +	offset = CXL_CM_OFFSET + PCI_EXT_CAP_NEXT(ras_cap_hdr);
> +
> +	iounmap(comp_reg_mapped);
> +
> +	if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) != CXL_CM_CAP_ID_RAS)
> +		return 0;
> +
> +	pr_debug("Found RAS capability @ %llX (%X)\n",
> +		 component_reg_phys + offset, *((u32 *)(comp_reg_mapped + offset)));
> +
> +	return component_reg_phys + offset;

For the RAS capability in the cxl_pci device this patch needs to be
reconciled with this effort:

https://lore.kernel.org/linux-cxl/166336972295.3803215.1047199449525031921.stgit@djiang5-desk3.ch.intel.com/

I think we will want RCD and VH RAS capability reporting to happen in
the same place, and that can not be cxl_pci because cxl_pci has no way
to find the RAS registers on its own. It needs the help from cxl_mem to
do the upstream cxl_port associtation first.

Given CXL switches will have their own RAS capabilities to report it
feels like the cxl_port driver is where all of this should be
centralized.


> +}
> +
> +static int cxl_setup_dport_ras(struct cxl_memdev *cxlmd, resource_size_t resource)
> +{
> +	struct cxl_register_map *map = &cxlmd->cxlds->ras_map;
> +	struct pci_dev *pdev = to_pci_dev(&cxlmd->dev);
> +
> +	if (!resource) {
> +		pr_err("%s():%d: RAS resource ptr is NULL\n", __func__, __LINE__);
> +		return  -EINVAL;
> +	}
> +
> +	map->base = devm_cxl_iomap_block(&pdev->dev, resource, CXL_CM_CAP_SIZE_RAS);
> +	if (!map->base)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +void cxl_pci_ras_init(struct cxl_memdev *cxlmd)
> +{
> +	resource_size_t cap;
> +
> +	/*
> +	 * TODO - CXL2.0 will need change to support PCI config space.
> +	 */
> +	if (!is_rcd(cxlmd))
> +		return;
> +
> +	cap = cxl_get_dport_ras_base(cxlmd);
> +	cxl_setup_dport_ras(cxlmd, cap);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_pci_ras_init, CXL);
> +
>  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  {
>  	struct cxl_register_map map;
> -- 
> 2.34.1
>
  
Terry Bowman Oct. 26, 2022, 7:01 p.m. UTC | #2
On 10/22/22 17:44, Dan Williams wrote:
> Terry Bowman wrote:
>> CXL RAS information resides in a RAS capability structure located in
>> CXL.cache and CXL.mem registers.[1] The RAS capability provides CXL
>> specific error information that can be helpful in debugging. This
>> information is not currently logged but needs to be logged during PCIe AER
>> error handling.
>>
>> Update the CXL driver to find and cache a pointer to the CXL RAS
>> capability. The RAS registers resides in the downport's component register
>> block. Note:RAS registers are not in the upport. The component registers
>> can be found by first using the RCRB to goto the downport. Next, the
>> downport's 64-bit BAR[0] will point to the component register block.
>>
>> [1] CXL3.0 Spec, '8.2.5 CXL.cache and CXL.mem Registers'
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>>  drivers/cxl/cxl.h    |  4 +++
>>  drivers/cxl/cxlmem.h |  1 +
>>  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 77 insertions(+)
>>
>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>> index 7d507ab80a78..69b50131ad86 100644
>> --- a/drivers/cxl/cxl.h
>> +++ b/drivers/cxl/cxl.h
>> @@ -36,6 +36,10 @@
>>  #define   CXL_CM_CAP_CAP_ID_HDM 0x5
>>  #define   CXL_CM_CAP_CAP_HDM_VERSION 1
>>  
>> +/* CXL 3.0 8.2.4.2 CXL RAS Capability Header */
>> +#define CXL_CM_CAP_ID_RAS 0x2
>> +#define CXL_CM_CAP_SIZE_RAS 0x5C
>> +
>>  /* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */
>>  #define CXL_HDM_DECODER_CAP_OFFSET 0x0
>>  #define   CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0)
>> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
>> index 079db2e15acc..515273e224ea 100644
>> --- a/drivers/cxl/cxlmem.h
>> +++ b/drivers/cxl/cxlmem.h
>> @@ -243,6 +243,7 @@ struct cxl_dev_state {
>>  	u64 next_persistent_bytes;
>>  
>>  	struct cxl_register_map aer_map;
>> +	struct cxl_register_map ras_map;
>>  
>>  	resource_size_t component_reg_phys;
>>  	u64 serial;
>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>> index 2287b5225862..7f717fb47a36 100644
>> --- a/drivers/cxl/pci.c
>> +++ b/drivers/cxl/pci.c
>> @@ -586,6 +586,78 @@ void cxl_pci_aer_init(struct cxl_memdev *cxlmd)
>>  }
>>  EXPORT_SYMBOL_NS_GPL(cxl_pci_aer_init, CXL);
>>  
>> +static resource_size_t cxl_get_dport_ras_base(struct cxl_memdev *cxlmd)
>> +{
>> +	resource_size_t component_reg_phys, offset = 0;
>> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>> +	void *cap_hdr_addr, *comp_reg_mapped;
>> +	u32 cap_hdr, ras_cap_hdr;
>> +	int cap_ndx;
>> +
>> +	comp_reg_mapped = ioremap(cxlds->component_reg_phys +
>> +				  CXL_CM_OFFSET, CXL_COMPONENT_REG_BLOCK_SIZE);
>> +	if (!comp_reg_mapped)
>> +		return 0;
>> +
>> +	cap_hdr_addr = comp_reg_mapped;
>> +	cap_hdr = readl(cap_hdr_addr);
>> +	for (cap_ndx = 0;
>> +	     cap_ndx < FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_hdr);
>> +	     cap_ndx++) {
>> +		ras_cap_hdr = readl(cap_hdr_addr + cap_ndx*sizeof(u32));
>> +
>> +		if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) == CXL_CM_CAP_ID_RAS) {
>> +			pr_debug("RAS cap header = %X @ %pa, cap_ndx = %d\n",
>> +				 ras_cap_hdr, cap_hdr_addr, cap_ndx);
>> +			break;
>> +		}
>> +	}
>> +
>> +	offset = CXL_CM_OFFSET + PCI_EXT_CAP_NEXT(ras_cap_hdr);
>> +
>> +	iounmap(comp_reg_mapped);
>> +
>> +	if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) != CXL_CM_CAP_ID_RAS)
>> +		return 0;
>> +
>> +	pr_debug("Found RAS capability @ %llX (%X)\n",
>> +		 component_reg_phys + offset, *((u32 *)(comp_reg_mapped + offset)));
>> +
>> +	return component_reg_phys + offset;
> 
> For the RAS capability in the cxl_pci device this patch needs to be
> reconciled with this effort:
> 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Flinux-cxl%2F166336972295.3803215.1047199449525031921.stgit%40djiang5-desk3.ch.intel.com%2F&amp;data=05%7C01%7Cterry.bowman%40amd.com%7C33092f731a854d7a9a5b08dab47f1075%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638020755070581692%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=84oOACCwh4XtndFKOUV%2FltpJjFnp4lsFQgw75UMphHQ%3D&amp;reserved=0
> 
> I think we will want RCD and VH RAS capability reporting to happen in
> the same place, and that can not be cxl_pci because cxl_pci has no way
> to find the RAS registers on its own. It needs the help from cxl_mem to
> do the upstream cxl_port associtation first.
> 
> Given CXL switches will have their own RAS capabilities to report it
> feels like the cxl_port driver is where all of this should be
> centralized.
> 
> 

I'm working on merging the patchsets now.

I'm merging the following:
  Dave Jiang's onto 6.1.0-rc1+, provides RAS mapping.
  Roberts series ontop of Dave's, provides RCD discovery.
  And this patchset ontop of Robert's, provides AER and RAS logging

Regards,
Terry

>> +}
>> +
>> +static int cxl_setup_dport_ras(struct cxl_memdev *cxlmd, resource_size_t resource)
>> +{
>> +	struct cxl_register_map *map = &cxlmd->cxlds->ras_map;
>> +	struct pci_dev *pdev = to_pci_dev(&cxlmd->dev);
>> +
>> +	if (!resource) {
>> +		pr_err("%s():%d: RAS resource ptr is NULL\n", __func__, __LINE__);
>> +		return  -EINVAL;
>> +	}
>> +
>> +	map->base = devm_cxl_iomap_block(&pdev->dev, resource, CXL_CM_CAP_SIZE_RAS);
>> +	if (!map->base)
>> +		return -ENOMEM;
>> +
>> +	return 0;
>> +}
>> +
>> +void cxl_pci_ras_init(struct cxl_memdev *cxlmd)
>> +{
>> +	resource_size_t cap;
>> +
>> +	/*
>> +	 * TODO - CXL2.0 will need change to support PCI config space.
>> +	 */
>> +	if (!is_rcd(cxlmd))
>> +		return;
>> +
>> +	cap = cxl_get_dport_ras_base(cxlmd);
>> +	cxl_setup_dport_ras(cxlmd, cap);
>> +}
>> +EXPORT_SYMBOL_NS_GPL(cxl_pci_ras_init, CXL);
>> +
>>  static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>>  {
>>  	struct cxl_register_map map;
>> -- 
>> 2.34.1
>>
> 
>
  
Dan Williams Oct. 27, 2022, 8:32 p.m. UTC | #3
Terry Bowman wrote:
> 
> 
> On 10/22/22 17:44, Dan Williams wrote:
> > Terry Bowman wrote:
> >> CXL RAS information resides in a RAS capability structure located in
> >> CXL.cache and CXL.mem registers.[1] The RAS capability provides CXL
> >> specific error information that can be helpful in debugging. This
> >> information is not currently logged but needs to be logged during PCIe AER
> >> error handling.
> >>
> >> Update the CXL driver to find and cache a pointer to the CXL RAS
> >> capability. The RAS registers resides in the downport's component register
> >> block. Note:RAS registers are not in the upport. The component registers
> >> can be found by first using the RCRB to goto the downport. Next, the
> >> downport's 64-bit BAR[0] will point to the component register block.
> >>
> >> [1] CXL3.0 Spec, '8.2.5 CXL.cache and CXL.mem Registers'
> >>
> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> >> ---
> >>  drivers/cxl/cxl.h    |  4 +++
> >>  drivers/cxl/cxlmem.h |  1 +
> >>  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
> >>  3 files changed, 77 insertions(+)
> >>
> >> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> >> index 7d507ab80a78..69b50131ad86 100644
> >> --- a/drivers/cxl/cxl.h
> >> +++ b/drivers/cxl/cxl.h
> >> @@ -36,6 +36,10 @@
> >>  #define   CXL_CM_CAP_CAP_ID_HDM 0x5
> >>  #define   CXL_CM_CAP_CAP_HDM_VERSION 1
> >>  
> >> +/* CXL 3.0 8.2.4.2 CXL RAS Capability Header */
> >> +#define CXL_CM_CAP_ID_RAS 0x2
> >> +#define CXL_CM_CAP_SIZE_RAS 0x5C
> >> +
> >>  /* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */
> >>  #define CXL_HDM_DECODER_CAP_OFFSET 0x0
> >>  #define   CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0)
> >> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> >> index 079db2e15acc..515273e224ea 100644
> >> --- a/drivers/cxl/cxlmem.h
> >> +++ b/drivers/cxl/cxlmem.h
> >> @@ -243,6 +243,7 @@ struct cxl_dev_state {
> >>  	u64 next_persistent_bytes;
> >>  
> >>  	struct cxl_register_map aer_map;
> >> +	struct cxl_register_map ras_map;
> >>  
> >>  	resource_size_t component_reg_phys;
> >>  	u64 serial;
> >> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> >> index 2287b5225862..7f717fb47a36 100644
> >> --- a/drivers/cxl/pci.c
> >> +++ b/drivers/cxl/pci.c
> >> @@ -586,6 +586,78 @@ void cxl_pci_aer_init(struct cxl_memdev *cxlmd)
> >>  }
> >>  EXPORT_SYMBOL_NS_GPL(cxl_pci_aer_init, CXL);
> >>  
> >> +static resource_size_t cxl_get_dport_ras_base(struct cxl_memdev *cxlmd)
> >> +{
> >> +	resource_size_t component_reg_phys, offset = 0;
> >> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> >> +	void *cap_hdr_addr, *comp_reg_mapped;
> >> +	u32 cap_hdr, ras_cap_hdr;
> >> +	int cap_ndx;
> >> +
> >> +	comp_reg_mapped = ioremap(cxlds->component_reg_phys +
> >> +				  CXL_CM_OFFSET, CXL_COMPONENT_REG_BLOCK_SIZE);
> >> +	if (!comp_reg_mapped)
> >> +		return 0;
> >> +
> >> +	cap_hdr_addr = comp_reg_mapped;
> >> +	cap_hdr = readl(cap_hdr_addr);
> >> +	for (cap_ndx = 0;
> >> +	     cap_ndx < FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_hdr);
> >> +	     cap_ndx++) {
> >> +		ras_cap_hdr = readl(cap_hdr_addr + cap_ndx*sizeof(u32));
> >> +
> >> +		if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) == CXL_CM_CAP_ID_RAS) {
> >> +			pr_debug("RAS cap header = %X @ %pa, cap_ndx = %d\n",
> >> +				 ras_cap_hdr, cap_hdr_addr, cap_ndx);
> >> +			break;
> >> +		}
> >> +	}
> >> +
> >> +	offset = CXL_CM_OFFSET + PCI_EXT_CAP_NEXT(ras_cap_hdr);
> >> +
> >> +	iounmap(comp_reg_mapped);
> >> +
> >> +	if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) != CXL_CM_CAP_ID_RAS)
> >> +		return 0;
> >> +
> >> +	pr_debug("Found RAS capability @ %llX (%X)\n",
> >> +		 component_reg_phys + offset, *((u32 *)(comp_reg_mapped + offset)));
> >> +
> >> +	return component_reg_phys + offset;
> > 
> > For the RAS capability in the cxl_pci device this patch needs to be
> > reconciled with this effort:
> > 
> > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Flinux-cxl%2F166336972295.3803215.1047199449525031921.stgit%40djiang5-desk3.ch.intel.com%2F&amp;data=05%7C01%7Cterry.bowman%40amd.com%7C33092f731a854d7a9a5b08dab47f1075%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638020755070581692%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=84oOACCwh4XtndFKOUV%2FltpJjFnp4lsFQgw75UMphHQ%3D&amp;reserved=0
> > 
> > I think we will want RCD and VH RAS capability reporting to happen in
> > the same place, and that can not be cxl_pci because cxl_pci has no way
> > to find the RAS registers on its own. It needs the help from cxl_mem to
> > do the upstream cxl_port associtation first.
> > 
> > Given CXL switches will have their own RAS capabilities to report it
> > feels like the cxl_port driver is where all of this should be
> > centralized.
> > 
> > 
> 
> I'm working on merging the patchsets now.
> 
> I'm merging the following:
>   Dave Jiang's onto 6.1.0-rc1+, provides RAS mapping.

Sounds like I should add this to the RCH branch so you can build on it.

>   Roberts series ontop of Dave's, provides RCD discovery.

Robert's series is still pending the rework to drop the
devm_cxl_enumerate_ports() changes, not sure it's at a state where you
can reliably build on it.

>   And this patchset ontop of Robert's, provides AER and RAS logging

As long as you are expecting to do one more rebase on the final form of
Robert's series, sounds good.
  
Ariel.Sibley@microchip.com Oct. 28, 2022, 12:53 p.m. UTC | #4
> -----Original Message-----
> From: Terry Bowman <terry.bowman@amd.com>
> Sent: Friday, October 21, 2022 3:56 PM
> To: alison.schofield@intel.com; vishal.l.verma@intel.com; dave.jiang@intel.com; ira.weiny@intel.com;
> bwidawsk@kernel.org; dan.j.williams@intel.com
> Cc: terry.bowman@amd.com; linux-cxl@vger.kernel.org; linux-kernel@vger.kernel.org; bhelgaas@google.com;
> rafael@kernel.org; lenb@kernel.org; Jonathan.Cameron@huawei.com; dave@stgolabs.net; rrichter@amd.com
> Subject: [PATCH 3/5] cxl/pci: Discover and cache pointer to RCD dport's CXL RAS registers
> 
> CXL RAS information resides in a RAS capability structure located in
> CXL.cache and CXL.mem registers.[1] The RAS capability provides CXL
> specific error information that can be helpful in debugging. This
> information is not currently logged but needs to be logged during PCIe AER
> error handling.
> 
> Update the CXL driver to find and cache a pointer to the CXL RAS
> capability. The RAS registers resides in the downport's component register
> block. Note:RAS registers are not in the upport. The component registers
> can be found by first using the RCRB to goto the downport. Next, the
> downport's 64-bit BAR[0] will point to the component register block.

I realize this patch is for dport only, but regarding "Note:RAS registers
are not in the upport.", the upstream port also has RAS registers.

Per CXL 3.0 Section 12.2.1.2 RCD Upstream Port-detected Errors:
"1. If a CXL.cache or CXL.mem logic block in UPZ detects a protocol or link
error, the block shall log the error in the CXL RAS Capability (see Section
8.2.4.16)."
  
Terry Bowman Oct. 28, 2022, 2:46 p.m. UTC | #5
Hi Ariel,


On 10/28/22 07:53, Ariel.Sibley@microchip.com wrote:
>> -----Original Message-----
>> From: Terry Bowman <terry.bowman@amd.com>
>> Sent: Friday, October 21, 2022 3:56 PM
>> To: alison.schofield@intel.com; vishal.l.verma@intel.com; dave.jiang@intel.com; ira.weiny@intel.com;
>> bwidawsk@kernel.org; dan.j.williams@intel.com
>> Cc: terry.bowman@amd.com; linux-cxl@vger.kernel.org; linux-kernel@vger.kernel.org; bhelgaas@google.com;
>> rafael@kernel.org; lenb@kernel.org; Jonathan.Cameron@huawei.com; dave@stgolabs.net; rrichter@amd.com
>> Subject: [PATCH 3/5] cxl/pci: Discover and cache pointer to RCD dport's CXL RAS registers
>>
>> CXL RAS information resides in a RAS capability structure located in
>> CXL.cache and CXL.mem registers.[1] The RAS capability provides CXL
>> specific error information that can be helpful in debugging. This
>> information is not currently logged but needs to be logged during PCIe AER
>> error handling.
>>
>> Update the CXL driver to find and cache a pointer to the CXL RAS
>> capability. The RAS registers resides in the downport's component register
>> block. Note:RAS registers are not in the upport. The component registers
>> can be found by first using the RCRB to goto the downport. Next, the
>> downport's 64-bit BAR[0] will point to the component register block.
> 
> I realize this patch is for dport only, but regarding "Note:RAS registers
> are not in the upport.", the upstream port also has RAS registers.
> 

Correct. Thanks for pointing this out.

> Per CXL 3.0 Section 12.2.1.2 RCD Upstream Port-detected Errors:
> "1. If a CXL.cache or CXL.mem logic block in UPZ detects a protocol or link
> error, the block shall log the error in the CXL RAS Capability (see Section
> 8.2.4.16)."
  
Terry Bowman Oct. 31, 2022, 4:17 p.m. UTC | #6
On 10/27/22 15:32, Dan Williams wrote:
> Terry Bowman wrote:
>>
>>
>> On 10/22/22 17:44, Dan Williams wrote:
>>> Terry Bowman wrote:
>>>> CXL RAS information resides in a RAS capability structure located in
>>>> CXL.cache and CXL.mem registers.[1] The RAS capability provides CXL
>>>> specific error information that can be helpful in debugging. This
>>>> information is not currently logged but needs to be logged during PCIe AER
>>>> error handling.
>>>>
>>>> Update the CXL driver to find and cache a pointer to the CXL RAS
>>>> capability. The RAS registers resides in the downport's component register
>>>> block. Note:RAS registers are not in the upport. The component registers
>>>> can be found by first using the RCRB to goto the downport. Next, the
>>>> downport's 64-bit BAR[0] will point to the component register block.
>>>>
>>>> [1] CXL3.0 Spec, '8.2.5 CXL.cache and CXL.mem Registers'
>>>>
>>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>>>> ---
>>>>  drivers/cxl/cxl.h    |  4 +++
>>>>  drivers/cxl/cxlmem.h |  1 +
>>>>  drivers/cxl/pci.c    | 72 ++++++++++++++++++++++++++++++++++++++++++++
>>>>  3 files changed, 77 insertions(+)
>>>>
>>>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>>>> index 7d507ab80a78..69b50131ad86 100644
>>>> --- a/drivers/cxl/cxl.h
>>>> +++ b/drivers/cxl/cxl.h
>>>> @@ -36,6 +36,10 @@
>>>>  #define   CXL_CM_CAP_CAP_ID_HDM 0x5
>>>>  #define   CXL_CM_CAP_CAP_HDM_VERSION 1
>>>>  
>>>> +/* CXL 3.0 8.2.4.2 CXL RAS Capability Header */
>>>> +#define CXL_CM_CAP_ID_RAS 0x2
>>>> +#define CXL_CM_CAP_SIZE_RAS 0x5C
>>>> +
>>>>  /* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */
>>>>  #define CXL_HDM_DECODER_CAP_OFFSET 0x0
>>>>  #define   CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0)
>>>> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
>>>> index 079db2e15acc..515273e224ea 100644
>>>> --- a/drivers/cxl/cxlmem.h
>>>> +++ b/drivers/cxl/cxlmem.h
>>>> @@ -243,6 +243,7 @@ struct cxl_dev_state {
>>>>  	u64 next_persistent_bytes;
>>>>  
>>>>  	struct cxl_register_map aer_map;
>>>> +	struct cxl_register_map ras_map;
>>>>  
>>>>  	resource_size_t component_reg_phys;
>>>>  	u64 serial;
>>>> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
>>>> index 2287b5225862..7f717fb47a36 100644
>>>> --- a/drivers/cxl/pci.c
>>>> +++ b/drivers/cxl/pci.c
>>>> @@ -586,6 +586,78 @@ void cxl_pci_aer_init(struct cxl_memdev *cxlmd)
>>>>  }
>>>>  EXPORT_SYMBOL_NS_GPL(cxl_pci_aer_init, CXL);
>>>>  
>>>> +static resource_size_t cxl_get_dport_ras_base(struct cxl_memdev *cxlmd)
>>>> +{
>>>> +	resource_size_t component_reg_phys, offset = 0;
>>>> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>>>> +	void *cap_hdr_addr, *comp_reg_mapped;
>>>> +	u32 cap_hdr, ras_cap_hdr;
>>>> +	int cap_ndx;
>>>> +
>>>> +	comp_reg_mapped = ioremap(cxlds->component_reg_phys +
>>>> +				  CXL_CM_OFFSET, CXL_COMPONENT_REG_BLOCK_SIZE);
>>>> +	if (!comp_reg_mapped)
>>>> +		return 0;
>>>> +
>>>> +	cap_hdr_addr = comp_reg_mapped;
>>>> +	cap_hdr = readl(cap_hdr_addr);
>>>> +	for (cap_ndx = 0;
>>>> +	     cap_ndx < FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_hdr);
>>>> +	     cap_ndx++) {
>>>> +		ras_cap_hdr = readl(cap_hdr_addr + cap_ndx*sizeof(u32));
>>>> +
>>>> +		if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) == CXL_CM_CAP_ID_RAS) {
>>>> +			pr_debug("RAS cap header = %X @ %pa, cap_ndx = %d\n",
>>>> +				 ras_cap_hdr, cap_hdr_addr, cap_ndx);
>>>> +			break;
>>>> +		}
>>>> +	}
>>>> +
>>>> +	offset = CXL_CM_OFFSET + PCI_EXT_CAP_NEXT(ras_cap_hdr);
>>>> +
>>>> +	iounmap(comp_reg_mapped);
>>>> +
>>>> +	if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) != CXL_CM_CAP_ID_RAS)
>>>> +		return 0;
>>>> +
>>>> +	pr_debug("Found RAS capability @ %llX (%X)\n",
>>>> +		 component_reg_phys + offset, *((u32 *)(comp_reg_mapped + offset)));
>>>> +
>>>> +	return component_reg_phys + offset;
>>>
>>> For the RAS capability in the cxl_pci device this patch needs to be
>>> reconciled with this effort:
>>>
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Flinux-cxl%2F166336972295.3803215.1047199449525031921.stgit%40djiang5-desk3.ch.intel.com%2F&amp;data=05%7C01%7CTerry.Bowman%40amd.com%7C17f2392d8abc4fcc3d0608dab85a586e%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638024995411582774%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=d64IsDCPTyPaBmsUYXlvn0rBMyaG8AxYHMuKHTSTOs8%3D&amp;reserved=0
>>>
>>> I think we will want RCD and VH RAS capability reporting to happen in
>>> the same place, and that can not be cxl_pci because cxl_pci has no way
>>> to find the RAS registers on its own. It needs the help from cxl_mem to
>>> do the upstream cxl_port associtation first.
>>>
>>> Given CXL switches will have their own RAS capabilities to report it
>>> feels like the cxl_port driver is where all of this should be
>>> centralized.
>>>
>>>
>>
>> I'm working on merging the patchsets now.
>>
>> I'm merging the following:
>>   Dave Jiang's onto 6.1.0-rc1+, provides RAS mapping.
> 
> Sounds like I should add this to the RCH branch so you can build on it.
> 
>>   Roberts series ontop of Dave's, provides RCD discovery.
> 
> Robert's series is still pending the rework to drop the
> devm_cxl_enumerate_ports() changes, not sure it's at a state where you
> can reliably build on it.
> 
>>   And this patchset ontop of Robert's, provides AER and RAS logging
> 
> As long as you are expecting to do one more rebase on the final form of
> Robert's series, sounds good.

I found there is some work to manually resolve merge conflicts between the 2 base 
patchsets. I will change directions and submit for review using Dave Jiang's 
patchset (using the URL you provided above). 

Regards,
Terry
  

Patch

diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 7d507ab80a78..69b50131ad86 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -36,6 +36,10 @@ 
 #define   CXL_CM_CAP_CAP_ID_HDM 0x5
 #define   CXL_CM_CAP_CAP_HDM_VERSION 1
 
+/* CXL 3.0 8.2.4.2 CXL RAS Capability Header */
+#define CXL_CM_CAP_ID_RAS 0x2
+#define CXL_CM_CAP_SIZE_RAS 0x5C
+
 /* HDM decoders CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure */
 #define CXL_HDM_DECODER_CAP_OFFSET 0x0
 #define   CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0)
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 079db2e15acc..515273e224ea 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -243,6 +243,7 @@  struct cxl_dev_state {
 	u64 next_persistent_bytes;
 
 	struct cxl_register_map aer_map;
+	struct cxl_register_map ras_map;
 
 	resource_size_t component_reg_phys;
 	u64 serial;
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 2287b5225862..7f717fb47a36 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -586,6 +586,78 @@  void cxl_pci_aer_init(struct cxl_memdev *cxlmd)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_pci_aer_init, CXL);
 
+static resource_size_t cxl_get_dport_ras_base(struct cxl_memdev *cxlmd)
+{
+	resource_size_t component_reg_phys, offset = 0;
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	void *cap_hdr_addr, *comp_reg_mapped;
+	u32 cap_hdr, ras_cap_hdr;
+	int cap_ndx;
+
+	comp_reg_mapped = ioremap(cxlds->component_reg_phys +
+				  CXL_CM_OFFSET, CXL_COMPONENT_REG_BLOCK_SIZE);
+	if (!comp_reg_mapped)
+		return 0;
+
+	cap_hdr_addr = comp_reg_mapped;
+	cap_hdr = readl(cap_hdr_addr);
+	for (cap_ndx = 0;
+	     cap_ndx < FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_hdr);
+	     cap_ndx++) {
+		ras_cap_hdr = readl(cap_hdr_addr + cap_ndx*sizeof(u32));
+
+		if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) == CXL_CM_CAP_ID_RAS) {
+			pr_debug("RAS cap header = %X @ %pa, cap_ndx = %d\n",
+				 ras_cap_hdr, cap_hdr_addr, cap_ndx);
+			break;
+		}
+	}
+
+	offset = CXL_CM_OFFSET + PCI_EXT_CAP_NEXT(ras_cap_hdr);
+
+	iounmap(comp_reg_mapped);
+
+	if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, ras_cap_hdr) != CXL_CM_CAP_ID_RAS)
+		return 0;
+
+	pr_debug("Found RAS capability @ %llX (%X)\n",
+		 component_reg_phys + offset, *((u32 *)(comp_reg_mapped + offset)));
+
+	return component_reg_phys + offset;
+}
+
+static int cxl_setup_dport_ras(struct cxl_memdev *cxlmd, resource_size_t resource)
+{
+	struct cxl_register_map *map = &cxlmd->cxlds->ras_map;
+	struct pci_dev *pdev = to_pci_dev(&cxlmd->dev);
+
+	if (!resource) {
+		pr_err("%s():%d: RAS resource ptr is NULL\n", __func__, __LINE__);
+		return  -EINVAL;
+	}
+
+	map->base = devm_cxl_iomap_block(&pdev->dev, resource, CXL_CM_CAP_SIZE_RAS);
+	if (!map->base)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void cxl_pci_ras_init(struct cxl_memdev *cxlmd)
+{
+	resource_size_t cap;
+
+	/*
+	 * TODO - CXL2.0 will need change to support PCI config space.
+	 */
+	if (!is_rcd(cxlmd))
+		return;
+
+	cap = cxl_get_dport_ras_base(cxlmd);
+	cxl_setup_dport_ras(cxlmd, cap);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_pci_ras_init, CXL);
+
 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct cxl_register_map map;