[v2,3/4] acpi/ghes, efi/cper: Recognize and process CXL Protocol Errors.

Message ID 20240109034755.100555-4-Smita.KoralahalliChannabasappa@amd.com
State New
Headers
Series acpi/ghes, cper, cxl: Trace FW-First CXL Protocol Errors |

Commit Message

Smita Koralahalli Jan. 9, 2024, 3:47 a.m. UTC
  UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.

Add GHES support to detect CXL CPER Protocol record and cache error
severity, device_id, serial number and CXL RAS capability struct in
struct cxl_cper_event_info.

Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
---
v2:
	Change to sub-struct for protocol error specific elemenets.
	Set serial number unconditionally.
	Copy entire cxl_ras_capability_regs struct rather than pointer.
	Calculate error severity in efi/cper and change to enum.
---
 drivers/acpi/apei/ghes.c        | 11 ++++++
 drivers/firmware/efi/cper_cxl.c | 68 +++++++++++++++++++++++++++++++++
 include/linux/cxl-event.h       | 13 +++++++
 3 files changed, 92 insertions(+)
  

Comments

Jonathan Cameron Feb. 15, 2024, 12:17 p.m. UTC | #1
On Tue, 9 Jan 2024 03:47:54 +0000
Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com> wrote:

> UEFI v2.10 section N.2.13 defines a CPER record for CXL Protocol errors.
> 
> Add GHES support to detect CXL CPER Protocol record and cache error
> severity, device_id, serial number and CXL RAS capability struct in
> struct cxl_cper_event_info.
> 
> Signed-off-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
> ---
> v2:
> 	Change to sub-struct for protocol error specific elemenets.
> 	Set serial number unconditionally.
> 	Copy entire cxl_ras_capability_regs struct rather than pointer.
> 	Calculate error severity in efi/cper and change to enum.
> ---
>  drivers/acpi/apei/ghes.c        | 11 ++++++
>  drivers/firmware/efi/cper_cxl.c | 68 +++++++++++++++++++++++++++++++++
>  include/linux/cxl-event.h       | 13 +++++++
>  3 files changed, 92 insertions(+)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 60b615d361d3..1d4f3d68a0bc 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -714,6 +714,14 @@ static void cxl_cper_post_event(enum cxl_event_type event_type,
>  		cper_callback(event_type, &info);
>  }
>  
> +void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
> +{
> +	struct cxl_cper_event_info info;
> +
> +	if (cxl_cper_handle_prot_err_info(gdata, &info))
> +		return;
> +}
> +
>  int cxl_cper_register_callback(cxl_cper_callback callback)
>  {
>  	guard(rwsem_write)(&cxl_cper_rw_sem);
> @@ -768,6 +776,9 @@ static bool ghes_do_proc(struct ghes *ghes,
>  		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
>  			queued = ghes_handle_arm_hw_error(gdata, sev);
>  		}
> +		else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
> +			cxl_cper_handle_prot_err(gdata);
> +		}
>  		else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
>  			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
>  
> diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
> index 4fd8d783993e..9b9b8c8f1157 100644
> --- a/drivers/firmware/efi/cper_cxl.c
> +++ b/drivers/firmware/efi/cper_cxl.c
> @@ -8,6 +8,7 @@
>   */
>  
>  #include <linux/cper.h>
> +#include <acpi/ghes.h>
>  #include "cper_cxl.h"
>  
>  #define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
> @@ -44,6 +45,17 @@ enum {
>  	USP,	/* CXL Upstream Switch Port */
>  };
>  
> +static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
> +{
> +	switch (cper_severity) {
> +	case CPER_SEV_RECOVERABLE:
> +	case CPER_SEV_FATAL:
> +		return CXL_AER_UNCORRECTABLE;
> +	default:
> +		return CXL_AER_CORRECTABLE;
> +	}
> +}
> +
>  void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
>  {
>  	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
> @@ -176,3 +188,59 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
>  			       sizeof(cxl_ras->header_log), 0);
>  	}
>  }
> +
> +int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
> +				  struct cxl_cper_event_info *info)
> +{
> +	struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
> +	struct cper_cxl_event_devid *device_id = &info->rec.hdr.device_id;
> +	struct cper_cxl_event_sn *dev_serial_num =  &info->rec.hdr.dev_serial_num;
> +	size_t size = sizeof(*prot_err) + prot_err->dvsec_len;

Not obvious what this is size of.  I'd rename it to reflect that's only
the distance to the end of the dvsec copy.
Or just compute the pointer below directly by putting this maths inline.

> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
> +		pr_err(FW_WARN "Not a valid protocol error log\n");
> +		return -EINVAL;
> +	}
> +
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
> +		pr_err(FW_WARN "Not a valid Device ID\n");
"No device ID\n"
is more accurate description.

I'd move this down to next to where we check the data is valid.
So keep each validity check next to where it matters rather than
a bunch of checks up here.  (mostly because I started writing you
didn't check it was valid down there before remembering this
earlier code :)

> +		return -EINVAL;
> +	}
> +
> +	/*
> +	 * Set device serial number unconditionally.
> +	 *
> +	 * Print a warning message if it is not valid. The device serial
> +	 * number is considered valid for CXL 1.1 device, CXL 2.0 device,
is required for
perhaps?  These all got renamed in the CXL spec.  We should use that naming
because it deliberately avoids limiting to particular spec versions.
	CXL RCD, CXL SLD, CXL LD, 
> +	 * CXL 2.0 Logical device, or CXL 2.0 Fabric Manager Managed
> +	 * Logical Device.

Not sure what this is now called.. :(


> +	 */
> +	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
> +	      prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
> +		pr_warn(FW_WARN "Not a valid serial number\n");
> +
> +	dev_serial_num->lower_dw = prot_err->dev_serial_num.lower_dw;
> +	dev_serial_num->upper_dw = prot_err->dev_serial_num.upper_dw;
> +
> +	/*
> +	 * The device ID or agent address is only valid for CXL 1.1 device,
> +	 * CXL 2.0 device, CXL 2.0 Logical device, CXL 2.0 Fabric Manager
> +	 * Managed Logical Device, CXL Root Port, CXL Downstream Switch
> +	 * Port, or CXL Upstream Switch Port.
> +	 */
> +	if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {

> +		device_id->segment_num = prot_err->agent_addr.segment;
> +		device_id->bus_num = prot_err->agent_addr.bus;
> +		device_id->device_num = prot_err->agent_addr.device;
> +		device_id->func_num = prot_err->agent_addr.function;
> +	} else {
> +		pr_err(FW_WARN "Not a valid agent type\n");
> +		return -EINVAL;
> +	}
> +
> +	info->p_err.cxl_ras = *(struct cxl_ras_capability_regs *)((long)prot_err + size);

Casting to a long isn't nice. Keep it as a pointer for this maths
a u8 * or void * would work.  Particularly if you did it as something
a bit more self documenting like

u8 *dvsec_start = (u8 *)(prot_err + 1);
u8 *cap_start = dvsec_start + prot_err->dvsec_length;

info->p_err.cxl_ras = *(struct cxl_ras_capability_regs *)cap_start;

> +
> +	info->p_err.severity = cper_severity_cxl_aer(gdata->error_severity);
> +
> +	return 0;
> +}
  

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 60b615d361d3..1d4f3d68a0bc 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -714,6 +714,14 @@  static void cxl_cper_post_event(enum cxl_event_type event_type,
 		cper_callback(event_type, &info);
 }
 
+void cxl_cper_handle_prot_err(struct acpi_hest_generic_data *gdata)
+{
+	struct cxl_cper_event_info info;
+
+	if (cxl_cper_handle_prot_err_info(gdata, &info))
+		return;
+}
+
 int cxl_cper_register_callback(cxl_cper_callback callback)
 {
 	guard(rwsem_write)(&cxl_cper_rw_sem);
@@ -768,6 +776,9 @@  static bool ghes_do_proc(struct ghes *ghes,
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
 			queued = ghes_handle_arm_hw_error(gdata, sev);
 		}
+		else if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR)) {
+			cxl_cper_handle_prot_err(gdata);
+		}
 		else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
 			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
 
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index 4fd8d783993e..9b9b8c8f1157 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -8,6 +8,7 @@ 
  */
 
 #include <linux/cper.h>
+#include <acpi/ghes.h>
 #include "cper_cxl.h"
 
 #define PROT_ERR_VALID_AGENT_TYPE		BIT_ULL(0)
@@ -44,6 +45,17 @@  enum {
 	USP,	/* CXL Upstream Switch Port */
 };
 
+static enum cxl_aer_err_type cper_severity_cxl_aer(int cper_severity)
+{
+	switch (cper_severity) {
+	case CPER_SEV_RECOVERABLE:
+	case CPER_SEV_FATAL:
+		return CXL_AER_UNCORRECTABLE;
+	default:
+		return CXL_AER_CORRECTABLE;
+	}
+}
+
 void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
 {
 	if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
@@ -176,3 +188,59 @@  void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
 			       sizeof(cxl_ras->header_log), 0);
 	}
 }
+
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+				  struct cxl_cper_event_info *info)
+{
+	struct cper_sec_prot_err *prot_err = acpi_hest_get_payload(gdata);
+	struct cper_cxl_event_devid *device_id = &info->rec.hdr.device_id;
+	struct cper_cxl_event_sn *dev_serial_num =  &info->rec.hdr.dev_serial_num;
+	size_t size = sizeof(*prot_err) + prot_err->dvsec_len;
+
+	if (!(prot_err->valid_bits & PROT_ERR_VALID_ERROR_LOG)) {
+		pr_err(FW_WARN "Not a valid protocol error log\n");
+		return -EINVAL;
+	}
+
+	if (!(prot_err->valid_bits & PROT_ERR_VALID_DEVICE_ID)) {
+		pr_err(FW_WARN "Not a valid Device ID\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Set device serial number unconditionally.
+	 *
+	 * Print a warning message if it is not valid. The device serial
+	 * number is considered valid for CXL 1.1 device, CXL 2.0 device,
+	 * CXL 2.0 Logical device, or CXL 2.0 Fabric Manager Managed
+	 * Logical Device.
+	 */
+	if (!(prot_err->valid_bits & PROT_ERR_VALID_SERIAL_NUMBER) ||
+	      prot_err->agent_type > 0x4 || prot_err->agent_type == RCH_DP)
+		pr_warn(FW_WARN "Not a valid serial number\n");
+
+	dev_serial_num->lower_dw = prot_err->dev_serial_num.lower_dw;
+	dev_serial_num->upper_dw = prot_err->dev_serial_num.upper_dw;
+
+	/*
+	 * The device ID or agent address is only valid for CXL 1.1 device,
+	 * CXL 2.0 device, CXL 2.0 Logical device, CXL 2.0 Fabric Manager
+	 * Managed Logical Device, CXL Root Port, CXL Downstream Switch
+	 * Port, or CXL Upstream Switch Port.
+	 */
+	if (prot_err->agent_type <= 0x7 && prot_err->agent_type != RCH_DP) {
+		device_id->segment_num = prot_err->agent_addr.segment;
+		device_id->bus_num = prot_err->agent_addr.bus;
+		device_id->device_num = prot_err->agent_addr.device;
+		device_id->func_num = prot_err->agent_addr.function;
+	} else {
+		pr_err(FW_WARN "Not a valid agent type\n");
+		return -EINVAL;
+	}
+
+	info->p_err.cxl_ras = *(struct cxl_ras_capability_regs *)((long)prot_err + size);
+
+	info->p_err.severity = cper_severity_cxl_aer(gdata->error_severity);
+
+	return 0;
+}
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 3a41dd5723e8..08e3979de9a3 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -152,13 +152,26 @@  struct cxl_ras_capability_regs {
 	u32 header_log[16];
 };
 
+enum cxl_aer_err_type {
+	CXL_AER_UNCORRECTABLE,
+	CXL_AER_CORRECTABLE,
+};
+
 struct cxl_cper_event_info {
 	struct cxl_cper_event_rec rec;
+	struct cxl_cper_prot_err {
+		struct cxl_ras_capability_regs cxl_ras;
+		int severity;
+	} p_err;
 };
 
 typedef void (*cxl_cper_callback)(enum cxl_event_type type,
 				  struct cxl_cper_event_info *info);
 
+struct acpi_hest_generic_data;
+int cxl_cper_handle_prot_err_info(struct acpi_hest_generic_data *gdata,
+				  struct cxl_cper_event_info *info);
+
 #ifdef CONFIG_ACPI_APEI_GHES
 int cxl_cper_register_callback(cxl_cper_callback callback);
 int cxl_cper_unregister_callback(cxl_cper_callback callback);