[RFC,v2,3/3] ACPI: APEI: EINJ: Add support for vendor defined error types

Message ID 20230525204422.4754-4-Avadhut.Naik@amd.com
State New
Headers
Series Add support for Vendor Defined Error Types in Einj Module |

Commit Message

Avadhut Naik May 25, 2023, 8:44 p.m. UTC
  Vendor-Defined Error types are supported by the platform apart from
standard error types if bit 31 is set in the output of GET_ERROR_TYPE
Error Injection Action.[1] While the errors themselves and the length
of their associated "OEM Defined data structure" might vary between
vendors, the physical address of this structure can be computed through
vendor_extension and length fields of "SET_ERROR_TYPE_WITH_ADDRESS" and
"Vendor Error Type Extension" Structures respectively.[2][3]

Currently, however, the einj module only computes the physical address of
Vendor Error Type Extension Structure. Neither does it compute the physical
address of OEM Defined structure nor does it establish the memory mapping
required for injecting Vendor-defined errors. Consequently, userspace
tools have to establish the very mapping through /dev/mem, nopat kernel
parameter and system calls like mmap/munmap initially before injecting
Vendor-defined errors.

Circumvent the issue by computing the physical address of OEM Defined data
structure and establishing the required mapping with the structure. Create
a new file "oem_error", if the system supports Vendor-defined errors, to
export this mapping, through debugfs_create_blob(). Userspace tools can
then populate their respective OEM Defined structure instances and just
write to the file as part of injecting Vendor-defined Errors.

[1] ACPI specification 6.5, section 18.6.4
[2] ACPI specification 6.5, Table 18.31
[3] ACPI specification 6.5, Table 18.32

Suggested-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Avadhut Naik <Avadhut.Naik@amd.com>
---
 drivers/acpi/apei/einj.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
  

Comments

Alexey Kardashevskiy June 8, 2023, 4:44 a.m. UTC | #1
On 26/5/23 06:44, Avadhut Naik wrote:
> Vendor-Defined Error types are supported by the platform apart from
> standard error types if bit 31 is set in the output of GET_ERROR_TYPE
> Error Injection Action.[1] While the errors themselves and the length
> of their associated "OEM Defined data structure" might vary between
> vendors, the physical address of this structure can be computed through
> vendor_extension and length fields of "SET_ERROR_TYPE_WITH_ADDRESS" and
> "Vendor Error Type Extension" Structures respectively.[2][3]
> 
> Currently, however, the einj module only computes the physical address of
> Vendor Error Type Extension Structure. Neither does it compute the physical
> address of OEM Defined structure nor does it establish the memory mapping
> required for injecting Vendor-defined errors. Consequently, userspace
> tools have to establish the very mapping through /dev/mem, nopat kernel
> parameter and system calls like mmap/munmap initially before injecting
> Vendor-defined errors.
> 
> Circumvent the issue by computing the physical address of OEM Defined data
> structure and establishing the required mapping with the structure. Create
> a new file "oem_error", if the system supports Vendor-defined errors, to
> export this mapping, through debugfs_create_blob(). Userspace tools can
> then populate their respective OEM Defined structure instances and just
> write to the file as part of injecting Vendor-defined Errors.
> 
> [1] ACPI specification 6.5, section 18.6.4
> [2] ACPI specification 6.5, Table 18.31
> [3] ACPI specification 6.5, Table 18.32
> 
> Suggested-by: Yazen Ghannam <yazen.ghannam@amd.com>
> Signed-off-by: Avadhut Naik <Avadhut.Naik@amd.com>
> ---
>   drivers/acpi/apei/einj.c | 19 +++++++++++++++++++
>   1 file changed, 19 insertions(+)
> 
> diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
> index d5f8dc4df7a5..9f23b6955cf0 100644
> --- a/drivers/acpi/apei/einj.c
> +++ b/drivers/acpi/apei/einj.c
> @@ -73,6 +73,7 @@ static u32 notrigger;
>   
>   static u32 vendor_flags;
>   static struct debugfs_blob_wrapper vendor_blob;
> +static struct debugfs_blob_wrapper vendor_errors;
>   static char vendor_dev[64];
>   
>   /*
> @@ -182,6 +183,16 @@ static int einj_timedout(u64 *t)
>   	return 0;
>   }
>   
> +static void get_oem_vendor_struct(u64 paddr, int offset,
> +				  struct vendor_error_type_extension *v)
> +{
> +	u64 target_pa = paddr + offset + sizeof(struct vendor_error_type_extension);
> +
> +	vendor_errors.size = v->length - sizeof(struct vendor_error_type_extension);
> +	if (vendor_errors.size)
> +		vendor_errors.data = acpi_os_map_iomem(target_pa, vendor_errors.size);


acpi_os_map_iomem() can return NULL but you check for the size (see 
below comments).

> +}
> +
>   static void check_vendor_extension(u64 paddr,
>   				   struct set_error_type_with_address *v5param)
>   {
> @@ -194,6 +205,7 @@ static void check_vendor_extension(u64 paddr,
>   	v = acpi_os_map_iomem(paddr + offset, sizeof(*v));
>   	if (!v)
>   		return;
> +	get_oem_vendor_struct(paddr, offset, v);
>   	sbdf = v->pcie_sbdf;
>   	sprintf(vendor_dev, "%x:%x:%x.%x vendor_id=%x device_id=%x rev_id=%x\n",
>   		sbdf >> 24, (sbdf >> 16) & 0xff,
> @@ -596,6 +608,7 @@ static struct { u32 mask; const char *str; } const einj_error_type_string[] = {
>   	{0x00008000, "CXL.mem Protocol Correctable"},
>   	{0x00010000, "CXL.mem Protocol Uncorrectable non-fatal"},
>   	{0x00020000, "CXL.mem Protocol Uncorrectable fatal"},
> +	{0x80000000, "Vendor Defined Error Types"},
>   };
>   
>   static int available_error_type_show(struct seq_file *m, void *v)
> @@ -768,6 +781,10 @@ static int __init einj_init(void)
>   				   einj_debug_dir, &vendor_flags);
>   	}
>   
> +	if (vendor_errors.size)
> +		debugfs_create_blob("oem_error", 0200, einj_debug_dir,
> +				    &vendor_errors);

Here writing to "oem_error" will crash.


> +
>   	pr_info("Error INJection is initialized.\n");
>   
>   	return 0;
> @@ -793,6 +810,8 @@ static void __exit einj_exit(void)
>   			sizeof(struct einj_parameter);
>   
>   		acpi_os_unmap_iomem(einj_param, size);
> +		if (vendor_errors.size)
> +			acpi_os_unmap_iomem(vendor_errors.data, vendor_errors.size);

And here is will produce an error message I suppose.

Just change get_oem_vendor_struct() to store the size in a local 
variable and only copy it to vendor_errors.size if vendor_errors.data!=NULL.

With that bit fixed,

Reviewed-by: Alexey Kardashevskiy <aik@amd.com>



>   	}
>   	einj_exec_ctx_init(&ctx);
>   	apei_exec_post_unmap_gars(&ctx);
  
Naik, Avadhut June 8, 2023, 9:18 p.m. UTC | #2
Hi,
	Thanks for reviewing!

On 6/7/2023 23:44, Alexey Kardashevskiy wrote:
> 
> 
> On 26/5/23 06:44, Avadhut Naik wrote:
>> Vendor-Defined Error types are supported by the platform apart from
>> standard error types if bit 31 is set in the output of GET_ERROR_TYPE
>> Error Injection Action.[1] While the errors themselves and the length
>> of their associated "OEM Defined data structure" might vary between
>> vendors, the physical address of this structure can be computed through
>> vendor_extension and length fields of "SET_ERROR_TYPE_WITH_ADDRESS" and
>> "Vendor Error Type Extension" Structures respectively.[2][3]
>>
>> Currently, however, the einj module only computes the physical address of
>> Vendor Error Type Extension Structure. Neither does it compute the physical
>> address of OEM Defined structure nor does it establish the memory mapping
>> required for injecting Vendor-defined errors. Consequently, userspace
>> tools have to establish the very mapping through /dev/mem, nopat kernel
>> parameter and system calls like mmap/munmap initially before injecting
>> Vendor-defined errors.
>>
>> Circumvent the issue by computing the physical address of OEM Defined data
>> structure and establishing the required mapping with the structure. Create
>> a new file "oem_error", if the system supports Vendor-defined errors, to
>> export this mapping, through debugfs_create_blob(). Userspace tools can
>> then populate their respective OEM Defined structure instances and just
>> write to the file as part of injecting Vendor-defined Errors.
>>
>> [1] ACPI specification 6.5, section 18.6.4
>> [2] ACPI specification 6.5, Table 18.31
>> [3] ACPI specification 6.5, Table 18.32
>>
>> Suggested-by: Yazen Ghannam <yazen.ghannam@amd.com>
>> Signed-off-by: Avadhut Naik <Avadhut.Naik@amd.com>
>> ---
>>   drivers/acpi/apei/einj.c | 19 +++++++++++++++++++
>>   1 file changed, 19 insertions(+)
>>
>> diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
>> index d5f8dc4df7a5..9f23b6955cf0 100644
>> --- a/drivers/acpi/apei/einj.c
>> +++ b/drivers/acpi/apei/einj.c
>> @@ -73,6 +73,7 @@ static u32 notrigger;
>>     static u32 vendor_flags;
>>   static struct debugfs_blob_wrapper vendor_blob;
>> +static struct debugfs_blob_wrapper vendor_errors;
>>   static char vendor_dev[64];
>>     /*
>> @@ -182,6 +183,16 @@ static int einj_timedout(u64 *t)
>>       return 0;
>>   }
>>   +static void get_oem_vendor_struct(u64 paddr, int offset,
>> +                  struct vendor_error_type_extension *v)
>> +{
>> +    u64 target_pa = paddr + offset + sizeof(struct vendor_error_type_extension);
>> +
>> +    vendor_errors.size = v->length - sizeof(struct vendor_error_type_extension);
>> +    if (vendor_errors.size)
>> +        vendor_errors.data = acpi_os_map_iomem(target_pa, vendor_errors.size);
> 
> 
> acpi_os_map_iomem() can return NULL but you check for the size (see below comments).
> 
>> +}
>> +
>>   static void check_vendor_extension(u64 paddr,
>>                      struct set_error_type_with_address *v5param)
>>   {
>> @@ -194,6 +205,7 @@ static void check_vendor_extension(u64 paddr,
>>       v = acpi_os_map_iomem(paddr + offset, sizeof(*v));
>>       if (!v)
>>           return;
>> +    get_oem_vendor_struct(paddr, offset, v);
>>       sbdf = v->pcie_sbdf;
>>       sprintf(vendor_dev, "%x:%x:%x.%x vendor_id=%x device_id=%x rev_id=%x\n",
>>           sbdf >> 24, (sbdf >> 16) & 0xff,
>> @@ -596,6 +608,7 @@ static struct { u32 mask; const char *str; } const einj_error_type_string[] = {
>>       {0x00008000, "CXL.mem Protocol Correctable"},
>>       {0x00010000, "CXL.mem Protocol Uncorrectable non-fatal"},
>>       {0x00020000, "CXL.mem Protocol Uncorrectable fatal"},
>> +    {0x80000000, "Vendor Defined Error Types"},
>>   };
>>     static int available_error_type_show(struct seq_file *m, void *v)
>> @@ -768,6 +781,10 @@ static int __init einj_init(void)
>>                      einj_debug_dir, &vendor_flags);
>>       }
>>   +    if (vendor_errors.size)
>> +        debugfs_create_blob("oem_error", 0200, einj_debug_dir,
>> +                    &vendor_errors);
> 
> Here writing to "oem_error" will crash.
> 
> 
>> +
>>       pr_info("Error INJection is initialized.\n");
>>         return 0;
>> @@ -793,6 +810,8 @@ static void __exit einj_exit(void)
>>               sizeof(struct einj_parameter);
>>             acpi_os_unmap_iomem(einj_param, size);
>> +        if (vendor_errors.size)
>> +            acpi_os_unmap_iomem(vendor_errors.data, vendor_errors.size);
> 
> And here is will produce an error message I suppose.
> 
> Just change get_oem_vendor_struct() to store the size in a local variable and only copy it to vendor_errors.size if vendor_errors.data!=NULL.
> 
> With that bit fixed,
> 
> Reviewed-by: Alexey Kardashevskiy <aik@amd.com>
> 
	Agreed. Will change the function to something like below:

static void get_oem_vendor_struct(u64 paddr, int offset,
                  struct vendor_error_type_extension *v)
{
    unsigned long vendor_size;  
    u64 target_pa = paddr + offset + sizeof(struct vendor_error_type_extension);

    vendor_size = v->length - sizeof(struct vendor_error_type_extension);

    if (vendor_size)
        vendor_errors.data = acpi_os_map_iomem(target_pa, vendor_size);

    if (vendor_errors.data)
        vendor_errors.size = vendor_size;
}

Thanks,
Avadhut Naik
> 
> 
>>       }
>>       einj_exec_ctx_init(&ctx);
>>       apei_exec_post_unmap_gars(&ctx);
> 

--
  

Patch

diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
index d5f8dc4df7a5..9f23b6955cf0 100644
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -73,6 +73,7 @@  static u32 notrigger;
 
 static u32 vendor_flags;
 static struct debugfs_blob_wrapper vendor_blob;
+static struct debugfs_blob_wrapper vendor_errors;
 static char vendor_dev[64];
 
 /*
@@ -182,6 +183,16 @@  static int einj_timedout(u64 *t)
 	return 0;
 }
 
+static void get_oem_vendor_struct(u64 paddr, int offset,
+				  struct vendor_error_type_extension *v)
+{
+	u64 target_pa = paddr + offset + sizeof(struct vendor_error_type_extension);
+
+	vendor_errors.size = v->length - sizeof(struct vendor_error_type_extension);
+	if (vendor_errors.size)
+		vendor_errors.data = acpi_os_map_iomem(target_pa, vendor_errors.size);
+}
+
 static void check_vendor_extension(u64 paddr,
 				   struct set_error_type_with_address *v5param)
 {
@@ -194,6 +205,7 @@  static void check_vendor_extension(u64 paddr,
 	v = acpi_os_map_iomem(paddr + offset, sizeof(*v));
 	if (!v)
 		return;
+	get_oem_vendor_struct(paddr, offset, v);
 	sbdf = v->pcie_sbdf;
 	sprintf(vendor_dev, "%x:%x:%x.%x vendor_id=%x device_id=%x rev_id=%x\n",
 		sbdf >> 24, (sbdf >> 16) & 0xff,
@@ -596,6 +608,7 @@  static struct { u32 mask; const char *str; } const einj_error_type_string[] = {
 	{0x00008000, "CXL.mem Protocol Correctable"},
 	{0x00010000, "CXL.mem Protocol Uncorrectable non-fatal"},
 	{0x00020000, "CXL.mem Protocol Uncorrectable fatal"},
+	{0x80000000, "Vendor Defined Error Types"},
 };
 
 static int available_error_type_show(struct seq_file *m, void *v)
@@ -768,6 +781,10 @@  static int __init einj_init(void)
 				   einj_debug_dir, &vendor_flags);
 	}
 
+	if (vendor_errors.size)
+		debugfs_create_blob("oem_error", 0200, einj_debug_dir,
+				    &vendor_errors);
+
 	pr_info("Error INJection is initialized.\n");
 
 	return 0;
@@ -793,6 +810,8 @@  static void __exit einj_exit(void)
 			sizeof(struct einj_parameter);
 
 		acpi_os_unmap_iomem(einj_param, size);
+		if (vendor_errors.size)
+			acpi_os_unmap_iomem(vendor_errors.data, vendor_errors.size);
 	}
 	einj_exec_ctx_init(&ctx);
 	apei_exec_post_unmap_gars(&ctx);