[1/2] RAS/AMD/ATL, EDAC/amd64: Move MI300 Row Retirement to ATL

Message ID 20240214033516.1344948-2-yazen.ghannam@amd.com
State New
Headers
Series FRU Memory Poison Manager |

Commit Message

Yazen Ghannam Feb. 14, 2024, 3:35 a.m. UTC
  DRAM row retirement depends on model-specific information that is best
done within the AMD Address Translation Library.

Export a generic wrapper function for other modules to use. Add any
model-specific helpers here.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
 drivers/edac/Kconfig        |  1 -
 drivers/edac/amd64_edac.c   | 48 ----------------------------------
 drivers/ras/amd/atl/Kconfig |  1 +
 drivers/ras/amd/atl/umc.c   | 51 +++++++++++++++++++++++++++++++++++++
 include/linux/ras.h         |  2 ++
 5 files changed, 54 insertions(+), 49 deletions(-)
  

Comments

Borislav Petkov Feb. 14, 2024, 8:36 a.m. UTC | #1
On Tue, Feb 13, 2024 at 09:35:15PM -0600, Yazen Ghannam wrote:
> DRAM row retirement depends on model-specific information that is best
> done within the AMD Address Translation Library.
> 
> Export a generic wrapper function for other modules to use. Add any
> model-specific helpers here.
> 
> Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
> ---
>  drivers/edac/Kconfig        |  1 -
>  drivers/edac/amd64_edac.c   | 48 ----------------------------------
>  drivers/ras/amd/atl/Kconfig |  1 +
>  drivers/ras/amd/atl/umc.c   | 51 +++++++++++++++++++++++++++++++++++++
>  include/linux/ras.h         |  2 ++
>  5 files changed, 54 insertions(+), 49 deletions(-)

So basically I can zap:

8be4984891e0 ("EDAC/amd64: Add MI300 row retirement support")

from

https://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git/log/?h=edac-amd-atl

and you can send me a patch which adds the row retirement straight to
the ATL?

Thx.
  
Yazen Ghannam Feb. 14, 2024, 2:19 p.m. UTC | #2
On 2/14/2024 3:36 AM, Borislav Petkov wrote:
> On Tue, Feb 13, 2024 at 09:35:15PM -0600, Yazen Ghannam wrote:
>> DRAM row retirement depends on model-specific information that is best
>> done within the AMD Address Translation Library.
>>
>> Export a generic wrapper function for other modules to use. Add any
>> model-specific helpers here.
>>
>> Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
>> ---
>>   drivers/edac/Kconfig        |  1 -
>>   drivers/edac/amd64_edac.c   | 48 ----------------------------------
>>   drivers/ras/amd/atl/Kconfig |  1 +
>>   drivers/ras/amd/atl/umc.c   | 51 +++++++++++++++++++++++++++++++++++++
>>   include/linux/ras.h         |  2 ++
>>   5 files changed, 54 insertions(+), 49 deletions(-)
> 
> So basically I can zap:
> 
> 8be4984891e0 ("EDAC/amd64: Add MI300 row retirement support")
> 
> from
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git/log/?h=edac-amd-atl
> 
> and you can send me a patch which adds the row retirement straight to
> the ATL?
> 

Yes, that's fine.

Thanks,
Yazen
  
Borislav Petkov Feb. 14, 2024, 4:05 p.m. UTC | #3
On Wed, Feb 14, 2024 at 09:19:13AM -0500, Yazen Ghannam wrote:
> Yes, that's fine.

Easy peasy:

From 98ecd3942837df907fbf9ceff7e23f55e55e40b2 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Tue, 13 Feb 2024 21:35:15 -0600
Subject: [PATCH] RAS/AMD/ATL: Add MI300 row retirement support

DRAM row retirement depends on model-specific information that is best
done within the AMD Address Translation Library.

Export a generic wrapper function for other modules to use. Add any
model-specific helpers here.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20240214033516.1344948-2-yazen.ghannam@amd.com
---
 drivers/ras/amd/atl/Kconfig |  1 +
 drivers/ras/amd/atl/umc.c   | 51 +++++++++++++++++++++++++++++++++++++
 include/linux/ras.h         |  2 ++
 3 files changed, 54 insertions(+)

diff --git a/drivers/ras/amd/atl/Kconfig b/drivers/ras/amd/atl/Kconfig
index a43513a700f1..df49c23e7f62 100644
--- a/drivers/ras/amd/atl/Kconfig
+++ b/drivers/ras/amd/atl/Kconfig
@@ -10,6 +10,7 @@
 config AMD_ATL
 	tristate "AMD Address Translation Library"
 	depends on AMD_NB && X86_64 && RAS
+	depends on MEMORY_FAILURE
 	default N
 	help
 	  This library includes support for implementation-specific
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
index 7e310d1dfcfc..08c6dbd44c62 100644
--- a/drivers/ras/amd/atl/umc.c
+++ b/drivers/ras/amd/atl/umc.c
@@ -239,6 +239,57 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
 	return addr;
 }
 
+/*
+ * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
+ * all memory within that DRAM row. This applies to the memory with a DRAM
+ * bank.
+ *
+ * To find the memory addresses, loop through permutations of the DRAM column
+ * bits and find the System Physical address of each. The column bits are used
+ * to calculate the intermediate Normalized address, so all permutations should
+ * be checked.
+ *
+ * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
+ */
+#define MI300_NUM_COL		BIT(HWEIGHT(MI300_UMC_MCA_COL))
+static void retire_row_mi300(struct atl_err *a_err)
+{
+	unsigned long addr;
+	struct page *p;
+	u8 col;
+
+	for (col = 0; col < MI300_NUM_COL; col++) {
+		a_err->addr &= ~MI300_UMC_MCA_COL;
+		a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
+
+		addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
+		if (IS_ERR_VALUE(addr))
+			continue;
+
+		addr = PHYS_PFN(addr);
+
+		/*
+		 * Skip invalid or already poisoned pages to avoid unnecessary
+		 * error messages from memory_failure().
+		 */
+		p = pfn_to_online_page(addr);
+		if (!p)
+			continue;
+
+		if (PageHWPoison(p))
+			continue;
+
+		memory_failure(addr, 0);
+	}
+}
+
+void amd_retire_dram_row(struct atl_err *a_err)
+{
+	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
+		return retire_row_mi300(a_err);
+}
+EXPORT_SYMBOL_GPL(amd_retire_dram_row);
+
 static unsigned long get_addr(unsigned long addr)
 {
 	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 09c632832bf1..a64182bc72ad 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -45,8 +45,10 @@ struct atl_err {
 #if IS_ENABLED(CONFIG_AMD_ATL)
 void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
 void amd_atl_unregister_decoder(void);
+void amd_retire_dram_row(struct atl_err *err);
 unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
 #else
+static inline void amd_retire_dram_row(struct atl_err *err) { }
 static inline unsigned long
 amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
 #endif /* CONFIG_AMD_ATL */
  

Patch

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 8b147403c955..16c8de5050e5 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -78,7 +78,6 @@  config EDAC_GHES
 config EDAC_AMD64
 	tristate "AMD64 (Opteron, Athlon64)"
 	depends on AMD_NB && EDAC_DECODE_MCE
-	depends on MEMORY_FAILURE
 	imply AMD_ATL
 	help
 	  Support for error detection and correction of DRAM ECC errors on
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index ee2f3ff15ab7..ca9a8641652d 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2795,51 +2795,6 @@  static void umc_get_err_info(struct mce *m, struct err_info *err)
 	err->csrow = m->synd & 0x7;
 }
 
-/*
- * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
- * all memory within that DRAM row. This applies to the memory with a DRAM
- * bank.
- *
- * To find the memory addresses, loop through permutations of the DRAM column
- * bits and find the System Physical address of each. The column bits are used
- * to calculate the intermediate Normalized address, so all permutations should
- * be checked.
- *
- * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
- */
-#define MI300_UMC_MCA_COL	GENMASK(5, 1)
-#define MI300_NUM_COL		BIT(HWEIGHT(MI300_UMC_MCA_COL))
-static void retire_row_mi300(struct atl_err *a_err)
-{
-	unsigned long addr;
-	struct page *p;
-	u8 col;
-
-	for (col = 0; col < MI300_NUM_COL; col++) {
-		a_err->addr &= ~MI300_UMC_MCA_COL;
-		a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
-
-		addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
-		if (IS_ERR_VALUE(addr))
-			continue;
-
-		addr = PHYS_PFN(addr);
-
-		/*
-		 * Skip invalid or already poisoned pages to avoid unnecessary
-		 * error messages from memory_failure().
-		 */
-		p = pfn_to_online_page(addr);
-		if (!p)
-			continue;
-
-		if (PageHWPoison(p))
-			continue;
-
-		memory_failure(addr, 0);
-	}
-}
-
 static void decode_umc_error(int node_id, struct mce *m)
 {
 	u8 ecc_type = (m->status >> 45) & 0x3;
@@ -2890,9 +2845,6 @@  static void decode_umc_error(int node_id, struct mce *m)
 
 	error_address_to_page_and_offset(sys_addr, &err);
 
-	if (pvt->fam == 0x19 && pvt->dram_type == MEM_HBM3)
-		retire_row_mi300(&a_err);
-
 log_error:
 	__log_ecc_error(mci, &err, ecc_type);
 }
diff --git a/drivers/ras/amd/atl/Kconfig b/drivers/ras/amd/atl/Kconfig
index a43513a700f1..df49c23e7f62 100644
--- a/drivers/ras/amd/atl/Kconfig
+++ b/drivers/ras/amd/atl/Kconfig
@@ -10,6 +10,7 @@ 
 config AMD_ATL
 	tristate "AMD Address Translation Library"
 	depends on AMD_NB && X86_64 && RAS
+	depends on MEMORY_FAILURE
 	default N
 	help
 	  This library includes support for implementation-specific
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
index 7e310d1dfcfc..08c6dbd44c62 100644
--- a/drivers/ras/amd/atl/umc.c
+++ b/drivers/ras/amd/atl/umc.c
@@ -239,6 +239,57 @@  static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
 	return addr;
 }
 
+/*
+ * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
+ * all memory within that DRAM row. This applies to the memory with a DRAM
+ * bank.
+ *
+ * To find the memory addresses, loop through permutations of the DRAM column
+ * bits and find the System Physical address of each. The column bits are used
+ * to calculate the intermediate Normalized address, so all permutations should
+ * be checked.
+ *
+ * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
+ */
+#define MI300_NUM_COL		BIT(HWEIGHT(MI300_UMC_MCA_COL))
+static void retire_row_mi300(struct atl_err *a_err)
+{
+	unsigned long addr;
+	struct page *p;
+	u8 col;
+
+	for (col = 0; col < MI300_NUM_COL; col++) {
+		a_err->addr &= ~MI300_UMC_MCA_COL;
+		a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
+
+		addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
+		if (IS_ERR_VALUE(addr))
+			continue;
+
+		addr = PHYS_PFN(addr);
+
+		/*
+		 * Skip invalid or already poisoned pages to avoid unnecessary
+		 * error messages from memory_failure().
+		 */
+		p = pfn_to_online_page(addr);
+		if (!p)
+			continue;
+
+		if (PageHWPoison(p))
+			continue;
+
+		memory_failure(addr, 0);
+	}
+}
+
+void amd_retire_dram_row(struct atl_err *a_err)
+{
+	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
+		return retire_row_mi300(a_err);
+}
+EXPORT_SYMBOL_GPL(amd_retire_dram_row);
+
 static unsigned long get_addr(unsigned long addr)
 {
 	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 09c632832bf1..a64182bc72ad 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -45,8 +45,10 @@  struct atl_err {
 #if IS_ENABLED(CONFIG_AMD_ATL)
 void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
 void amd_atl_unregister_decoder(void);
+void amd_retire_dram_row(struct atl_err *err);
 unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
 #else
+static inline void amd_retire_dram_row(struct atl_err *err) { }
 static inline unsigned long
 amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
 #endif /* CONFIG_AMD_ATL */