RAS/AMD/ATL: Add MI300 DRAM to Normalized address translation support

Message ID 20240131165732.88297-1-yazen.ghannam@amd.com
State New
Headers
Series RAS/AMD/ATL: Add MI300 DRAM to Normalized address translation support |

Commit Message

Yazen Ghannam Jan. 31, 2024, 4:57 p.m. UTC
  Modern AMD systems report DRAM ECC errors through Unified Memory
Controller (UMC) MCA banks. The value provided in MCA_ADDR is a
"Normalized" address which represents the UMC's view of its managed
memory. The Normalized address must be translated to a System Physical
address for software to take action.

MI300 systems, uniquely, do not provide a Normalized address in MCA_ADDR
for DRAM ECC errors. Rather, the "DRAM" address is reported. This value
includes identifiers for the Bank, Row, Column, Pseudochannel, and Stack
of the memory location.

The DRAM address must be converted to a Normalized address in order to
be further translated to a System Physical address.

Add helper functions to do the DRAM to Normalized translation for MI300
systems. The method is based on the fixed hardware layout of the on-chip
memory.

Also, cache the required hardware register values during module init.
These should not change during run time. And all UMCs should be
programmed identically.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
Notes:

This is an almost complete rewrite of the following patch:
https://lore.kernel.org/r/20231129073521.2127403-4-muralimk@amd.com

I'd like to include Murali as co-developer, if that's okay with him,
since this is based on his earlier work. 

With this patch, MI300 address translation support should be complete.
The remaining work includes handling expanded page retirement cases.

 drivers/ras/amd/atl/internal.h |   1 +
 drivers/ras/amd/atl/system.c   |   6 +-
 drivers/ras/amd/atl/umc.c      | 199 ++++++++++++++++++++++++++++++++-
 3 files changed, 204 insertions(+), 2 deletions(-)


base-commit: a7b57372e1c5c848cbe9169574f07a9ee2177a1b
  

Comments

M K, Muralidhara Feb. 1, 2024, 7:03 a.m. UTC | #1
Hi Yazen,

On 1/31/2024 10:27 PM, Yazen Ghannam wrote:
> Modern AMD systems report DRAM ECC errors through Unified Memory
> Controller (UMC) MCA banks. The value provided in MCA_ADDR is a
> "Normalized" address which represents the UMC's view of its managed
> memory. The Normalized address must be translated to a System Physical
> address for software to take action.
> 
> MI300 systems, uniquely, do not provide a Normalized address in MCA_ADDR
> for DRAM ECC errors. Rather, the "DRAM" address is reported. This value
> includes identifiers for the Bank, Row, Column, Pseudochannel, and Stack
> of the memory location.
> 
> The DRAM address must be converted to a Normalized address in order to
> be further translated to a System Physical address.
> 
> Add helper functions to do the DRAM to Normalized translation for MI300
> systems. The method is based on the fixed hardware layout of the on-chip
> memory.
> 
> Also, cache the required hardware register values during module init.
> These should not change during run time. And all UMCs should be
> programmed identically.
> 
> Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
> ---
> Notes:
> 
> This is an almost complete rewrite of the following patch:
> https://lore.kernel.org/r/20231129073521.2127403-4-muralimk@amd.com
> 
> I'd like to include Murali as co-developer, if that's okay with him,
> since this is based on his earlier work.
> 

Address translation support on MI300A is working as expected.
Please add me as codeveloped and tested-by
Muralidhara M K <muralidhara.mk@amd.com>


> With this patch, MI300 address translation support should be complete.
> The remaining work includes handling expanded page retirement cases.
> 
>   drivers/ras/amd/atl/internal.h |   1 +
>   drivers/ras/amd/atl/system.c   |   6 +-
>   drivers/ras/amd/atl/umc.c      | 199 ++++++++++++++++++++++++++++++++-
>   3 files changed, 204 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h
> index 21d45755e5f2..5de69e0bb0f9 100644
> --- a/drivers/ras/amd/atl/internal.h
> +++ b/drivers/ras/amd/atl/internal.h
> @@ -224,6 +224,7 @@ int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo);
>   
>   int get_df_system_info(void);
>   int determine_node_id(struct addr_ctx *ctx, u8 socket_num, u8 die_num);
> +int get_addr_hash_mi300(void);
>   
>   int get_address_map(struct addr_ctx *ctx);
>   
> diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c
> index 46493ed405d6..701349e84942 100644
> --- a/drivers/ras/amd/atl/system.c
> +++ b/drivers/ras/amd/atl/system.c
> @@ -124,9 +124,13 @@ static int df4_determine_df_rev(u32 reg)
>   	if (reg == DF_FUNC0_ID_ZEN4_SERVER)
>   		df_cfg.flags.socket_id_shift_quirk = 1;
>   
> -	if (reg == DF_FUNC0_ID_MI300)
> +	if (reg == DF_FUNC0_ID_MI300) {
>   		df_cfg.flags.heterogeneous = 1;
>   
> +		if (get_addr_hash_mi300())
> +			return -EINVAL;
> +	}
> +
>   	return df4_get_fabric_id_mask_registers();
>   }
>   
> diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
> index 7bfa21a582f0..816c2c5c683f 100644
> --- a/drivers/ras/amd/atl/umc.c
> +++ b/drivers/ras/amd/atl/umc.c
> @@ -49,6 +49,203 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
>   	return i;
>   }
>   
> +static u16 internal_bitwise_xor(u16 val)
> +{
> +	u16 tmp = 0;
> +	u8 i;
> +
> +	for (i = 0; i < 16; i++)
> +		tmp ^= (val >> i) & 0x1;
> +
> +	return tmp;
> +}
> +
> +struct xor_bits {
> +	bool	xor_enable;
> +	u16	col_xor;
> +	u32	row_xor;
> +};
> +
> +#define NUM_BANK_BITS	4
> +
> +static struct {
> +	/* Values from UMC::CH::AddrHashBank registers */
> +	struct xor_bits	bank[NUM_BANK_BITS];
> +
> +	/* Values from UMC::CH::AddrHashPC register */
> +	struct xor_bits	pc;
> +
> +	/* Value from UMC::CH::AddrHashPC2 register */
> +	u8		bank_xor;
> +} addr_hash;
> +
> +#define MI300_UMC_CH_BASE	0x90000
> +#define MI300_ADDR_HASH_BANK0	(MI300_UMC_CH_BASE + 0xC8)
> +#define MI300_ADDR_HASH_PC	(MI300_UMC_CH_BASE + 0xE0)
> +#define MI300_ADDR_HASH_PC2	(MI300_UMC_CH_BASE + 0xE4)
> +
> +#define ADDR_HASH_XOR_EN	BIT(0)
> +#define ADDR_HASH_COL_XOR	GENMASK(13, 1)
> +#define ADDR_HASH_ROW_XOR	GENMASK(31, 14)
> +#define ADDR_HASH_BANK_XOR	GENMASK(5, 0)
> +
> +/*
> + * Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used
> + * for hashing. Do this during module init, since the values will not
> + * change during run time.
> + *
> + * These registers are instantiated for each UMC across each AMD Node.
> + * However, they should be identically programmed due to the fixed hardware
> + * design of MI300 systems. So read the values from Node 0 UMC 0 and keep a
> + * single global structure for simplicity.
> + */
> +int get_addr_hash_mi300(void)
> +{
> +	u32 temp;
> +	int ret;
> +	u8 i;
> +
> +	for (i = 0; i < NUM_BANK_BITS; i++) {
> +		ret = amd_smn_read(0, MI300_ADDR_HASH_BANK0 + (i * 4), &temp);
> +		if (ret)
> +			return ret;
> +
> +		addr_hash.bank[i].xor_enable = FIELD_GET(ADDR_HASH_XOR_EN,  temp);
> +		addr_hash.bank[i].col_xor    = FIELD_GET(ADDR_HASH_COL_XOR, temp);
> +		addr_hash.bank[i].row_xor    = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
> +	}
> +
> +	ret = amd_smn_read(0, MI300_ADDR_HASH_PC, &temp);
> +	if (ret)
> +		return ret;
> +
> +	addr_hash.pc.xor_enable = FIELD_GET(ADDR_HASH_XOR_EN,  temp);
> +	addr_hash.pc.col_xor    = FIELD_GET(ADDR_HASH_COL_XOR, temp);
> +	addr_hash.pc.row_xor    = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
> +
> +	ret = amd_smn_read(0, MI300_ADDR_HASH_PC2, &temp);
> +	if (ret)
> +		return ret;
> +
> +	addr_hash.bank_xor = FIELD_GET(ADDR_HASH_BANK_XOR, temp);
> +
> +	return 0;
> +}
> +
> +/*
> + * MI300 systems report a DRAM address in MCA_ADDR for DRAM ECC errors. This must
> + * be converted to the intermediate Normalized address (NA) before translating to a
> + * System Physical address.
> + *
> + * The DRAM address includes bank, row, and column. Also included are bits for
> + * Pseudochannel (PC) and Stack ID (SID).
> + *
> + * Abbreviations: (S)tack ID, (P)seudochannel, (R)ow, (B)ank, (C)olumn, (Z)ero
> + *
> + * The MCA address format is as follows:
> + *	MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]}
> + *
> + * The Normalized address format is fixed in hardware and is as follows:
> + *	NA[30:0] = {S[1:0], R[13:0], C4, B[1:0], B[3:2], C[3:2], P, C[1:0], Z[4:0]}
> + *
> + * Additionally, the PC and Bank bits may be hashed. This must be accounted for before
> + * reconstructing the Normalized address.
> + */
> +#define MI300_UMC_MCA_COL	GENMASK(5, 1)
> +#define MI300_UMC_MCA_BANK	GENMASK(9, 6)
> +#define MI300_UMC_MCA_ROW	GENMASK(24, 10)
> +#define MI300_UMC_MCA_PC	BIT(25)
> +#define MI300_UMC_MCA_SID	GENMASK(27, 26)
> +
> +#define MI300_NA_COL_1_0	GENMASK(6, 5)
> +#define MI300_NA_PC		BIT(7)
> +#define MI300_NA_COL_3_2	GENMASK(9, 8)
> +#define MI300_NA_BANK_3_2	GENMASK(11, 10)
> +#define MI300_NA_BANK_1_0	GENMASK(13, 12)
> +#define MI300_NA_COL_4		BIT(14)
> +#define MI300_NA_ROW		GENMASK(28, 15)
> +#define MI300_NA_SID		GENMASK(30, 29)
> +
> +static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
> +{
> +	u16 i, col, row, bank, pc, sid, temp;
> +
> +	col  = FIELD_GET(MI300_UMC_MCA_COL,  addr);
> +	bank = FIELD_GET(MI300_UMC_MCA_BANK, addr);
> +	row  = FIELD_GET(MI300_UMC_MCA_ROW,  addr);
> +	pc   = FIELD_GET(MI300_UMC_MCA_PC,   addr);
> +	sid  = FIELD_GET(MI300_UMC_MCA_SID,  addr);
> +
> +	/* Calculate hash for each Bank bit. */
> +	for (i = 0; i < NUM_BANK_BITS; i++) {
> +		if (!addr_hash.bank[i].xor_enable)
> +			continue;
> +
> +		temp  = internal_bitwise_xor(col & addr_hash.bank[i].col_xor);
> +		temp ^= internal_bitwise_xor(row & addr_hash.bank[i].row_xor);
> +		bank ^= temp << i;
> +	}
> +
> +	/* Calculate hash for PC bit. */
> +	if (addr_hash.pc.xor_enable) {
> +		/* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */
> +		bank |= sid << 5;
> +
> +		temp  = internal_bitwise_xor(col  & addr_hash.pc.col_xor);
> +		temp ^= internal_bitwise_xor(row  & addr_hash.pc.row_xor);
> +		temp ^= internal_bitwise_xor(bank & addr_hash.bank_xor);
> +		pc   ^= temp;
> +
> +		/* Drop SID bits for the sake of debug printing later. */
> +		bank &= 0x1F;
> +	}
> +
> +	/* Reconstruct the Normalized address starting with NA[4:0] = 0 */
> +	addr  = 0;
> +
> +	/* NA[6:5] = Column[1:0] */
> +	temp  = col & 0x3;
> +	addr |= FIELD_PREP(MI300_NA_COL_1_0, temp);
> +
> +	/* NA[7] = PC */
> +	addr |= FIELD_PREP(MI300_NA_PC, pc);
> +
> +	/* NA[9:8] = Column[3:2] */
> +	temp  = (col >> 2) & 0x3;
> +	addr |= FIELD_PREP(MI300_NA_COL_3_2, temp);
> +
> +	/* NA[11:10] = Bank[3:2] */
> +	temp  = (bank >> 2) & 0x3;
> +	addr |= FIELD_PREP(MI300_NA_BANK_3_2, temp);
> +
> +	/* NA[13:12] = Bank[1:0] */
> +	temp  = bank & 0x3;
> +	addr |= FIELD_PREP(MI300_NA_BANK_1_0, temp);
> +
> +	/* NA[14] = Column[4] */
> +	temp  = (col >> 4) & 0x1;
> +	addr |= FIELD_PREP(MI300_NA_COL_4, temp);
> +
> +	/* NA[28:15] = Row[13:0] */
> +	addr |= FIELD_PREP(MI300_NA_ROW, row);
> +
> +	/* NA[30:29] = SID[1:0] */
> +	addr |= FIELD_PREP(MI300_NA_SID, sid);
> +
> +	pr_debug("Addr=0x%016lx", addr);
> +	pr_debug("Bank=%u Row=%u Column=%u PC=%u SID=%u", bank, row, col, pc, sid);
> +
> +	return addr;
> +}
> +
> +static unsigned long get_addr(unsigned long addr)
> +{
> +	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
> +		return convert_dram_to_norm_addr_mi300(addr);
> +
> +	return addr;
> +}
> +
>   #define MCA_IPID_INST_ID_HI	GENMASK_ULL(47, 44)
>   static u8 get_die_id(struct atl_err *err)
>   {
> @@ -82,7 +279,7 @@ unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
>   {
>   	u8 socket_id = topology_physical_package_id(err->cpu);
>   	u8 coh_st_inst_id = get_coh_st_inst_id(err);
> -	unsigned long addr = err->addr;
> +	unsigned long addr = get_addr(err->addr);
>   	u8 die_id = get_die_id(err);
>   
>   	pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx",
> 
> base-commit: a7b57372e1c5c848cbe9169574f07a9ee2177a1b
>
  
Borislav Petkov Feb. 1, 2024, 9:21 a.m. UTC | #2
On Wed, Jan 31, 2024 at 10:57:32AM -0600, Yazen Ghannam wrote:
> diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
> index 7bfa21a582f0..816c2c5c683f 100644
> --- a/drivers/ras/amd/atl/umc.c
> +++ b/drivers/ras/amd/atl/umc.c
> @@ -49,6 +49,203 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
>  	return i;
>  }
>  
> +static u16 internal_bitwise_xor(u16 val)

Why are we calling it "internal"?
  
Yazen Ghannam Feb. 1, 2024, 2:35 p.m. UTC | #3
On 2/1/2024 4:21 AM, Borislav Petkov wrote:
> On Wed, Jan 31, 2024 at 10:57:32AM -0600, Yazen Ghannam wrote:
>> diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
>> index 7bfa21a582f0..816c2c5c683f 100644
>> --- a/drivers/ras/amd/atl/umc.c
>> +++ b/drivers/ras/amd/atl/umc.c
>> @@ -49,6 +49,203 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
>>   	return i;
>>   }
>>   
>> +static u16 internal_bitwise_xor(u16 val)
> 
> Why are we calling it "internal"?
> 


It's an operation on the bits within a value rather than between two values.

BTW, I looked up "internal" in a thesaurus, and nothing seemed much better to me.

Maybe something like "xor_bits_in_value()"? This has the verb-first style too.

Thanks,
Yazen
  
Borislav Petkov Feb. 1, 2024, 3:05 p.m. UTC | #4
On Thu, Feb 01, 2024 at 09:35:13AM -0500, Yazen Ghannam wrote:
> It's an operation on the bits within a value rather than between two values.
> 
> BTW, I looked up "internal" in a thesaurus, and nothing seemed much better to me.
> 
> Maybe something like "xor_bits_in_value()"? This has the verb-first style too.

Ah, ok, easy:

---
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
index 67dc186a1226..7e310d1dfcfc 100644
--- a/drivers/ras/amd/atl/umc.c
+++ b/drivers/ras/amd/atl/umc.c
@@ -49,7 +49,8 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
 	return i;
 }
 
-static u16 internal_bitwise_xor(u16 val)
+/* XOR the bits in @val. */
+static u16 bitwise_xor_bits(u16 val)
 {
 	u16 tmp = 0;
 	u8 i;
@@ -181,8 +182,8 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
 		if (!addr_hash.bank[i].xor_enable)
 			continue;
 
-		temp  = internal_bitwise_xor(col & addr_hash.bank[i].col_xor);
-		temp ^= internal_bitwise_xor(row & addr_hash.bank[i].row_xor);
+		temp  = bitwise_xor_bits(col & addr_hash.bank[i].col_xor);
+		temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor);
 		bank ^= temp << i;
 	}
 
@@ -191,9 +192,9 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
 		/* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */
 		bank |= sid << 5;
 
-		temp  = internal_bitwise_xor(col  & addr_hash.pc.col_xor);
-		temp ^= internal_bitwise_xor(row  & addr_hash.pc.row_xor);
-		temp ^= internal_bitwise_xor(bank & addr_hash.bank_xor);
+		temp  = bitwise_xor_bits(col  & addr_hash.pc.col_xor);
+		temp ^= bitwise_xor_bits(row  & addr_hash.pc.row_xor);
+		temp ^= bitwise_xor_bits(bank & addr_hash.bank_xor);
 		pc   ^= temp;
 
 		/* Drop SID bits for the sake of debug printing later. */
  
Yazen Ghannam Feb. 1, 2024, 9:59 p.m. UTC | #5
On 2/1/2024 10:05 AM, Borislav Petkov wrote:
> On Thu, Feb 01, 2024 at 09:35:13AM -0500, Yazen Ghannam wrote:
>> It's an operation on the bits within a value rather than between two values.
>>
>> BTW, I looked up "internal" in a thesaurus, and nothing seemed much better to me.
>>
>> Maybe something like "xor_bits_in_value()"? This has the verb-first style too.
> 
> Ah, ok, easy:
> 
> ---
> diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
> index 67dc186a1226..7e310d1dfcfc 100644
> --- a/drivers/ras/amd/atl/umc.c
> +++ b/drivers/ras/amd/atl/umc.c
> @@ -49,7 +49,8 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
>   	return i;
>   }
>   
> -static u16 internal_bitwise_xor(u16 val)
> +/* XOR the bits in @val. */
> +static u16 bitwise_xor_bits(u16 val)
>   {
>   	u16 tmp = 0;
>   	u8 i;
> @@ -181,8 +182,8 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
>   		if (!addr_hash.bank[i].xor_enable)
>   			continue;
>   
> -		temp  = internal_bitwise_xor(col & addr_hash.bank[i].col_xor);
> -		temp ^= internal_bitwise_xor(row & addr_hash.bank[i].row_xor);
> +		temp  = bitwise_xor_bits(col & addr_hash.bank[i].col_xor);
> +		temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor);
>   		bank ^= temp << i;
>   	}
>   
> @@ -191,9 +192,9 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
>   		/* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */
>   		bank |= sid << 5;
>   
> -		temp  = internal_bitwise_xor(col  & addr_hash.pc.col_xor);
> -		temp ^= internal_bitwise_xor(row  & addr_hash.pc.row_xor);
> -		temp ^= internal_bitwise_xor(bank & addr_hash.bank_xor);
> +		temp  = bitwise_xor_bits(col  & addr_hash.pc.col_xor);
> +		temp ^= bitwise_xor_bits(row  & addr_hash.pc.row_xor);
> +		temp ^= bitwise_xor_bits(bank & addr_hash.bank_xor);
>   		pc   ^= temp;
>   
>   		/* Drop SID bits for the sake of debug printing later. */
> 
> 

Yep, easy :) Looks good to me.

Thanks,
Yazen
  

Patch

diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h
index 21d45755e5f2..5de69e0bb0f9 100644
--- a/drivers/ras/amd/atl/internal.h
+++ b/drivers/ras/amd/atl/internal.h
@@ -224,6 +224,7 @@  int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo);
 
 int get_df_system_info(void);
 int determine_node_id(struct addr_ctx *ctx, u8 socket_num, u8 die_num);
+int get_addr_hash_mi300(void);
 
 int get_address_map(struct addr_ctx *ctx);
 
diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c
index 46493ed405d6..701349e84942 100644
--- a/drivers/ras/amd/atl/system.c
+++ b/drivers/ras/amd/atl/system.c
@@ -124,9 +124,13 @@  static int df4_determine_df_rev(u32 reg)
 	if (reg == DF_FUNC0_ID_ZEN4_SERVER)
 		df_cfg.flags.socket_id_shift_quirk = 1;
 
-	if (reg == DF_FUNC0_ID_MI300)
+	if (reg == DF_FUNC0_ID_MI300) {
 		df_cfg.flags.heterogeneous = 1;
 
+		if (get_addr_hash_mi300())
+			return -EINVAL;
+	}
+
 	return df4_get_fabric_id_mask_registers();
 }
 
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
index 7bfa21a582f0..816c2c5c683f 100644
--- a/drivers/ras/amd/atl/umc.c
+++ b/drivers/ras/amd/atl/umc.c
@@ -49,6 +49,203 @@  static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
 	return i;
 }
 
+static u16 internal_bitwise_xor(u16 val)
+{
+	u16 tmp = 0;
+	u8 i;
+
+	for (i = 0; i < 16; i++)
+		tmp ^= (val >> i) & 0x1;
+
+	return tmp;
+}
+
+struct xor_bits {
+	bool	xor_enable;
+	u16	col_xor;
+	u32	row_xor;
+};
+
+#define NUM_BANK_BITS	4
+
+static struct {
+	/* Values from UMC::CH::AddrHashBank registers */
+	struct xor_bits	bank[NUM_BANK_BITS];
+
+	/* Values from UMC::CH::AddrHashPC register */
+	struct xor_bits	pc;
+
+	/* Value from UMC::CH::AddrHashPC2 register */
+	u8		bank_xor;
+} addr_hash;
+
+#define MI300_UMC_CH_BASE	0x90000
+#define MI300_ADDR_HASH_BANK0	(MI300_UMC_CH_BASE + 0xC8)
+#define MI300_ADDR_HASH_PC	(MI300_UMC_CH_BASE + 0xE0)
+#define MI300_ADDR_HASH_PC2	(MI300_UMC_CH_BASE + 0xE4)
+
+#define ADDR_HASH_XOR_EN	BIT(0)
+#define ADDR_HASH_COL_XOR	GENMASK(13, 1)
+#define ADDR_HASH_ROW_XOR	GENMASK(31, 14)
+#define ADDR_HASH_BANK_XOR	GENMASK(5, 0)
+
+/*
+ * Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used
+ * for hashing. Do this during module init, since the values will not
+ * change during run time.
+ *
+ * These registers are instantiated for each UMC across each AMD Node.
+ * However, they should be identically programmed due to the fixed hardware
+ * design of MI300 systems. So read the values from Node 0 UMC 0 and keep a
+ * single global structure for simplicity.
+ */
+int get_addr_hash_mi300(void)
+{
+	u32 temp;
+	int ret;
+	u8 i;
+
+	for (i = 0; i < NUM_BANK_BITS; i++) {
+		ret = amd_smn_read(0, MI300_ADDR_HASH_BANK0 + (i * 4), &temp);
+		if (ret)
+			return ret;
+
+		addr_hash.bank[i].xor_enable = FIELD_GET(ADDR_HASH_XOR_EN,  temp);
+		addr_hash.bank[i].col_xor    = FIELD_GET(ADDR_HASH_COL_XOR, temp);
+		addr_hash.bank[i].row_xor    = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
+	}
+
+	ret = amd_smn_read(0, MI300_ADDR_HASH_PC, &temp);
+	if (ret)
+		return ret;
+
+	addr_hash.pc.xor_enable = FIELD_GET(ADDR_HASH_XOR_EN,  temp);
+	addr_hash.pc.col_xor    = FIELD_GET(ADDR_HASH_COL_XOR, temp);
+	addr_hash.pc.row_xor    = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
+
+	ret = amd_smn_read(0, MI300_ADDR_HASH_PC2, &temp);
+	if (ret)
+		return ret;
+
+	addr_hash.bank_xor = FIELD_GET(ADDR_HASH_BANK_XOR, temp);
+
+	return 0;
+}
+
+/*
+ * MI300 systems report a DRAM address in MCA_ADDR for DRAM ECC errors. This must
+ * be converted to the intermediate Normalized address (NA) before translating to a
+ * System Physical address.
+ *
+ * The DRAM address includes bank, row, and column. Also included are bits for
+ * Pseudochannel (PC) and Stack ID (SID).
+ *
+ * Abbreviations: (S)tack ID, (P)seudochannel, (R)ow, (B)ank, (C)olumn, (Z)ero
+ *
+ * The MCA address format is as follows:
+ *	MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]}
+ *
+ * The Normalized address format is fixed in hardware and is as follows:
+ *	NA[30:0] = {S[1:0], R[13:0], C4, B[1:0], B[3:2], C[3:2], P, C[1:0], Z[4:0]}
+ *
+ * Additionally, the PC and Bank bits may be hashed. This must be accounted for before
+ * reconstructing the Normalized address.
+ */
+#define MI300_UMC_MCA_COL	GENMASK(5, 1)
+#define MI300_UMC_MCA_BANK	GENMASK(9, 6)
+#define MI300_UMC_MCA_ROW	GENMASK(24, 10)
+#define MI300_UMC_MCA_PC	BIT(25)
+#define MI300_UMC_MCA_SID	GENMASK(27, 26)
+
+#define MI300_NA_COL_1_0	GENMASK(6, 5)
+#define MI300_NA_PC		BIT(7)
+#define MI300_NA_COL_3_2	GENMASK(9, 8)
+#define MI300_NA_BANK_3_2	GENMASK(11, 10)
+#define MI300_NA_BANK_1_0	GENMASK(13, 12)
+#define MI300_NA_COL_4		BIT(14)
+#define MI300_NA_ROW		GENMASK(28, 15)
+#define MI300_NA_SID		GENMASK(30, 29)
+
+static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
+{
+	u16 i, col, row, bank, pc, sid, temp;
+
+	col  = FIELD_GET(MI300_UMC_MCA_COL,  addr);
+	bank = FIELD_GET(MI300_UMC_MCA_BANK, addr);
+	row  = FIELD_GET(MI300_UMC_MCA_ROW,  addr);
+	pc   = FIELD_GET(MI300_UMC_MCA_PC,   addr);
+	sid  = FIELD_GET(MI300_UMC_MCA_SID,  addr);
+
+	/* Calculate hash for each Bank bit. */
+	for (i = 0; i < NUM_BANK_BITS; i++) {
+		if (!addr_hash.bank[i].xor_enable)
+			continue;
+
+		temp  = internal_bitwise_xor(col & addr_hash.bank[i].col_xor);
+		temp ^= internal_bitwise_xor(row & addr_hash.bank[i].row_xor);
+		bank ^= temp << i;
+	}
+
+	/* Calculate hash for PC bit. */
+	if (addr_hash.pc.xor_enable) {
+		/* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */
+		bank |= sid << 5;
+
+		temp  = internal_bitwise_xor(col  & addr_hash.pc.col_xor);
+		temp ^= internal_bitwise_xor(row  & addr_hash.pc.row_xor);
+		temp ^= internal_bitwise_xor(bank & addr_hash.bank_xor);
+		pc   ^= temp;
+
+		/* Drop SID bits for the sake of debug printing later. */
+		bank &= 0x1F;
+	}
+
+	/* Reconstruct the Normalized address starting with NA[4:0] = 0 */
+	addr  = 0;
+
+	/* NA[6:5] = Column[1:0] */
+	temp  = col & 0x3;
+	addr |= FIELD_PREP(MI300_NA_COL_1_0, temp);
+
+	/* NA[7] = PC */
+	addr |= FIELD_PREP(MI300_NA_PC, pc);
+
+	/* NA[9:8] = Column[3:2] */
+	temp  = (col >> 2) & 0x3;
+	addr |= FIELD_PREP(MI300_NA_COL_3_2, temp);
+
+	/* NA[11:10] = Bank[3:2] */
+	temp  = (bank >> 2) & 0x3;
+	addr |= FIELD_PREP(MI300_NA_BANK_3_2, temp);
+
+	/* NA[13:12] = Bank[1:0] */
+	temp  = bank & 0x3;
+	addr |= FIELD_PREP(MI300_NA_BANK_1_0, temp);
+
+	/* NA[14] = Column[4] */
+	temp  = (col >> 4) & 0x1;
+	addr |= FIELD_PREP(MI300_NA_COL_4, temp);
+
+	/* NA[28:15] = Row[13:0] */
+	addr |= FIELD_PREP(MI300_NA_ROW, row);
+
+	/* NA[30:29] = SID[1:0] */
+	addr |= FIELD_PREP(MI300_NA_SID, sid);
+
+	pr_debug("Addr=0x%016lx", addr);
+	pr_debug("Bank=%u Row=%u Column=%u PC=%u SID=%u", bank, row, col, pc, sid);
+
+	return addr;
+}
+
+static unsigned long get_addr(unsigned long addr)
+{
+	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
+		return convert_dram_to_norm_addr_mi300(addr);
+
+	return addr;
+}
+
 #define MCA_IPID_INST_ID_HI	GENMASK_ULL(47, 44)
 static u8 get_die_id(struct atl_err *err)
 {
@@ -82,7 +279,7 @@  unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
 {
 	u8 socket_id = topology_physical_package_id(err->cpu);
 	u8 coh_st_inst_id = get_coh_st_inst_id(err);
-	unsigned long addr = err->addr;
+	unsigned long addr = get_addr(err->addr);
 	u8 die_id = get_die_id(err);
 
 	pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx",