@@ -77,6 +77,9 @@ (define_constraint "Y"
(define_register_constraint "v" "VGPR_REGS"
"VGPR registers")
+(define_register_constraint "a" "TARGET_CDNA1_PLUS ? AVGPR_REGS : NO_REGS"
+ "Accumulator VGPR registers")
+
(define_register_constraint "Sg" "SGPR_REGS"
"SGPR registers")
@@ -449,12 +449,16 @@ (define_insn "mov<mode>_unspec"
(set_attr "length" "0")])
(define_insn "*mov<mode>"
- [(set (match_operand:V_1REG 0 "nonimmediate_operand" "=v,v")
- (match_operand:V_1REG 1 "general_operand" "vA,B"))]
- ""
- "v_mov_b32\t%0, %1"
- [(set_attr "type" "vop1,vop1")
- (set_attr "length" "4,8")])
+ [(set (match_operand:V_1REG 0 "nonimmediate_operand")
+ (match_operand:V_1REG 1 "general_operand"))]
+ ""
+ {@ [cons: =0, 1; attrs: type, length, gcn_version]
+ [v ,vA;vop1 ,4,* ] v_mov_b32\t%0, %1
+ [v ,B ;vop1 ,8,* ] ^
+ [v ,a ;vop3p_mai,8,* ] v_accvgpr_read_b32\t%0, %1
+ [$a ,v ;vop3p_mai,8,* ] v_accvgpr_write_b32\t%0, %1
+ [a ,a ;vop1 ,4,cdna2] v_accvgpr_mov_b32\t%0, %1
+ })
(define_insn "mov<mode>_exec"
[(set (match_operand:V_1REG 0 "nonimmediate_operand")
@@ -493,17 +497,29 @@ (define_insn "mov<mode>_exec"
; (set_attr "length" "4,8,16,16")])
(define_insn "*mov<mode>"
- [(set (match_operand:V_2REG 0 "nonimmediate_operand" "=v")
- (match_operand:V_2REG 1 "general_operand" "vDB"))]
+ [(set (match_operand:V_2REG 0 "nonimmediate_operand" "=v, v,$a,a")
+ (match_operand:V_2REG 1 "general_operand" "vDB,a, v,a"))]
""
- {
- if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1]))
- return "v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1";
- else
- return "v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1";
- }
- [(set_attr "type" "vmult")
- (set_attr "length" "16")])
+ "@
+ * if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \
+ return \"v_mov_b32\t%L0, %L1\;v_mov_b32\t%H0, %H1\"; \
+ else \
+ return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\";
+ * if (REGNO (operands[0]) <= REGNO (operands[1])) \
+ return \"v_accvgpr_read_b32\t%L0, %L1\;v_accvgpr_read_b32\t%H0, %H1\"; \
+ else \
+ return \"v_accvgpr_read_b32\t%H0, %H1\;v_accvgpr_read_b32\t%L0, %L1\";
+ * if (REGNO (operands[0]) <= REGNO (operands[1])) \
+ return \"v_accvgpr_write_b32\t%L0, %L1\;v_accvgpr_write_b32\t%H0, %H1\"; \
+ else \
+ return \"v_accvgpr_write_b32\t%H0, %H1\;v_accvgpr_write_b32\t%L0, %L1\";
+ * if (REGNO (operands[0]) <= REGNO (operands[1])) \
+ return \"v_accvgpr_mov_b32\t%L0, %L1\;v_accvgpr_mov_b32\t%H0, %H1\"; \
+ else \
+ return \"v_accvgpr_mov_b32\t%H0, %H1\;v_accvgpr_mov_b32\t%L0, %L1\";"
+ [(set_attr "type" "vmult,vmult,vmult,vmult")
+ (set_attr "length" "16,16,16,8")
+ (set_attr "gcn_version" "*,*,*,cdna2")])
(define_insn "mov<mode>_exec"
[(set (match_operand:V_2REG 0 "nonimmediate_operand" "= v, v, v, v, m")
@@ -546,17 +562,15 @@ (define_insn "mov<mode>_exec"
(set_attr "length" "16,16,16,16,16")])
(define_insn "*mov<mode>_4reg"
- [(set (match_operand:V_4REG 0 "nonimmediate_operand" "=v")
- (match_operand:V_4REG 1 "general_operand" "vDB"))]
+ [(set (match_operand:V_4REG 0 "nonimmediate_operand")
+ (match_operand:V_4REG 1 "general_operand"))]
""
- {
- return "v_mov_b32\t%L0, %L1\;"
- "v_mov_b32\t%H0, %H1\;"
- "v_mov_b32\t%J0, %J1\;"
- "v_mov_b32\t%K0, %K1\;";
- }
- [(set_attr "type" "vmult")
- (set_attr "length" "16")])
+ {@ [cons: =0, 1; attrs: type, length, gcn_version]
+ [v,vDB;vmult,16,* ] v_mov_b32\t%L0, %L1\; v_mov_b32\t%H0, %H1\; v_mov_b32\t%J0, %J1\; v_mov_b32\t%K0, %K1
+ [v,a ;vmult,32,* ] v_accvgpr_read_b32\t%L0, %L1\; v_accvgpr_read_b32\t%H0, %H1\; v_accvgpr_read_b32\t%J0, %J1\; v_accvgpr_read_b32\t%K0, %K1
+ [a,v ;vmult,32,* ] v_accvgpr_write_b32\t%L0, %L1\;v_accvgpr_write_b32\t%H0, %H1\;v_accvgpr_write_b32\t%J0, %J1\;v_accvgpr_write_b32\t%K0, %K1
+ [a,a ;vmult,32,cdna2] v_accvgpr_mov_b32\t%L0, %L1\; v_accvgpr_mov_b32\t%H0, %H1\; v_accvgpr_mov_b32\t%J0, %J1\; v_accvgpr_mov_b32\t%K0, %K1
+ })
(define_insn "mov<mode>_exec"
[(set (match_operand:V_4REG 0 "nonimmediate_operand" "= v, v, v, v, m")
@@ -648,19 +662,21 @@ (define_insn "@mov<mode>_sgprbase"
UNSPEC_SGPRBASE))
(clobber (match_operand:<VnDI> 2 "register_operand"))]
"lra_in_progress || reload_completed"
- {@ [cons: =0, 1, =2; attrs: type, length]
- [v,vA,&v;vop1,4 ] v_mov_b32\t%0, %1
- [v,vB,&v;vop1,8 ] ^
- [v,m ,&v;* ,12] #
- [m,v ,&v;* ,12] #
+ {@ [cons: =0, 1, =2; attrs: type, length, gcn_version]
+ [v,vA,&v;vop1,4 ,* ] v_mov_b32\t%0, %1
+ [v,vB,&v;vop1,8 ,* ] ^
+ [v,m ,&v;* ,12,* ] #
+ [m,v ,&v;* ,12,* ] #
+ [a,m ,&v;* ,12,cdna2] #
+ [m,a ,&v;* ,12,cdna2] #
})
(define_insn "@mov<mode>_sgprbase"
- [(set (match_operand:V_2REG 0 "nonimmediate_operand" "= v, v, m")
+ [(set (match_operand:V_2REG 0 "nonimmediate_operand" "= v, v, m, a, m")
(unspec:V_2REG
- [(match_operand:V_2REG 1 "general_operand" "vDB, m, v")]
+ [(match_operand:V_2REG 1 "general_operand" "vDB, m, v, m, a")]
UNSPEC_SGPRBASE))
- (clobber (match_operand:<VnDI> 2 "register_operand" "=&v,&v,&v"))]
+ (clobber (match_operand:<VnDI> 2 "register_operand" "=&v,&v,&v,&v,&v"))]
"lra_in_progress || reload_completed"
"@
* if (!REG_P (operands[1]) || REGNO (operands[0]) <= REGNO (operands[1])) \
@@ -668,9 +684,12 @@ (define_insn "@mov<mode>_sgprbase"
else \
return \"v_mov_b32\t%H0, %H1\;v_mov_b32\t%L0, %L1\";
#
+ #
+ #
#"
- [(set_attr "type" "vmult,*,*")
- (set_attr "length" "8,12,12")])
+ [(set_attr "type" "vmult,*,*,*,*")
+ (set_attr "length" "8,12,12,12,12")
+ (set_attr "gcn_version" "*,*,*,cdna2,cdna2")])
(define_insn "@mov<mode>_sgprbase"
[(set (match_operand:V_4REG 0 "nonimmediate_operand")
@@ -1126,13 +1145,13 @@ (define_expand "gather<mode>_expr<exec>"
{})
(define_insn "gather<mode>_insn_1offset<exec>"
- [(set (match_operand:V_MOV 0 "register_operand" "=v")
+ [(set (match_operand:V_MOV 0 "register_operand" "=v,a")
(unspec:V_MOV
- [(plus:<VnDI> (match_operand:<VnDI> 1 "register_operand" " v")
+ [(plus:<VnDI> (match_operand:<VnDI> 1 "register_operand" " v,v")
(vec_duplicate:<VnDI>
- (match_operand 2 "immediate_operand" " n")))
- (match_operand 3 "immediate_operand" " n")
- (match_operand 4 "immediate_operand" " n")
+ (match_operand 2 "immediate_operand" " n,n")))
+ (match_operand 3 "immediate_operand" " n,n")
+ (match_operand 4 "immediate_operand" " n,n")
(mem:BLK (scratch))]
UNSPEC_GATHER))]
"(AS_FLAT_P (INTVAL (operands[3]))
@@ -1162,16 +1181,17 @@ (define_insn "gather<mode>_insn_1offset<exec>"
return buf;
}
[(set_attr "type" "flat")
- (set_attr "length" "12")])
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "*,cdna2")])
(define_insn "gather<mode>_insn_1offset_ds<exec>"
- [(set (match_operand:V_MOV 0 "register_operand" "=v")
+ [(set (match_operand:V_MOV 0 "register_operand" "=v,a")
(unspec:V_MOV
- [(plus:<VnSI> (match_operand:<VnSI> 1 "register_operand" " v")
+ [(plus:<VnSI> (match_operand:<VnSI> 1 "register_operand" " v,v")
(vec_duplicate:<VnSI>
- (match_operand 2 "immediate_operand" " n")))
- (match_operand 3 "immediate_operand" " n")
- (match_operand 4 "immediate_operand" " n")
+ (match_operand 2 "immediate_operand" " n,n")))
+ (match_operand 3 "immediate_operand" " n,n")
+ (match_operand 4 "immediate_operand" " n,n")
(mem:BLK (scratch))]
UNSPEC_GATHER))]
"(AS_ANY_DS_P (INTVAL (operands[3]))
@@ -1184,20 +1204,22 @@ (define_insn "gather<mode>_insn_1offset_ds<exec>"
return buf;
}
[(set_attr "type" "ds")
- (set_attr "length" "12")])
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "*,cdna2")])
(define_insn "gather<mode>_insn_2offsets<exec>"
- [(set (match_operand:V_MOV 0 "register_operand" "=v")
+ [(set (match_operand:V_MOV 0 "register_operand" "=v,a")
(unspec:V_MOV
[(plus:<VnDI>
(plus:<VnDI>
(vec_duplicate:<VnDI>
- (match_operand:DI 1 "register_operand" "Sv"))
+ (match_operand:DI 1 "register_operand" "Sv,Sv"))
(sign_extend:<VnDI>
- (match_operand:<VnSI> 2 "register_operand" " v")))
- (vec_duplicate:<VnDI> (match_operand 3 "immediate_operand" " n")))
- (match_operand 4 "immediate_operand" " n")
- (match_operand 5 "immediate_operand" " n")
+ (match_operand:<VnSI> 2 "register_operand" " v,v")))
+ (vec_duplicate:<VnDI> (match_operand 3 "immediate_operand"
+ " n,n")))
+ (match_operand 4 "immediate_operand" " n,n")
+ (match_operand 5 "immediate_operand" " n,n")
(mem:BLK (scratch))]
UNSPEC_GATHER))]
"(AS_GLOBAL_P (INTVAL (operands[4]))
@@ -1216,7 +1238,8 @@ (define_insn "gather<mode>_insn_2offsets<exec>"
return buf;
}
[(set_attr "type" "flat")
- (set_attr "length" "12")])
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "*,cdna2")])
(define_expand "scatter_store<mode><vnsi>"
[(match_operand:DI 0 "register_operand")
@@ -1255,12 +1278,12 @@ (define_expand "scatter<mode>_expr<exec_scatter>"
(define_insn "scatter<mode>_insn_1offset<exec_scatter>"
[(set (mem:BLK (scratch))
(unspec:BLK
- [(plus:<VnDI> (match_operand:<VnDI> 0 "register_operand" "v")
+ [(plus:<VnDI> (match_operand:<VnDI> 0 "register_operand" "v,v")
(vec_duplicate:<VnDI>
- (match_operand 1 "immediate_operand" "n")))
- (match_operand:V_MOV 2 "register_operand" "v")
- (match_operand 3 "immediate_operand" "n")
- (match_operand 4 "immediate_operand" "n")]
+ (match_operand 1 "immediate_operand" "n,n")))
+ (match_operand:V_MOV 2 "register_operand" "v,a")
+ (match_operand 3 "immediate_operand" "n,n")
+ (match_operand 4 "immediate_operand" "n,n")]
UNSPEC_SCATTER))]
"(AS_FLAT_P (INTVAL (operands[3]))
&& (INTVAL(operands[1]) == 0
@@ -1288,17 +1311,18 @@ (define_insn "scatter<mode>_insn_1offset<exec_scatter>"
return buf;
}
[(set_attr "type" "flat")
- (set_attr "length" "12")])
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "*,cdna2")])
(define_insn "scatter<mode>_insn_1offset_ds<exec_scatter>"
[(set (mem:BLK (scratch))
(unspec:BLK
- [(plus:<VnSI> (match_operand:<VnSI> 0 "register_operand" "v")
+ [(plus:<VnSI> (match_operand:<VnSI> 0 "register_operand" "v,v")
(vec_duplicate:<VnSI>
- (match_operand 1 "immediate_operand" "n")))
- (match_operand:V_MOV 2 "register_operand" "v")
- (match_operand 3 "immediate_operand" "n")
- (match_operand 4 "immediate_operand" "n")]
+ (match_operand 1 "immediate_operand" "n,n")))
+ (match_operand:V_MOV 2 "register_operand" "v,a")
+ (match_operand 3 "immediate_operand" "n,n")
+ (match_operand 4 "immediate_operand" "n,n")]
UNSPEC_SCATTER))]
"(AS_ANY_DS_P (INTVAL (operands[3]))
&& ((unsigned HOST_WIDE_INT)INTVAL(operands[1]) < 0x10000))"
@@ -1310,7 +1334,8 @@ (define_insn "scatter<mode>_insn_1offset_ds<exec_scatter>"
return buf;
}
[(set_attr "type" "ds")
- (set_attr "length" "12")])
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "*,cdna2")])
(define_insn "scatter<mode>_insn_2offsets<exec_scatter>"
[(set (mem:BLK (scratch))
@@ -1318,13 +1343,13 @@ (define_insn "scatter<mode>_insn_2offsets<exec_scatter>"
[(plus:<VnDI>
(plus:<VnDI>
(vec_duplicate:<VnDI>
- (match_operand:DI 0 "register_operand" "Sv"))
+ (match_operand:DI 0 "register_operand" "Sv,Sv"))
(sign_extend:<VnDI>
- (match_operand:<VnSI> 1 "register_operand" " v")))
- (vec_duplicate:<VnDI> (match_operand 2 "immediate_operand" " n")))
- (match_operand:V_MOV 3 "register_operand" " v")
- (match_operand 4 "immediate_operand" " n")
- (match_operand 5 "immediate_operand" " n")]
+ (match_operand:<VnSI> 1 "register_operand" "v,v")))
+ (vec_duplicate:<VnDI> (match_operand 2 "immediate_operand" "n,n")))
+ (match_operand:V_MOV 3 "register_operand" "v,a")
+ (match_operand 4 "immediate_operand" "n,n")
+ (match_operand 5 "immediate_operand" "n,n")]
UNSPEC_SCATTER))]
"(AS_GLOBAL_P (INTVAL (operands[4]))
&& (((unsigned HOST_WIDE_INT)INTVAL(operands[2]) + 0x1000) < 0x2000))"
@@ -1341,7 +1366,8 @@ (define_insn "scatter<mode>_insn_2offsets<exec_scatter>"
return buf;
}
[(set_attr "type" "flat")
- (set_attr "length" "12")])
+ (set_attr "length" "12")
+ (set_attr "gcn_version" "*,cdna2")])
;; }}}
;; {{{ Permutations
@@ -96,6 +96,7 @@ static hash_map<tree, int> lds_allocs;
#define MAX_NORMAL_SGPR_COUNT 62 // i.e. 64 with VCC
#define MAX_NORMAL_VGPR_COUNT 24
+#define MAX_NORMAL_AVGPR_COUNT 24
/* }}} */
/* {{{ Initialization and options. */
@@ -483,7 +484,8 @@ gcn_class_max_nregs (reg_class_t rclass, machine_mode mode)
{
/* Scalar registers are 32bit, vector registers are in fact tuples of
64 lanes. */
- if (rclass == VGPR_REGS)
+ if (rclass == VGPR_REGS || rclass == AVGPR_REGS
+ || rclass == ALL_VGPR_REGS)
{
if (vgpr_1reg_mode_p (mode))
return 1;
@@ -583,7 +585,7 @@ gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
return (sgpr_1reg_mode_p (mode)
|| (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode))
|| (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode));
- if (VGPR_REGNO_P (regno))
+ if (VGPR_REGNO_P (regno) || (AVGPR_REGNO_P (regno) && TARGET_CDNA1_PLUS))
/* Vector instructions do not care about the alignment of register
pairs, but where there is no 64-bit instruction, many of the
define_split do not work if the input and output registers partially
@@ -623,6 +625,8 @@ gcn_regno_reg_class (int regno)
}
if (VGPR_REGNO_P (regno))
return VGPR_REGS;
+ if (AVGPR_REGNO_P (regno))
+ return AVGPR_REGS;
if (SGPR_REGNO_P (regno))
return SGPR_REGS;
if (regno < FIRST_VGPR_REG)
@@ -813,7 +817,7 @@ gcn_spill_class (reg_class_t c, machine_mode /*mode */ )
|| c == VCC_CONDITIONAL_REG || c == EXEC_MASK_REG)
return SGPR_REGS;
else
- return NO_REGS;
+ return c == VGPR_REGS && TARGET_CDNA1_PLUS ? AVGPR_REGS : NO_REGS;
}
/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
@@ -2348,12 +2352,15 @@ gcn_sgpr_move_p (rtx op0, rtx op1)
return true;
if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1)))
return true;
- if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER
- || VGPR_REGNO_P (REGNO (op0)))
+ if (!REG_P (op0)
+ || REGNO (op0) >= FIRST_PSEUDO_REGISTER
+ || VGPR_REGNO_P (REGNO (op0))
+ || AVGPR_REGNO_P (REGNO (op0)))
return false;
if (REG_P (op1)
&& REGNO (op1) < FIRST_PSEUDO_REGISTER
- && !VGPR_REGNO_P (REGNO (op1)))
+ && !VGPR_REGNO_P (REGNO (op1))
+ && !AVGPR_REGNO_P (REGNO (op1)))
return true;
return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode);
}
@@ -2424,6 +2431,11 @@ gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS);
break;
}
+
+ /* CDNA1 doesn't have an instruction for going between the accumulator
+ registers and memory. Go via a VGPR in this case. */
+ if (TARGET_CDNA1 && rclass == AVGPR_REGS && result != VGPR_REGS)
+ result = VGPR_REGS;
}
if (dump_file && (dump_flags & TDF_DETAILS))
@@ -2445,7 +2457,8 @@ gcn_conditional_register_usage (void)
if (cfun->machine->normal_function)
{
- /* Restrict the set of SGPRs and VGPRs used by non-kernel functions. */
+ /* Restrict the set of SGPRs, VGPRs and AVGPRs used by non-kernel
+ functions. */
for (int i = SGPR_REGNO (MAX_NORMAL_SGPR_COUNT);
i <= LAST_SGPR_REG; i++)
fixed_regs[i] = 1, call_used_regs[i] = 1;
@@ -2454,6 +2467,9 @@ gcn_conditional_register_usage (void)
i <= LAST_VGPR_REG; i++)
fixed_regs[i] = 1, call_used_regs[i] = 1;
+ for (int i = AVGPR_REGNO (MAX_NORMAL_AVGPR_COUNT);
+ i <= LAST_AVGPR_REG; i++)
+ fixed_regs[i] = 1, call_used_regs[i] = 1;
return;
}
@@ -2507,6 +2523,16 @@ gcn_conditional_register_usage (void)
fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1;
}
+static bool
+gcn_vgpr_equivalent_register_operand (rtx x, machine_mode mode)
+{
+ if (gcn_vgpr_register_operand (x, mode))
+ return true;
+ if (TARGET_CDNA2_PLUS && gcn_avgpr_register_operand (x, mode))
+ return true;
+ return false;
+}
+
/* Determine if a load or store is valid, according to the register classes
and address space. Used primarily by the machine description to decide
when to split a move into two steps. */
@@ -2515,21 +2541,36 @@ bool
gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
{
if (!MEM_P (dest) && !MEM_P (src))
- return true;
+ {
+ if (gcn_vgpr_register_operand (src, mode)
+ && gcn_avgpr_register_operand (dest, mode))
+ return true;
+ if (gcn_avgpr_register_operand (src, mode)
+ && gcn_vgpr_register_operand (dest, mode))
+ return true;
+ if (TARGET_CDNA2_PLUS
+ && gcn_avgpr_register_operand (src, mode)
+ && gcn_avgpr_register_operand (dest, mode))
+ return true;
+ if (gcn_avgpr_hard_register_operand (src, mode)
+ || gcn_avgpr_hard_register_operand (dest, mode))
+ return false;
+ return true;
+ }
if (MEM_P (dest)
&& AS_FLAT_P (MEM_ADDR_SPACE (dest))
&& (gcn_flat_address_p (XEXP (dest, 0), mode)
|| GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (dest, 0)) == LABEL_REF)
- && gcn_vgpr_register_operand (src, mode))
+ && gcn_vgpr_equivalent_register_operand (src, mode))
return true;
else if (MEM_P (src)
&& AS_FLAT_P (MEM_ADDR_SPACE (src))
&& (gcn_flat_address_p (XEXP (src, 0), mode)
|| GET_CODE (XEXP (src, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (src, 0)) == LABEL_REF)
- && gcn_vgpr_register_operand (dest, mode))
+ && gcn_vgpr_equivalent_register_operand (dest, mode))
return true;
if (MEM_P (dest)
@@ -2537,14 +2578,14 @@ gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
&& (gcn_global_address_p (XEXP (dest, 0))
|| GET_CODE (XEXP (dest, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (dest, 0)) == LABEL_REF)
- && gcn_vgpr_register_operand (src, mode))
+ && gcn_vgpr_equivalent_register_operand (src, mode))
return true;
else if (MEM_P (src)
&& AS_GLOBAL_P (MEM_ADDR_SPACE (src))
&& (gcn_global_address_p (XEXP (src, 0))
|| GET_CODE (XEXP (src, 0)) == SYMBOL_REF
|| GET_CODE (XEXP (src, 0)) == LABEL_REF)
- && gcn_vgpr_register_operand (dest, mode))
+ && gcn_vgpr_equivalent_register_operand (dest, mode))
return true;
if (MEM_P (dest)
@@ -2565,12 +2606,12 @@ gcn_valid_move_p (machine_mode mode, rtx dest, rtx src)
if (MEM_P (dest)
&& AS_ANY_DS_P (MEM_ADDR_SPACE (dest))
&& gcn_ds_address_p (XEXP (dest, 0))
- && gcn_vgpr_register_operand (src, mode))
+ && gcn_vgpr_equivalent_register_operand (src, mode))
return true;
else if (MEM_P (src)
&& AS_ANY_DS_P (MEM_ADDR_SPACE (src))
&& gcn_ds_address_p (XEXP (src, 0))
- && gcn_vgpr_register_operand (dest, mode))
+ && gcn_vgpr_equivalent_register_operand (dest, mode))
return true;
return false;
@@ -3006,7 +3047,8 @@ gcn_compute_frame_offsets (void)
if ((df_regs_ever_live_p (regno) && !call_used_or_fixed_reg_p (regno))
|| ((regno & ~1) == HARD_FRAME_POINTER_REGNUM
&& frame_pointer_needed))
- offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4);
+ offsets->callee_saves += (VGPR_REGNO_P (regno)
+ || AVGPR_REGNO_P (regno) ? 256 : 4);
/* Round up to 64-bit boundary to maintain stack alignment. */
offsets->callee_saves = (offsets->callee_saves + 7) & ~7;
@@ -3949,6 +3991,11 @@ gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
if (in)
return (LOAD_COST + 2) * nregs;
return STORE_COST * nregs;
+ case AVGPR_REGS:
+ case ALL_VGPR_REGS:
+ if (in)
+ return (LOAD_COST + (TARGET_CDNA2_PLUS ? 2 : 4)) * nregs;
+ return (STORE_COST + (TARGET_CDNA2_PLUS ? 0 : 2)) * nregs;
case ALL_REGS:
case ALL_GPR_REGS:
case SRCDST_REGS:
@@ -3968,6 +4015,15 @@ gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in)
static int
gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src)
{
+ if (src == AVGPR_REGS)
+ {
+ if (dst == AVGPR_REGS)
+ return TARGET_CDNA1 ? 6 : 2;
+ if (dst != VGPR_REGS)
+ return 6;
+ }
+ if (dst == AVGPR_REGS && src != VGPR_REGS)
+ return 6;
/* Increase cost of moving from and to vector registers. While this is
fast in hardware (I think), it has hidden cost of setting up the exec
flags. */
@@ -5674,6 +5730,7 @@ gcn_vmem_insn_p (attr_type type)
case TYPE_MUBUF:
case TYPE_MTBUF:
case TYPE_FLAT:
+ case TYPE_VOP3P_MAI:
return true;
case TYPE_UNKNOWN:
case TYPE_SOP1:
@@ -5913,7 +5970,8 @@ gcn_md_reorg (void)
FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
{
const_rtx x = *iter;
- if (REG_P (x) && VGPR_REGNO_P (REGNO (x)))
+ if (REG_P (x) && (VGPR_REGNO_P (REGNO (x))
+ || AVGPR_REGNO_P (REGNO (x))))
{
if (VECTOR_MODE_P (GET_MODE (x)))
{
@@ -6069,17 +6127,16 @@ gcn_md_reorg (void)
if (!prev_insn->insn)
continue;
+ HARD_REG_SET depregs = prev_insn->writes & ireads;
+
/* VALU writes SGPR followed by VMEM reading the same SGPR
requires 5 wait states. */
if ((prev_insn->age + nops_rqd) < 5
&& prev_insn->unit == UNIT_VECTOR
- && gcn_vmem_insn_p (itype))
- {
- HARD_REG_SET regs = prev_insn->writes & ireads;
- if (hard_reg_set_intersect_p
- (regs, reg_class_contents[(int) SGPR_REGS]))
- nops_rqd = 5 - prev_insn->age;
- }
+ && gcn_vmem_insn_p (itype)
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) SGPR_REGS]))
+ nops_rqd = 5 - prev_insn->age;
/* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ
requires 5 wait states. */
@@ -6101,15 +6158,12 @@ gcn_md_reorg (void)
SGPR/VCC as lane select requires 4 wait states. */
if ((prev_insn->age + nops_rqd) < 4
&& prev_insn->unit == UNIT_VECTOR
- && get_attr_laneselect (insn) == LANESELECT_YES)
- {
- HARD_REG_SET regs = prev_insn->writes & ireads;
- if (hard_reg_set_intersect_p
- (regs, reg_class_contents[(int) SGPR_REGS])
+ && get_attr_laneselect (insn) == LANESELECT_YES
+ && (hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) SGPR_REGS])
|| hard_reg_set_intersect_p
- (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG]))
- nops_rqd = 4 - prev_insn->age;
- }
+ (depregs, reg_class_contents[(int) VCC_CONDITIONAL_REG])))
+ nops_rqd = 4 - prev_insn->age;
/* VALU writes VGPR followed by VALU_DPP reading that VGPR
requires 2 wait states. */
@@ -6117,9 +6171,8 @@ gcn_md_reorg (void)
&& prev_insn->unit == UNIT_VECTOR
&& itype == TYPE_VOP_DPP)
{
- HARD_REG_SET regs = prev_insn->writes & ireads;
if (hard_reg_set_intersect_p
- (regs, reg_class_contents[(int) VGPR_REGS]))
+ (depregs, reg_class_contents[(int) VGPR_REGS]))
nops_rqd = 2 - prev_insn->age;
}
@@ -6138,6 +6191,35 @@ gcn_md_reorg (void)
(prev_insn->writes,
reg_class_contents[(int)VCC_CONDITIONAL_REG])))
nops_rqd = ivccwait - prev_insn->age;
+
+ /* CDNA1: write VGPR before v_accvgpr_write reads it. */
+ if (TARGET_CDNA1
+ && (prev_insn->age + nops_rqd) < 2
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) VGPR_REGS])
+ && hard_reg_set_intersect_p
+ (iwrites, reg_class_contents[(int) AVGPR_REGS]))
+ nops_rqd = 2 - prev_insn->age;
+
+ /* CDNA1: v_accvgpr_write writes AVGPR before v_accvgpr_read. */
+ if (TARGET_CDNA1
+ && (prev_insn->age + nops_rqd) < 3
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) AVGPR_REGS])
+ && hard_reg_set_intersect_p
+ (iwrites, reg_class_contents[(int) VGPR_REGS]))
+ nops_rqd = 3 - prev_insn->age;
+
+ /* CDNA1: Undocumented(?!) read-after-write when restoring values
+ from AVGPRs to VGPRS. Observed problem was for address register
+ of flat_load instruction, but others may be affected? */
+ if (TARGET_CDNA1
+ && (prev_insn->age + nops_rqd) < 2
+ && hard_reg_set_intersect_p
+ (prev_insn->reads, reg_class_contents[(int) AVGPR_REGS])
+ && hard_reg_set_intersect_p
+ (depregs, reg_class_contents[(int) VGPR_REGS]))
+ nops_rqd = 2 - prev_insn->age;
}
/* Insert the required number of NOPs. */
@@ -6429,7 +6511,7 @@ output_file_start (void)
void
gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
{
- int sgpr, vgpr;
+ int sgpr, vgpr, avgpr;
bool xnack_enabled = TARGET_XNACK;
fputs ("\n\n", file);
@@ -6454,6 +6536,12 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr))
break;
vgpr++;
+ for (avgpr = 255; avgpr >= 0; avgpr--)
+ if (df_regs_ever_live_p (FIRST_AVGPR_REG + avgpr))
+ break;
+ avgpr++;
+ vgpr = (vgpr + 3) & ~3;
+ avgpr = (avgpr + 3) & ~3;
if (!leaf_function_p ())
{
@@ -6462,6 +6550,8 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
vgpr = MAX_NORMAL_VGPR_COUNT;
if (sgpr < MAX_NORMAL_SGPR_COUNT)
sgpr = MAX_NORMAL_SGPR_COUNT;
+ if (avgpr < MAX_NORMAL_AVGPR_COUNT)
+ avgpr = MAX_NORMAL_AVGPR_COUNT;
}
/* The gfx90a accum_offset field can't represent 0 registers. */
@@ -6519,6 +6609,11 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
? 2
: cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG)
? 1 : 0);
+ int next_free_vgpr = vgpr;
+ if (TARGET_CDNA1 && avgpr > vgpr)
+ next_free_vgpr = avgpr;
+ if (TARGET_CDNA2_PLUS)
+ next_free_vgpr += avgpr;
fprintf (file,
"\t .amdhsa_next_free_vgpr\t%i\n"
"\t .amdhsa_next_free_sgpr\t%i\n"
@@ -6529,7 +6624,7 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
"\t .amdhsa_group_segment_fixed_size\t%u\n"
"\t .amdhsa_float_denorm_mode_32\t3\n"
"\t .amdhsa_float_denorm_mode_16_64\t3\n",
- vgpr,
+ next_free_vgpr,
sgpr,
xnack_enabled,
LDS_SIZE);
@@ -6537,7 +6632,7 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
fprintf (file,
"\t .amdhsa_accum_offset\t%i\n"
"\t .amdhsa_tg_split\t0\n",
- (vgpr+3)&~3); // I think this means the AGPRs come after the VGPRs
+ vgpr); /* The AGPRs come after the VGPRs. */
fputs ("\t.end_amdhsa_kernel\n", file);
#if 1
@@ -6564,9 +6659,9 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
cfun->machine->kernarg_segment_byte_size,
cfun->machine->kernarg_segment_alignment,
LDS_SIZE,
- sgpr, vgpr);
- if (gcn_arch == PROCESSOR_GFX90a)
- fprintf (file, " .agpr_count: 0\n"); // AGPRs are not used, yet
+ sgpr, next_free_vgpr);
+ if (gcn_arch == PROCESSOR_GFX90a || gcn_arch == PROCESSOR_GFX908)
+ fprintf (file, " .agpr_count: %i\n", avgpr);
fputs (" .end_amdgpu_metadata\n", file);
#endif
@@ -6662,6 +6757,9 @@ print_reg (FILE *file, rtx x)
else if (VGPR_REGNO_P (REGNO (x)))
fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
REGNO (x) - FIRST_VGPR_REG + 1);
+ else if (AVGPR_REGNO_P (REGNO (x)))
+ fprintf (file, "a[%i:%i]", REGNO (x) - FIRST_AVGPR_REG,
+ REGNO (x) - FIRST_AVGPR_REG + 1);
else if (REGNO (x) == FLAT_SCRATCH_REG)
fprintf (file, "flat_scratch");
else if (REGNO (x) == EXEC_REG)
@@ -6680,6 +6778,9 @@ print_reg (FILE *file, rtx x)
else if (VGPR_REGNO_P (REGNO (x)))
fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG,
REGNO (x) - FIRST_VGPR_REG + 3);
+ else if (AVGPR_REGNO_P (REGNO (x)))
+ fprintf (file, "a[%i:%i]", REGNO (x) - FIRST_AVGPR_REG,
+ REGNO (x) - FIRST_AVGPR_REG + 3);
else
gcc_unreachable ();
}
@@ -7603,6 +7704,8 @@ gcn_dwarf_register_number (unsigned int regno)
}
else if (VGPR_REGNO_P (regno))
return (regno - FIRST_VGPR_REG + 2560);
+ else if (AVGPR_REGNO_P (regno))
+ return (regno - FIRST_AVGPR_REG + 3072);
/* Otherwise, there's nothing sensible to do. */
return regno + 100000;
@@ -146,6 +146,9 @@
#define FIRST_VGPR_REG 160
#define VGPR_REGNO(N) ((N)+FIRST_VGPR_REG)
#define LAST_VGPR_REG 415
+#define FIRST_AVGPR_REG 416
+#define AVGPR_REGNO(N) ((N)+FIRST_AVGPR_REG)
+#define LAST_AVGPR_REG 671
/* Frame Registers, and other registers */
@@ -157,10 +160,10 @@
#define RETURN_VALUE_REG 168 /* Must be divisible by 4. */
#define STATIC_CHAIN_REGNUM 30
#define WORK_ITEM_ID_Z_REG 162
-#define SOFT_ARG_REG 416
-#define FRAME_POINTER_REGNUM 418
-#define DWARF_LINK_REGISTER 420
-#define FIRST_PSEUDO_REGISTER 421
+#define SOFT_ARG_REG 672
+#define FRAME_POINTER_REGNUM 674
+#define DWARF_LINK_REGISTER 676
+#define FIRST_PSEUDO_REGISTER 677
#define FIRST_PARM_REG (FIRST_SGPR_REG + 24)
#define FIRST_VPARM_REG (FIRST_VGPR_REG + 8)
@@ -176,6 +179,7 @@
#define SGPR_OR_VGPR_REGNO_P(N) ((N)>=FIRST_VGPR_REG && (N) <= LAST_SGPR_REG)
#define SGPR_REGNO_P(N) ((N) <= LAST_SGPR_REG)
#define VGPR_REGNO_P(N) ((N)>=FIRST_VGPR_REG && (N) <= LAST_VGPR_REG)
+#define AVGPR_REGNO_P(N) ((N)>=FIRST_AVGPR_REG && (N) <= LAST_AVGPR_REG)
#define SSRC_REGNO_P(N) ((N) <= SCC_REG && (N) != VCCZ_REG)
#define SDST_REGNO_P(N) ((N) <= EXEC_HI_REG && (N) != VCCZ_REG)
#define CC_REG_P(X) (REG_P (X) && CC_REGNO_P (REGNO (X)))
@@ -206,7 +210,7 @@
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, \
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
- /* VGRPs */ \
+ /* VGPRs */ \
0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
@@ -223,6 +227,23 @@
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ /* Accumulation VGPRs */ \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
/* Other registers. */ \
1, 1, 1, 1, 1 \
}
@@ -244,7 +265,7 @@
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
- /* VGRPs */ \
+ /* VGPRs */ \
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
@@ -261,6 +282,23 @@
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+ /* Accumulation VGPRs */ \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \
/* Other registers. */ \
1, 1, 1, 1, 1 \
}
@@ -320,6 +358,8 @@ enum reg_class
SGPR_SRC_REGS,
GENERAL_REGS,
VGPR_REGS,
+ AVGPR_REGS,
+ ALL_VGPR_REGS,
ALL_GPR_REGS,
SRCDST_REGS,
AFP_REGS,
@@ -345,6 +385,8 @@ enum reg_class
"SGPR_SRC_REGS", \
"GENERAL_REGS", \
"VGPR_REGS", \
+ "AVGPR_REGS", \
+ "ALL_VGPR_REGS", \
"ALL_GPR_REGS", \
"SRCDST_REGS", \
"AFP_REGS", \
@@ -357,40 +399,58 @@ enum reg_class
#define REG_CLASS_CONTENTS { \
/* NO_REGS. */ \
{0, 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, \
0, 0, 0, 0, 0, 0}, \
/* SCC_CONDITIONAL_REG. */ \
{0, 0, 0, 0, \
NAMED_REG_MASK2 (SCC_REG), 0, 0, 0, \
- 0, 0, 0, 0, 0}, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
/* VCCZ_CONDITIONAL_REG. */ \
{0, 0, 0, NAMED_REG_MASK (VCCZ_REG), \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, \
0, 0, 0, 0, 0, 0}, \
/* VCC_CONDITIONAL_REG. */ \
{0, 0, 0, NAMED_REG_MASK (VCC_LO_REG)|NAMED_REG_MASK (VCC_HI_REG), \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, \
0, 0, 0, 0, 0, 0}, \
/* EXECZ_CONDITIONAL_REG. */ \
{0, 0, 0, 0, \
NAMED_REG_MASK2 (EXECZ_REG), 0, 0, 0, \
- 0, 0, 0, 0, 0}, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
/* ALL_CONDITIONAL_REGS. */ \
{0, 0, 0, NAMED_REG_MASK (VCCZ_REG), \
NAMED_REG_MASK2 (EXECZ_REG) | NAMED_REG_MASK2 (SCC_REG), 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0}, \
/* EXEC_MASK_REG. */ \
{0, 0, 0, NAMED_REG_MASK (EXEC_LO_REG) | NAMED_REG_MASK (EXEC_HI_REG), \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, \
0, 0, 0, 0, 0, 0}, \
/* SGPR_REGS. */ \
{0xffffffff, 0xffffffff, 0xffffffff, 0xf1, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, \
0, 0, 0, 0, 0, 0}, \
/* SGPR_EXEC_REGS. */ \
{0xffffffff, 0xffffffff, 0xffffffff, \
0xf1 | NAMED_REG_MASK (EXEC_LO_REG) | NAMED_REG_MASK (EXEC_HI_REG), \
0, 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0}, \
/* SGPR_VOP_SRC_REGS. */ \
{0xffffffff, 0xffffffff, 0xffffffff, \
@@ -398,12 +458,16 @@ enum reg_class
-NAMED_REG_MASK (EXEC_LO_REG) \
-NAMED_REG_MASK (EXEC_HI_REG), \
NAMED_REG_MASK2 (SCC_REG), 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0}, \
/* SGPR_MEM_SRC_REGS. */ \
{0xffffffff, 0xffffffff, 0xffffffff, \
0xffffffff-NAMED_REG_MASK (VCCZ_REG)-NAMED_REG_MASK (M0_REG) \
-NAMED_REG_MASK (EXEC_LO_REG)-NAMED_REG_MASK (EXEC_HI_REG), \
0, 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0}, \
/* SGPR_DST_REGS. */ \
{0xffffffff, 0xffffffff, 0xffffffff, \
@@ -413,30 +477,56 @@ enum reg_class
/* SGPR_SRC_REGS. */ \
{0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
NAMED_REG_MASK2 (EXECZ_REG) | NAMED_REG_MASK2 (SCC_REG), 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, 0, 0}, \
/* GENERAL_REGS. */ \
{0xffffffff, 0xffffffff, 0xffffffff, 0xf1, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, \
0, 0, 0, 0, 0, 0}, \
/* VGPR_REGS. */ \
{0, 0, 0, 0, \
0, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
+ /* AVGPR_REGS. */ \
+ {0, 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0}, \
+ /* ALL_VGPR_REGS. */ \
+ {0, 0, 0, 0, \
+ 0, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0}, \
/* ALL_GPR_REGS. */ \
{0xffffffff, 0xffffffff, 0xffffffff, 0xf1, \
0, 0xffffffff, 0xffffffff, 0xffffffff, \
- 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0}, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
/* SRCDST_REGS. */ \
{0xffffffff, 0xffffffff, 0xffffffff, \
0xffffffff-NAMED_REG_MASK (VCCZ_REG), \
0, 0xffffffff, 0xffffffff, 0xffffffff, \
- 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0}, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0}, \
/* AFP_REGS. */ \
{0, 0, 0, 0, \
+ 0, 0, 0, 0, \
+ 0, 0, 0, 0, \
0, 0, 0, 0, \
0, 0, 0, 0, 0, 0xf}, \
/* ALL_REGS. */ \
{0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
+ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, \
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0 }}
@@ -541,6 +631,34 @@ enum gcn_address_spaces
"v236", "v237", "v238", "v239", "v240", "v241", "v242", "v243", "v244", \
"v245", "v246", "v247", "v248", "v249", "v250", "v251", "v252", "v253", \
"v254", "v255", \
+ "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", "a10", \
+ "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", "a20", \
+ "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", "a30", \
+ "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", "a40", \
+ "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", "a50", \
+ "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", "a60", \
+ "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", "a70", \
+ "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", "a80", \
+ "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", "a90", \
+ "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", "a100", \
+ "a101", "a102", "a103", "a104", "a105", "a106", "a107", "a108", "a109", \
+ "a110", "a111", "a112", "a113", "a114", "a115", "a116", "a117", "a118", \
+ "a119", "a120", "a121", "a122", "a123", "a124", "a125", "a126", "a127", \
+ "a128", "a129", "a130", "a131", "a132", "a133", "a134", "a135", "a136", \
+ "a137", "a138", "a139", "a140", "a141", "a142", "a143", "a144", "a145", \
+ "a146", "a147", "a148", "a149", "a150", "a151", "a152", "a153", "a154", \
+ "a155", "a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", \
+ "a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", "a172", \
+ "a173", "a174", "a175", "a176", "a177", "a178", "a179", "a180", "a181", \
+ "a182", "a183", "a184", "a185", "a186", "a187", "a188", "a189", "a190", \
+ "a191", "a192", "a193", "a194", "a195", "a196", "a197", "a198", "a199", \
+ "a200", "a201", "a202", "a203", "a204", "a205", "a206", "a207", "a208", \
+ "a209", "a210", "a211", "a212", "a213", "a214", "a215", "a216", "a217", \
+ "a218", "a219", "a220", "a221", "a222", "a223", "a224", "a225", "a226", \
+ "a227", "a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", \
+ "a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", "a244", \
+ "a245", "a246", "a247", "a248", "a249", "a250", "a251", "a252", "a253", \
+ "a254", "a255", \
"?ap0", "?ap1", "?fp0", "?fp1", "?dwlr" }
#define PRINT_OPERAND(FILE, X, CODE) print_operand(FILE, X, CODE)
@@ -51,13 +51,15 @@ (define_constants
(EXECZ_REG 128)
(SCC_REG 129)
(FIRST_VGPR_REG 160)
- (LAST_VGPR_REG 415)])
+ (LAST_VGPR_REG 415)
+ (FIRST_AVGPR_REG 416)
+ (LAST_AVGPR_REG 671)])
(define_constants
[(SP_REGNUM 16)
(LR_REGNUM 18)
- (AP_REGNUM 416)
- (FP_REGNUM 418)])
+ (AP_REGNUM 672)
+ (FP_REGNUM 674)])
(define_c_enum "unspecv" [
UNSPECV_PROLOGUE_USE
@@ -171,6 +173,11 @@ (define_c_enum "unspec" [
; vdst: vgpr0-255
; sdst: sgpr0-103/vcc/tba/tma/ttmp0-11
;
+; vop3p_mai - vector, three inputs, one vector output
+; vsrc0,vsrc1,vsrc2: inline constant -16 to -64, fp inline immediate,
+; (acc or arch) vgpr0-255
+; vdst: (acc or arch) vgpr0-255
+;
; vop_sdwa - second dword for vop1/vop2/vopc for specifying sub-dword address
; src0: vgpr0-255
; dst_sel: BYTE_0-3, WORD_0-1, DWORD
@@ -229,7 +236,8 @@ (define_c_enum "unspec" [
(define_attr "type"
"unknown,sop1,sop2,sopk,sopc,sopp,smem,ds,vop2,vop1,vopc,
- vop3a,vop3b,vop_sdwa,vop_dpp,mubuf,mtbuf,flat,mult,vmult"
+ vop3a,vop3b,vop3p_mai,vop_sdwa,vop_dpp,mubuf,mtbuf,flat,mult,
+ vmult"
(const_string "unknown"))
; Set if instruction is executed in scalar or vector unit
@@ -237,7 +245,7 @@ (define_attr "type"
(define_attr "unit" "unknown,scalar,vector"
(cond [(eq_attr "type" "sop1,sop2,sopk,sopc,sopp,smem,mult")
(const_string "scalar")
- (eq_attr "type" "vop2,vop1,vopc,vop3a,vop3b,ds,
+ (eq_attr "type" "vop2,vop1,vopc,vop3a,vop3b,ds,vop3p_mai,
vop_sdwa,vop_dpp,flat,vmult")
(const_string "vector")]
(const_string "unknown")))
@@ -284,7 +292,7 @@ (define_attr "length" ""
; Disable alternatives that only apply to specific ISA variants.
-(define_attr "gcn_version" "gcn3,gcn5" (const_string "gcn3"))
+(define_attr "gcn_version" "gcn3,gcn5,cdna2" (const_string "gcn3"))
(define_attr "rdna" "any,no,yes" (const_string "any"))
(define_attr "enabled" ""
@@ -297,6 +305,9 @@ (define_attr "enabled" ""
(eq_attr "gcn_version" "gcn3") (const_int 1)
(and (eq_attr "gcn_version" "gcn5")
(ne (symbol_ref "TARGET_GCN5_PLUS") (const_int 0)))
+ (const_int 1)
+ (and (eq_attr "gcn_version" "cdna2")
+ (ne (symbol_ref "TARGET_CDNA2_PLUS") (const_int 0)))
(const_int 1)]
(const_int 0)))
@@ -552,25 +563,32 @@ (define_insn "*mov<mode>_insn"
[(set (match_operand:SISF 0 "nonimmediate_operand")
(match_operand:SISF 1 "gcn_load_operand"))]
""
- {@ [cons: =0, 1; attrs: type, exec, length]
- [SD ,SSA ;sop1 ,* ,4 ] s_mov_b32\t%0, %1
- [SD ,J ;sopk ,* ,4 ] s_movk_i32\t%0, %1
- [SD ,B ;sop1 ,* ,8 ] s_mov_b32\t%0, %1
- [SD ,RB ;smem ,* ,12] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
- [RB ,Sm ;smem ,* ,12] s_buffer_store%s1\t%1, s[0:3], %0
- [Sm ,RS ;smem ,* ,12] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [RS ,Sm ;smem ,* ,12] s_store_dword\t%1, %A0
- [v ,v ;vop1 ,* ,4 ] v_mov_b32\t%0, %1
- [Sg ,v ;vop3a,none,8 ] v_readlane_b32\t%0, %1, 0
- [v ,Sv ;vop3a,none,8 ] v_writelane_b32\t%0, %1, 0
- [v ,RF ;flat ,* ,12] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
- [RF ,v ;flat ,* ,12] flat_store_dword\t%A0, %1%O0%g0
- [v ,B ;vop1 ,* ,8 ] v_mov_b32\t%0, %1
- [RLRG,v ;ds ,* ,12] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,* ,12] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [SD ,Y ;sop1 ,* ,8 ] s_mov_b32\t%0, %1
- [v ,RM ;flat ,* ,12] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [RM ,v ;flat ,* ,12] global_store_dword\t%A0, %1%O0%g0
+ {@ [cons: =0, 1; attrs: type, exec, length, gcn_version]
+ [SD ,SSA ;sop1 ,* ,4 ,* ] s_mov_b32\t%0, %1
+ [SD ,J ;sopk ,* ,4 ,* ] s_movk_i32\t%0, %1
+ [SD ,B ;sop1 ,* ,8 ,* ] s_mov_b32\t%0, %1
+ [SD ,RB ;smem ,* ,12,* ] s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
+ [RB ,Sm ;smem ,* ,12,* ] s_buffer_store%s1\t%1, s[0:3], %0
+ [Sm ,RS ;smem ,* ,12,* ] s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [RS ,Sm ;smem ,* ,12,* ] s_store_dword\t%1, %A0
+ [v ,v ;vop1 ,* ,4 ,* ] v_mov_b32\t%0, %1
+ [Sg ,v ;vop3a,none,8 ,* ] v_readlane_b32\t%0, %1, 0
+ [v ,Sv ;vop3a,none,8 ,* ] v_writelane_b32\t%0, %1, 0
+ [v ,^a ;vop3p_mai,*,8,* ] v_accvgpr_read_b32\t%0, %1
+ [a ,v ;vop3p_mai,*,8,* ] v_accvgpr_write_b32\t%0, %1
+ [a ,a ;vop1 ,* ,4,cdna2] v_accvgpr_mov_b32\t%0, %1
+ [v ,RF ;flat ,* ,12,* ] flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [^a ,RF ;flat ,* ,12,cdna2] ^
+ [RF ,v ;flat ,* ,12,* ] flat_store_dword\t%A0, %1%O0%g0
+ [RF ,a ;flat ,* ,12,cdna2] ^
+ [v ,B ;vop1 ,* ,8 ,* ] v_mov_b32\t%0, %1
+ [RLRG,v ;ds ,* ,12,* ] ds_write_b32\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,* ,12,* ] ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [SD ,Y ;sop1 ,* ,8 ,* ] s_mov_b32\t%0, %1
+ [v ,RM ;flat ,* ,12,* ] global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [^a ,RM ;flat ,* ,12,cdna2] ^
+ [RM ,v ;flat ,* ,12,* ] global_store_dword\t%A0, %1%O0%g0
+ [RM ,a ;flat ,* ,12,cdna2] ^
})
; 8/16bit move pattern
@@ -580,20 +598,27 @@ (define_insn "*mov<mode>_insn"
[(set (match_operand:QIHI 0 "nonimmediate_operand")
(match_operand:QIHI 1 "gcn_load_operand"))]
"gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
- {@ [cons: =0, 1; attrs: type, exec, length]
- [SD ,SSA ;sop1 ,* ,4 ] s_mov_b32\t%0, %1
- [SD ,J ;sopk ,* ,4 ] s_movk_i32\t%0, %1
- [SD ,B ;sop1 ,* ,8 ] s_mov_b32\t%0, %1
- [v ,v ;vop1 ,* ,4 ] v_mov_b32\t%0, %1
- [Sg ,v ;vop3a,none,4 ] v_readlane_b32\t%0, %1, 0
- [v ,Sv ;vop3a,none,4 ] v_writelane_b32\t%0, %1, 0
- [v ,RF ;flat ,* ,12] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
- [RF ,v ;flat ,* ,12] flat_store%s0\t%A0, %1%O0%g0
- [v ,B ;vop1 ,* ,8 ] v_mov_b32\t%0, %1
- [RLRG,v ;ds ,* ,12] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,* ,12] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [v ,RM ;flat ,* ,12] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [RM ,v ;flat ,* ,12] global_store%s0\t%A0, %1%O0%g0
+ {@ [cons: =0, 1; attrs: type, exec, length, gcn_version]
+ [SD ,SSA ;sop1 ,* ,4 ,* ] s_mov_b32\t%0, %1
+ [SD ,J ;sopk ,* ,4 ,* ] s_movk_i32\t%0, %1
+ [SD ,B ;sop1 ,* ,8 ,* ] s_mov_b32\t%0, %1
+ [v ,v ;vop1 ,* ,4 ,* ] v_mov_b32\t%0, %1
+ [Sg ,v ;vop3a,none,4 ,* ] v_readlane_b32\t%0, %1, 0
+ [v ,Sv ;vop3a,none,4 ,* ] v_writelane_b32\t%0, %1, 0
+ [v ,^a ;vop3p_mai,*,8,* ] v_accvgpr_read_b32\t%0, %1
+ [a ,v ;vop3p_mai,*,8,* ] v_accvgpr_write_b32\t%0, %1
+ [a ,a ;vop1 ,* ,8,cdna2] v_accvgpr_mov_b32\t%0, %1
+ [v ,RF ;flat ,* ,12,* ] flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [^a ,RF ;flat ,* ,12,cdna2] ^
+ [RF ,v ;flat ,* ,12,* ] flat_store%s0\t%A0, %1%O0%g0
+ [RF ,a ;flat ,* ,12,cdna2] ^
+ [v ,B ;vop1 ,* ,8 ,* ] v_mov_b32\t%0, %1
+ [RLRG,v ;ds ,* ,12,* ] ds_write%b0\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,* ,12,* ] ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,RM ;flat ,* ,12,* ] global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [^a ,RM ;flat ,* ,12,cdna2] ^
+ [RM ,v ;flat ,* ,12,* ] global_store%s0\t%A0, %1%O0%g0
+ [RM ,a ;flat ,* ,12,cdna2] ^
})
; 64bit move pattern
@@ -602,22 +627,29 @@ (define_insn_and_split "*mov<mode>_insn"
[(set (match_operand:DIDF 0 "nonimmediate_operand")
(match_operand:DIDF 1 "general_operand"))]
"GET_CODE(operands[1]) != SYMBOL_REF"
- {@ [cons: =0, 1; attrs: type, length]
- [SD ,SSA ;sop1 ,4 ] s_mov_b64\t%0, %1
- [SD ,C ;sop1 ,8 ] ^
- [SD ,DB ;mult ,* ] #
- [RS ,Sm ;smem ,12] s_store_dwordx2\t%1, %A0
- [Sm ,RS ;smem ,12] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [v ,v ;vmult,* ] #
- [v ,DB ;vmult,* ] #
- [Sg ,v ;vmult,* ] #
- [v ,Sv ;vmult,* ] #
- [v ,RF ;flat ,12] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
- [RF ,v ;flat ,12] flat_store_dwordx2\t%A0, %1%O0%g0
- [RLRG,v ;ds ,12] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RLRG;ds ,12] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
- [v ,RM ;flat ,12] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [RM ,v ;flat ,12] global_store_dwordx2\t%A0, %1%O0%g0
+ {@ [cons: =0, 1; attrs: type, length, gcn_version]
+ [SD ,SSA ;sop1 ,4 ,* ] s_mov_b64\t%0, %1
+ [SD ,C ;sop1 ,8 ,* ] ^
+ [SD ,DB ;mult ,* ,* ] #
+ [RS ,Sm ;smem ,12,* ] s_store_dwordx2\t%1, %A0
+ [Sm ,RS ;smem ,12,* ] s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [v ,v ;vmult,* ,* ] #
+ [v ,DB ;vmult,* ,* ] #
+ [Sg ,v ;vmult,* ,* ] #
+ [v ,Sv ;vmult,* ,* ] #
+ [v ,^a ;vmult,* ,* ] #
+ [a ,v ;vmult,* ,* ] #
+ [a ,a ;vmult,* ,cdna2] #
+ [v ,RF ;flat ,12,* ] flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [^a ,RF ;flat ,12,cdna2] ^
+ [RF ,v ;flat ,12,* ] flat_store_dwordx2\t%A0, %1%O0%g0
+ [RF ,a ;flat ,12,cdna2] ^
+ [RLRG,v ;ds ,12,* ] ds_write_b64\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RLRG;ds ,12,* ] ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,RM ;flat ,12,* ] global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [^a ,RM ;flat ,12,cdna2] ^
+ [RM ,v ;flat ,12,* ] global_store_dwordx2\t%A0, %1%O0%g0
+ [RM ,a ;flat ,12,cdna2] ^
}
"reload_completed
&& ((!MEM_P (operands[0]) && !MEM_P (operands[1])
@@ -655,19 +687,26 @@ (define_insn_and_split "*movti_insn"
[(set (match_operand:TI 0 "nonimmediate_operand")
(match_operand:TI 1 "general_operand" ))]
""
- {@ [cons: =0, 1; attrs: type, delayeduse, length]
- [SD,SSB;mult ,* ,* ] #
- [RS,Sm ;smem ,* ,12] s_store_dwordx4\t%1, %A0
- [Sm,RS ;smem ,yes,12] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
- [RF,v ;flat ,* ,12] flat_store_dwordx4\t%A0, %1%O0%g0
- [v ,RF ;flat ,* ,12] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
- [v ,v ;vmult,* ,* ] #
- [v ,Sv ;vmult,* ,* ] #
- [SD,v ;vmult,* ,* ] #
- [RM,v ;flat ,yes,12] global_store_dwordx4\t%A0, %1%O0%g0
- [v ,RM ;flat ,* ,12] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
- [RL,v ;ds ,* ,12] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
- [v ,RL ;ds ,* ,12] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ {@ [cons: =0, 1; attrs: type, delayeduse, length, gcn_version]
+ [SD,SSB;mult ,* ,* ,* ] #
+ [RS,Sm ;smem ,* ,12,* ] s_store_dwordx4\t%1, %A0
+ [Sm,RS ;smem ,yes,12,* ] s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
+ [RF,v ;flat ,* ,12,* ] flat_store_dwordx4\t%A0, %1%O0%g0
+ [RF,a ;flat ,* ,12,cdna2] ^
+ [v ,RF ;flat ,* ,12,* ] flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
+ [^a,RF ;flat ,* ,12,cdna2] ^
+ [v ,v ;vmult,* ,* ,* ] #
+ [v ,Sv ;vmult,* ,* ,* ] #
+ [SD,v ;vmult,* ,* ,* ] #
+ [RM,v ;flat ,yes,12,* ] global_store_dwordx4\t%A0, %1%O0%g0
+ [RM,a ;flat ,yes,12,cdna2] ^
+ [v ,RM ;flat ,* ,12,* ] global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
+ [^a,RM ;flat ,* ,12,cdna2] ^
+ [RL,v ;ds ,* ,12,* ] ds_write_b128\t%A0, %1%O0\;s_waitcnt\tlgkmcnt(0)
+ [v ,RL ;ds ,* ,12,* ] ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
+ [v ,^a ;vmult,* ,* ,* ] #
+ [a ,v ;vmult,* ,* ,* ] #
+ [a ,a ;vmult,* ,* ,cdna2] #
}
"reload_completed
&& REG_P (operands[0])
@@ -471,6 +471,26 @@ copy_early_debug_info (const char *infile, const char *outfile)
return true;
}
+/* CDNA2 devices have twice as many VGPRs compared to older devices,
+ but the AVGPRS are allocated from the same pool. */
+
+static int
+isa_has_combined_avgprs (int isa)
+{
+ switch (isa)
+ {
+ case EF_AMDGPU_MACH_AMDGCN_GFX803:
+ case EF_AMDGPU_MACH_AMDGCN_GFX900:
+ case EF_AMDGPU_MACH_AMDGCN_GFX906:
+ case EF_AMDGPU_MACH_AMDGCN_GFX908:
+ case EF_AMDGPU_MACH_AMDGCN_GFX1030:
+ return false;
+ case EF_AMDGPU_MACH_AMDGCN_GFX90a:
+ return true;
+ }
+ fatal_error (input_location, "unhandled ISA in isa_has_combined_avgprs");
+}
+
/* Parse an input assembler file, extract the offload tables etc.,
and output (1) the assembler code, minus the tables (which can contain
problematic relocations), and (2) a C file with the offload tables
@@ -496,6 +516,7 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
{
int sgpr_count;
int vgpr_count;
+ int avgpr_count;
char *kernel_name;
} regcount = { -1, -1, NULL };
@@ -543,6 +564,12 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
gcc_assert (regcount.kernel_name);
break;
}
+ else if (sscanf (buf, " .agpr_count: %d\n",
+ ®count.avgpr_count) == 1)
+ {
+ gcc_assert (regcount.kernel_name);
+ break;
+ }
break;
}
@@ -685,6 +712,8 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
{
sgpr_count = regcounts[j].sgpr_count;
vgpr_count = regcounts[j].vgpr_count;
+ if (isa_has_combined_avgprs (elf_arch))
+ vgpr_count += regcounts[j].avgpr_count;
break;
}
@@ -70,6 +70,30 @@ (define_predicate "gcn_vgpr_register_operand"
return VGPR_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER;
})
+(define_predicate "gcn_avgpr_register_operand"
+ (match_operand 0 "register_operand")
+ {
+ if (GET_CODE (op) == SUBREG)
+ op = SUBREG_REG (op);
+
+ if (!REG_P (op))
+ return false;
+
+ return AVGPR_REGNO_P (REGNO (op)) || REGNO (op) >= FIRST_PSEUDO_REGISTER;
+})
+
+(define_predicate "gcn_avgpr_hard_register_operand"
+ (match_operand 0 "register_operand")
+ {
+ if (GET_CODE (op) == SUBREG)
+ op = SUBREG_REG (op);
+
+ if (!REG_P (op))
+ return false;
+
+ return AVGPR_REGNO_P (REGNO (op));
+})
+
(define_predicate "gcn_inline_immediate_operand"
(match_code "const_int,const_double,const_vector")
{
@@ -2010,6 +2010,9 @@ Any @code{symbol_ref} or @code{label_ref}
@item v
VGPR register
+@item a
+Accelerator VGPR register (CDNA1 onwards)
+
@item Sg
SGPR register
new file mode 100644
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=gfx90a -O1" } */
+/* { dg-skip-if "incompatible ISA" { *-*-* } { "-march=gfx90[068]" } } */
+/* { dg-final { scan-assembler {load[^\n]*a[0-9[]} } } */
+/* { dg-final { scan-assembler {store[^\n]*a[0-9[]} } } */
+
+#define TYPE double
+
+#include "avgpr-mem-int.c"
new file mode 100644
@@ -0,0 +1,116 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=gfx90a -O1" } */
+/* { dg-skip-if "incompatible ISA" { *-*-* } { "-march=gfx90[068]" } } */
+/* { dg-final { scan-assembler {load[^\n]*a[0-9[]} } } */
+/* { dg-final { scan-assembler {store[^\n]*a[0-9[]} } } */
+
+#ifndef TYPE
+#define TYPE int
+#endif
+
+TYPE a[50];
+
+int f()
+{
+ __asm__ volatile ("; fake -> %0" :: "va"(a[0]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[1]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[2]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[3]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[4]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[5]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[6]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[7]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[8]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[9]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[10]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[11]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[12]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[13]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[14]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[15]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[16]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[17]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[18]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[19]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[20]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[21]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[22]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[23]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[24]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[25]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[26]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[27]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[28]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[29]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[30]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[31]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[32]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[33]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[34]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[35]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[36]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[37]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[38]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[39]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[40]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[41]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[42]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[43]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[44]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[45]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[46]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[47]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[48]));
+ __asm__ volatile ("; fake -> %0" :: "va"(a[49]));
+
+ __asm__ volatile ("; fake <- %0" : "+va"(a[0]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[1]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[2]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[3]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[4]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[5]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[6]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[7]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[8]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[9]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[10]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[11]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[12]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[13]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[14]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[15]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[16]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[17]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[18]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[19]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[20]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[21]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[22]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[23]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[24]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[25]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[26]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[27]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[28]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[29]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[30]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[31]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[32]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[33]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[34]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[35]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[36]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[37]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[38]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[39]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[40]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[41]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[42]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[43]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[44]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[45]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[46]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[47]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[48]));
+ __asm__ volatile ("; fake <- %0" : "+va"(a[49]));
+}
new file mode 100644
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=gfx90a -O1" } */
+/* { dg-skip-if "incompatible ISA" { *-*-* } { "-march=gfx90[068]" } } */
+/* { dg-final { scan-assembler {load[^\n]*a[0-9[]} } } */
+/* { dg-final { scan-assembler {store[^\n]*a[0-9[]} } } */
+
+#define TYPE long
+
+#include "avgpr-mem-int.c"
new file mode 100644
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=gfx90a -O1" } */
+/* { dg-skip-if "incompatible ISA" { *-*-* } { "-march=gfx90[068]" } } */
+/* { dg-final { scan-assembler {load[^\n]*a[0-9[]} } } */
+/* { dg-final { scan-assembler {store[^\n]*a[0-9[]} } } */
+
+#define TYPE short
+
+#include "avgpr-mem-int.c"
new file mode 100644
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=gfx908 -O1" } */
+/* { dg-skip-if "incompatible ISA" { *-*-* } { "-march=gfx90[06]" } } */
+/* { dg-final { scan-assembler "accvgpr" } } */
+
+#define TYPE double
+
+#include "avgpr-spill-int.c"
new file mode 100644
@@ -0,0 +1,115 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=gfx908 -O1" } */
+/* { dg-skip-if "incompatible ISA" { *-*-* } { "-march=gfx90[06]" } } */
+/* { dg-final { scan-assembler "accvgpr" } } */
+
+#ifndef TYPE
+#define TYPE int
+#endif
+
+TYPE a[50];
+
+int f()
+{
+ __asm__ volatile ("; fake <- %0" : "=v"(a[0]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[1]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[2]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[3]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[4]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[5]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[6]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[7]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[8]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[9]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[10]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[11]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[12]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[13]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[14]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[15]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[16]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[17]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[18]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[19]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[20]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[21]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[22]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[23]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[24]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[25]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[26]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[27]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[28]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[29]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[30]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[31]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[32]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[33]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[34]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[35]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[36]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[37]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[38]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[39]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[40]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[41]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[42]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[43]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[44]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[45]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[46]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[47]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[48]));
+ __asm__ volatile ("; fake <- %0" : "=v"(a[49]));
+
+ __asm__ volatile ("; fake -> %0" :: "v"(a[0]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[1]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[2]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[3]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[4]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[5]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[6]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[7]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[8]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[9]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[10]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[11]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[12]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[13]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[14]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[15]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[16]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[17]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[18]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[19]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[20]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[21]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[22]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[23]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[24]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[25]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[26]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[27]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[28]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[29]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[30]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[31]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[32]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[33]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[34]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[35]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[36]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[37]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[38]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[39]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[40]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[41]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[42]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[43]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[44]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[45]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[46]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[47]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[48]));
+ __asm__ volatile ("; fake -> %0" :: "v"(a[49]));
+}
new file mode 100644
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=gfx908 -O1" } */
+/* { dg-skip-if "incompatible ISA" { *-*-* } { "-march=gfx90[06]" } } */
+/* { dg-final { scan-assembler "accvgpr" } } */
+
+#define TYPE long
+
+#include "avgpr-spill-int.c"
new file mode 100644
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=gfx908 -O1" } */
+/* { dg-skip-if "incompatible ISA" { *-*-* } { "-march=gfx90[06]" } } */
+/* { dg-final { scan-assembler "accvgpr" } } */
+
+#define TYPE short
+
+#include "avgpr-spill-int.c"
@@ -1702,6 +1702,25 @@ isa_code(const char *isa) {
return -1;
}
+/* CDNA2 devices have twice as many VGPRs compared to older devices. */
+
+static int
+max_isa_vgprs (int isa)
+{
+ switch (isa)
+ {
+ case EF_AMDGPU_MACH_AMDGCN_GFX803:
+ case EF_AMDGPU_MACH_AMDGCN_GFX900:
+ case EF_AMDGPU_MACH_AMDGCN_GFX906:
+ case EF_AMDGPU_MACH_AMDGCN_GFX908:
+ case EF_AMDGPU_MACH_AMDGCN_GFX1030:
+ return 256;
+ case EF_AMDGPU_MACH_AMDGCN_GFX90a:
+ return 512;
+ }
+ GOMP_PLUGIN_fatal ("unhandled ISA in max_isa_vgprs");
+}
+
/* }}} */
/* {{{ Run */
@@ -2143,6 +2162,7 @@ run_kernel (struct kernel_info *kernel, void *vars,
struct GOMP_kernel_launch_attributes *kla,
struct goacc_asyncqueue *aq, bool module_locked)
{
+ struct agent_info *agent = kernel->agent;
GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
kernel->description->vpgr_count);
@@ -2150,8 +2170,9 @@ run_kernel (struct kernel_info *kernel, void *vars,
VGPRs available to run the kernels together. */
if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
{
+ int max_vgprs = max_isa_vgprs (agent->device_isa);
int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
- int max_threads = (256 / granulated_vgprs) * 4;
+ int max_threads = (max_vgprs / granulated_vgprs) * 4;
if (kla->gdims[2] > max_threads)
{
GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
@@ -2188,7 +2209,6 @@ run_kernel (struct kernel_info *kernel, void *vars,
DEBUG_PRINT ("]\n");
DEBUG_FLUSH ();
- struct agent_info *agent = kernel->agent;
if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");