amdgcn: Add instruction patterns for vector operations on complex numbers
Checks
Commit Message
This patch introduces instruction patterns for complex number operations
in the GCN machine description. These patterns are cmul, cmul_conj,
vec_addsub, vec_fmaddsub, vec_fmsubadd, cadd90, cadd270, cmla and cmls
(cmla_conj and cmls_conj were not found to be favorable to implement).
As a side effect of adding cmls, I also added fms patterns corresponding
to the existing fma patterns. Tested on CDNA2 GFX90a.
OK to commit?
gcc/ChangeLog:
* config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn)
(gcn_expand_dpp_distribute_even_insn)
(gcn_expand_dpp_distribute_odd_insn): Declare.
* config/gcn/gcn-valu.md (@dpp_swap_pairs<mode>)
(@dpp_distribute_even<mode>, @dpp_distribute_odd<mode>)
(cmul<conj_op><mode>3, cml<addsub_as><mode>4, vec_addsub<mode>3)
(cadd<rot><mode>3, vec_fmaddsub<mode>4, vec_fmsubadd<mode>4)
(fms<mode>4<exec>, fms<mode>4_negop2<exec>, fms<mode>4)
(fms<mode>4_negop2): New patterns.
* config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn)
(gcn_expand_dpp_distribute_even_insn)
(gcn_expand_dpp_distribute_odd_insn): New functions.
* config/gcn/gcn.md: Add entries to unspec enum.
gcc/testsuite/ChangeLog:
* gcc.target/gcn/complex.c: New test.
Comments
On 09/02/2023 20:13, Andrew Jenner wrote:
> This patch introduces instruction patterns for complex number operations
> in the GCN machine description. These patterns are cmul, cmul_conj,
> vec_addsub, vec_fmaddsub, vec_fmsubadd, cadd90, cadd270, cmla and cmls
> (cmla_conj and cmls_conj were not found to be favorable to implement).
> As a side effect of adding cmls, I also added fms patterns corresponding
> to the existing fma patterns. Tested on CDNA2 GFX90a.
>
> OK to commit?
>
>
> gcc/ChangeLog:
>
> * config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn)
> (gcn_expand_dpp_distribute_even_insn)
> (gcn_expand_dpp_distribute_odd_insn): Declare.
> * config/gcn/gcn-valu.md (@dpp_swap_pairs<mode>)
> (@dpp_distribute_even<mode>, @dpp_distribute_odd<mode>)
> (cmul<conj_op><mode>3, cml<addsub_as><mode>4, vec_addsub<mode>3)
> (cadd<rot><mode>3, vec_fmaddsub<mode>4, vec_fmsubadd<mode>4)
> (fms<mode>4<exec>, fms<mode>4_negop2<exec>, fms<mode>4)
> (fms<mode>4_negop2): New patterns.
> * config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn)
> (gcn_expand_dpp_distribute_even_insn)
> (gcn_expand_dpp_distribute_odd_insn): New functions.
> * config/gcn/gcn.md: Add entries to unspec enum.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/gcn/complex.c: New test.
+;; It would be possible to represent these without the UNSPEC as
+;;
+;; (vec_merge
+;; (fma op1 op2 op3)
+;; (fma op1 op2 (neg op3))
+;; (merge-const))
+;;
+;; But this doesn't seem useful in practice.
+
+(define_expand "vec_fmaddsub<mode>4"
+ [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")
+ (match_operand:V_noHI 3 "register_operand" "v")]
+ UNSPEC_FMADDSUB))]
This is a define_expand pattern that has a custom-code expansion with an
unconditional "DONE", so the actual RTL representation is irrelevant
here: it only needs to have the match_operand entries. The
UNSPEC_FMADDSUB is therefore dead (as in, it will never appear in the
IR). We can safely remove those, although I don't hate them for
readability purposes.
The UNSPEC_CMUL and UNSPEC_CMUL_CONJ are similarly "dead", but since you
use them for an iterator they're still useful in the machine description.
+(define_insn "fms<mode>4<exec>"
+ [(set (match_operand:V_FP 0 "register_operand" "= v, v")
+ (fma:V_FP
+ (match_operand:V_FP 1 "gcn_alu_operand" "% vA, vA")
+ (match_operand:V_FP 2 "gcn_alu_operand" " vA,vSvA")
+ (neg:V_FP
+ (match_operand:V_FP 3 "gcn_alu_operand" "vSvA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, %2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
Please ensure that the alternatives are vertically aligned in the same
style as the rest of the file.
+/* Generate DPP pairwise swap instruction.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_swap_pairs_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
....
+/* Generate DPP distribute even instruction.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_distribute_even_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
....
+/* Generate DPP distribute odd instruction.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_distribute_odd_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
Please add a comment that isn't just the function name in words. Explain
what operation happens here and maybe show an example of what it produces.
+++ b/gcc/testsuite/gcc.target/gcn/complex.c
@@ -0,0 +1,640 @@
+// { dg-do run }
+// { dg-options "-O -fopenmp-simd -ftree-loop-if-convert -fno-ssa-phiopt" }
Does the -fopenmp-simd option do anything here? There are no "omp
declare simd" directives.
+void cmulF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i];
+ }
+}
Tests in gcc.target/gcn won't do anything with "omp target" directives.
I would expect the loop to vectorize without, at -O2 or above (or "-O1
-ftree-vectorize"), but you might find the output easier to read with
"__restrict" on the parameters as that will avoid emitting the runtime
alias check and scalar code implementation.
I'd also expect you to have to do something to avoid inlining.
+ td = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size,
omp_default_mem_alloc);
+ te = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size,
omp_default_mem_alloc);
+ tf = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size,
omp_default_mem_alloc);
+ tg = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size,
omp_default_mem_alloc);
There's no need to use libgomp to allocate memory on the device --
malloc works just fine -- and it doesn't need to be specifically aligned
unless you're wanting performance.
In general I'm confused by this testcase because it looks like it was
written for an offloading toolchain, but it's placed into the
bare-machine GCN testsuite.
Andrew
I have updated this patch to incorporate the feedback from Andrew
Stubbs. Tested on CDNA2 GFX90a.
gcc/ChangeLog:
* config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn)
(gcn_expand_dpp_distribute_even_insn)
(gcn_expand_dpp_distribute_odd_insn): Declare.
* config/gcn/gcn-valu.md (@dpp_swap_pairs<mode>)
(@dpp_distribute_even<mode>, @dpp_distribute_odd<mode>)
(cmul<conj_op><mode>3, cml<addsub_as><mode>4, vec_addsub<mode>3)
(cadd<rot><mode>3, vec_fmaddsub<mode>4, vec_fmsubadd<mode>4)
(fms<mode>4<exec>, fms<mode>4_negop2<exec>, fms<mode>4)
(fms<mode>4_negop2): New patterns.
* config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn)
(gcn_expand_dpp_distribute_even_insn)
(gcn_expand_dpp_distribute_odd_insn): New functions.
* config/gcn/gcn.md: Add entries to unspec enum.
gcc/testsuite/ChangeLog:
* gcc.target/gcn/complex.c: New test.
diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h
index 861044e77f0..d7862b21a2a 100644
--- a/gcc/config/gcn/gcn-protos.h
+++ b/gcc/config/gcn/gcn-protos.h
@@ -27,6 +27,11 @@ extern unsigned int gcn_dwarf_register_number (unsigned int regno);
extern rtx get_exec (int64_t);
extern rtx get_exec (machine_mode mode);
extern char * gcn_expand_dpp_shr_insn (machine_mode, const char *, int, int);
+extern char * gcn_expand_dpp_swap_pairs_insn (machine_mode, const char *, int);
+extern char * gcn_expand_dpp_distribute_even_insn (machine_mode, const char *,
+ int unspec);
+extern char * gcn_expand_dpp_distribute_odd_insn (machine_mode, const char *,
+ int unspec);
extern void gcn_expand_epilogue ();
extern rtx gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets,
rtx scale, bool unsigned_p, rtx exec);
diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 75e9a59600b..787d7709d0d 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -1224,6 +1224,45 @@
[(set_attr "type" "vop_dpp")
(set_attr "length" "16")])
+(define_insn "@dpp_swap_pairs<mode>"
+ [(set (match_operand:V_noHI 0 "register_operand" "=v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_SWAP_PAIRS))]
+ ""
+ {
+ return gcn_expand_dpp_swap_pairs_insn (<MODE>mode, "v_mov_b32",
+ UNSPEC_MOV_DPP_SWAP_PAIRS);
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_even<mode>"
+ [(set (match_operand:V_noHI 0 "register_operand" "=v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+ ""
+ {
+ return gcn_expand_dpp_distribute_even_insn (<MODE>mode, "v_mov_b32",
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN);
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_odd<mode>"
+ [(set (match_operand:V_noHI 0 "register_operand" "=v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+ ""
+ {
+ return gcn_expand_dpp_distribute_odd_insn (<MODE>mode, "v_mov_b32",
+ UNSPEC_MOV_DPP_DISTRIBUTE_ODD);
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "16")])
+
;; }}}
;; {{{ ALU special case: add/sub
@@ -2185,6 +2224,180 @@
DONE;
})
+(define_int_iterator UNSPEC_CMUL_OP [UNSPEC_CMUL UNSPEC_CMUL_CONJ])
+(define_int_attr conj_op [(UNSPEC_CMUL "") (UNSPEC_CMUL_CONJ "_conj")])
+(define_int_attr cmul_subadd [(UNSPEC_CMUL "sub") (UNSPEC_CMUL_CONJ "add")])
+(define_int_attr cmul_addsub [(UNSPEC_CMUL "add") (UNSPEC_CMUL_CONJ "sub")])
+
+(define_expand "cmul<conj_op><mode>3"
+ [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")]
+ UNSPEC_CMUL_OP))]
+ ""
+ {
+ // operands[1] a b
+ // operands[2] c d
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2])); // a*c b*d
+
+ rtx s2_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (s2_perm, operands[2])); // d c
+
+ rtx t2 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t2, operands[1], s2_perm)); // a*d b*c
+
+ rtx t1_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (t1_perm, t1)); // b*d a*c
+
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_<cmul_subadd><mode>3_exec (dest, t1, t1_perm, dest, even));
+ // a*c-b*d 0
+
+ rtx t2_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (t2_perm, t2)); // b*c a*d
+
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_<cmul_addsub><mode>3_exec (dest, t2, t2_perm, dest, odd));
+ // 0 a*d+b*c
+ DONE;
+ })
+
+(define_code_iterator addsub [plus minus])
+(define_code_attr addsub_as [(plus "a") (minus "s")])
+
+(define_expand "cml<addsub_as><mode>4"
+ [(set (match_operand:V_FP 0 "register_operand" "=&v")
+ (addsub:V_FP
+ (unspec:V_FP
+ [(match_operand:V_FP 1 "register_operand" "v")
+ (match_operand:V_FP 2 "register_operand" "v")]
+ UNSPEC_CMUL)
+ (match_operand:V_FP 3 "register_operand" "v")))]
+ ""
+ {
+ rtx a = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_distribute_even<mode> (a, operands[1])); // a a
+
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_fm<addsub_as><mode>4 (t1, a, operands[2], operands[3]));
+ // a*c a*d
+
+ rtx b = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_distribute_odd<mode> (b, operands[1])); // b b
+
+ rtx t2 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t2, b, operands[2])); // b*c b*d
+
+ rtx t2_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (t2_perm, t2)); // b*d b*c
+
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_sub<mode>3_exec (dest, t1, t2_perm, dest, even));
+
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, t1, t2_perm, dest, odd));
+
+ DONE;
+ })
+
+(define_expand "vec_addsub<mode>3"
+ [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+ (vec_merge:V_noHI
+ (minus:V_noHI
+ (match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v"))
+ (plus:V_noHI (match_dup 1) (match_dup 2))
+ (const_int 6148914691236517205)))]
+ ""
+ {
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ rtx x = operands[1];
+ rtx y = operands[2];
+ emit_insn (gen_sub<mode>3_exec (dest, x, y, dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, x, y, dest, odd));
+
+ DONE;
+ })
+
+(define_int_iterator CADD [UNSPEC_CADD90 UNSPEC_CADD270])
+(define_int_attr rot [(UNSPEC_CADD90 "90") (UNSPEC_CADD270 "270")])
+(define_int_attr cadd_subadd [(UNSPEC_CADD90 "sub") (UNSPEC_CADD270 "add")])
+(define_int_attr cadd_addsub [(UNSPEC_CADD90 "add") (UNSPEC_CADD270 "sub")])
+
+(define_expand "cadd<rot><mode>3"
+ [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+ (unspec:V_noHI [(match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")]
+ CADD))]
+ ""
+ {
+ rtx dest = operands[0];
+ rtx x = operands[1];
+ rtx y = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (y, operands[2]));
+
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ emit_insn (gen_<cadd_subadd><mode>3_exec (dest, x, y, dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_<cadd_addsub><mode>3_exec (dest, x, y, dest, odd));
+
+ DONE;
+ })
+
+(define_expand "vec_fmaddsub<mode>4"
+ [(match_operand:V_noHI 0 "register_operand" "=&v")
+ (match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")
+ (match_operand:V_noHI 3 "register_operand" "v")]
+ ""
+ {
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2]));
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_sub<mode>3_exec (dest, t1, operands[3], dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, odd));
+
+ DONE;
+ })
+
+(define_expand "vec_fmsubadd<mode>4"
+ [(match_operand:V_noHI 0 "register_operand" "=&v")
+ (match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")
+ (match_operand:V_noHI 3 "register_operand" "v")]
+ ""
+ {
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2]));
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, odd));
+
+ DONE;
+ })
+
;; }}}
;; {{{ ALU generic case
@@ -2861,6 +3074,56 @@
[(set_attr "type" "vop3a")
(set_attr "length" "8")])
+(define_insn "fms<mode>4<exec>"
+ [(set (match_operand:V_FP 0 "register_operand" "= v, v")
+ (fma:V_FP
+ (match_operand:V_FP 1 "gcn_alu_operand" "% vA, vA")
+ (match_operand:V_FP 2 "gcn_alu_operand" " vA,vSvA")
+ (neg:V_FP
+ (match_operand:V_FP 3 "gcn_alu_operand" "vSvA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, %2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fms<mode>4_negop2<exec>"
+ [(set (match_operand:V_FP 0 "register_operand" "= v, v, v")
+ (fma:V_FP
+ (match_operand:V_FP 1 "gcn_alu_operand" " vA, vA,vSvA")
+ (neg:V_FP
+ (match_operand:V_FP 2 "gcn_alu_operand" " vA,vSvA, vA"))
+ (neg:V_FP
+ (match_operand:V_FP 3 "gcn_alu_operand" "vSvA, vA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, -%2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fms<mode>4"
+ [(set (match_operand:FP 0 "register_operand" "= v, v")
+ (fma:FP
+ (match_operand:FP 1 "gcn_alu_operand" "% vA, vA")
+ (match_operand:FP 2 "gcn_alu_operand" " vA,vSvA")
+ (neg:FP
+ (match_operand:FP 3 "gcn_alu_operand" "vSvA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, %2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fms<mode>4_negop2"
+ [(set (match_operand:FP 0 "register_operand" "= v, v, v")
+ (fma:FP
+ (match_operand:FP 1 "gcn_alu_operand" " vA, vA,vSvA")
+ (neg:FP
+ (match_operand:FP 2 "gcn_alu_operand" " vA,vSvA, vA"))
+ (neg:FP
+ (match_operand:FP 3 "gcn_alu_operand" "vSvA, vA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, -%2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
;; }}}
;; {{{ FP division
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 3d71c2f6526..aca17a19d84 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -5013,6 +5013,79 @@ gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
return !is_packed;
}
+/* Generate DPP pairwise swap instruction.
+ This instruction swaps the values in each even lane with the value in the
+ next one:
+ a, b, c, d -> b, a, d, c.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_swap_pairs_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
+{
+ static char buf[128];
+ const char *dpp;
+
+ /* Add the DPP modifiers. */
+ dpp = "quad_perm:[1,0,3,2]";
+
+ if (vgpr_2reg_mode_p (mode))
+ sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+ insn, dpp, insn, dpp);
+ else
+ sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+ return buf;
+}
+
+/* Generate DPP distribute even instruction.
+ This instruction copies the value in each even lane to the next one:
+ a, b, c, d -> a, a, c, c.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_distribute_even_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
+{
+ static char buf[128];
+ const char *dpp;
+
+ /* Add the DPP modifiers. */
+ dpp = "quad_perm:[0,0,2,2]";
+
+ if (vgpr_2reg_mode_p (mode))
+ sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+ insn, dpp, insn, dpp);
+ else
+ sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+ return buf;
+}
+
+/* Generate DPP distribute odd instruction.
+ This isntruction copies the value in each odd lane to the previous one:
+ a, b, c, d -> b, b, d, d.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_distribute_odd_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
+{
+ static char buf[128];
+ const char *dpp;
+
+ /* Add the DPP modifiers. */
+ dpp = "quad_perm:[1,1,3,3]";
+
+ if (vgpr_2reg_mode_p (mode))
+ sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+ insn, dpp, insn, dpp);
+ else
+ sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+ return buf;
+}
+
/* Generate DPP instructions used for vector reductions.
The opcode is given by INSN.
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 10d2b874cce..c90303c54b5 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -78,6 +78,13 @@
UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
UNSPEC_MOV_DPP_SHR
+ UNSPEC_MOV_DPP_SWAP_PAIRS
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN
+ UNSPEC_MOV_DPP_DISTRIBUTE_ODD
+ UNSPEC_CMUL UNSPEC_CMUL_CONJ
+ UNSPEC_CMUL_ADD UNSPEC_CMUL_SUB
+ UNSPEC_CADD90
+ UNSPEC_CADD270
UNSPEC_GATHER
UNSPEC_SCATTER
UNSPEC_RCP
diff --git a/gcc/testsuite/gcc.target/gcn/complex.c b/gcc/testsuite/gcc.target/gcn/complex.c
new file mode 100755
index 00000000000..2304b986cf4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/gcn/complex.c
@@ -0,0 +1,627 @@
+// { dg-do run }
+// { dg-options "-std=c99 -O3" }
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+#define COUNT 1000
+#define MAX 1000
+#define ALIGNMENT (2*1024*1024) // 2MB
+
+_Complex double conj(_Complex double);
+_Complex float conjf(_Complex float);
+
+unsigned int device = 0;
+
+// cmul
+
+void cmulF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmulFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a*c-b*d || td[i*2+1] != a*d+b*c)
+ return false;
+ }
+ return true;
+}
+
+void cmulD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmulDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a*c-b*d || td[i*2+1] != a*d+b*c)
+ return false;
+ }
+ return true;
+}
+
+
+// cmul_conj
+
+void cmul_conjF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * conj(f[i]);
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmul_conjFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a*c+b*d || td[i*2+1] != b*c-a*d)
+ return false;
+ }
+ return true;
+}
+
+void cmul_conjD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * conj(f[i]);
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmul_conjDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a*c+b*d || td[i*2+1] != b*c-a*d)
+ return false;
+ }
+ return true;
+}
+
+
+// addsub
+
+void addsubF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] - conjf(f[i]);
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool addsubFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a-c || td[i*2+1] != b+d)
+ return false;
+ }
+ return true;
+}
+
+void addsubD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] - conj(f[i]);
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool addsubDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a-c || td[i*2+1] != b+d)
+ return false;
+ }
+ return true;
+}
+
+
+// fmaddsub
+
+void fmaddsubF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]-tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]+tg[i*2+1];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool fmaddsubFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c-e || td[i*2+1] != b*d+f)
+ return false;
+ }
+ return true;
+}
+
+void fmaddsubD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]-tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]+tg[i*2+1];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool fmaddsubDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c-e || td[i*2+1] != b*d+f)
+ return false;
+ }
+ return true;
+}
+
+
+// fmsubadd
+
+void fmsubaddF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]+tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]-tg[i*2+1];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool fmsubaddFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c+e || td[i*2+1] != b*d-f)
+ return false;
+ }
+ return true;
+}
+
+void fmsubaddD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]+tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]-tg[i*2+1];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool fmsubaddDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c+e || td[i*2+1] != b*d-f)
+ return false;
+ }
+ return true;
+}
+
+
+// cadd90
+
+void cadd90F(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] - tf[i*2+1];
+ td[i*2+1] = te[i*2+1] + tf[i*2];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cadd90Fcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a-d || td[i*2+1] != b+c)
+ return false;
+ }
+ return true;
+}
+
+void cadd90D(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] - tf[i*2+1];
+ td[i*2+1] = te[i*2+1] + tf[i*2];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cadd90Dcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a-d || td[i*2+1] != b+c)
+ return false;
+ }
+ return true;
+}
+
+// cadd270
+
+void cadd270F(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] + tf[i*2+1];
+ td[i*2+1] = te[i*2+1] - tf[i*2];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cadd270Fcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a+d || td[i*2+1] != b-c)
+ return false;
+ }
+ return true;
+}
+
+void cadd270D(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] + tf[i*2+1];
+ td[i*2+1] = te[i*2+1] - tf[i*2];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cadd270Dcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a+d || td[i*2+1] != b-c)
+ return false;
+ }
+ return true;
+}
+
+
+// cmla
+
+void cmlaF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] + g[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmlaFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d+e || td[i*2+1] != a*d+b*c+f)
+ return false;
+ }
+ return true;
+}
+
+void cmlaD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] + g[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmlaDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d+e || td[i*2+1] != a*d+b*c+f)
+ return false;
+ }
+ return true;
+}
+
+
+// cmls
+
+void cmlsF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] - g[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmlsFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d-e || td[i*2+1] != a*d+b*c-f)
+ return false;
+ }
+ return true;
+}
+
+void cmlsD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] - g[i];
+ }
+}
+
+__attribute__((optimize("no-tree-vectorize")))
+bool cmlsDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d-e || td[i*2+1] != a*d+b*c-f)
+ return false;
+ }
+ return true;
+}
+
+
+typedef void(*runF)(float *td, float *te, float *tf, float *tg, int tas);
+typedef void(*runD)(double *td, double *te, double *tf, double *tg, int tas);
+typedef bool(*checkF)(float *td, float *te, float *tf, float *tg, int tas);
+typedef bool(*checkD)(double *td, double *te, double *tf, double *tg, int tas);
+
+typedef struct
+{
+ runF rF;
+ runD rD;
+ checkF cF;
+ checkD cD;
+} operation;
+
+operation ops[] = {
+ {cmulF, cmulD, cmulFcheck, cmulDcheck},
+ {cmul_conjF, cmul_conjD, cmul_conjFcheck, cmul_conjDcheck},
+ {addsubF, addsubD, addsubFcheck, addsubDcheck},
+ {fmaddsubF, fmaddsubD, fmaddsubFcheck, fmaddsubDcheck},
+ {fmsubaddF, fmsubaddD, fmsubaddFcheck, fmsubaddDcheck},
+ {cadd90F, cadd90D, cadd90Fcheck, cadd90Dcheck},
+ {cadd270F, cadd270D, cadd270Fcheck, cadd270Dcheck},
+ {cmlaF, cmlaD, cmlaFcheck, cmlaDcheck},
+ {cmlsF, cmlsD, cmlsFcheck, cmlsDcheck}
+};
+
+void testF(operation* op)
+{
+ float* td;
+ float* te;
+ float* tf;
+ float* tg;
+ int array_size = COUNT;
+ td = (float*)malloc(sizeof(float)*array_size);
+ te = (float*)malloc(sizeof(float)*array_size);
+ tf = (float*)malloc(sizeof(float)*array_size);
+ tg = (float*)malloc(sizeof(float)*array_size);
+ float* dd = td;
+ float* ee = te;
+ float* ff = tf;
+ float* gg = tg;
+ for (int i = 0; i < COUNT; ++i)
+ {
+ te[i] = (float)(rand() % MAX);
+ tf[i] = (float)(rand() % MAX);
+ tg[i] = (float)(rand() % MAX);
+ }
+ op->rF(td, te, tf, tg, COUNT);
+ if (!op->cF(td, te, tf, tg, COUNT))
+ abort();
+}
+
+void testD(operation* op)
+{
+ double* td;
+ double* te;
+ double* tf;
+ double* tg;
+ int array_size = COUNT;
+ td = (double*)malloc(sizeof(double)*array_size);
+ te = (double*)malloc(sizeof(double)*array_size);
+ tf = (double*)malloc(sizeof(double)*array_size);
+ tg = (double*)malloc(sizeof(double)*array_size);
+ double* dd = td;
+ double* ee = te;
+ double* ff = tf;
+ double* gg = tg;
+ for (int i = 0; i < COUNT; ++i)
+ {
+ te[i] = (double)(rand() % MAX);
+ tf[i] = (double)(rand() % MAX);
+ tg[i] = (double)(rand() % MAX);
+ }
+ op->rD(td, te, tf, tg, COUNT);
+ if (!op->cD(td, te, tf, tg, COUNT))
+ abort();
+}
+
+int main()
+{
+ for (int i = 0; i < 9; ++i)
+ {
+ testF(&ops[i]);
+ testD(&ops[i]);
+ }
+}
+
On 21/03/2023 13:35, Andrew Jenner wrote:
> I have updated this patch to incorporate the feedback from Andrew
> Stubbs. Tested on CDNA2 GFX90a.
>
> gcc/ChangeLog:
>
> * config/gcn/gcn-protos.h (gcn_expand_dpp_swap_pairs_insn)
> (gcn_expand_dpp_distribute_even_insn)
> (gcn_expand_dpp_distribute_odd_insn): Declare.
> * config/gcn/gcn-valu.md (@dpp_swap_pairs<mode>)
> (@dpp_distribute_even<mode>, @dpp_distribute_odd<mode>)
> (cmul<conj_op><mode>3, cml<addsub_as><mode>4, vec_addsub<mode>3)
> (cadd<rot><mode>3, vec_fmaddsub<mode>4, vec_fmsubadd<mode>4)
> (fms<mode>4<exec>, fms<mode>4_negop2<exec>, fms<mode>4)
> (fms<mode>4_negop2): New patterns.
> * config/gcn/gcn.cc (gcn_expand_dpp_swap_pairs_insn)
> (gcn_expand_dpp_distribute_even_insn)
> (gcn_expand_dpp_distribute_odd_insn): New functions.
> * config/gcn/gcn.md: Add entries to unspec enum.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/gcn/complex.c: New test.
OK.
Andrew
@@ -27,6 +27,11 @@ extern unsigned int gcn_dwarf_register_number (unsigned int regno);
extern rtx get_exec (int64_t);
extern rtx get_exec (machine_mode mode);
extern char * gcn_expand_dpp_shr_insn (machine_mode, const char *, int, int);
+extern char * gcn_expand_dpp_swap_pairs_insn (machine_mode, const char *, int);
+extern char * gcn_expand_dpp_distribute_even_insn (machine_mode, const char *,
+ int unspec);
+extern char * gcn_expand_dpp_distribute_odd_insn (machine_mode, const char *,
+ int unspec);
extern void gcn_expand_epilogue ();
extern rtx gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets,
rtx scale, bool unsigned_p, rtx exec);
@@ -1224,6 +1224,45 @@
[(set_attr "type" "vop_dpp")
(set_attr "length" "16")])
+(define_insn "@dpp_swap_pairs<mode>"
+ [(set (match_operand:V_noHI 0 "register_operand" "=v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_SWAP_PAIRS))]
+ ""
+ {
+ return gcn_expand_dpp_swap_pairs_insn (<MODE>mode, "v_mov_b32",
+ UNSPEC_MOV_DPP_SWAP_PAIRS);
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_even<mode>"
+ [(set (match_operand:V_noHI 0 "register_operand" "=v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+ ""
+ {
+ return gcn_expand_dpp_distribute_even_insn (<MODE>mode, "v_mov_b32",
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN);
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "16")])
+
+(define_insn "@dpp_distribute_odd<mode>"
+ [(set (match_operand:V_noHI 0 "register_operand" "=v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" " v")]
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN))]
+ ""
+ {
+ return gcn_expand_dpp_distribute_odd_insn (<MODE>mode, "v_mov_b32",
+ UNSPEC_MOV_DPP_DISTRIBUTE_ODD);
+ }
+ [(set_attr "type" "vop_dpp")
+ (set_attr "length" "16")])
+
;; }}}
;; {{{ ALU special case: add/sub
@@ -2185,6 +2224,194 @@
DONE;
})
+(define_int_iterator UNSPEC_CMUL_OP [UNSPEC_CMUL UNSPEC_CMUL_CONJ])
+(define_int_attr conj_op [(UNSPEC_CMUL "") (UNSPEC_CMUL_CONJ "_conj")])
+(define_int_attr cmul_subadd [(UNSPEC_CMUL "sub") (UNSPEC_CMUL_CONJ "add")])
+(define_int_attr cmul_addsub [(UNSPEC_CMUL "add") (UNSPEC_CMUL_CONJ "sub")])
+
+(define_expand "cmul<conj_op><mode>3"
+ [(set (match_operand:V_noHI 0 "register_operand" "= &v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")]
+ UNSPEC_CMUL_OP))]
+ ""
+ {
+ // operands[1] a b
+ // operands[2] c d
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2])); // a*c b*d
+
+ rtx s2_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (s2_perm, operands[2])); // d c
+
+ rtx t2 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t2, operands[1], s2_perm)); // a*d b*c
+
+ rtx t1_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (t1_perm, t1)); // b*d a*c
+
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_<cmul_subadd><mode>3_exec (dest, t1, t1_perm, dest, even));
+ // a*c-b*d 0
+
+ rtx t2_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (t2_perm, t2)); // b*c a*d
+
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_<cmul_addsub><mode>3_exec (dest, t2, t2_perm, dest, odd));
+ // 0 a*d+b*c
+ DONE;
+ })
+
+(define_code_iterator addsub [plus minus])
+(define_code_attr addsub_as [(plus "a") (minus "s")])
+
+(define_expand "cml<addsub_as><mode>4"
+ [(set (match_operand:V_FP 0 "register_operand" "= &v")
+ (addsub:V_FP
+ (unspec:V_FP
+ [(match_operand:V_FP 1 "register_operand" "v")
+ (match_operand:V_FP 2 "register_operand" "v")]
+ UNSPEC_CMUL)
+ (match_operand:V_FP 3 "register_operand" "v")))]
+ ""
+ {
+ rtx a = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_distribute_even<mode> (a, operands[1])); // a a
+
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_fm<addsub_as><mode>4 (t1, a, operands[2], operands[3]));
+ // a*c a*d
+
+ rtx b = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_distribute_odd<mode> (b, operands[1])); // b b
+
+ rtx t2 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t2, b, operands[2])); // b*c b*d
+
+ rtx t2_perm = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (t2_perm, t2)); // b*d b*c
+
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_sub<mode>3_exec (dest, t1, t2_perm, dest, even));
+
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, t1, t2_perm, dest, odd));
+
+ DONE;
+ })
+
+(define_expand "vec_addsub<mode>3"
+ [(set (match_operand:V_noHI 0 "register_operand" "= &v")
+ (vec_merge:V_noHI
+ (minus:V_noHI
+ (match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v"))
+ (plus:V_noHI (match_dup 1) (match_dup 2))
+ (const_int 6148914691236517205)))]
+ ""
+ {
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ rtx x = operands[1];
+ rtx y = operands[2];
+ emit_insn (gen_sub<mode>3_exec (dest, x, y, dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, x, y, dest, odd));
+
+ DONE;
+ })
+
+(define_int_iterator CADD [UNSPEC_CADD90 UNSPEC_CADD270])
+(define_int_attr rot [(UNSPEC_CADD90 "90") (UNSPEC_CADD270 "270")])
+(define_int_attr cadd_subadd [(UNSPEC_CADD90 "sub") (UNSPEC_CADD270 "add")])
+(define_int_attr cadd_addsub [(UNSPEC_CADD90 "add") (UNSPEC_CADD270 "sub")])
+
+(define_expand "cadd<rot><mode>3"
+ [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+ (unspec:V_noHI [(match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")]
+ CADD))]
+ ""
+ {
+ rtx dest = operands[0];
+ rtx x = operands[1];
+ rtx y = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_dpp_swap_pairs<mode> (y, operands[2]));
+
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ emit_insn (gen_<cadd_subadd><mode>3_exec (dest, x, y, dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_<cadd_addsub><mode>3_exec (dest, x, y, dest, odd));
+
+ DONE;
+ })
+
+;; It would be possible to represent these without the UNSPEC as
+;;
+;; (vec_merge
+;; (fma op1 op2 op3)
+;; (fma op1 op2 (neg op3))
+;; (merge-const))
+;;
+;; But this doesn't seem useful in practice.
+
+(define_expand "vec_fmaddsub<mode>4"
+ [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")
+ (match_operand:V_noHI 3 "register_operand" "v")]
+ UNSPEC_FMADDSUB))]
+ ""
+ {
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2]));
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_sub<mode>3_exec (dest, t1, operands[3], dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, odd));
+
+ DONE;
+ })
+
+(define_expand "vec_fmsubadd<mode>4"
+ [(set (match_operand:V_noHI 0 "register_operand" "=&v")
+ (unspec:V_noHI
+ [(match_operand:V_noHI 1 "register_operand" "v")
+ (match_operand:V_noHI 2 "register_operand" "v")
+ (neg:V_noHI
+ (match_operand:V_noHI 3 "register_operand" "v"))]
+ UNSPEC_FMADDSUB))]
+ ""
+ {
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_mul<mode>3 (t1, operands[1], operands[2]));
+ rtx even = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (even, get_exec (0x5555555555555555UL));
+ rtx dest = operands[0];
+ emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, even));
+ rtx odd = gen_rtx_REG (DImode, EXEC_REG);
+ emit_move_insn (odd, get_exec (0xaaaaaaaaaaaaaaaaUL));
+ emit_insn (gen_add<mode>3_exec (dest, t1, operands[3], dest, odd));
+
+ DONE;
+ })
+
;; }}}
;; {{{ ALU generic case
@@ -2768,6 +2995,56 @@
[(set_attr "type" "vop3a")
(set_attr "length" "8")])
+(define_insn "fms<mode>4<exec>"
+ [(set (match_operand:V_FP 0 "register_operand" "= v, v")
+ (fma:V_FP
+ (match_operand:V_FP 1 "gcn_alu_operand" "% vA, vA")
+ (match_operand:V_FP 2 "gcn_alu_operand" " vA,vSvA")
+ (neg:V_FP
+ (match_operand:V_FP 3 "gcn_alu_operand" "vSvA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, %2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fms<mode>4_negop2<exec>"
+ [(set (match_operand:V_FP 0 "register_operand" "= v, v, v")
+ (fma:V_FP
+ (match_operand:V_FP 1 "gcn_alu_operand" " vA, vA,vSvA")
+ (neg:V_FP
+ (match_operand:V_FP 2 "gcn_alu_operand" " vA,vSvA, vA"))
+ (neg:V_FP
+ (match_operand:V_FP 3 "gcn_alu_operand" "vSvA, vA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, -%2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fms<mode>4"
+ [(set (match_operand:FP 0 "register_operand" "= v, v")
+ (fma:FP
+ (match_operand:FP 1 "gcn_alu_operand" "% vA, vA")
+ (match_operand:FP 2 "gcn_alu_operand" " vA,vSvA")
+ (neg:FP
+ (match_operand:FP 3 "gcn_alu_operand" "vSvA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, %2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
+(define_insn "fms<mode>4_negop2"
+ [(set (match_operand:FP 0 "register_operand" "= v, v, v")
+ (fma:FP
+ (match_operand:FP 1 "gcn_alu_operand" " vA, vA,vSvA")
+ (neg:FP
+ (match_operand:FP 2 "gcn_alu_operand" " vA,vSvA, vA"))
+ (neg:FP
+ (match_operand:FP 3 "gcn_alu_operand" "vSvA, vA, vA"))))]
+ ""
+ "v_fma%i0\t%0, %1, -%2, -%3"
+ [(set_attr "type" "vop3a")
+ (set_attr "length" "8")])
+
;; }}}
;; {{{ FP division
@@ -5012,6 +5012,72 @@ gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed)
return !is_packed;
}
+/* Generate DPP pairwise swap instruction.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_swap_pairs_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
+{
+ static char buf[128];
+ const char *dpp;
+
+ /* Add the DPP modifiers. */
+ dpp = "quad_perm:[1,0,3,2]";
+
+ if (vgpr_2reg_mode_p (mode))
+ sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+ insn, dpp, insn, dpp);
+ else
+ sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+ return buf;
+}
+
+/* Generate DPP distribute even instruction.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_distribute_even_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
+{
+ static char buf[128];
+ const char *dpp;
+
+ /* Add the DPP modifiers. */
+ dpp = "quad_perm:[0,0,2,2]";
+
+ if (vgpr_2reg_mode_p (mode))
+ sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+ insn, dpp, insn, dpp);
+ else
+ sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+ return buf;
+}
+
+/* Generate DPP distribute odd instruction.
+ The opcode is given by INSN. */
+
+char *
+gcn_expand_dpp_distribute_odd_insn (machine_mode mode, const char *insn,
+ int ARG_UNUSED (unspec))
+{
+ static char buf[128];
+ const char *dpp;
+
+ /* Add the DPP modifiers. */
+ dpp = "quad_perm:[1,1,3,3]";
+
+ if (vgpr_2reg_mode_p (mode))
+ sprintf (buf, "%s\t%%L0, %%L1 %s\n\t%s\t%%H0, %%H1 %s",
+ insn, dpp, insn, dpp);
+ else
+ sprintf (buf, "%s\t%%0, %%1 %s", insn, dpp);
+
+ return buf;
+}
+
/* Generate DPP instructions used for vector reductions.
The opcode is given by INSN.
@@ -78,6 +78,14 @@
UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
UNSPEC_MOV_DPP_SHR
+ UNSPEC_MOV_DPP_SWAP_PAIRS
+ UNSPEC_MOV_DPP_DISTRIBUTE_EVEN
+ UNSPEC_MOV_DPP_DISTRIBUTE_ODD
+ UNSPEC_CMUL UNSPEC_CMUL_CONJ
+ UNSPEC_CMUL_ADD UNSPEC_CMUL_SUB
+ UNSPEC_FMADDSUB
+ UNSPEC_CADD90
+ UNSPEC_CADD270
UNSPEC_GATHER
UNSPEC_SCATTER
UNSPEC_RCP
new file mode 100755
@@ -0,0 +1,640 @@
+// { dg-do run }
+// { dg-options "-O -fopenmp-simd -ftree-loop-if-convert -fno-ssa-phiopt" }
+
+#include <stdlib.h>
+#include <omp.h>
+#include <stdbool.h>
+
+#define COUNT 1000
+#define MAX 1000
+#define ALIGNMENT (2*1024*1024) // 2MB
+
+_Complex double conj(_Complex double);
+_Complex float conjf(_Complex float);
+
+unsigned int device = 0;
+
+// cmul
+
+void cmulF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i];
+ }
+}
+
+bool cmulFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a*c-b*d || td[i*2+1] != a*d+b*c)
+ return false;
+ }
+ return true;
+}
+
+void cmulD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i];
+ }
+}
+
+bool cmulDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a*c-b*d || td[i*2+1] != a*d+b*c)
+ return false;
+ }
+ return true;
+}
+
+
+// cmul_conj
+
+void cmul_conjF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * conj(f[i]);
+ }
+}
+
+bool cmul_conjFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a*c+b*d || td[i*2+1] != b*c-a*d)
+ return false;
+ }
+ return true;
+}
+
+void cmul_conjD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * conj(f[i]);
+ }
+}
+
+bool cmul_conjDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a*c+b*d || td[i*2+1] != b*c-a*d)
+ return false;
+ }
+ return true;
+}
+
+
+// addsub
+
+void addsubF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] - conjf(f[i]);
+ }
+}
+
+bool addsubFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a-c || td[i*2+1] != b+d)
+ return false;
+ }
+ return true;
+}
+
+void addsubD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] - conj(f[i]);
+ }
+}
+
+bool addsubDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a-c || td[i*2+1] != b+d)
+ return false;
+ }
+ return true;
+}
+
+
+// fmaddsub
+
+void fmaddsubF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]-tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]+tg[i*2+1];
+ }
+}
+
+bool fmaddsubFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c-e || td[i*2+1] != b*d+f)
+ return false;
+ }
+ return true;
+}
+
+void fmaddsubD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]-tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]+tg[i*2+1];
+ }
+}
+
+bool fmaddsubDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c-e || td[i*2+1] != b*d+f)
+ return false;
+ }
+ return true;
+}
+
+
+// fmsubadd
+
+void fmsubaddF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]+tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]-tg[i*2+1];
+ }
+}
+
+bool fmsubaddFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c+e || td[i*2+1] != b*d-f)
+ return false;
+ }
+ return true;
+}
+
+void fmsubaddD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2]*tf[i*2]+tg[i*2];
+ td[i*2+1] = te[i*2+1]*tf[i*2+1]-tg[i*2+1];
+ }
+}
+
+bool fmsubaddDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c+e || td[i*2+1] != b*d-f)
+ return false;
+ }
+ return true;
+}
+
+
+// cadd90
+
+void cadd90F(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] - tf[i*2+1];
+ td[i*2+1] = te[i*2+1] + tf[i*2];
+ }
+}
+
+bool cadd90Fcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a-d || td[i*2+1] != b+c)
+ return false;
+ }
+ return true;
+}
+
+void cadd90D(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] - tf[i*2+1];
+ td[i*2+1] = te[i*2+1] + tf[i*2];
+ }
+}
+
+bool cadd90Dcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a-d || td[i*2+1] != b+c)
+ return false;
+ }
+ return true;
+}
+
+// cadd270
+
+void cadd270F(float *td, float *te, float *tf, float *tg, int tas)
+{
+ int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] + tf[i*2+1];
+ td[i*2+1] = te[i*2+1] - tf[i*2];
+ }
+}
+
+bool cadd270Fcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ if (td[i*2] != a+d || td[i*2+1] != b-c)
+ return false;
+ }
+ return true;
+}
+
+void cadd270D(double *td, double *te, double *tf, double *tg, int tas)
+{
+ int array_size = tas/2;
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ td[i*2] = te[i*2] + tf[i*2+1];
+ td[i*2+1] = te[i*2+1] - tf[i*2];
+ }
+}
+
+bool cadd270Dcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ if (td[i*2] != a+d || td[i*2+1] != b-c)
+ return false;
+ }
+ return true;
+}
+
+
+// cmla
+
+void cmlaF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] + g[i];
+ }
+}
+
+bool cmlaFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d+e || td[i*2+1] != a*d+b*c+f)
+ return false;
+ }
+ return true;
+}
+
+void cmlaD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] + g[i];
+ }
+}
+
+bool cmlaDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d+e || td[i*2+1] != a*d+b*c+f)
+ return false;
+ }
+ return true;
+}
+
+
+// cmls
+
+void cmlsF(float *td, float *te, float *tf, float *tg, int tas)
+{
+ typedef _Complex float complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] - g[i];
+ }
+}
+
+bool cmlsFcheck(float *td, float *te, float *tf, float *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ float a = te[i*2];
+ float b = te[i*2+1];
+ float c = tf[i*2];
+ float d = tf[i*2+1];
+ float e = tg[i*2];
+ float f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d-e || td[i*2+1] != a*d+b*c-f)
+ return false;
+ }
+ return true;
+}
+
+void cmlsD(double *td, double *te, double *tf, double *tg, int tas)
+{
+ typedef _Complex double complexT;
+ int array_size = tas/2;
+ complexT *d = (complexT*)(td);
+ complexT *e = (complexT*)(te);
+ complexT *f = (complexT*)(tf);
+ complexT *g = (complexT*)(tg);
+#pragma omp target teams distribute parallel for simd
+ for (int i = 0; i < array_size; i++)
+ {
+ d[i] = e[i] * f[i] - g[i];
+ }
+}
+
+bool cmlsDcheck(double *td, double *te, double *tf, double *tg, int tas)
+{
+ for (int i = 0; i < tas/2; ++i)
+ {
+ double a = te[i*2];
+ double b = te[i*2+1];
+ double c = tf[i*2];
+ double d = tf[i*2+1];
+ double e = tg[i*2];
+ double f = tg[i*2+1];
+ if (td[i*2] != a*c-b*d-e || td[i*2+1] != a*d+b*c-f)
+ return false;
+ }
+ return true;
+}
+
+
+typedef void(*runF)(float *td, float *te, float *tf, float *tg, int tas);
+typedef void(*runD)(double *td, double *te, double *tf, double *tg, int tas);
+typedef bool(*checkF)(float *td, float *te, float *tf, float *tg, int tas);
+typedef bool(*checkD)(double *td, double *te, double *tf, double *tg, int tas);
+
+typedef struct
+{
+ runF rF;
+ runD rD;
+ checkF cF;
+ checkD cD;
+} operation;
+
+operation ops[] = {
+ {cmulF, cmulD, cmulFcheck, cmulDcheck},
+ {cmul_conjF, cmul_conjD, cmul_conjFcheck, cmul_conjDcheck},
+ {addsubF, addsubD, addsubFcheck, addsubDcheck},
+ {fmaddsubF, fmaddsubD, fmaddsubFcheck, fmaddsubDcheck},
+ {fmsubaddF, fmsubaddD, fmsubaddFcheck, fmsubaddDcheck},
+ {cadd90F, cadd90D, cadd90Fcheck, cadd90Dcheck},
+ {cadd270F, cadd270D, cadd270Fcheck, cadd270Dcheck},
+ {cmlaF, cmlaD, cmlaFcheck, cmlaDcheck},
+ {cmlsF, cmlsD, cmlsFcheck, cmlsDcheck}
+};
+
+void testF(operation* op)
+{
+ float* td;
+ float* te;
+ float* tf;
+ float* tg;
+ int array_size = COUNT;
+ td = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, omp_default_mem_alloc);
+ te = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, omp_default_mem_alloc);
+ tf = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, omp_default_mem_alloc);
+ tg = (float*)omp_aligned_alloc(ALIGNMENT, sizeof(float)*array_size, omp_default_mem_alloc);
+ omp_set_default_device(device);
+ float* dd = td;
+ float* ee = te;
+ float* ff = tf;
+ float* gg = tg;
+ for (int i = 0; i < COUNT; ++i)
+ {
+ te[i] = (float)(rand() % MAX);
+ tf[i] = (float)(rand() % MAX);
+ tg[i] = (float)(rand() % MAX);
+ }
+ // Set up data region on device
+#pragma omp target enter data map(to: dd[0:array_size], ee[0:array_size], ff[0:array_size], gg[0:array_size])
+ {}
+ op->rF(td, te, tf, tg, COUNT);
+#pragma omp target exit data map(from: dd[0:array_size], ee[0:array_size], ff[0:array_size], gg[0:array_size])
+ {}
+ if (!op->cF(td, te, tf, tg, COUNT))
+ abort();
+}
+
+void testD(operation* op)
+{
+ double* td;
+ double* te;
+ double* tf;
+ double* tg;
+ int array_size = COUNT;
+ td = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, omp_default_mem_alloc);
+ te = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, omp_default_mem_alloc);
+ tf = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, omp_default_mem_alloc);
+ tg = (double*)omp_aligned_alloc(ALIGNMENT, sizeof(double)*array_size, omp_default_mem_alloc);
+ omp_set_default_device(device);
+ double* dd = td;
+ double* ee = te;
+ double* ff = tf;
+ double* gg = tg;
+ for (int i = 0; i < COUNT; ++i)
+ {
+ te[i] = (double)(rand() % MAX);
+ tf[i] = (double)(rand() % MAX);
+ tg[i] = (double)(rand() % MAX);
+ }
+ // Set up data region on device
+#pragma omp target enter data map(to: dd[0:array_size], ee[0:array_size], ff[0:array_size], gg[0:array_size])
+ {}
+ op->rD(td, te, tf, tg, COUNT);
+#pragma omp target exit data map(from: dd[0:array_size], ee[0:array_size], ff[0:array_size], gg[0:array_size])
+ {}
+ if (!op->cD(td, te, tf, tg, COUNT))
+ abort();
+}
+
+int main()
+{
+ for (int i = 0; i < 9; ++i)
+ {
+ testF(&ops[i]);
+ testD(&ops[i]);
+ }
+}
+