@@ -4145,6 +4145,9 @@
v256 __builtin_vpair_f32_add (v256, v256);
VPAIR_F32_ADD vpair_add_v8sf3 {mma,pair}
+ float __builtin_vpair_f32_add_elements (v256);
+ VPAIR_F32_ADD_ELEMENTS vpair_reduc_plus_scale_v8sf {mma,pair}
+
v256 __builtin_vpair_f32_assemble (vf, vf);
VPAIR_F32_ASSEMBLE vpair_assemble_v8sf {mma,pair}
@@ -4180,6 +4183,9 @@
v256 __builtin_vpair_f64_add (v256, v256);
VPAIR_F64_ADD vpair_add_v4df3 {mma,pair}
+ double __builtin_vpair_f64_add_elements (v256);
+ VPAIR_F64_ADD_ELEMENTS vpair_reduc_plus_scale_v4df {mma,pair}
+
v256 __builtin_vpair_f64_assemble (vd, vd);
VPAIR_F64_ASSEMBLE vpair_assemble_v4df {mma,pair}
@@ -4375,6 +4381,9 @@ v256 __builtin_vpair_f64_assemble (vd, vd);
v256 __builtin_vpair_i64_add (v256, v256);
VPAIR_I64_ADD vpair_add_v4di3 {mma,pair}
+ long long __builtin_vpair_i64_add_elements (v256);
+ VPAIR_I64_ADD_ELEMENTS vpair_reduc_plus_scale_v4di {mma,pair,no32bit}
+
v256 __builtin_vpair_i64_and (v256, v256);
VPAIR_I64_AND vpair_and_v4di3 {mma,pair}
@@ -4408,6 +4417,9 @@ v256 __builtin_vpair_f64_assemble (vd, vd);
v256 __builtin_vpair_i64_xor (v256, v256);
VPAIR_I64_XOR vpair_xor_v4di3 {mma,pair}
+ unsigned long long __builtin_vpair_i64u_add_elements (v256);
+ VPAIR_I64U_ADD_ELEMENTS vpair_reduc_plus_scale_v4di {mma,pair,no32bit}
+
v256 __builtin_vpair_i64u_assemble (vull, vull);
VPAIR_I64U_ASSEMBLE vpair_assemble_v4di {mma,pair}
@@ -35,6 +35,9 @@ (define_c_enum "unspec"
UNSPEC_VPAIR_V4DI
UNSPEC_VPAIR_ZERO
UNSPEC_VPAIR_SPLAT
+ UNSPEC_VPAIR_REDUCE_PLUS_F32
+ UNSPEC_VPAIR_REDUCE_PLUS_F64
+ UNSPEC_VPAIR_REDUCE_PLUS_I64
])
;; Iterator doing unary/binary arithmetic on vector pairs
@@ -577,6 +580,66 @@ (define_insn_and_split "*vpair_nfms_fpcontract_<vp_pmode>4"
}
[(set_attr "length" "8")])
+
+;; Add all elements in a pair of V4SF vectors.
+(define_insn_and_split "vpair_reduc_plus_scale_v8sf"
+ [(set (match_operand:SF 0 "vsx_register_operand" "=wa")
+ (unspec:SF [(match_operand:OO 1 "vsx_register_operand" "v")]
+ UNSPEC_VPAIR_REDUCE_PLUS_F32))
+ (clobber (match_scratch:V4SF 2 "=&v"))
+ (clobber (match_scratch:V4SF 3 "=&v"))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(pc)]
+{
+ rtx op0 = operands[0];
+ rtx op1 = operands[1];
+ rtx tmp1 = operands[2];
+ rtx tmp2 = operands[3];
+ unsigned r = reg_or_subregno (op1);
+ rtx op1_hi = gen_rtx_REG (V4SFmode, r);
+ rtx op1_lo = gen_rtx_REG (V4SFmode, r + 1);
+
+ emit_insn (gen_addv4sf3 (tmp1, op1_hi, op1_lo));
+ emit_insn (gen_altivec_vsldoi_v4sf (tmp2, tmp1, tmp1, GEN_INT (8)));
+ emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+ emit_insn (gen_altivec_vsldoi_v4sf (tmp1, tmp2, tmp2, GEN_INT (4)));
+ emit_insn (gen_addv4sf3 (tmp2, tmp1, tmp2));
+ emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp2));
+ DONE;
+}
+ [(set_attr "length" "24")])
+
+;; Add all elements in a pair of V2DF vectors
+(define_insn_and_split "vpair_reduc_plus_scale_v4df"
+ [(set (match_operand:DF 0 "vsx_register_operand" "=&wa")
+ (unspec:DF [(match_operand:OO 1 "vsx_register_operand" "wa")]
+ UNSPEC_VPAIR_REDUCE_PLUS_F64))
+ (clobber (match_scratch:DF 2 "=&wa"))
+ (clobber (match_scratch:V2DF 3 "=&wa"))]
+ "TARGET_MMA"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 3)
+ (plus:V2DF (match_dup 4)
+ (match_dup 5)))
+ (set (match_dup 2)
+ (vec_select:DF (match_dup 3)
+ (parallel [(match_dup 6)])))
+ (set (match_dup 0)
+ (plus:DF (match_dup 7)
+ (match_dup 2)))]
+{
+ unsigned reg1 = reg_or_subregno (operands[1]);
+ unsigned reg3 = reg_or_subregno (operands[3]);
+
+ operands[4] = gen_rtx_REG (V2DFmode, reg1);
+ operands[5] = gen_rtx_REG (V2DFmode, reg1 + 1);
+ operands[6] = GEN_INT (BYTES_BIG_ENDIAN ? 1 : 0);
+ operands[7] = gen_rtx_REG (DFmode, reg3);
+})
+
;; Vector pair integer negate support.
(define_insn_and_split "vpair_neg_<vp_pmode>2"
@@ -786,3 +849,33 @@ (define_insn_and_split "*vpair_nor_<vp_pmode>_2"
DONE;
}
[(set_attr "length" "8")])
+
+;; Add all elements in a pair of V2DI vectors
+(define_insn_and_split "vpair_reduc_plus_scale_v4di"
+ [(set (match_operand:DI 0 "gpc_reg_operand" "=&r")
+ (unspec:DI [(match_operand:OO 1 "altivec_register_operand" "v")]
+ UNSPEC_VPAIR_REDUCE_PLUS_I64))
+ (clobber (match_scratch:V2DI 2 "=&v"))
+ (clobber (match_scratch:DI 3 "=&r"))]
+ "TARGET_MMA && TARGET_POWERPC64"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 2)
+ (plus:V2DI (match_dup 4)
+ (match_dup 5)))
+ (set (match_dup 3)
+ (vec_select:DI (match_dup 2)
+ (parallel [(const_int 0)])))
+ (set (match_dup 0)
+ (vec_select:DI (match_dup 2)
+ (parallel [(const_int 1)])))
+ (set (match_dup 0)
+ (plus:DI (match_dup 0)
+ (match_dup 3)))]
+{
+ unsigned reg1 = reg_or_subregno (operands[1]);
+
+ operands[4] = gen_rtx_REG (V2DImode, reg1);
+ operands[5] = gen_rtx_REG (V2DImode, reg1 + 1);
+}
+ [(set_attr "length" "16")])
@@ -21399,6 +21399,7 @@ The following built-in functions operate on pairs of
@smallexample
__vector_pair __builtin_vpair_f32_abs (__vector_pair);
__vector_pair __builtin_vpair_f32_add (__vector_pair, __vector_pair);
+float __builtin_vpair_f32_add_elements (__vector_pair);
__vector_pair __builtin_vpair_f32_assemble (vector float, vector float);
vector float __builtin_vpair_f32_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_f32_fma (__vector_pair, __vector_pair, __vector_pair);
@@ -21416,6 +21417,7 @@ The following built-in functions operate on pairs of
@smallexample
__vector_pair __builtin_vpair_f64_abs (__vector_pair);
__vector_pair __builtin_vpair_f64_add (__vector_pair, __vector_pair);
+double __builtin_vpair_f64_add_elements (__vector_pair);
__vector_pair __builtin_vpair_f64_assemble (vector double, vector double);
vector double __builtin_vpair_f64_extract_vector (__vector_pair, int);
__vector_pair __builtin_vpair_f64_fma (__vector_pair, __vector_pair, __vector_pair);
@@ -21432,6 +21434,7 @@ The following built-in functions operate on pairs of
@smallexample
__vector_pair __builtin_vpair_i64_add (__vector_pair, __vector_pair);
+long long __builtin_vpair_i64_add_elements (__vector_pair);
__vector_pair __builtin_vpair_i64_and (__vector_pair, __vector_pair);
__vector_pair __builtin_vpair_i64_assemble (vector long long,
vector long long);
new file mode 100644
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test vector pair built-in functions to do a horizontal add of the
+ elements. */
+
+float
+f32_add_elements (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xvaddsp, 2 vsldoi, 2 xvaddsp, 1 xcvspdp. */
+ return __builtin_vpair_f32_add_elements (*p);
+}
+
+double
+f64_add_elements (__vector_pair *p)
+{
+ /* 1 lxvp, 1 xvadddp, 1 xxperdi, 1 fadd/xxadddp. */
+ return __builtin_vpair_f64_add_elements (*p);
+}
+
+long long
+i64_add_elements (__vector_pair *p)
+{
+ /* 1 lxvp, 1vaddudm, 1 mfvsrld, 1 mfvsrd, 1 add. */
+ return __builtin_vpair_i64_add_elements (*p);
+}
+
+unsigned long long
+i64u_add_elements (__vector_pair *p)
+{
+ /* 1 lxvp, 1vaddudm, 1 mfvsrld, 1 mfvsrd, 1 add. */
+ return __builtin_vpair_i64u_add_elements (*p);
+}
+
+/* { dg-final { scan-assembler-times {\mfadd\M|\mxsadddp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mlxvp\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mmfvsrd\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mmfvsrld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvaddudm\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvsldoi\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxscvspdp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M} 3 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 1 } } */