@@ -1958,7 +1958,7 @@ (define_insn "aarch64_simd_vec_unpack<su>_lo_<mode>"
[(set_attr "type" "neon_shift_imm_long")]
)
-(define_insn "aarch64_simd_vec_unpack<su>_hi_<mode>"
+(define_insn_and_split "aarch64_simd_vec_unpack<su>_hi_<mode>"
[(set (match_operand:<VWIDE> 0 "register_operand" "=w")
(ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
(match_operand:VQW 1 "register_operand" "w")
@@ -1966,63 +1966,54 @@ (define_insn "aarch64_simd_vec_unpack<su>_hi_<mode>"
)))]
"TARGET_SIMD"
"<su>xtl2\t%0.<Vwtype>, %1.<Vtype>"
- [(set_attr "type" "neon_shift_imm_long")]
-)
-
-(define_expand "vec_unpacku_hi_<mode>"
- [(match_operand:<VWIDE> 0 "register_operand")
- (match_operand:VQW 1 "register_operand")]
- "TARGET_SIMD"
+ "&& <CODE> == ZERO_EXTEND
+ && can_create_pseudo_p ()
+ && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))"
+ [(const_int 0)]
{
- rtx res = gen_reg_rtx (<MODE>mode);
- rtx tmp = aarch64_gen_shareable_zero (<MODE>mode);
- if (BYTES_BIG_ENDIAN)
- emit_insn (gen_aarch64_zip2<mode> (res, tmp, operands[1]));
- else
- emit_insn (gen_aarch64_zip2<mode> (res, operands[1], tmp));
- emit_move_insn (operands[0],
- simplify_gen_subreg (<VWIDE>mode, res, <MODE>mode, 0));
+ /* On many cores, it is cheaper to implement UXTL2 using a ZIP2 with zero,
+ provided that the cost of the zero can be amortized over several
+ operations. We'll later recombine the zero and zip if there are
+ not sufficient uses of the zero to make the split worthwhile. */
+ rtx res = simplify_gen_subreg (<MODE>mode, operands[0], <VWIDE>mode, 0);
+ rtx zero = aarch64_gen_shareable_zero (V4SImode);
+ emit_insn (gen_aarch64_zip2<mode> (res, operands[1], zero));
DONE;
}
+ [(set_attr "type" "neon_shift_imm_long")]
)
-(define_expand "vec_unpacks_hi_<mode>"
- [(match_operand:<VWIDE> 0 "register_operand")
- (match_operand:VQW 1 "register_operand")]
+(define_insn "*aarch64_zip2_uxtl2"
+ [(set (match_operand:VQW 0 "register_operand" "=w")
+ (unspec:VQW
+ [(match_operand 1 "aarch64_any_register_operand" "w")
+ (match_operand 2 "aarch64_simd_imm_zero")]
+ UNSPEC_ZIP2))]
"TARGET_SIMD"
- {
- rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
- emit_insn (gen_aarch64_simd_vec_unpacks_hi_<mode> (operands[0],
- operands[1], p));
- DONE;
- }
+ "uxtl2\t%0.<Vwtype>, %1.<Vtype>"
+ [(set_attr "type" "neon_shift_imm_long")]
)
-(define_expand "vec_unpacku_lo_<mode>"
+(define_expand "vec_unpack<su>_hi_<mode>"
[(match_operand:<VWIDE> 0 "register_operand")
- (match_operand:VQW 1 "register_operand")]
+ (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
"TARGET_SIMD"
{
- rtx res = gen_reg_rtx (<MODE>mode);
- rtx tmp = aarch64_gen_shareable_zero (<MODE>mode);
- if (BYTES_BIG_ENDIAN)
- emit_insn (gen_aarch64_zip1<mode> (res, tmp, operands[1]));
- else
- emit_insn (gen_aarch64_zip1<mode> (res, operands[1], tmp));
- emit_move_insn (operands[0],
- simplify_gen_subreg (<VWIDE>mode, res, <MODE>mode, 0));
+ rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+ emit_insn (gen_aarch64_simd_vec_unpack<su>_hi_<mode> (operands[0],
+ operands[1], p));
DONE;
}
)
-(define_expand "vec_unpacks_lo_<mode>"
+(define_expand "vec_unpack<su>_lo_<mode>"
[(match_operand:<VWIDE> 0 "register_operand")
- (match_operand:VQW 1 "register_operand")]
+ (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
"TARGET_SIMD"
{
rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
- emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0],
- operands[1], p));
+ emit_insn (gen_aarch64_simd_vec_unpack<su>_lo_<mode> (operands[0],
+ operands[1], p));
DONE;
}
)
@@ -4792,62 +4783,6 @@ (define_insn "aarch64_<ANY_EXTEND:su>subw2<mode>_internal"
[(set_attr "type" "neon_sub_widen")]
)
-(define_insn "aarch64_usubw<mode>_lo_zip"
- [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
- (minus:<VWIDE>
- (match_operand:<VWIDE> 1 "register_operand" "w")
- (subreg:<VWIDE>
- (unspec:<MODE> [
- (match_operand:VQW 2 "register_operand" "w")
- (match_operand:VQW 3 "aarch64_simd_imm_zero")
- ] UNSPEC_ZIP1) 0)))]
- "TARGET_SIMD"
- "usubw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
- [(set_attr "type" "neon_sub_widen")]
-)
-
-(define_insn "aarch64_uaddw<mode>_lo_zip"
- [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
- (plus:<VWIDE>
- (subreg:<VWIDE>
- (unspec:<MODE> [
- (match_operand:VQW 2 "register_operand" "w")
- (match_operand:VQW 3 "aarch64_simd_imm_zero")
- ] UNSPEC_ZIP1) 0)
- (match_operand:<VWIDE> 1 "register_operand" "w")))]
- "TARGET_SIMD"
- "uaddw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
- [(set_attr "type" "neon_add_widen")]
-)
-
-(define_insn "aarch64_usubw<mode>_hi_zip"
- [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
- (minus:<VWIDE>
- (match_operand:<VWIDE> 1 "register_operand" "w")
- (subreg:<VWIDE>
- (unspec:<MODE> [
- (match_operand:VQW 2 "register_operand" "w")
- (match_operand:VQW 3 "aarch64_simd_imm_zero")
- ] UNSPEC_ZIP2) 0)))]
- "TARGET_SIMD"
- "usubw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
- [(set_attr "type" "neon_sub_widen")]
-)
-
-(define_insn "aarch64_uaddw<mode>_hi_zip"
- [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
- (plus:<VWIDE>
- (subreg:<VWIDE>
- (unspec:<MODE> [
- (match_operand:VQW 2 "register_operand" "w")
- (match_operand:VQW 3 "aarch64_simd_imm_zero")
- ] UNSPEC_ZIP2) 0)
- (match_operand:<VWIDE> 1 "register_operand" "w")))]
- "TARGET_SIMD"
- "uaddw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
- [(set_attr "type" "neon_add_widen")]
-)
-
(define_insn "aarch64_<ANY_EXTEND:su>addw<mode>"
[(set (match_operand:<VWIDE> 0 "register_operand" "=w")
(plus:<VWIDE>
@@ -8615,8 +8550,8 @@ (define_insn_and_split "aarch64_combinev16qi"
;; need corresponding changes there.
(define_insn "aarch64_<PERMUTE:perm_insn><mode><vczle><vczbe>"
[(set (match_operand:VALL_F16 0 "register_operand" "=w")
- (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
- (match_operand:VALL_F16 2 "register_operand" "w")]
+ (unspec:VALL_F16 [(match_operand:VALL_F16 1 "aarch64_any_register_operand" "w")
+ (match_operand:VALL_F16 2 "aarch64_any_register_operand" "w")]
PERMUTE))]
"TARGET_SIMD"
"<PERMUTE:perm_insn>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
@@ -9788,11 +9723,37 @@ (define_insn "aarch64_crypto_pmullv2di"
)
;; Sign- or zero-extend a 64-bit integer vector to a 128-bit vector.
-(define_insn "<optab><Vnarrowq><mode>2"
+(define_insn_and_split "<optab><Vnarrowq><mode>2"
[(set (match_operand:VQN 0 "register_operand" "=w")
(ANY_EXTEND:VQN (match_operand:<VNARROWQ> 1 "register_operand" "w")))]
"TARGET_SIMD"
"<su>xtl\t%0.<Vtype>, %1.<Vntype>"
+ "&& <CODE> == ZERO_EXTEND
+ && can_create_pseudo_p ()
+ && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))"
+ [(const_int 0)]
+ {
+ /* On many cores, it is cheaper to implement UXTL using a ZIP1 with zero,
+ provided that the cost of the zero can be amortized over several
+ operations. We'll later recombine the zero and zip if there are
+ not sufficient uses of the zero to make the split worthwhile. */
+ rtx res = simplify_gen_subreg (<VNARROWQ2>mode, operands[0],
+ <MODE>mode, 0);
+ rtx zero = aarch64_gen_shareable_zero (V4SImode);
+ emit_insn (gen_aarch64_zip1<Vnarrowq2> (res, operands[1], zero));
+ DONE;
+ }
+ [(set_attr "type" "neon_shift_imm_long")]
+)
+
+(define_insn "*aarch64_zip1_uxtl"
+ [(set (match_operand:VQW 0 "register_operand" "=w")
+ (unspec:VQW
+ [(match_operand 1 "aarch64_any_register_operand" "w")
+ (match_operand 2 "aarch64_simd_imm_zero")]
+ UNSPEC_ZIP1))]
+ "TARGET_SIMD"
+ "uxtl\t%0.<Vwtype>, %1.<Vhalftype>"
[(set_attr "type" "neon_shift_imm_long")]
)
@@ -15202,6 +15202,20 @@ cost_plus:
return false;
}
+ /* Recognize ZIPs of zero that can be implemented using UXTL{,2}.
+ On many cores, ZIPs have a higher throughput than UXTL,
+ and the zero feeding the ZIPs can be eliminated during rename.
+ We therefore prefer 1 MOVI + 2 ZIPs over 2 UXTLs, assuming all
+ five instructions have equal execution frequency.
+
+ This could be put behind a tuning property if other cores prefer
+ a different approach. */
+ if (speed
+ && (XINT (x, 1) == UNSPEC_ZIP1 || XINT (x, 1) == UNSPEC_ZIP2)
+ && (mode == V16QImode || mode == V8HImode || mode == V4SImode)
+ && aarch64_const_zero_rtx_p (XVECEXP (x, 0, 1)))
+ *cost += COSTS_N_INSNS (1);
+
if (XINT (x, 1) == UNSPEC_RBIT)
{
if (speed)
@@ -22873,16 +22887,41 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
== SYMBOL_TINY_ABSOLUTE;
}
+/* Return a function-invariant register that contains VALUE. *CACHED_INSN
+ caches instructions that set up such registers, so that they can be
+ reused by future calls. */
+
+static rtx
+aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
+{
+ rtx_insn *insn = *cached_insn;
+ if (insn && INSN_P (insn) && !insn->deleted ())
+ {
+ rtx pat = PATTERN (insn);
+ if (GET_CODE (pat) == SET)
+ {
+ rtx dest = SET_DEST (pat);
+ if (REG_P (dest)
+ && !HARD_REGISTER_P (dest)
+ && rtx_equal_p (SET_SRC (pat), value))
+ return dest;
+ }
+ }
+ rtx reg = gen_reg_rtx (GET_MODE (value));
+ *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
+ function_beg_insn);
+ return reg;
+}
+
/* Create a 0 constant that is based on V4SI to allow CSE to optimally share
the constant creation. */
rtx
aarch64_gen_shareable_zero (machine_mode mode)
{
- machine_mode zmode = V4SImode;
- rtx tmp = gen_reg_rtx (zmode);
- emit_move_insn (tmp, CONST0_RTX (zmode));
- return lowpart_subreg (mode, tmp, zmode);
+ rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
+ CONST0_RTX (V4SImode));
+ return lowpart_subreg (mode, reg, GET_MODE (reg));
}
/* Return a const_int vector of VAL. */
@@ -1056,6 +1056,12 @@ typedef struct GTY (()) machine_function
/* A set of all decls that have been passed to a vld1 intrinsic in the
current function. This is used to help guide the vector cost model. */
hash_set<tree> *vector_load_decls;
+
+ /* An instruction that was emitted at the start of the function to
+ set an Advanced SIMD pseudo register to zero. If the instruction
+ still exists and still fulfils its original purpose. the same register
+ can be reused by other code. */
+ rtx_insn *advsimd_zero_insn;
} machine_function;
#endif
#endif
@@ -1656,6 +1656,8 @@ (define_mode_attr Vnarrowq [(V8HI "v8qi") (V4SI "v4hi")
;; Narrowed quad-modes for VQN (Used for XTN2).
(define_mode_attr VNARROWQ2 [(V8HI "V16QI") (V4SI "V8HI")
(V2DI "V4SI")])
+(define_mode_attr Vnarrowq2 [(V8HI "v16qi") (V4SI "v8hi")
+ (V2DI "v4si")])
;; Narrowed modes of vector modes.
(define_mode_attr VNARROW [(VNx8HI "VNx16QI")
@@ -1042,7 +1042,9 @@ (define_predicate "aarch64_gather_scale_operand_d"
;; A special predicate that doesn't match a particular mode.
(define_special_predicate "aarch64_any_register_operand"
- (match_code "reg"))
+ (ior (match_code "reg")
+ (and (match_code "subreg")
+ (match_code "reg" "0"))))
(define_predicate "aarch64_sve_any_binary_operator"
(match_code "plus,minus,mult,div,udiv,smax,umax,smin,umin,and,ior,xor"))
@@ -1,5 +1,5 @@
/* { dg-do run { target le } } */
-/* { dg-additional-options "-Os -fno-tree-ter -save-temps -fdump-rtl-ree-all -free -std=c99 -w" } */
+/* { dg-additional-options "-Os -fno-tree-ter -save-temps -fdump-rtl-ree-all -free -std=c99 -w -fdisable-rtl-split1" } */
typedef unsigned char u8;
typedef unsigned char __attribute__((__vector_size__ (8))) v64u8;
@@ -1,5 +1,5 @@
/* { dg-do run { target le } } */
-/* { dg-additional-options "-O2 -save-temps -fdump-rtl-ree-all -free -std=c99 -w" } */
+/* { dg-additional-options "-O2 -save-temps -fdump-rtl-ree-all -free -std=c99 -w -fdisable-rtl-split1" } */
typedef unsigned char __attribute__((__vector_size__ (8))) v64u8;
typedef unsigned char __attribute__((__vector_size__ (16))) v128u8;
@@ -3,8 +3,6 @@
#include <arm_neon.h>
-#include <arm_neon.h>
-
#define FUNC(IT, OT, S) \
OT \
foo_##S (IT a) \
@@ -22,11 +20,11 @@ FUNC (int32x4_t, int64x2_t, s32)
/* { dg-final { scan-assembler-times {sxtl2\tv0\.2d, v0\.4s} 1} } */
FUNC (uint8x16_t, uint16x8_t, u8)
-/* { dg-final { scan-assembler-times {zip2\tv0\.16b, v0\.16b} 1} } */
+/* { dg-final { scan-assembler-times {uxtl2\tv0\.8h, v0\.16b} 1} } */
FUNC (uint16x8_t, uint32x4_t, u16)
-/* { dg-final { scan-assembler-times {zip2\tv0\.8h, v0\.8h} 1} } */
+/* { dg-final { scan-assembler-times {uxtl2\tv0\.4s, v0\.8h} 1} } */
FUNC (uint32x4_t, uint64x2_t, u32)
-/* { dg-final { scan-assembler-times {zip2\tv0\.4s, v0\.4s} 1} } */
+/* { dg-final { scan-assembler-times {uxtl2\tv0\.2d, v0\.4s} 1} } */
new file mode 100644
@@ -0,0 +1,24 @@
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-vect-compare-costs=0" } */
+/* { dg-do run } */
+
+#pragma GCC target "+nosve"
+
+void __attribute__((noipa))
+f (unsigned int *__restrict x, unsigned short *__restrict y, int n)
+{
+ for (int i = 0; i < n; ++i)
+ x[i] = y[i];
+}
+
+unsigned short y[] = { 1, 2, 3, 4, 5, 6, 7, 8, -1, -2, -3, -4, -5, -6, -7, -8 };
+volatile unsigned int x[16];
+
+int
+main (void)
+{
+ f ((unsigned int *) x, y, 16);
+ for (int i = 0; i < 8; ++i)
+ if (x[i] != i + 1 || x[i + 8] != 0xffff - i)
+ __builtin_abort ();
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,127 @@
+/* { dg-options "-Os -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+typedef __UINT8_TYPE__ v8qi __attribute__((vector_size(8)));
+typedef __UINT16_TYPE__ v4hi __attribute__((vector_size(8)));
+typedef __UINT32_TYPE__ v2si __attribute__((vector_size(8)));
+
+typedef __UINT16_TYPE__ v8hi __attribute__((vector_size(16)));
+typedef __UINT32_TYPE__ v4si __attribute__((vector_size(16)));
+typedef __UINT64_TYPE__ v2di __attribute__((vector_size(16)));
+
+/*
+** f1:
+** uxtl v0\.2d, v0\.2s
+** ret
+*/
+v2di f1 (v2si x) { return __builtin_convertvector (x, v2di); }
+
+/*
+** f2:
+** uxtl v0\.4s, v0\.4h
+** ret
+*/
+v4si f2 (v4hi x) { return __builtin_convertvector (x, v4si); }
+
+/*
+** f3:
+** uxtl v0\.8h, v0\.8b
+** ret
+*/
+v8hi f3 (v8qi x) { return __builtin_convertvector (x, v8hi); }
+
+/*
+** g1:
+** uxtl v[0-9]+\.2d, v[0-9]+\.2s
+** uxtl v[0-9]+\.2d, v[0-9]+\.2s
+** stp [^\n]+
+** ret
+*/
+void
+g1 (v2di *__restrict a, v2si b, v2si c)
+{
+ a[0] = __builtin_convertvector (b, v2di);
+ a[1] = __builtin_convertvector (c, v2di);
+}
+
+/*
+** g2:
+** uxtl v[0-9]+\.4s, v[0-9]+\.4h
+** uxtl v[0-9]+\.4s, v[0-9]+\.4h
+** stp [^\n]+
+** ret
+*/
+void
+g2 (v4si *__restrict a, v4hi b, v4hi c)
+{
+ a[0] = __builtin_convertvector (b, v4si);
+ a[1] = __builtin_convertvector (c, v4si);
+}
+
+/*
+** g3:
+** uxtl v[0-9]+\.8h, v[0-9]+\.8b
+** uxtl v[0-9]+\.8h, v[0-9]+\.8b
+** stp [^\n]+
+** ret
+*/
+void
+g3 (v8hi *__restrict a, v8qi b, v8qi c)
+{
+ a[0] = __builtin_convertvector (b, v8hi);
+ a[1] = __builtin_convertvector (c, v8hi);
+}
+
+/*
+** h1:
+** uxtl v[0-9]+\.2d, v[0-9]+\.2s
+** ...
+** uxtl v[0-9]+\.2d, v[0-9]+\.2s
+** ...
+** uxtl v[0-9]+\.2d, v[0-9]+\.2s
+** ...
+** ret
+*/
+void
+h1 (v2di *__restrict a, v2si b, v2si c, v2si d)
+{
+ a[0] = __builtin_convertvector (b, v2di);
+ a[1] = __builtin_convertvector (c, v2di);
+ a[2] = __builtin_convertvector (d, v2di);
+}
+
+/*
+** h2:
+** uxtl v[0-9]+\.4s, v[0-9]+\.4h
+** ...
+** uxtl v[0-9]+\.4s, v[0-9]+\.4h
+** ...
+** uxtl v[0-9]+\.4s, v[0-9]+\.4h
+** ...
+** ret
+*/
+void
+h2 (v4si *__restrict a, v4hi b, v4hi c, v4hi d)
+{
+ a[0] = __builtin_convertvector (b, v4si);
+ a[1] = __builtin_convertvector (c, v4si);
+ a[2] = __builtin_convertvector (d, v4si);
+}
+
+/*
+** h3:
+** uxtl v[0-9]+\.8h, v[0-9]+\.8b
+** ...
+** uxtl v[0-9]+\.8h, v[0-9]+\.8b
+** ...
+** uxtl v[0-9]+\.8h, v[0-9]+\.8b
+** ...
+** ret
+*/
+void
+h3 (v8hi *__restrict a, v8qi b, v8qi c, v8qi d)
+{
+ a[0] = __builtin_convertvector (b, v8hi);
+ a[1] = __builtin_convertvector (c, v8hi);
+ a[2] = __builtin_convertvector (d, v8hi);
+}
new file mode 100644
@@ -0,0 +1,130 @@
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+
+/*
+** f1:
+** uxtl2 v0\.2d, v0\.4s
+** ret
+*/
+uint64x2_t f1 (uint32x4_t x) { return vshll_high_n_u32 (x, 0); }
+
+/*
+** f2:
+** uxtl2 v0\.4s, v0\.8h
+** ret
+*/
+uint32x4_t f2 (uint16x8_t x) { return vshll_high_n_u16 (x, 0); }
+
+/*
+** f3:
+** uxtl2 v0\.8h, v0\.16b
+** ret
+*/
+uint16x8_t f3 (uint8x16_t x) { return vshll_high_n_u8 (x, 0); }
+
+/*
+** g1:
+** movi (v[0-9]+)\.4s, #?0
+** zip2 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** zip2 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** stp [^\n]+
+** ret
+*/
+void
+g1 (uint64x2_t *__restrict a, uint32x4_t b, uint32x4_t c)
+{
+ a[0] = vshll_high_n_u32 (b, 0);
+ a[1] = vshll_high_n_u32 (c, 0);
+}
+
+/*
+** g2:
+** movi (v[0-9]+)\.4s, #?0
+** zip2 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** zip2 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** stp [^\n]+
+** ret
+*/
+void
+g2 (uint32x4_t *__restrict a, uint16x8_t b, uint16x8_t c)
+{
+ a[0] = vshll_high_n_u16 (b, 0);
+ a[1] = vshll_high_n_u16 (c, 0);
+}
+
+/*
+** g3:
+** movi (v[0-9]+)\.4s, #?0
+** zip2 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** zip2 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** stp [^\n]+
+** ret
+*/
+void
+g3 (uint16x8_t *__restrict a, uint8x16_t b, uint8x16_t c)
+{
+ a[0] = vshll_high_n_u8 (b, 0);
+ a[1] = vshll_high_n_u8 (c, 0);
+}
+
+/*
+** h1:
+** movi (v[0-9]+)\.4s, #?0
+** ...
+** zip2 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** ...
+** zip2 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** ...
+** zip2 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** ...
+** ret
+*/
+void
+h1 (uint64x2_t *__restrict a, uint32x4_t b, uint32x4_t c, uint32x4_t d)
+{
+ a[0] = vshll_high_n_u32 (b, 0);
+ a[1] = vshll_high_n_u32 (c, 0);
+ a[2] = vshll_high_n_u32 (d, 0);
+}
+
+/*
+** h2:
+** movi (v[0-9]+)\.4s, #?0
+** ...
+** zip2 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** ...
+** zip2 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** ...
+** zip2 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** ...
+** ret
+*/
+void
+h2 (uint32x4_t *__restrict a, uint16x8_t b, uint16x8_t c, uint16x8_t d)
+{
+ a[0] = vshll_high_n_u16 (b, 0);
+ a[1] = vshll_high_n_u16 (c, 0);
+ a[2] = vshll_high_n_u16 (d, 0);
+}
+
+/*
+** h3:
+** movi (v[0-9]+)\.4s, #?0
+** ...
+** zip2 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** ...
+** zip2 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** ...
+** zip2 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** ...
+** ret
+*/
+void
+h3 (uint16x8_t *__restrict a, uint8x16_t b, uint8x16_t c, uint8x16_t d)
+{
+ a[0] = vshll_high_n_u8 (b, 0);
+ a[1] = vshll_high_n_u8 (c, 0);
+ a[2] = vshll_high_n_u8 (d, 0);
+}
new file mode 100644
@@ -0,0 +1,26 @@
+/* { dg-options "-O2" } */
+
+#include <arm_neon.h>
+
+void foo ();
+
+void
+f (uint16x8_t *__restrict a, uint8x16_t *__restrict b)
+{
+ a[0] = vshll_high_n_u8 (b[0], 0);
+ a[1] = vshll_high_n_u8 (b[1], 0);
+ a[2] = vshll_high_n_u8 (b[2], 0);
+ a[3] = vshll_high_n_u8 (b[3], 0);
+ foo ();
+ a[4] = vshll_high_n_u8 (b[4], 0);
+ a[5] = vshll_high_n_u8 (b[5], 0);
+ a[6] = vshll_high_n_u8 (b[6], 0);
+ a[7] = vshll_high_n_u8 (b[7], 0);
+}
+
+/* The zero should be rematerialized after the call to foo. */
+/* { dg-final { scan-assembler-times {\tmovi\tv[0-9]+\.4s, #?0\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tldp\tq} 4 } } */
+/* { dg-final { scan-assembler-times {\tzip2\t} 8 } } */
+/* { dg-final { scan-assembler-times {\tstp\tq} 4 } } */
+/* { dg-final { scan-assembler-not {\t[bhsdqv](?:[89]|1[0-5])} } } */
new file mode 100644
@@ -0,0 +1,136 @@
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -mlittle-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+typedef __UINT8_TYPE__ v8qi __attribute__((vector_size(8)));
+typedef __UINT16_TYPE__ v4hi __attribute__((vector_size(8)));
+typedef __UINT32_TYPE__ v2si __attribute__((vector_size(8)));
+
+typedef __UINT16_TYPE__ v8hi __attribute__((vector_size(16)));
+typedef __UINT32_TYPE__ v4si __attribute__((vector_size(16)));
+typedef __UINT64_TYPE__ v2di __attribute__((vector_size(16)));
+
+/*
+** f1:
+** uxtl v0\.2d, v0\.2s
+** ret
+*/
+v2di f1 (v2si x) { return __builtin_convertvector (x, v2di); }
+
+/*
+** f2:
+** uxtl v0\.4s, v0\.4h
+** ret
+*/
+v4si f2 (v4hi x) { return __builtin_convertvector (x, v4si); }
+
+/*
+** f3:
+** uxtl v0\.8h, v0\.8b
+** ret
+*/
+v8hi f3 (v8qi x) { return __builtin_convertvector (x, v8hi); }
+
+/*
+** g1:
+** movi (v[0-9]+)\.4s, #?0
+** zip1 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** zip1 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** stp [^\n]+
+** ret
+*/
+void
+g1 (v2di *__restrict a, v2si b, v2si c)
+{
+ a[0] = __builtin_convertvector (b, v2di);
+ a[1] = __builtin_convertvector (c, v2di);
+}
+
+/*
+** g2:
+** movi (v[0-9]+)\.4s, #?0
+** zip1 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** zip1 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** stp [^\n]+
+** ret
+*/
+void
+g2 (v4si *__restrict a, v4hi b, v4hi c)
+{
+ a[0] = __builtin_convertvector (b, v4si);
+ a[1] = __builtin_convertvector (c, v4si);
+}
+
+/*
+** g3:
+** movi (v[0-9]+)\.4s, #?0
+** zip1 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** zip1 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** stp [^\n]+
+** ret
+*/
+void
+g3 (v8hi *__restrict a, v8qi b, v8qi c)
+{
+ a[0] = __builtin_convertvector (b, v8hi);
+ a[1] = __builtin_convertvector (c, v8hi);
+}
+
+/*
+** h1:
+** movi (v[0-9]+)\.4s, #?0
+** ...
+** zip1 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** ...
+** zip1 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** ...
+** zip1 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** ...
+** ret
+*/
+void
+h1 (v2di *__restrict a, v2si b, v2si c, v2si d)
+{
+ a[0] = __builtin_convertvector (b, v2di);
+ a[1] = __builtin_convertvector (c, v2di);
+ a[2] = __builtin_convertvector (d, v2di);
+}
+
+/*
+** h2:
+** movi (v[0-9]+)\.4s, #?0
+** ...
+** zip1 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** ...
+** zip1 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** ...
+** zip1 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** ...
+** ret
+*/
+void
+h2 (v4si *__restrict a, v4hi b, v4hi c, v4hi d)
+{
+ a[0] = __builtin_convertvector (b, v4si);
+ a[1] = __builtin_convertvector (c, v4si);
+ a[2] = __builtin_convertvector (d, v4si);
+}
+
+/*
+** h3:
+** movi (v[0-9]+)\.4s, #?0
+** ...
+** zip1 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** ...
+** zip1 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** ...
+** zip1 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** ...
+** ret
+*/
+void
+h3 (v8hi *__restrict a, v8qi b, v8qi c, v8qi d)
+{
+ a[0] = __builtin_convertvector (b, v8hi);
+ a[1] = __builtin_convertvector (c, v8hi);
+ a[2] = __builtin_convertvector (d, v8hi);
+}
new file mode 100644
@@ -0,0 +1,136 @@
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2 -mbig-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+typedef __UINT8_TYPE__ v8qi __attribute__((vector_size(8)));
+typedef __UINT16_TYPE__ v4hi __attribute__((vector_size(8)));
+typedef __UINT32_TYPE__ v2si __attribute__((vector_size(8)));
+
+typedef __UINT16_TYPE__ v8hi __attribute__((vector_size(16)));
+typedef __UINT32_TYPE__ v4si __attribute__((vector_size(16)));
+typedef __UINT64_TYPE__ v2di __attribute__((vector_size(16)));
+
+/*
+** f1:
+** uxtl v0\.2d, v0\.2s
+** ret
+*/
+v2di f1 (v2si x) { return __builtin_convertvector (x, v2di); }
+
+/*
+** f2:
+** uxtl v0\.4s, v0\.4h
+** ret
+*/
+v4si f2 (v4hi x) { return __builtin_convertvector (x, v4si); }
+
+/*
+** f3:
+** uxtl v0\.8h, v0\.8b
+** ret
+*/
+v8hi f3 (v8qi x) { return __builtin_convertvector (x, v8hi); }
+
+/*
+** g1:
+** movi (v[0-9]+)\.4s, #?0
+** zip1 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** zip1 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** stp [^\n]+
+** ret
+*/
+void
+g1 (v2di *__restrict a, v2si b, v2si c)
+{
+ a[0] = __builtin_convertvector (b, v2di);
+ a[1] = __builtin_convertvector (c, v2di);
+}
+
+/*
+** g2:
+** movi (v[0-9]+)\.4s, #?0
+** zip1 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** zip1 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** stp [^\n]+
+** ret
+*/
+void
+g2 (v4si *__restrict a, v4hi b, v4hi c)
+{
+ a[0] = __builtin_convertvector (b, v4si);
+ a[1] = __builtin_convertvector (c, v4si);
+}
+
+/*
+** g3:
+** movi (v[0-9]+)\.4s, #?0
+** zip1 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** zip1 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** stp [^\n]+
+** ret
+*/
+void
+g3 (v8hi *__restrict a, v8qi b, v8qi c)
+{
+ a[0] = __builtin_convertvector (b, v8hi);
+ a[1] = __builtin_convertvector (c, v8hi);
+}
+
+/*
+** h1:
+** movi (v[0-9]+)\.4s, #?0
+** ...
+** zip1 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** ...
+** zip1 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** ...
+** zip1 v[0-9]+\.4s, v[0-9]+\.4s, \1\.4s
+** ...
+** ret
+*/
+void
+h1 (v2di *__restrict a, v2si b, v2si c, v2si d)
+{
+ a[0] = __builtin_convertvector (b, v2di);
+ a[1] = __builtin_convertvector (c, v2di);
+ a[2] = __builtin_convertvector (d, v2di);
+}
+
+/*
+** h2:
+** movi (v[0-9]+)\.4s, #?0
+** ...
+** zip1 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** ...
+** zip1 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** ...
+** zip1 v[0-9]+\.8h, v[0-9]+\.8h, \1\.8h
+** ...
+** ret
+*/
+void
+h2 (v4si *__restrict a, v4hi b, v4hi c, v4hi d)
+{
+ a[0] = __builtin_convertvector (b, v4si);
+ a[1] = __builtin_convertvector (c, v4si);
+ a[2] = __builtin_convertvector (d, v4si);
+}
+
+/*
+** h3:
+** movi (v[0-9]+)\.4s, #?0
+** ...
+** zip1 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** ...
+** zip1 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** ...
+** zip1 v[0-9]+\.16b, v[0-9]+\.16b, \1\.16b
+** ...
+** ret
+*/
+void
+h3 (v8hi *__restrict a, v8qi b, v8qi c, v8qi d)
+{
+ a[0] = __builtin_convertvector (b, v8hi);
+ a[1] = __builtin_convertvector (c, v8hi);
+ a[2] = __builtin_convertvector (d, v8hi);
+}
new file mode 100644
@@ -0,0 +1,32 @@
+/* { dg-options "-O2" } */
+/* { dg-do run } */
+
+#pragma GCC target "+nosve"
+
+typedef __UINT8_TYPE__ v8qi __attribute__((vector_size(8)));
+typedef __UINT16_TYPE__ v8hi __attribute__((vector_size(16)));
+
+void __attribute__((noipa))
+f (v8hi *__restrict a, v8qi b, v8qi c, v8qi d)
+{
+ a[0] = __builtin_convertvector (b, v8hi);
+ a[1] = __builtin_convertvector (c, v8hi);
+ a[2] = __builtin_convertvector (d, v8hi);
+}
+
+v8hi a[3];
+v8qi b = { 1, 2, 3, 4, 5, 6, 7, 8 };
+v8qi c = { -1, -2, -3, -4, -5, -6, -7, -8 };
+
+v8hi bconv = { 1, 2, 3, 4, 5, 6, 7, 8 };
+v8hi cconv = { 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8 };
+
+int
+main (void)
+{
+ f (a, b, c, b);
+ if (__builtin_memcmp (&a[0], &bconv, sizeof (bconv)) != 0
+ || __builtin_memcmp (&a[1], &cconv, sizeof (cconv)) != 0)
+ __builtin_abort ();
+ return 0;
+}
@@ -14,5 +14,5 @@ f (int16_t *x, int16_t *y, uint8_t *z, int n)
}
}
-/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.8h, v[0-9]+\.8b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tv[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b\n} 1 } } */
/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */
@@ -14,5 +14,5 @@ f (int64_t *x, int64_t *y, uint32_t *z, int n)
}
}
-/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.2d, v[0-9]+\.2s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.4s\n} 1 } } */
/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 1 } } */
@@ -14,5 +14,5 @@ f (int32_t *x, int32_t *y, uint16_t *z, int n)
}
}
-/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.4s, v[0-9]+\.4h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h\n} 1 } } */
/* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4s,} 1 } } */