[pushed] aarch64: Avoid paradoxical subregs in UXTL split [PR113485]
Checks
Commit Message
g:74e3e839ab2d36841320 handled the UXTL{,2}-ZIP[12] optimisation
in split1. The UXTL input is a 64-bit vector of N-bit elements
and the result is a 128-bit vector of 2N-bit elements. The
corresponding ZIP1 operates on 128-bit vectors of N-bit elements.
This meant that the ZIP1 input had to be a 128-bit paradoxical subreg
of the 64-bit UXTL input. In the PRs, it wasn't possible to generate
this subreg because the inputs were already subregs of a x[234]
structure of 64-bit vectors.
I don't think the same thing can happen for UXTL2->ZIP2 because
UXTL2 input is a 128-bit vector rather than a 64-bit vector.
It isn't really necessary for ZIP1 to take 128-bit inputs,
since the upper 64 bits are ignored. This patch therefore adds
a pattern for 64-bit → 128-bit ZIP1s.
In principle, we should probably use this form for all ZIP1s.
But in practice, that creates an awkward special case, and
would be quite invasive for stage 4.
Tested on aarch64-linux-gnu & pushed.
Richard
gcc/
PR target/113485
* config/aarch64/aarch64-simd.md (aarch64_zip1<mode>_low): New
pattern.
(<optab><Vnarrowq><mode>2): Use it instead of generating a
paradoxical subreg for the input.
gcc/testsuite/
PR target/113485
* gcc.target/aarch64/pr113485.c: New test.
* gcc.target/aarch64/pr113573.c: Likewise.
---
gcc/config/aarch64/aarch64-simd.md | 17 +++++++--
gcc/testsuite/gcc.target/aarch64/pr113485.c | 25 +++++++++++++
gcc/testsuite/gcc.target/aarch64/pr113573.c | 40 +++++++++++++++++++++
3 files changed, 79 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/pr113485.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/pr113573.c
@@ -8505,6 +8505,18 @@ (define_insn "aarch64_<PERMUTE:perm_insn><mode><vczle><vczbe>"
[(set_attr "type" "neon_permute<q>")]
)
+;; ZIP1 ignores the contents of the upper halves of the registers,
+;; so we can describe 128-bit operations in terms of 64-bit inputs.
+(define_insn "aarch64_zip1<mode>_low"
+ [(set (match_operand:VQ 0 "register_operand" "=w")
+ (unspec:VQ [(match_operand:<VHALF> 1 "register_operand" "w")
+ (match_operand:<VHALF> 2 "register_operand" "w")]
+ UNSPEC_ZIP1))]
+ "TARGET_SIMD"
+ "zip1\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+ [(set_attr "type" "neon_permute_q")]
+)
+
;; This instruction's pattern is generated directly by
;; aarch64_expand_vec_perm_const, so any changes to the pattern would
;; need corresponding changes there. Note that the immediate (third)
@@ -9685,9 +9697,8 @@ (define_insn_and_split "<optab><Vnarrowq><mode>2"
not sufficient uses of the zero to make the split worthwhile. */
rtx res = simplify_gen_subreg (<VNARROWQ2>mode, operands[0],
<MODE>mode, 0);
- rtx zero = aarch64_gen_shareable_zero (<VNARROWQ2>mode);
- rtx op = lowpart_subreg (<VNARROWQ2>mode, operands[1], <VNARROWQ>mode);
- emit_insn (gen_aarch64_zip1<Vnarrowq2> (res, op, zero));
+ rtx zero = aarch64_gen_shareable_zero (<VNARROWQ>mode);
+ emit_insn (gen_aarch64_zip1<Vnarrowq2>_low (res, operands[1], zero));
DONE;
}
[(set_attr "type" "neon_shift_imm_long")]
new file mode 100644
@@ -0,0 +1,25 @@
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+void test()
+{
+ while (1)
+ {
+ static const uint16_t jsimd_rgb_ycc_neon_consts[] = {19595, 0, 0, 0, 0, 0, 0, 0};
+ uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+
+ uint8_t tmp_buf[0];
+ uint8x8x3_t input_pixels = vld3_u8(tmp_buf);
+ uint16x8_t r = vmovl_u8(input_pixels.val[1]);
+ uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+
+ uint32x4_t s = vdupq_n_u32(1);
+ uint16x4_t a = vrshrn_n_u32(s, 16);
+ uint16x4_t y = vrshrn_n_u32(y_l, 16);
+ uint16x8_t ay = vcombine_u16(a, y);
+
+ unsigned char ***out_buf;
+ vst1_u8(out_buf[1][0], vmovn_u16(ay));
+ }
+}
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-options "-O2" } */
+
+#pragma GCC aarch64 "arm_neon.h"
+typedef __Uint8x8_t uint8x8_t;
+typedef __Uint16x4_t uint16x4_t;
+typedef __Int16x8_t int16x8_t;
+typedef __Uint16x8_t uint16x8_t;
+int jsimd_extbgrx_ycc_convert_neon_image_width,
+ jsimd_extbgrx_ycc_convert_neon___trans_tmp_1;
+uint16x4_t jsimd_extbgrx_ycc_convert_neon___trans_tmp_2;
+uint16x8_t vcombine_u16();
+uint16x8_t vmovl_u8(uint8x8_t __a) {
+ return __builtin_aarch64_uxtlv8hi_uu(__a);
+}
+__inline int __attribute__((__gnu_inline__)) vmull_laneq_u16();
+uint8x8x4_t vld4_u8();
+void jsimd_extbgrx_ycc_convert_neon() {
+ int scaled_128_5 = jsimd_extbgrx_ycc_convert_neon___trans_tmp_1,
+ cols_remaining = jsimd_extbgrx_ycc_convert_neon_image_width;
+ for (;;)
+ if (cols_remaining) {
+ uint8x8x4_t input_pixels = vld4_u8();
+ uint16x8_t r = vmovl_u8(input_pixels.val[2]);
+ uint16x8_t g = vmovl_u8(input_pixels.val[1]);
+ uint16x8_t b = vmovl_u8(input_pixels.val[0]);
+ int y_l = vmull_laneq_u16(r);
+ uint16x8_t __a = g;
+ jsimd_extbgrx_ycc_convert_neon___trans_tmp_2 =
+ (uint16x4_t)__builtin_aarch64_get_lowv8hi((int16x8_t)__a);
+ __a = b;
+ int cb_l = scaled_128_5;
+ int cb_h = scaled_128_5;
+ int cr_l = scaled_128_5;
+ int cr_h = scaled_128_5;
+ uint16x8_t y_u16 = vcombine_u16(y_l);
+ uint16x8_t cb_u16 = vcombine_u16(cb_l, cb_h);
+ uint16x8_t cr_u16 = vcombine_u16(cr_l, cr_h);
+ __a = y_u16 = cb_u16 = cr_u16;
+ }
+}