[ARM] MVE: Implementing auto-vectorized array * scalar instructions

Message ID yw8jedqpfw6c.fsf@arm.com
State Accepted
Headers
Series [ARM] MVE: Implementing auto-vectorized array * scalar instructions |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Victor Do Nascimento Feb. 16, 2023, 3:48 p.m. UTC
  Hi all,

The back-end pattern for mapping the auto-vectorized representation of
vector * scalar to to machine instruction VMUL was missing, and
multiple instructions were needed to reproduce this behavior as a
result of failed RTL pattern match in combine pass.

RTL patterns were introduced to reproduce the behavior of the
intrinsics vmulq_n_<mode> and vmulq_n_f<mode>.

In the case of literal constants, an intermediate instruction was
added in to initial RTL expansion to ensure a general-purpose register
was allocated to store the constant, which could then be be extracted
from the constant vector.

For the function

void test_vmulimm_s32x4 (int32_t * __restrict__ dest, int32_t *a) 
{ 
  int i;
  for (i=0; i<4; i++) { 
    dest[i] = a[i] * 5; 
  }
}


The GIMPLE -> RTL expansion is modified to produce:
(set (reg:SI 119)
     (const_int 5 [0x5]))
(set (reg:V4SI 118)
     (mult:V4SI (vec_duplicate:V4SI (reg:SI 119))
                (reg:V4SI 117)))

instead of:
(set (reg:V4SI 119)
     (const_vector:V4SI [
        (const_int 5 [0x5]) repeated x4
      ]))
(set (reg:V4SI 118)
     (mult:V4SI (reg:V4SI 117)
                (reg:V4SI 119)))

The end assembly for the above function introduces the emission of the following insn:
vmul.i32 q3, q3, r3

as opposed to:
vmul.i32 q3, q3, q2

All tests in gcc.target/arm/simd/mve-vmul-scalar-1.c now pass.

Added new RTL templates, amended unit test and checked for regressions on arm-none-eabi.

Thanks,
Victor

gcc:
	* gcc/config/arm/arm.cc (neon_vdup_constant): static keyword
	removed.
	* gcc/config/arm/arm-protos.h (neon_vdup_constant): prototype
	added.
	* gcc/config/arm/mve.md (@mve_vmulq_n_<mode>2): New.
	* gcc/config/arm/predicates.md (reg_or_me_replicated_const_operand):
	New.
	* gcc/config/arm/vec-common.md (mul<mode>3): Modify to use
	`reg_or_me_replicated_const_operand'.

testsuite:
	* gcc.target/arm/simd/mve-vmul-scalar-1.c: Corrected typo,
	xfails removed. 
---
 gcc/config/arm/arm-protos.h                        |  1 +
 gcc/config/arm/arm.cc                              |  2 +-
 gcc/config/arm/mve.md                              | 11 +++++++++++
 gcc/config/arm/predicates.md                       |  8 ++++++++
 gcc/config/arm/vec-common.md                       | 14 ++++++++++++--
 .../gcc.target/arm/simd/mve-vmul-scalar-1.c        | 13 ++++++-------
 6 files changed, 39 insertions(+), 10 deletions(-)
  

Patch

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index aea472bfbb9..4cf9fb00e01 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -199,6 +199,7 @@  extern rtx arm_load_tp (rtx);
 extern bool arm_coproc_builtin_available (enum unspecv);
 extern bool arm_coproc_ldc_stc_legitimate_address (rtx);
 extern rtx arm_stack_protect_tls_canary_mem (bool);
+extern rtx neon_vdup_constant (rtx, bool);
 
 
 #if defined TREE_CODE
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index efc48349dd3..7d9d265b0a7 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -13301,7 +13301,7 @@  neon_pairwise_reduce (rtx op0, rtx op1, machine_mode mode,
    If this is the case, and GENERATE is set, we also generate
    instructions to do this and return an RTX to assign to the register.  */
 
-static rtx
+rtx
 neon_vdup_constant (rtx vals, bool generate)
 {
   machine_mode mode = GET_MODE (vals);
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 555ad1b66c8..806c24e33aa 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1376,6 +1376,17 @@ 
   [(set_attr "type" "mve_move")
 ])
 
+(define_insn "@mve_vmulq_n_<mode>2"
+  [
+   (set (match_operand:MVE_VLD_ST 0 "s_register_operand" "=w")
+	(mult:MVE_VLD_ST (vec_duplicate:MVE_VLD_ST (match_operand:<V_elem> 1 "s_register_operand" "r"))
+						   (match_operand:MVE_VLD_ST 2 "s_register_operand" "w")))
+  ]
+  "TARGET_HAVE_MVE"
+  "vmul.%#<V_if_elem>\t%q0, %q2, %r1"
+  [(set_attr "type" "mve_move")
+])
+
 ;;
 ;; [vmulq_u, vmulq_s])
 ;;
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 3139750c606..31eadfa2d3b 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -113,6 +113,14 @@ 
           && neon_immediate_valid_for_logic (op, mode, 1, NULL, NULL));
 })
 
+(define_predicate "reg_or_mve_replicated_const_operand"
+  (if_then_else (and (match_test "TARGET_HAVE_MVE")
+		     (match_code "const_vector")
+		     (match_test "const_vec_duplicate_p (op)"))
+		(match_operand 0 "immediate_operand")
+		(match_operand 0 "s_register_operand"))
+)
+
 (define_predicate "neon_inv_logic_op2"
   (ior (match_operand 0 "imm_for_neon_inv_logic_operand")
        (match_operand 0 "s_register_operand")))
diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
index f06df4db636..17b67c214b4 100644
--- a/gcc/config/arm/vec-common.md
+++ b/gcc/config/arm/vec-common.md
@@ -102,12 +102,22 @@ 
 (define_expand "mul<mode>3"
   [(set (match_operand:VDQWH 0 "s_register_operand")
 	(mult:VDQWH (match_operand:VDQWH 1 "s_register_operand")
-		    (match_operand:VDQWH 2 "s_register_operand")))]
+		    (match_operand:VDQWH 2 "reg_or_mve_replicated_const_operand")))]
   "ARM_HAVE_<MODE>_ARITH
    && (!TARGET_REALLY_IWMMXT
        || <MODE>mode == V4HImode
        || <MODE>mode == V2SImode)"
-)
+{
+  if ((GET_CODE (operands[2]) == CONST_VECTOR) && can_create_pseudo_p ()
+       && (VALID_MVE_SI_MODE (<MODE>mode) || VALID_MVE_SF_MODE (<MODE>mode)))
+  {
+    rtx tmp = gen_reg_rtx (<V_elem>mode);
+    emit_move_insn (tmp, neon_vdup_constant (operands[2], 0));
+    emit_insn (maybe_gen_mve_vmulq_n_2 (<MODE>mode, operands[0], tmp,
+					operands[1]));
+    DONE;
+  }
+})
 
 (define_expand "smin<mode>3"
   [(set (match_operand:VALLW 0 "s_register_operand")
diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c
index 22be452e8d9..0736847a96d 100644
--- a/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c
+++ b/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c
@@ -24,9 +24,9 @@  FUNC_IMM(u, uint, 8, 16, *, vmulimm)
 
 /* For the moment we do not select the T2 vmul variant operating on a scalar
    final argument.  */
-/* { dg-final { scan-assembler-times {vmul\.i32\tq[0-9]+, q[0-9]+, r[0-9]+} 2 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {vmul\.i16\tq[0-9]+, q[0-9]+, r[0-9]+} 2 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {vmul\.i8\tq[0-9]+, q[0-9]+, r[0-9]+} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {vmul\.i32\tq[0-9]+, q[0-9]+, r[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {vmul\.i16\tq[0-9]+, q[0-9]+, r[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {vmul\.i8\tq[0-9]+, q[0-9]+, r[0-9]+} 2 } } */
 
 void test_vmul_f32 (float * dest, float * a, float * b) {
   int i;
@@ -40,16 +40,15 @@  void test_vmulimm_f32 (float * dest, float * a) {
     dest[i] = a[i] * 5.0;
   }
 }
-/* { dg-final { scan-assembler-times {vmul\.f32\tq[0-9]+, q[0-9]+, r[0-9]+} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {vmul\.f32\tq[0-9]+, q[0-9]+, r[0-9]+} 2 } } */
 
 void test_vmul_f16 (__fp16 * dest, __fp16 * a, __fp16 * b) {
   int i;
   for (i=0; i<8; i++) {
-    dest[i] = a[i] * b[i];
+    dest[i] = a[i] * b[1];
   }
 }
 
-/* Note that dest[i] = a[i] * 5.0f16 is not vectorized.  */
 void test_vmulimm_f16 (__fp16 * dest, __fp16 * a) {
   int i;
   __fp16 b = 5.0f16;
@@ -57,4 +56,4 @@  void test_vmulimm_f16 (__fp16 * dest, __fp16 * a) {
     dest[i] = a[i] * b;
   }
 }
-/* { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, r[0-9]+} 2 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, r[0-9]+} 2 } } */