RISC-V: Optimize VLA SLP with duplicate VLA shuffle indice

Message ID 20231117044319.3912782-1-juzhe.zhong@rivai.ai
State Unresolved
Headers
Series RISC-V: Optimize VLA SLP with duplicate VLA shuffle indice |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

juzhe.zhong@rivai.ai Nov. 17, 2023, 4:43 a.m. UTC
  When evaluating dynamic LMUL, notice we can do better on VLA SLP with duplicate VLA shuffle indice.

Consider this following case:

void
foo (uint16_t *restrict a, uint16_t *restrict b, int n)
{
  for (int i = 0; i < n; ++i)
    {
      a[i * 8] = b[i * 8 + 3] + 1;
      a[i * 8 + 1] = b[i * 8 + 6] + 1;
      a[i * 8 + 2] = b[i * 8 + 0] + 1;
      a[i * 8 + 3] = b[i * 8 + 2] + 1;
      a[i * 8 + 4] = b[i * 8 + 1] + 1;
      a[i * 8 + 5] = b[i * 8 + 7] + 1;
      a[i * 8 + 6] = b[i * 8 + 5] + 1;
      a[i * 8 + 7] = b[i * 8 + 4] + 1;
    }
}

We want to generate this following indice:

{ 3, 6, 0, 2, 1, 7, 5, 4, 11, 14, 8, 10, 9, 15, 13, 12, 19, 22, 16, 18, 17, 23, 21, 20, ... }

Before this patch, we are using pair of vmseq + vmerge to create such vector in prologue:

https://godbolt.org/z/r919WPabK

foo:
        ble     a2,zero,.L5
        vsetvli a5,zero,e16,m8,ta,ma
        vid.v   v16
        vand.vi v24,v16,7
        vmseq.vi        v0,v24,1
        vmseq.vi        v1,v24,2
        vmv.v.i v8,3
        vmerge.vim      v8,v8,5,v0
        vmv1r.v v0,v1
        vmseq.vi        v2,v24,3
        vmerge.vim      v8,v8,-2,v0
        vmv1r.v v0,v2
        vmseq.vi        v1,v24,4
        csrr    a4,vlenb
        vmerge.vim      v8,v8,-1,v0
        csrr    a6,vlenb
        vmv1r.v v0,v1
        vmseq.vi        v2,v24,5
        slli    a3,a4,2
        vmerge.vim      v8,v8,-3,v0
        slli    a6,a6,2
        vmv1r.v v0,v2
        vmseq.vi        v1,v24,6
        vmerge.vim      v8,v8,2,v0
        addi    a6,a6,-1
        vmv1r.v v0,v1
        slli    a2,a2,3
        slli    a4,a4,3
        neg     a7,a3
        vmseq.vi        v2,v24,7
        vmerge.vim      v8,v8,-1,v0
        vmv.v.x v24,a6
        vmv1r.v v0,v2
        vmerge.vim      v8,v8,-3,v0
        vadd.vv v16,v16,v8
        vand.vv v24,v16,v24
.L3:
        minu    a5,a2,a3
        vsetvli zero,a5,e16,m8,ta,ma
        mv      a6,a2
        vle16.v v16,0(a1)
        vrgather.vv     v8,v16,v24
        vadd.vi v8,v8,1
        vse16.v v8,0(a0)
        add     a1,a1,a4
        add     a0,a0,a4
        add     a2,a2,a7
        bgtu    a6,a3,.L3
.L5:
        ret

After this patch:

foo:
	ble	a2,zero,.L5
	li	a6,-536875008
	slli	a6,a6,4
	addi	a6,a6,3
	csrr	a4,vlenb
	csrr	a5,vlenb
	li	t1,-536850432
	slli	a6,a6,16
	slli	a3,a4,1
	addi	a6,a6,-3
	slli	a5,a5,1
	slli	t1,t1,4
	vsetvli	t3,zero,e64,m4,ta,ma
	addi	a5,a5,-1
	vmv.v.x	v4,a6
	addi	t1,t1,3
	slli	a2,a2,3
	slli	a4,a4,2
	neg	a7,a3
	vslide1up.vx	v16,v4,t1
	vsetvli	a6,zero,e16,m4,ta,ma
	vid.v	v12
	vmv.v.x	v4,a5
	vand.vi	v20,v12,7
	vrgather.vv	v8,v16,v20
	vadd.vv	v12,v12,v8
	vand.vv	v12,v12,v4
.L3:
	minu	a5,a2,a3
	vsetvli	zero,a5,e16,m4,ta,ma
	mv	a6,a2
	vle16.v	v8,0(a1)
	vrgather.vv	v4,v8,v12
	vadd.vi	v4,v4,1
	vse16.v	v4,0(a0)
	add	a1,a1,a4
	add	a0,a0,a4
	add	a2,a2,a7
	bgtu	a6,a3,.L3
.L5:
	ret

Optimization:
1. reduce 9 dynamic instructions in prologue.
2. Fewer vector instructions reduce hardware pipeline consuming.

gcc/ChangeLog:

	* config/riscv/riscv-v.cc (rvv_builder::merge_pattern): New function.
	(expand_vector_init_trailing_same_elem): Adapt function.
	(expand_const_vector): Ditto.
	(expand_vec_init): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c: Adapt test.
	* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c: Ditto.
	* gcc.dg/vect/costmodel/riscv/rvv/slp-1.c: New test.

---
 gcc/config/riscv/riscv-v.cc                   | 174 +++++++++++++-----
 .../costmodel/riscv/rvv/dynamic-lmul4-6.c     |   7 +-
 .../costmodel/riscv/rvv/dynamic-lmul4-8.c     |   7 +-
 .../gcc.dg/vect/costmodel/riscv/rvv/slp-1.c   |  40 ++++
 4 files changed, 174 insertions(+), 54 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c
  

Patch

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 000d0c2c721..6a2009ffb05 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -420,6 +420,7 @@  public:
 
   bool single_step_npatterns_p () const;
   bool npatterns_all_equal_p () const;
+  void merge_pattern (rtx);
 
   machine_mode new_mode () const { return m_new_mode; }
   scalar_mode inner_mode () const { return m_inner_mode; }
@@ -679,6 +680,50 @@  rvv_builder::npatterns_all_equal_p () const
   return true;
 }
 
+/* Merge pattern to reduce the elements we need to process.
+
+   E.g. v = { 0, 1, 2, 3 }, mode = V4SI.
+   Since we can use EEW = 64 RVV instructions.
+
+   Transform it into:
+   v = { 1 << 32, 3 << 32 | 2 }, mode = V2DI.  */
+void
+rvv_builder::merge_pattern (rtx src)
+{
+  if (this->inner_bits_size () >= GET_MODE_BITSIZE (Pmode))
+    return;
+  unsigned int ele_num = GET_MODE_BITSIZE (Pmode) / this->inner_bits_size ();
+  poly_uint64 nunits = exact_div (this->full_nelts (), ele_num);
+  machine_mode mode;
+  if (!get_vector_mode (Pmode, nunits).exists (&mode))
+    return;
+  this->new_vector (mode, this->npatterns () / ele_num,
+		    this->nelts_per_pattern ());
+  rtx imm = gen_int_mode ((1ULL << this->inner_bits_size ()) - 1, Pmode);
+  for (unsigned int i = 0; i < this->nelts_per_pattern (); i++)
+    {
+      for (unsigned int j = 0; j < this->npatterns (); j++)
+	{
+	  rtx val = const0_rtx;
+	  for (unsigned int k = 0; k < ele_num; k++)
+	    {
+	      rtx e
+		= gen_lowpart (Pmode, CONST_VECTOR_ELT (src, k + j * ele_num));
+	      e = expand_simple_binop (Pmode, AND, e, imm, NULL_RTX, false,
+				       OPTAB_DIRECT);
+	      e = expand_simple_binop (
+		Pmode, ASHIFT, e,
+		gen_int_mode (this->inner_bits_size () * k, Pmode), NULL_RTX,
+		false, OPTAB_DIRECT);
+	      val = expand_simple_binop (Pmode, IOR, e, val, NULL_RTX, false,
+					 OPTAB_DIRECT);
+	    }
+	  this->quick_push (val);
+	}
+    }
+  this->finalize ();
+}
+
 static unsigned
 get_sew (machine_mode mode)
 {
@@ -1040,6 +1085,39 @@  expand_vec_series (rtx dest, rtx base, rtx step)
     emit_move_insn (dest, result);
 }
 
+/* Subroutine of expand_vec_init and expand_const_vector to handle case
+   when all trailing elements of builder are same.
+   This works as follows:
+   (a) Use expand_insn interface to broadcast last vector element in TARGET.
+   (b) Insert remaining elements in TARGET using insr.
+
+   ??? The heuristic used is to do above if number of same trailing elements
+   is greater than leading_ndups, loosely based on
+   heuristic from mostly_zeros_p.  May need fine-tuning.  */
+
+static void
+expand_vector_init_trailing_same_elem (rtx target,
+				       const rtx_vector_builder &builder,
+				       int start, int nelts_reqd)
+{
+  machine_mode mode = GET_MODE (target);
+
+  rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
+  for (int i = start - 1; i >= 0; i--)
+    {
+      unsigned int unspec
+	= FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
+      insn_code icode = code_for_pred_slide (unspec, mode);
+      rtx tmp = gen_reg_rtx (mode);
+      rtx ops[] = {tmp, dup, builder.elt (i)};
+      emit_vlmax_insn (icode, BINARY_OP, ops);
+      /* slide1up need source and dest to be different REG.  */
+      dup = tmp;
+    }
+
+  emit_move_insn (target, dup);
+}
+
 static void
 expand_const_vector (rtx target, rtx src)
 {
@@ -1139,6 +1217,43 @@  expand_const_vector (rtx target, rtx src)
 	  rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
 	  emit_move_insn (target, gen_lowpart (mode, dup));
 	}
+      /* We don't apply such approach for LMUL = 8 since vrgather.vv doesn't
+	 allow dest overlap with any source register and VLA repeating vector
+	 always by a addition.  So, it such VLA constant vector will consume
+	 32 registers if LMUL = 8 which cause serious high register pressure. */
+      else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT && npatterns > 2)
+	{
+	  /* We handle the case that we can't find a vector containter to hold
+	     element bitsize = NPATTERNS * ele_bitsize.
+
+	       NPATTERNS = 8, element width = 16
+		 v = { 3, 5, -2, -1, -3, 2, -1, -3, ... }
+	       Since NPATTERNS * element width = 128, we can't find a container
+	       to hold it.
+
+	       In this case, we use NPATTERNS merge operations to generate such
+	       vector.  */
+
+	  /* Generate index = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
+	  rtx vid = gen_reg_rtx (builder.int_mode ());
+	  expand_vec_series (vid, const0_rtx, const1_rtx);
+	  rtx index = gen_reg_rtx (builder.int_mode ());
+	  rtx range = gen_int_mode (builder.npatterns () - 1,
+				    GET_MODE_INNER (builder.int_mode ()));
+	  rtx and_ops[] = {index, vid, range};
+	  emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
+			   BINARY_OP, and_ops);
+
+	  /* Create vector = { 3, 5, -2, -1, -3, 2, -1, -3, X, ... , X }.  */
+	  builder.merge_pattern (src);
+	  rtx init = gen_reg_rtx (builder.mode ());
+	  expand_vector_init_trailing_same_elem (init, builder,
+						 builder.npatterns () - 1,
+						 builder.npatterns ());
+
+	  /* Shuffle vector with index.  */
+	  emit_vlmax_gather_insn (target, gen_lowpart (mode, init), index);
+	}
       else
 	{
 	  /* We handle the case that we can't find a vector containter to hold
@@ -2268,47 +2383,6 @@  expand_vector_init_merge_combine_sequence (rtx target,
   emit_vlmax_insn (icode, MERGE_OP, merge_ops);
 }
 
-/* Subroutine of expand_vec_init to handle case
-   when all trailing elements of builder are same.
-   This works as follows:
-   (a) Use expand_insn interface to broadcast last vector element in TARGET.
-   (b) Insert remaining elements in TARGET using insr.
-
-   ??? The heuristic used is to do above if number of same trailing elements
-   is greater than leading_ndups, loosely based on
-   heuristic from mostly_zeros_p.  May need fine-tuning.  */
-
-static bool
-expand_vector_init_trailing_same_elem (rtx target,
-				       const rtx_vector_builder &builder,
-				       int nelts_reqd)
-{
-  int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
-  int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
-  machine_mode mode = GET_MODE (target);
-
-  if (trailing_ndups > leading_ndups)
-    {
-      rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
-      for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
-	{
-	  unsigned int unspec
-	    = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
-	  insn_code icode = code_for_pred_slide (unspec, mode);
-	  rtx tmp = gen_reg_rtx (mode);
-	  rtx ops[] = {tmp, dup, builder.elt (i)};
-	  emit_vlmax_insn (icode, BINARY_OP, ops);
-	  /* slide1up need source and dest to be different REG.  */
-	  dup = tmp;
-	}
-
-      emit_move_insn (target, dup);
-      return true;
-    }
-
-  return false;
-}
-
 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
 
 void
@@ -2378,11 +2452,19 @@  expand_vec_init (rtx target, rtx vals)
 
   /* Optimize trailing same elements sequence:
       v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x};  */
-  if (!expand_vector_init_trailing_same_elem (target, v, nelts))
-    /* Handle common situation by vslide1down. This function can handle any
-       situation of vec_init<mode>. Only the cases that are not optimized above
-       will fall through here.  */
-    expand_vector_init_insert_elems (target, v, nelts);
+  int leading_ndups = v.count_dups (0, nelts - 1, 1);
+  int trailing_ndups = v.count_dups (nelts - 1, -1, -1);
+  if (trailing_ndups > leading_ndups)
+    {
+      expand_vector_init_trailing_same_elem (target, v, nelts - trailing_ndups,
+					     nelts);
+      return;
+    }
+
+  /* Handle common situation by vslide1down. This function can handle any
+     situation of vec_init<mode>. Only the cases that are not optimized above
+     will fall through here.  */
+  expand_vector_init_insert_elems (target, v, nelts);
 }
 
 /* Get insn code for corresponding comparison.  */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c
index c7b88635923..d3149052b13 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c
@@ -4,7 +4,7 @@ 
 #include <stdint-gcc.h>
 
 void
-foo (uint8_t *restrict a, uint8_t *restrict b, int n)
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
 {
   for (int i = 0; i < n; ++i)
     {
@@ -19,9 +19,8 @@  foo (uint8_t *restrict a, uint8_t *restrict b, int n)
     }
 }
 
-/* { dg-final { scan-assembler {e8,m4} } } */
-/* { dg-final { scan-assembler-times {csrr} 1 } } */
-/* { dg-final { scan-tree-dump-not "Maximum lmul = 8" "vect" } } */
+/* { dg-final { scan-assembler {e16,m4} } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
 /* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c
index 4d9175e86f8..4c428f33a79 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c
@@ -4,7 +4,7 @@ 
 #include <stdint-gcc.h>
 
 void
-foo (uint8_t *restrict a, uint8_t *restrict b, int n)
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
 {
   for (int i = 0; i < n; ++i)
     {
@@ -28,9 +28,8 @@  foo (uint8_t *restrict a, uint8_t *restrict b, int n)
     }
 }
 
-/* { dg-final { scan-assembler {e8,m4} } } */
-/* { dg-final { scan-assembler-times {csrr} 1 } } */
-/* { dg-final { scan-tree-dump-not "Maximum lmul = 8" "vect" } } */
+/* { dg-final { scan-assembler {e16,m4} } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
 /* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c
new file mode 100644
index 00000000000..23bf59b0cb3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c
@@ -0,0 +1,40 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic -fno-schedule-insns -fdump-tree-vect-details" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+
+/*
+** foo:
+**   ...
+**   vslide1up\.vx\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+
+**   ...
+**   vrgather\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+**   ...
+**   vle16\.v\s+v8,\s*0\([a-x0-9]+\)
+**   ...
+**   vrgather\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+**   ...
+*/
+void
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      a[i * 8] = b[i * 8 + 3] + 1;
+      a[i * 8 + 1] = b[i * 8 + 6] + 1;
+      a[i * 8 + 2] = b[i * 8 + 0] + 1;
+      a[i * 8 + 3] = b[i * 8 + 2] + 1;
+      a[i * 8 + 4] = b[i * 8 + 1] + 1;
+      a[i * 8 + 5] = b[i * 8 + 7] + 1;
+      a[i * 8 + 6] = b[i * 8 + 5] + 1;
+      a[i * 8 + 7] = b[i * 8 + 4] + 1;
+    }
+}
+
+/* { dg-final { scan-assembler-times {vslide1up\.vx} 1 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv} 2 } } */
+/* { dg-final { scan-assembler-not {vmv1r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv2r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv4r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv8r\.v} } } */