RISC-V: Optimize VLA SLP with duplicate VLA shuffle indice
Checks
Commit Message
When evaluating dynamic LMUL, notice we can do better on VLA SLP with duplicate VLA shuffle indice.
Consider this following case:
void
foo (uint16_t *restrict a, uint16_t *restrict b, int n)
{
for (int i = 0; i < n; ++i)
{
a[i * 8] = b[i * 8 + 3] + 1;
a[i * 8 + 1] = b[i * 8 + 6] + 1;
a[i * 8 + 2] = b[i * 8 + 0] + 1;
a[i * 8 + 3] = b[i * 8 + 2] + 1;
a[i * 8 + 4] = b[i * 8 + 1] + 1;
a[i * 8 + 5] = b[i * 8 + 7] + 1;
a[i * 8 + 6] = b[i * 8 + 5] + 1;
a[i * 8 + 7] = b[i * 8 + 4] + 1;
}
}
We want to generate this following indice:
{ 3, 6, 0, 2, 1, 7, 5, 4, 11, 14, 8, 10, 9, 15, 13, 12, 19, 22, 16, 18, 17, 23, 21, 20, ... }
Before this patch, we are using pair of vmseq + vmerge to create such vector in prologue:
https://godbolt.org/z/r919WPabK
foo:
ble a2,zero,.L5
vsetvli a5,zero,e16,m8,ta,ma
vid.v v16
vand.vi v24,v16,7
vmseq.vi v0,v24,1
vmseq.vi v1,v24,2
vmv.v.i v8,3
vmerge.vim v8,v8,5,v0
vmv1r.v v0,v1
vmseq.vi v2,v24,3
vmerge.vim v8,v8,-2,v0
vmv1r.v v0,v2
vmseq.vi v1,v24,4
csrr a4,vlenb
vmerge.vim v8,v8,-1,v0
csrr a6,vlenb
vmv1r.v v0,v1
vmseq.vi v2,v24,5
slli a3,a4,2
vmerge.vim v8,v8,-3,v0
slli a6,a6,2
vmv1r.v v0,v2
vmseq.vi v1,v24,6
vmerge.vim v8,v8,2,v0
addi a6,a6,-1
vmv1r.v v0,v1
slli a2,a2,3
slli a4,a4,3
neg a7,a3
vmseq.vi v2,v24,7
vmerge.vim v8,v8,-1,v0
vmv.v.x v24,a6
vmv1r.v v0,v2
vmerge.vim v8,v8,-3,v0
vadd.vv v16,v16,v8
vand.vv v24,v16,v24
.L3:
minu a5,a2,a3
vsetvli zero,a5,e16,m8,ta,ma
mv a6,a2
vle16.v v16,0(a1)
vrgather.vv v8,v16,v24
vadd.vi v8,v8,1
vse16.v v8,0(a0)
add a1,a1,a4
add a0,a0,a4
add a2,a2,a7
bgtu a6,a3,.L3
.L5:
ret
After this patch:
foo:
ble a2,zero,.L5
li a6,-536875008
slli a6,a6,4
addi a6,a6,3
csrr a4,vlenb
csrr a5,vlenb
li t1,-536850432
slli a6,a6,16
slli a3,a4,1
addi a6,a6,-3
slli a5,a5,1
slli t1,t1,4
vsetvli t3,zero,e64,m4,ta,ma
addi a5,a5,-1
vmv.v.x v4,a6
addi t1,t1,3
slli a2,a2,3
slli a4,a4,2
neg a7,a3
vslide1up.vx v16,v4,t1
vsetvli a6,zero,e16,m4,ta,ma
vid.v v12
vmv.v.x v4,a5
vand.vi v20,v12,7
vrgather.vv v8,v16,v20
vadd.vv v12,v12,v8
vand.vv v12,v12,v4
.L3:
minu a5,a2,a3
vsetvli zero,a5,e16,m4,ta,ma
mv a6,a2
vle16.v v8,0(a1)
vrgather.vv v4,v8,v12
vadd.vi v4,v4,1
vse16.v v4,0(a0)
add a1,a1,a4
add a0,a0,a4
add a2,a2,a7
bgtu a6,a3,.L3
.L5:
ret
Optimization:
1. reduce 9 dynamic instructions in prologue.
2. Fewer vector instructions reduce hardware pipeline consuming.
gcc/ChangeLog:
* config/riscv/riscv-v.cc (rvv_builder::merge_pattern): New function.
(expand_vector_init_trailing_same_elem): Adapt function.
(expand_const_vector): Ditto.
(expand_vec_init): Ditto.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c: Adapt test.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/slp-1.c: New test.
---
gcc/config/riscv/riscv-v.cc | 174 +++++++++++++-----
.../costmodel/riscv/rvv/dynamic-lmul4-6.c | 7 +-
.../costmodel/riscv/rvv/dynamic-lmul4-8.c | 7 +-
.../gcc.dg/vect/costmodel/riscv/rvv/slp-1.c | 40 ++++
4 files changed, 174 insertions(+), 54 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c
@@ -420,6 +420,7 @@ public:
bool single_step_npatterns_p () const;
bool npatterns_all_equal_p () const;
+ void merge_pattern (rtx);
machine_mode new_mode () const { return m_new_mode; }
scalar_mode inner_mode () const { return m_inner_mode; }
@@ -679,6 +680,50 @@ rvv_builder::npatterns_all_equal_p () const
return true;
}
+/* Merge pattern to reduce the elements we need to process.
+
+ E.g. v = { 0, 1, 2, 3 }, mode = V4SI.
+ Since we can use EEW = 64 RVV instructions.
+
+ Transform it into:
+ v = { 1 << 32, 3 << 32 | 2 }, mode = V2DI. */
+void
+rvv_builder::merge_pattern (rtx src)
+{
+ if (this->inner_bits_size () >= GET_MODE_BITSIZE (Pmode))
+ return;
+ unsigned int ele_num = GET_MODE_BITSIZE (Pmode) / this->inner_bits_size ();
+ poly_uint64 nunits = exact_div (this->full_nelts (), ele_num);
+ machine_mode mode;
+ if (!get_vector_mode (Pmode, nunits).exists (&mode))
+ return;
+ this->new_vector (mode, this->npatterns () / ele_num,
+ this->nelts_per_pattern ());
+ rtx imm = gen_int_mode ((1ULL << this->inner_bits_size ()) - 1, Pmode);
+ for (unsigned int i = 0; i < this->nelts_per_pattern (); i++)
+ {
+ for (unsigned int j = 0; j < this->npatterns (); j++)
+ {
+ rtx val = const0_rtx;
+ for (unsigned int k = 0; k < ele_num; k++)
+ {
+ rtx e
+ = gen_lowpart (Pmode, CONST_VECTOR_ELT (src, k + j * ele_num));
+ e = expand_simple_binop (Pmode, AND, e, imm, NULL_RTX, false,
+ OPTAB_DIRECT);
+ e = expand_simple_binop (
+ Pmode, ASHIFT, e,
+ gen_int_mode (this->inner_bits_size () * k, Pmode), NULL_RTX,
+ false, OPTAB_DIRECT);
+ val = expand_simple_binop (Pmode, IOR, e, val, NULL_RTX, false,
+ OPTAB_DIRECT);
+ }
+ this->quick_push (val);
+ }
+ }
+ this->finalize ();
+}
+
static unsigned
get_sew (machine_mode mode)
{
@@ -1040,6 +1085,39 @@ expand_vec_series (rtx dest, rtx base, rtx step)
emit_move_insn (dest, result);
}
+/* Subroutine of expand_vec_init and expand_const_vector to handle case
+ when all trailing elements of builder are same.
+ This works as follows:
+ (a) Use expand_insn interface to broadcast last vector element in TARGET.
+ (b) Insert remaining elements in TARGET using insr.
+
+ ??? The heuristic used is to do above if number of same trailing elements
+ is greater than leading_ndups, loosely based on
+ heuristic from mostly_zeros_p. May need fine-tuning. */
+
+static void
+expand_vector_init_trailing_same_elem (rtx target,
+ const rtx_vector_builder &builder,
+ int start, int nelts_reqd)
+{
+ machine_mode mode = GET_MODE (target);
+
+ rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
+ for (int i = start - 1; i >= 0; i--)
+ {
+ unsigned int unspec
+ = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
+ insn_code icode = code_for_pred_slide (unspec, mode);
+ rtx tmp = gen_reg_rtx (mode);
+ rtx ops[] = {tmp, dup, builder.elt (i)};
+ emit_vlmax_insn (icode, BINARY_OP, ops);
+ /* slide1up need source and dest to be different REG. */
+ dup = tmp;
+ }
+
+ emit_move_insn (target, dup);
+}
+
static void
expand_const_vector (rtx target, rtx src)
{
@@ -1139,6 +1217,43 @@ expand_const_vector (rtx target, rtx src)
rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
emit_move_insn (target, gen_lowpart (mode, dup));
}
+ /* We don't apply such approach for LMUL = 8 since vrgather.vv doesn't
+ allow dest overlap with any source register and VLA repeating vector
+ always by a addition. So, it such VLA constant vector will consume
+ 32 registers if LMUL = 8 which cause serious high register pressure. */
+ else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT && npatterns > 2)
+ {
+ /* We handle the case that we can't find a vector containter to hold
+ element bitsize = NPATTERNS * ele_bitsize.
+
+ NPATTERNS = 8, element width = 16
+ v = { 3, 5, -2, -1, -3, 2, -1, -3, ... }
+ Since NPATTERNS * element width = 128, we can't find a container
+ to hold it.
+
+ In this case, we use NPATTERNS merge operations to generate such
+ vector. */
+
+ /* Generate index = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
+ rtx vid = gen_reg_rtx (builder.int_mode ());
+ expand_vec_series (vid, const0_rtx, const1_rtx);
+ rtx index = gen_reg_rtx (builder.int_mode ());
+ rtx range = gen_int_mode (builder.npatterns () - 1,
+ GET_MODE_INNER (builder.int_mode ()));
+ rtx and_ops[] = {index, vid, range};
+ emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
+ BINARY_OP, and_ops);
+
+ /* Create vector = { 3, 5, -2, -1, -3, 2, -1, -3, X, ... , X }. */
+ builder.merge_pattern (src);
+ rtx init = gen_reg_rtx (builder.mode ());
+ expand_vector_init_trailing_same_elem (init, builder,
+ builder.npatterns () - 1,
+ builder.npatterns ());
+
+ /* Shuffle vector with index. */
+ emit_vlmax_gather_insn (target, gen_lowpart (mode, init), index);
+ }
else
{
/* We handle the case that we can't find a vector containter to hold
@@ -2268,47 +2383,6 @@ expand_vector_init_merge_combine_sequence (rtx target,
emit_vlmax_insn (icode, MERGE_OP, merge_ops);
}
-/* Subroutine of expand_vec_init to handle case
- when all trailing elements of builder are same.
- This works as follows:
- (a) Use expand_insn interface to broadcast last vector element in TARGET.
- (b) Insert remaining elements in TARGET using insr.
-
- ??? The heuristic used is to do above if number of same trailing elements
- is greater than leading_ndups, loosely based on
- heuristic from mostly_zeros_p. May need fine-tuning. */
-
-static bool
-expand_vector_init_trailing_same_elem (rtx target,
- const rtx_vector_builder &builder,
- int nelts_reqd)
-{
- int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
- int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
- machine_mode mode = GET_MODE (target);
-
- if (trailing_ndups > leading_ndups)
- {
- rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
- for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
- {
- unsigned int unspec
- = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
- insn_code icode = code_for_pred_slide (unspec, mode);
- rtx tmp = gen_reg_rtx (mode);
- rtx ops[] = {tmp, dup, builder.elt (i)};
- emit_vlmax_insn (icode, BINARY_OP, ops);
- /* slide1up need source and dest to be different REG. */
- dup = tmp;
- }
-
- emit_move_insn (target, dup);
- return true;
- }
-
- return false;
-}
-
/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
void
@@ -2378,11 +2452,19 @@ expand_vec_init (rtx target, rtx vals)
/* Optimize trailing same elements sequence:
v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x}; */
- if (!expand_vector_init_trailing_same_elem (target, v, nelts))
- /* Handle common situation by vslide1down. This function can handle any
- situation of vec_init<mode>. Only the cases that are not optimized above
- will fall through here. */
- expand_vector_init_insert_elems (target, v, nelts);
+ int leading_ndups = v.count_dups (0, nelts - 1, 1);
+ int trailing_ndups = v.count_dups (nelts - 1, -1, -1);
+ if (trailing_ndups > leading_ndups)
+ {
+ expand_vector_init_trailing_same_elem (target, v, nelts - trailing_ndups,
+ nelts);
+ return;
+ }
+
+ /* Handle common situation by vslide1down. This function can handle any
+ situation of vec_init<mode>. Only the cases that are not optimized above
+ will fall through here. */
+ expand_vector_init_insert_elems (target, v, nelts);
}
/* Get insn code for corresponding comparison. */
@@ -4,7 +4,7 @@
#include <stdint-gcc.h>
void
-foo (uint8_t *restrict a, uint8_t *restrict b, int n)
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
{
for (int i = 0; i < n; ++i)
{
@@ -19,9 +19,8 @@ foo (uint8_t *restrict a, uint8_t *restrict b, int n)
}
}
-/* { dg-final { scan-assembler {e8,m4} } } */
-/* { dg-final { scan-assembler-times {csrr} 1 } } */
-/* { dg-final { scan-tree-dump-not "Maximum lmul = 8" "vect" } } */
+/* { dg-final { scan-assembler {e16,m4} } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
/* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
@@ -4,7 +4,7 @@
#include <stdint-gcc.h>
void
-foo (uint8_t *restrict a, uint8_t *restrict b, int n)
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
{
for (int i = 0; i < n; ++i)
{
@@ -28,9 +28,8 @@ foo (uint8_t *restrict a, uint8_t *restrict b, int n)
}
}
-/* { dg-final { scan-assembler {e8,m4} } } */
-/* { dg-final { scan-assembler-times {csrr} 1 } } */
-/* { dg-final { scan-tree-dump-not "Maximum lmul = 8" "vect" } } */
+/* { dg-final { scan-assembler {e16,m4} } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
/* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
new file mode 100644
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic -fno-schedule-insns -fdump-tree-vect-details" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+
+/*
+** foo:
+** ...
+** vslide1up\.vx\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+
+** ...
+** vrgather\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+** ...
+** vle16\.v\s+v8,\s*0\([a-x0-9]+\)
+** ...
+** vrgather\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+** ...
+*/
+void
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ a[i * 8] = b[i * 8 + 3] + 1;
+ a[i * 8 + 1] = b[i * 8 + 6] + 1;
+ a[i * 8 + 2] = b[i * 8 + 0] + 1;
+ a[i * 8 + 3] = b[i * 8 + 2] + 1;
+ a[i * 8 + 4] = b[i * 8 + 1] + 1;
+ a[i * 8 + 5] = b[i * 8 + 7] + 1;
+ a[i * 8 + 6] = b[i * 8 + 5] + 1;
+ a[i * 8 + 7] = b[i * 8 + 4] + 1;
+ }
+}
+
+/* { dg-final { scan-assembler-times {vslide1up\.vx} 1 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv} 2 } } */
+/* { dg-final { scan-assembler-not {vmv1r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv2r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv4r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv8r\.v} } } */