Consider this following case:
int
bar (int *x, int a, int b, int n)
{
x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__);
int sum1 = 0;
int sum2 = 0;
for (int i = 0; i < n; ++i)
{
sum1 += x[2*i] - a;
sum1 += x[2*i+1] * b;
sum2 += x[2*i] - b;
sum2 += x[2*i+1] * a;
}
return sum1 + sum2;
}
Before this patch:
bar:
ble a3,zero,.L5
csrr t0,vlenb
csrr a6,vlenb
slli t1,t0,3
vsetvli a5,zero,e32,m4,ta,ma
sub sp,sp,t1
vid.v v20
vmv.v.x v12,a1
vand.vi v4,v20,1
vmv.v.x v16,a2
vmseq.vi v4,v4,1
slli t3,a6,2
vsetvli zero,a5,e32,m4,ta,ma
vmv1r.v v0,v4
viota.m v8,v4
add a7,t3,sp
vsetvli a5,zero,e32,m4,ta,mu
vand.vi v28,v20,-2
vadd.vi v4,v28,1
vs4r.v v20,0(a7) ----- spill
vrgather.vv v24,v12,v8
vrgather.vv v20,v16,v8
vrgather.vv v24,v16,v8,v0.t
vrgather.vv v20,v12,v8,v0.t
vs4r.v v4,0(sp) ----- spill
slli a3,a3,1
addi t4,a6,-1
neg t1,a6
vmv4r.v v0,v20
vmv.v.i v4,0
j .L4
.L13:
vsetvli a5,zero,e32,m4,ta,ma
.L4:
mv a7,a3
mv a4,a3
bleu a3,a6,.L3
csrr a4,vlenb
.L3:
vmv.v.x v8,t4
vl4re32.v v12,0(sp) ---- spill
vand.vv v20,v28,v8
vand.vv v8,v12,v8
vsetvli zero,a4,e32,m4,ta,ma
vle32.v v16,0(a0)
vsetvli a5,zero,e32,m4,ta,ma
add a3,a3,t1
vrgather.vv v12,v16,v20
add a0,a0,t3
vrgather.vv v20,v16,v8
vsub.vv v12,v12,v0
vsetvli zero,a4,e32,m4,tu,ma
vadd.vv v4,v4,v12
vmacc.vv v4,v24,v20
bgtu a7,a6,.L13
csrr a1,vlenb
slli a1,a1,2
add a1,a1,sp
li a4,-1
csrr t0,vlenb
vsetvli a5,zero,e32,m4,ta,ma
vl4re32.v v12,0(a1) ---- spill
vmv.v.i v8,0
vmul.vx v0,v12,a4
li a2,0
slli t1,t0,3
vadd.vi v0,v0,-1
vand.vi v0,v0,1
vmseq.vv v0,v0,v8
vand.vi v12,v12,1
vmerge.vvm v16,v8,v4,v0
vmseq.vv v12,v12,v8
vmv.s.x v1,a2
vmv1r.v v0,v12
vredsum.vs v16,v16,v1
vmerge.vvm v8,v8,v4,v0
vmv.x.s a0,v16
vredsum.vs v8,v8,v1
vmv.x.s a5,v8
add sp,sp,t1
addw a0,a0,a5
jr ra
.L5:
li a0,0
ret
We can there are multiple horrible register spillings.
The root cause of this issue is for a scalar IR load:
_5 = *_4;
We didn't check whether it is a continguous load/store or gather/scatter load/store
Since it will be translate into:
1. MASK_LEN_GATHER_LOAD (..., perm indice).
2. Continguous load/store + VEC_PERM (..., perm indice)
It's obvious that no matter which situation, we will end up with consuming one vector register group (perm indice)
that we didn't count it before.
So this case we pick LMUL = 4 which is incorrect choice for dynamic LMUL cost model.
The key of this patch is:
if ((type == load_vec_info_type || type == store_vec_info_type)
&& !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)))
{
...
}
Add one more register consumption if it is not an adjacent load/store.
After this patch, it pick LMUL = 2 which is optimal:
bar:
ble a3,zero,.L4
csrr a6,vlenb
vsetvli a5,zero,e32,m2,ta,ma
vmv.v.x v6,a2
srli a2,a6,1
vmv.v.x v4,a1
vid.v v12
slli a3,a3,1
vand.vi v0,v12,1
addi t1,a2,-1
vmseq.vi v0,v0,1
slli a6,a6,1
vsetvli zero,a5,e32,m2,ta,ma
neg a7,a2
viota.m v2,v0
vsetvli a5,zero,e32,m2,ta,mu
vrgather.vv v16,v4,v2
vrgather.vv v14,v6,v2
vrgather.vv v16,v6,v2,v0.t
vrgather.vv v14,v4,v2,v0.t
vand.vi v18,v12,-2
vmv.v.i v2,0
vadd.vi v20,v18,1
.L3:
minu a4,a3,a2
vsetvli zero,a4,e32,m2,ta,ma
vle32.v v8,0(a0)
vsetvli a5,zero,e32,m2,ta,ma
vmv.v.x v4,t1
vand.vv v10,v18,v4
vrgather.vv v6,v8,v10
vsub.vv v6,v6,v14
vsetvli zero,a4,e32,m2,tu,ma
vadd.vv v2,v2,v6
vsetvli a1,zero,e32,m2,ta,ma
vand.vv v4,v20,v4
vrgather.vv v6,v8,v4
vsetvli zero,a4,e32,m2,tu,ma
mv a4,a3
add a0,a0,a6
add a3,a3,a7
vmacc.vv v2,v16,v6
bgtu a4,a2,.L3
vsetvli a1,zero,e32,m2,ta,ma
vand.vi v0,v12,1
vmv.v.i v4,0
li a3,-1
vmseq.vv v0,v0,v4
vmv.s.x v1,zero
vmerge.vvm v6,v4,v2,v0
vredsum.vs v6,v6,v1
vmul.vx v0,v12,a3
vadd.vi v0,v0,-1
vand.vi v0,v0,1
vmv.x.s a4,v6
vmseq.vv v0,v0,v4
vmv.s.x v1,zero
vmerge.vvm v4,v4,v2,v0
vredsum.vs v4,v4,v1
vmv.x.s a0,v4
addw a0,a0,a4
ret
.L4:
li a0,0
ret
No spillings.
gcc/ChangeLog:
* config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Fix big LMUL issue.
(get_store_value): New function.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: New test.
---
gcc/config/riscv/riscv-vector-costs.cc | 93 +++++++++++++++++--
.../costmodel/riscv/rvv/dynamic-lmul2-7.c | 25 +++++
2 files changed, 109 insertions(+), 9 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3. If not see
#include "bitmap.h"
#include "ssa.h"
#include "backend.h"
+#include "tree-data-ref.h"
/* This file should be included last. */
#include "riscv-vector-costs.h"
@@ -135,8 +136,9 @@ compute_local_program_points (
|| is_gimple_call (gsi_stmt (si))))
continue;
stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
- if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
- != undef_vec_info_type)
+ enum stmt_vec_info_type type
+ = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+ if (type != undef_vec_info_type)
{
stmt_point info = {point, gsi_stmt (si)};
program_points.safe_push (info);
@@ -289,9 +291,7 @@ max_number_of_live_regs (const basic_block bb,
unsigned int i;
unsigned int live_point = 0;
auto_vec<unsigned int> live_vars_vec;
- live_vars_vec.safe_grow (max_point + 1, true);
- for (i = 0; i < live_vars_vec.length (); ++i)
- live_vars_vec[i] = 0;
+ live_vars_vec.safe_grow_cleared (max_point + 1, true);
for (hash_map<tree, pair>::iterator iter = live_ranges.begin ();
iter != live_ranges.end (); ++iter)
{
@@ -360,6 +360,31 @@ get_current_lmul (class loop *loop)
return loop_autovec_infos.get (loop)->current_lmul;
}
+/* Get STORE value. */
+static tree
+get_store_value (gimple *stmt)
+{
+ if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
+ {
+ if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+ return gimple_call_arg (stmt, 3);
+ else
+ gcc_unreachable ();
+ }
+ else
+ return gimple_assign_rhs1 (stmt);
+}
+
+/* Return true if it is non-contiguous load/store. */
+static bool
+non_contiguous_memory_access_p (stmt_vec_info stmt_info)
+{
+ enum stmt_vec_info_type type
+ = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+ return ((type == load_vec_info_type || type == store_vec_info_type)
+ && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)));
+}
+
/* Update the live ranges according PHI.
Loop:
@@ -395,13 +420,15 @@ update_local_live_ranges (
unsigned int nbbs = loop->num_nodes;
unsigned int i, j;
gphi_iterator psi;
+ gimple_stmt_iterator si;
for (i = 0; i < nbbs; i++)
{
basic_block bb = bbs[i];
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
- "Update local program points for bb %d:\n", bb->index);
- for (psi = gsi_start_phis (bbs[i]); !gsi_end_p (psi); gsi_next (&psi))
+ "Update local program points for bb %d:\n",
+ bbs[i]->index);
+ for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi))
{
gphi *phi = psi.phi ();
stmt_vec_info stmt_info = vinfo->lookup_stmt (phi);
@@ -413,12 +440,23 @@ update_local_live_ranges (
{
edge e = gimple_phi_arg_edge (phi, j);
tree def = gimple_phi_arg_def (phi, j);
- auto *live_ranges = live_ranges_per_bb.get (e->src);
+ auto *live_ranges = live_ranges_per_bb.get (bb);
+ auto *live_range = live_ranges->get (def);
+ if (live_range && flow_bb_inside_loop_p (loop, e->src))
+ {
+ unsigned int start = (*live_range).first;
+ (*live_range).first = 0;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Update %T start point from %d to %d:\n",
+ def, start, (*live_range).first);
+ }
+ live_ranges = live_ranges_per_bb.get (e->src);
if (!program_points_per_bb.get (e->src))
continue;
unsigned int max_point
= (*program_points_per_bb.get (e->src)).length () - 1;
- auto *live_range = live_ranges->get (def);
+ live_range = live_ranges->get (def);
if (!live_range)
continue;
@@ -430,6 +468,43 @@ update_local_live_ranges (
end, (*live_range).second);
}
}
+ for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
+ {
+ if (!(is_gimple_assign (gsi_stmt (si))
+ || is_gimple_call (gsi_stmt (si))))
+ continue;
+ stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
+ enum stmt_vec_info_type type
+ = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+ if (non_contiguous_memory_access_p (stmt_info))
+ {
+ /* For non-adjacent load/store STMT, we will potentially
+ convert it into:
+
+ 1. MASK_LEN_GATHER_LOAD (..., perm indice).
+ 2. Continguous load/store + VEC_PERM (..., perm indice)
+
+ We will be likely using one more vector variable. */
+ unsigned int max_point
+ = (*program_points_per_bb.get (bb)).length () - 1;
+ auto *live_ranges = live_ranges_per_bb.get (bb);
+ bool existed_p = false;
+ tree var = type == load_vec_info_type
+ ? gimple_get_lhs (gsi_stmt (si))
+ : get_store_value (gsi_stmt (si));
+ tree sel_type = build_nonstandard_integer_type (
+ TYPE_PRECISION (TREE_TYPE (var)), 1);
+ tree sel = build_decl (UNKNOWN_LOCATION, VAR_DECL,
+ get_identifier ("vect_perm"), sel_type);
+ pair &live_range = live_ranges->get_or_insert (sel, &existed_p);
+ gcc_assert (!existed_p);
+ live_range = pair (0, max_point);
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Add perm indice %T, start = 0, end = %d\n",
+ sel, max_point);
+ }
+ }
}
}
new file mode 100644
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -fdump-tree-vect-details" } */
+
+int
+bar (int *x, int a, int b, int n)
+{
+ x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__);
+ int sum1 = 0;
+ int sum2 = 0;
+ for (int i = 0; i < n; ++i)
+ {
+ sum1 += x[2*i] - a;
+ sum1 += x[2*i+1] * b;
+ sum2 += x[2*i] - b;
+ sum2 += x[2*i+1] * a;
+ }
+ return sum1 + sum2;
+}
+
+/* { dg-final { scan-assembler {e32,m2} } } */
+/* { dg-final { scan-assembler-times {csrr} 1 } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */