[V2] RISC-V: Adjust scalar_to_vec cost accurately
Checks
Commit Message
1. This patch set scalar_to_vec cost as 2 instead 1 since scalar move
instruction is slightly more costly than normal rvv instructions (e.g. vadd.vv).
2. Adjust scalar_to_vec cost accurately according to the splat value, for example,
a value like 32872, needs 2 more scalar instructions:
so the cost = 2 (scalar instructions) + 2 (scalar move).
We adjust the cost like this since it doesn need such many instructions in vectorized codes,
wheras they are not needed in scalar codes.
After this patch, no matter -march=rv64gcv_zvl256b or -march=rv64gcv_zvl4096b.
We have optimal codgen:
lui a5,%hi(a)
li a4,19
sb a4,%lo(a)(a5)
li a0,0
ret
PR target/113281
gcc/ChangeLog:
* config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Adjust scalar_to_vec cost accurately.
(costs::add_stmt_cost): Ditto.
* config/riscv/riscv.cc: Ditto.
* config/riscv/t-riscv: Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/pr113209.c: Adapt test.
* gcc.target/riscv/rvv/autovec/zve32f-1.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c: New test.
---
gcc/config/riscv/riscv-vector-costs.cc | 50 ++++++++++++++++++-
gcc/config/riscv/riscv.cc | 4 +-
gcc/config/riscv/t-riscv | 2 +-
.../vect/costmodel/riscv/rvv/pr113281-1.c | 18 +++++++
.../vect/costmodel/riscv/rvv/pr113281-2.c | 18 +++++++
.../gcc.target/riscv/rvv/autovec/pr113209.c | 2 +-
.../gcc.target/riscv/rvv/autovec/zve32f-1.c | 2 +-
7 files changed, 90 insertions(+), 6 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c
Comments
> 1. This patch set scalar_to_vec cost as 2 instead 1 since scalar move
> instruction is slightly more costly than normal rvv instructions (e.g. vadd.vv).
We can go with 2 or 3 (if needed) for now but should later
really incorporate reg-move costs in this IMHO. Just like e.g.
static const struct cpu_regmove_cost cortexa57_regmove_cost =
{
1, /* GP2GP */
/* Avoid the use of slow int<->fp moves for spilling by setting
their cost higher than memmov_cost. */
5, /* GP2FP */
...
};
we can add V2FP, V2GP and the reverse. Then add those to
scalar_to_vec (later vec_to_scalar as well) in adjust_stmt_cost
according to the mode.
> 2. Adjust scalar_to_vec cost accurately according to the splat value, for example,
> a value like 32872, needs 2 more scalar instructions:
> so the cost = 2 (scalar instructions) + 2 (scalar move).
> We adjust the cost like this since it doesn need such many instructions in vectorized codes,
> wheras they are not needed in scalar codes.
I'm afraid the issue I mentioned (we don't count the constant
synthesis for scalar but would for vector with the change) is
still present.
Even if it does not cause any regressions or problems now it
certainly might in the future, especially with complex constants.
Basically we would not vectorize something containing several
synthesized constants (like popcount) anymore.
Therefore I would advise against it even though the given
example cannot be "solved" unconditionally then.
Regards
Robin
@@ -42,6 +42,7 @@ along with GCC; see the file COPYING3. If not see
#include "backend.h"
#include "tree-data-ref.h"
#include "tree-ssa-loop-niter.h"
+#include "emit-rtl.h"
/* This file should be included last. */
#include "riscv-vector-costs.h"
@@ -1055,6 +1056,50 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
return vector_costs::better_main_loop_than_p (other);
}
+/* Adjust vectorization cost after calling
+ targetm.vectorize.builtin_vectorization_cost. For some statement, we would
+ like to further fine-grain tweak the cost on top of
+ targetm.vectorize.builtin_vectorization_cost handling which doesn't have any
+ information on statement operation codes etc. */
+
+static unsigned
+adjust_stmt_cost (enum vect_cost_for_stmt kind,
+ struct _stmt_vec_info *stmt_info, int count, int stmt_cost)
+{
+ gimple *stmt = stmt_info->stmt;
+ switch (kind)
+ {
+ case scalar_to_vec: {
+ stmt_cost *= count;
+ gcall *call = dyn_cast<gcall *> (stmt);
+ /* Adjust cost by counting the scalar value initialization. */
+ unsigned int num
+ = call ? gimple_call_num_args (call) : gimple_num_ops (stmt);
+ unsigned int start = call ? 0 : 1;
+
+ for (unsigned int i = start; i < num; i++)
+ {
+ tree op = call ? gimple_call_arg (call, i) : gimple_op (stmt, i);
+ if (TREE_CODE (op) == INTEGER_CST)
+ {
+ HOST_WIDE_INT value = tree_fits_shwi_p (op) ? tree_to_shwi (op)
+ : tree_to_uhwi (op);
+ /* We don't need to count scalar costs if it
+ is in range of [-16, 15] since we can use
+ vmv.v.i. */
+ if (!IN_RANGE (value, -16, 15))
+ stmt_cost += riscv_const_insns (gen_int_mode (value, Pmode));
+ }
+ /* TODO: We don't count CONST_POLY_INT value for now. */
+ }
+ return stmt_cost;
+ }
+ default:
+ break;
+ }
+ return count * stmt_cost;
+}
+
unsigned
costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
stmt_vec_info stmt_info, slp_tree, tree vectype,
@@ -1082,9 +1127,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
as one iteration of the VLA loop. */
if (where == vect_body && m_unrolled_vls_niters)
m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
+
+ if (vectype)
+ stmt_cost = adjust_stmt_cost (kind, stmt_info, count, stmt_cost);
}
- return record_stmt_cost (stmt_info, where, count * stmt_cost);
+ return record_stmt_cost (stmt_info, where, stmt_cost);
}
void
@@ -366,7 +366,7 @@ static const common_vector_cost rvv_vls_vector_cost = {
1, /* gather_load_cost */
1, /* scatter_store_cost */
1, /* vec_to_scalar_cost */
- 1, /* scalar_to_vec_cost */
+ 2, /* scalar_to_vec_cost */
1, /* permute_cost */
1, /* align_load_cost */
1, /* align_store_cost */
@@ -382,7 +382,7 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
1, /* gather_load_cost */
1, /* scatter_store_cost */
1, /* vec_to_scalar_cost */
- 1, /* scalar_to_vec_cost */
+ 2, /* scalar_to_vec_cost */
1, /* permute_cost */
1, /* align_load_cost */
1, /* align_store_cost */
@@ -73,7 +73,7 @@ riscv-vector-costs.o: $(srcdir)/config/riscv/riscv-vector-costs.cc \
$(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(TARGET_H) $(FUNCTION_H) \
$(TREE_H) basic-block.h $(RTL_H) gimple.h targhooks.h cfgloop.h \
fold-const.h $(TM_P_H) tree-vectorizer.h gimple-iterator.h bitmap.h \
- ssa.h backend.h tree-data-ref.h tree-ssa-loop-niter.h \
+ ssa.h backend.h tree-data-ref.h tree-ssa-loop-niter.h emit-rtl.h \
$(srcdir)/config/riscv/riscv-vector-costs.h
$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
$(srcdir)/config/riscv/riscv-vector-costs.cc
new file mode 100644
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3 -ftree-vectorize" } */
+
+unsigned char a;
+
+int main() {
+ short b = a = 0;
+ for (; a != 19; a++)
+ if (a)
+ b = 32872 >> a;
+
+ if (b == 0)
+ return 0;
+ else
+ return 1;
+}
+
+/* { dg-final { scan-assembler-not {vset} } } */
new file mode 100644
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvl4096b -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=m8" } */
+
+unsigned char a;
+
+int main() {
+ short b = a = 0;
+ for (; a != 19; a++)
+ if (a)
+ b = 32872 >> a;
+
+ if (b == 0)
+ return 0;
+ else
+ return 1;
+}
+
+/* { dg-final { scan-assembler-not {vset} } } */
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3" } */
+/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3 -fno-vect-cost-model" } */
int b, c, d, f, i, a;
int e[1] = {0};
@@ -3,4 +3,4 @@
#include "template-1.h"
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 "vect" } } */