RISC-V: Add initial cost handling for segment loads/stores.
Checks
Commit Message
Hi,
This has been sitting on my local tree - I've been wanting to post it
for a while but somehow forgot.
This patch makes segment loads and stores more expensive. It adds
segment_load and segment_store cost fields to the common vector costs
and adds handling to adjust_stmt_cost. In the future we could handle
this in a more fine-grained manner but let's start somehow.
Regtested on rv64.
Regards
Robin
gcc/ChangeLog:
* config/riscv/riscv-protos.h (struct common_vector_cost): Add
segment_[load/store]_cost.
* config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost):
Handle segment loads/stores.
* config/riscv/riscv.cc: Initialize segment_[load/store]_cost
to 1.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c: Expect m4
instead of m2.
---
gcc/config/riscv/riscv-protos.h | 4 +
gcc/config/riscv/riscv-vector-costs.cc | 127 ++++++++++++------
gcc/config/riscv/riscv.cc | 4 +
.../vect/costmodel/riscv/rvv/pr113112-4.c | 4 +-
4 files changed, 95 insertions(+), 44 deletions(-)
Comments
This patch looks odd to me.
I don't see memrefs in the trunk code.
Also, I prefer list all cost in cost tune info for NF = 2 ~ 8 like ARM SVE does:
/* Detect cases in which a vector load or store represents an
LD[234] or ST[234] instruction. */
switch (aarch64_ld234_st234_vectors (kind, stmt_info))
{
case 2:
stmt_cost += simd_costs->ld2_st2_permute_cost;
break;
case 3:
stmt_cost += simd_costs->ld3_st3_permute_cost;
break;
case 4:
stmt_cost += simd_costs->ld4_st4_permute_cost;
break;
}
juzhe.zhong@rivai.ai
From: Robin Dapp
Date: 2024-02-26 23:54
To: gcc-patches; palmer; Kito Cheng; juzhe.zhong@rivai.ai
CC: rdapp.gcc; jeffreyalaw
Subject: [PATCH] RISC-V: Add initial cost handling for segment loads/stores.
Hi,
This has been sitting on my local tree - I've been wanting to post it
for a while but somehow forgot.
This patch makes segment loads and stores more expensive. It adds
segment_load and segment_store cost fields to the common vector costs
and adds handling to adjust_stmt_cost. In the future we could handle
this in a more fine-grained manner but let's start somehow.
Regtested on rv64.
Regards
Robin
gcc/ChangeLog:
* config/riscv/riscv-protos.h (struct common_vector_cost): Add
segment_[load/store]_cost.
* config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost):
Handle segment loads/stores.
* config/riscv/riscv.cc: Initialize segment_[load/store]_cost
to 1.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c: Expect m4
instead of m2.
---
gcc/config/riscv/riscv-protos.h | 4 +
gcc/config/riscv/riscv-vector-costs.cc | 127 ++++++++++++------
gcc/config/riscv/riscv.cc | 4 +
.../vect/costmodel/riscv/rvv/pr113112-4.c | 4 +-
4 files changed, 95 insertions(+), 44 deletions(-)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 80efdf2b7e5..2e8ab9990a8 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -218,6 +218,10 @@ struct common_vector_cost
const int gather_load_cost;
const int scatter_store_cost;
+ /* Segment load/store cost. */
+ const int segment_load_cost;
+ const int segment_store_cost;
+
/* Cost of a vector-to-scalar operation. */
const int vec_to_scalar_cost;
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index adf9c197df5..d3c12444773 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1043,6 +1043,24 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
return vector_costs::better_main_loop_than_p (other);
}
+/* Returns the group size i.e. the number of vectors to be loaded by a
+ segmented load/store instruction. Return 0 if it is no segmented
+ load/store. */
+static int
+segment_loadstore_group_size (enum vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info)
+{
+ if ((kind == vector_load || kind == vector_store)
+ && STMT_VINFO_DATA_REF (stmt_info))
+ {
+ stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+ if (stmt_info
+ && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+ return DR_GROUP_SIZE (stmt_info);
+ }
+ return 0;
+}
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1067,55 +1085,80 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
case vector_load:
case vector_store:
{
- /* Unit-stride vector loads and stores do not have offset addressing
- as opposed to scalar loads and stores.
- If the address depends on a variable we need an additional
- add/sub for each load/store in the worst case. */
- if (stmt_info && stmt_info->stmt)
+ if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info))
{
- data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
- class loop *father = stmt_info->stmt->bb->loop_father;
- if (!loop && father && !father->inner && father->superloops)
+ int group_size;
+ if ((group_size
+ = segment_loadstore_group_size (kind, stmt_info)) > 1)
{
- tree ref;
- if (TREE_CODE (dr->ref) != MEM_REF
- || !(ref = TREE_OPERAND (dr->ref, 0))
- || TREE_CODE (ref) != SSA_NAME)
- break;
+ /* Segment loads and stores. When the group size is > 1
+ the vectorizer will add a vector load/store statement for
+ each vector in the group. Note that STMT_COST is
+ overwritten here rather than adjusted. */
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost
+ = (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
+ ? costs->vla->segment_load_cost
+ : costs->vla->segment_store_cost);
+ else
+ stmt_cost
+ = (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
+ ? costs->vls->segment_load_cost
+ : costs->vls->segment_store_cost);
+ break;
+ /* TODO: Indexed and ordered/unordered cost. */
+ }
+ else
+ {
+ /* Unit-stride vector loads and stores do not have offset
+ addressing as opposed to scalar loads and stores.
+ If the address depends on a variable we need an additional
+ add/sub for each load/store in the worst case. */
+ data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ class loop *father = stmt_info->stmt->bb->loop_father;
+ if (!loop && father && !father->inner && father->superloops)
+ {
+ tree ref;
+ if (TREE_CODE (dr->ref) != MEM_REF
+ || !(ref = TREE_OPERAND (dr->ref, 0))
+ || TREE_CODE (ref) != SSA_NAME)
+ break;
- if (SSA_NAME_IS_DEFAULT_DEF (ref))
- break;
+ if (SSA_NAME_IS_DEFAULT_DEF (ref))
+ break;
- if (memrefs.contains ({ref, cst0}))
- break;
+ if (memrefs.contains ({ref, cst0}))
+ break;
- memrefs.add ({ref, cst0});
+ memrefs.add ({ref, cst0});
- /* In case we have not seen REF before and the base address
- is a pointer operation try a bit harder. */
- tree base = DR_BASE_ADDRESS (dr);
- if (TREE_CODE (base) == POINTER_PLUS_EXPR
- || TREE_CODE (base) == POINTER_DIFF_EXPR)
- {
- /* Deconstruct BASE's first operand. If it is a binary
- operation, i.e. a base and an "offset" store this
- pair. Only increase the stmt_cost if we haven't seen
- it before. */
- tree argp = TREE_OPERAND (base, 1);
- typedef std::pair<tree, tree> addr_pair;
- addr_pair pair;
- if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ /* In case we have not seen REF before and the base
+ address is a pointer operation try a bit harder. */
+ tree base = DR_BASE_ADDRESS (dr);
+ if (TREE_CODE (base) == POINTER_PLUS_EXPR
+ || TREE_CODE (base) == POINTER_DIFF_EXPR)
{
- tree argp0 = tree_strip_nop_conversions
- (TREE_OPERAND (argp, 0));
- tree argp1 = TREE_OPERAND (argp, 1);
- pair = addr_pair (argp0, argp1);
- if (memrefs.contains (pair))
- break;
-
- memrefs.add (pair);
- stmt_cost += builtin_vectorization_cost (scalar_stmt,
- NULL_TREE, 0);
+ /* Deconstruct BASE's first operand. If it is a
+ binary operation, i.e. a base and an "offset"
+ store this pair. Only increase the stmt_cost if
+ we haven't seen it before. */
+ tree argp = TREE_OPERAND (base, 1);
+ typedef std::pair<tree, tree> addr_pair;
+ addr_pair pair;
+ if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ {
+ tree argp0 = tree_strip_nop_conversions
+ (TREE_OPERAND (argp, 0));
+ tree argp1 = TREE_OPERAND (argp, 1);
+ pair = addr_pair (argp0, argp1);
+ if (memrefs.contains (pair))
+ break;
+
+ memrefs.add (pair);
+ stmt_cost
+ += builtin_vectorization_cost (scalar_stmt,
+ NULL_TREE, 0);
+ }
}
}
}
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5e984ee2a55..6c2f0ec34f4 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -365,6 +365,8 @@ static const common_vector_cost rvv_vls_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 2, /* segment_load_cost */
+ 2, /* segment_store_cost */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
@@ -381,6 +383,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 2, /* segment_load_cost */
+ 2, /* segment_store_cost */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
index 5c55a66ed77..bdeedaf5224 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
@@ -21,8 +21,8 @@ void move_replacements (rtx *x, rtx *y, int n_replacements)
}
}
-/* { dg-final { scan-assembler {e64,m2} } } */
-/* { dg-final { scan-assembler-not {e64,m4} } } */
+/* { dg-final { scan-assembler-not {e64,m2} } } */
+/* { dg-final { scan-assembler {e64,m4} } } */
/* { dg-final { scan-assembler-not {jr} } } */
/* { dg-final { scan-assembler {ret} } } */
/* { dg-final { scan-assembler-not {sp} } } */
--
2.43.2
> This patch looks odd to me.
> I don't see memrefs in the trunk code.
It's on top of the vle/vse offset handling patch from
a while back that I haven't committed yet.
> Also, I prefer list all cost in cost tune info for NF = 2 ~ 8 like ARM SVE does:
I don't mind having separate costs for each but I figured they
scale anyway with the number of vectors already. Attached v2
is more similar to aarch64.
Regards
Robin
Subject: [PATCH v2] RISC-V: Add initial cost handling for segment
loads/stores.
This patch makes segment loads and stores more expensive. It adds
segment_permute_2 (as well as 4 and 8) cost fields to the common vector
costs and adds handling to adjust_stmt_cost.
gcc/ChangeLog:
* config/riscv/riscv-protos.h (struct common_vector_cost): Add
segment_permute cost.
* config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost):
Handle segment loads/stores.
* config/riscv/riscv.cc: Initialize segment_permute_[248] to 1.
---
gcc/config/riscv/riscv-protos.h | 5 +
gcc/config/riscv/riscv-vector-costs.cc | 139 +++++++++++++++++--------
gcc/config/riscv/riscv.cc | 6 ++
3 files changed, 108 insertions(+), 42 deletions(-)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 80efdf2b7e5..9b737aca1a3 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -218,6 +218,11 @@ struct common_vector_cost
const int gather_load_cost;
const int scatter_store_cost;
+ /* Segment load/store permute cost. */
+ const int segment_permute_2;
+ const int segment_permute_4;
+ const int segment_permute_8;
+
/* Cost of a vector-to-scalar operation. */
const int vec_to_scalar_cost;
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index adf9c197df5..c8178d71101 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1043,6 +1043,25 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
return vector_costs::better_main_loop_than_p (other);
}
+/* Returns the group size i.e. the number of vectors to be loaded by a
+ segmented load/store instruction. Return 0 if it is no segmented
+ load/store. */
+static int
+segment_loadstore_group_size (enum vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info)
+{
+ if (stmt_info
+ && (kind == vector_load || kind == vector_store)
+ && STMT_VINFO_DATA_REF (stmt_info))
+ {
+ stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+ if (stmt_info
+ && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+ return DR_GROUP_SIZE (stmt_info);
+ }
+ return 0;
+}
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1067,55 +1086,91 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
case vector_load:
case vector_store:
{
- /* Unit-stride vector loads and stores do not have offset addressing
- as opposed to scalar loads and stores.
- If the address depends on a variable we need an additional
- add/sub for each load/store in the worst case. */
- if (stmt_info && stmt_info->stmt)
+ if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info))
{
- data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
- class loop *father = stmt_info->stmt->bb->loop_father;
- if (!loop && father && !father->inner && father->superloops)
+ /* Segment loads and stores. When the group size is > 1
+ the vectorizer will add a vector load/store statement for
+ each vector in the group. Here we additionally add permute
+ costs for each. */
+ /* TODO: Indexed and ordered/unordered cost. */
+ int group_size = segment_loadstore_group_size (kind, stmt_info);
+ if (group_size > 1)
+ {
+ switch (group_size)
+ {
+ case 2:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_2;
+ else
+ stmt_cost += costs->vls->segment_permute_2;
+ break;
+ case 4:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_4;
+ else
+ stmt_cost += costs->vls->segment_permute_4;
+ break;
+ case 8:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_8;
+ else
+ stmt_cost += costs->vls->segment_permute_8;
+ break;
+ default:
+ break;
+ }
+ }
+ else
{
- tree ref;
- if (TREE_CODE (dr->ref) != MEM_REF
- || !(ref = TREE_OPERAND (dr->ref, 0))
- || TREE_CODE (ref) != SSA_NAME)
- break;
+ /* Unit-stride vector loads and stores do not have offset
+ addressing as opposed to scalar loads and stores.
+ If the address depends on a variable we need an additional
+ add/sub for each load/store in the worst case. */
+ data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ class loop *father = stmt_info->stmt->bb->loop_father;
+ if (!loop && father && !father->inner && father->superloops)
+ {
+ tree ref;
+ if (TREE_CODE (dr->ref) != MEM_REF
+ || !(ref = TREE_OPERAND (dr->ref, 0))
+ || TREE_CODE (ref) != SSA_NAME)
+ break;
- if (SSA_NAME_IS_DEFAULT_DEF (ref))
- break;
+ if (SSA_NAME_IS_DEFAULT_DEF (ref))
+ break;
- if (memrefs.contains ({ref, cst0}))
- break;
+ if (memrefs.contains ({ref, cst0}))
+ break;
- memrefs.add ({ref, cst0});
+ memrefs.add ({ref, cst0});
- /* In case we have not seen REF before and the base address
- is a pointer operation try a bit harder. */
- tree base = DR_BASE_ADDRESS (dr);
- if (TREE_CODE (base) == POINTER_PLUS_EXPR
- || TREE_CODE (base) == POINTER_DIFF_EXPR)
- {
- /* Deconstruct BASE's first operand. If it is a binary
- operation, i.e. a base and an "offset" store this
- pair. Only increase the stmt_cost if we haven't seen
- it before. */
- tree argp = TREE_OPERAND (base, 1);
- typedef std::pair<tree, tree> addr_pair;
- addr_pair pair;
- if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ /* In case we have not seen REF before and the base
+ address is a pointer operation try a bit harder. */
+ tree base = DR_BASE_ADDRESS (dr);
+ if (TREE_CODE (base) == POINTER_PLUS_EXPR
+ || TREE_CODE (base) == POINTER_DIFF_EXPR)
{
- tree argp0 = tree_strip_nop_conversions
- (TREE_OPERAND (argp, 0));
- tree argp1 = TREE_OPERAND (argp, 1);
- pair = addr_pair (argp0, argp1);
- if (memrefs.contains (pair))
- break;
-
- memrefs.add (pair);
- stmt_cost += builtin_vectorization_cost (scalar_stmt,
- NULL_TREE, 0);
+ /* Deconstruct BASE's first operand. If it is a
+ binary operation, i.e. a base and an "offset"
+ store this pair. Only increase the stmt_cost if
+ we haven't seen it before. */
+ tree argp = TREE_OPERAND (base, 1);
+ typedef std::pair<tree, tree> addr_pair;
+ addr_pair pair;
+ if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ {
+ tree argp0 = tree_strip_nop_conversions
+ (TREE_OPERAND (argp, 0));
+ tree argp1 = TREE_OPERAND (argp, 1);
+ pair = addr_pair (argp0, argp1);
+ if (memrefs.contains (pair))
+ break;
+
+ memrefs.add (pair);
+ stmt_cost
+ += builtin_vectorization_cost (scalar_stmt,
+ NULL_TREE, 0);
+ }
}
}
}
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5e984ee2a55..141278ec35e 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -365,6 +365,9 @@ static const common_vector_cost rvv_vls_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 1, /* segment_permute (2) */
+ 1, /* segment_permute (4) */
+ 1, /* segment_permute (8) */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
@@ -381,6 +384,9 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 1, /* segment_permute (2) */
+ 1, /* segment_permute (4) */
+ 1, /* segment_permute (8) */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
+ /* Segment load/store permute cost. */
+ const int segment_permute_2;
+ const int segment_permute_4;
+ const int segment_permute_8;
Why do we only have 2/4/8, I think we should have 2/3/4/5/6/7/8
juzhe.zhong@rivai.ai
From: Robin Dapp
Date: 2024-02-28 05:27
To: juzhe.zhong@rivai.ai; gcc-patches; palmer; kito.cheng
CC: rdapp.gcc; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Add initial cost handling for segment loads/stores.
> This patch looks odd to me.
> I don't see memrefs in the trunk code.
It's on top of the vle/vse offset handling patch from
a while back that I haven't committed yet.
> Also, I prefer list all cost in cost tune info for NF = 2 ~ 8 like ARM SVE does:
I don't mind having separate costs for each but I figured they
scale anyway with the number of vectors already. Attached v2
is more similar to aarch64.
Regards
Robin
Subject: [PATCH v2] RISC-V: Add initial cost handling for segment
loads/stores.
This patch makes segment loads and stores more expensive. It adds
segment_permute_2 (as well as 4 and 8) cost fields to the common vector
costs and adds handling to adjust_stmt_cost.
gcc/ChangeLog:
* config/riscv/riscv-protos.h (struct common_vector_cost): Add
segment_permute cost.
* config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost):
Handle segment loads/stores.
* config/riscv/riscv.cc: Initialize segment_permute_[248] to 1.
---
gcc/config/riscv/riscv-protos.h | 5 +
gcc/config/riscv/riscv-vector-costs.cc | 139 +++++++++++++++++--------
gcc/config/riscv/riscv.cc | 6 ++
3 files changed, 108 insertions(+), 42 deletions(-)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 80efdf2b7e5..9b737aca1a3 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -218,6 +218,11 @@ struct common_vector_cost
const int gather_load_cost;
const int scatter_store_cost;
+ /* Segment load/store permute cost. */
+ const int segment_permute_2;
+ const int segment_permute_4;
+ const int segment_permute_8;
+
/* Cost of a vector-to-scalar operation. */
const int vec_to_scalar_cost;
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index adf9c197df5..c8178d71101 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1043,6 +1043,25 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
return vector_costs::better_main_loop_than_p (other);
}
+/* Returns the group size i.e. the number of vectors to be loaded by a
+ segmented load/store instruction. Return 0 if it is no segmented
+ load/store. */
+static int
+segment_loadstore_group_size (enum vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info)
+{
+ if (stmt_info
+ && (kind == vector_load || kind == vector_store)
+ && STMT_VINFO_DATA_REF (stmt_info))
+ {
+ stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+ if (stmt_info
+ && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+ return DR_GROUP_SIZE (stmt_info);
+ }
+ return 0;
+}
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1067,55 +1086,91 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
case vector_load:
case vector_store:
{
- /* Unit-stride vector loads and stores do not have offset addressing
- as opposed to scalar loads and stores.
- If the address depends on a variable we need an additional
- add/sub for each load/store in the worst case. */
- if (stmt_info && stmt_info->stmt)
+ if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info))
{
- data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
- class loop *father = stmt_info->stmt->bb->loop_father;
- if (!loop && father && !father->inner && father->superloops)
+ /* Segment loads and stores. When the group size is > 1
+ the vectorizer will add a vector load/store statement for
+ each vector in the group. Here we additionally add permute
+ costs for each. */
+ /* TODO: Indexed and ordered/unordered cost. */
+ int group_size = segment_loadstore_group_size (kind, stmt_info);
+ if (group_size > 1)
+ {
+ switch (group_size)
+ {
+ case 2:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_2;
+ else
+ stmt_cost += costs->vls->segment_permute_2;
+ break;
+ case 4:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_4;
+ else
+ stmt_cost += costs->vls->segment_permute_4;
+ break;
+ case 8:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_8;
+ else
+ stmt_cost += costs->vls->segment_permute_8;
+ break;
+ default:
+ break;
+ }
+ }
+ else
{
- tree ref;
- if (TREE_CODE (dr->ref) != MEM_REF
- || !(ref = TREE_OPERAND (dr->ref, 0))
- || TREE_CODE (ref) != SSA_NAME)
- break;
+ /* Unit-stride vector loads and stores do not have offset
+ addressing as opposed to scalar loads and stores.
+ If the address depends on a variable we need an additional
+ add/sub for each load/store in the worst case. */
+ data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ class loop *father = stmt_info->stmt->bb->loop_father;
+ if (!loop && father && !father->inner && father->superloops)
+ {
+ tree ref;
+ if (TREE_CODE (dr->ref) != MEM_REF
+ || !(ref = TREE_OPERAND (dr->ref, 0))
+ || TREE_CODE (ref) != SSA_NAME)
+ break;
- if (SSA_NAME_IS_DEFAULT_DEF (ref))
- break;
+ if (SSA_NAME_IS_DEFAULT_DEF (ref))
+ break;
- if (memrefs.contains ({ref, cst0}))
- break;
+ if (memrefs.contains ({ref, cst0}))
+ break;
- memrefs.add ({ref, cst0});
+ memrefs.add ({ref, cst0});
- /* In case we have not seen REF before and the base address
- is a pointer operation try a bit harder. */
- tree base = DR_BASE_ADDRESS (dr);
- if (TREE_CODE (base) == POINTER_PLUS_EXPR
- || TREE_CODE (base) == POINTER_DIFF_EXPR)
- {
- /* Deconstruct BASE's first operand. If it is a binary
- operation, i.e. a base and an "offset" store this
- pair. Only increase the stmt_cost if we haven't seen
- it before. */
- tree argp = TREE_OPERAND (base, 1);
- typedef std::pair<tree, tree> addr_pair;
- addr_pair pair;
- if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ /* In case we have not seen REF before and the base
+ address is a pointer operation try a bit harder. */
+ tree base = DR_BASE_ADDRESS (dr);
+ if (TREE_CODE (base) == POINTER_PLUS_EXPR
+ || TREE_CODE (base) == POINTER_DIFF_EXPR)
{
- tree argp0 = tree_strip_nop_conversions
- (TREE_OPERAND (argp, 0));
- tree argp1 = TREE_OPERAND (argp, 1);
- pair = addr_pair (argp0, argp1);
- if (memrefs.contains (pair))
- break;
-
- memrefs.add (pair);
- stmt_cost += builtin_vectorization_cost (scalar_stmt,
- NULL_TREE, 0);
+ /* Deconstruct BASE's first operand. If it is a
+ binary operation, i.e. a base and an "offset"
+ store this pair. Only increase the stmt_cost if
+ we haven't seen it before. */
+ tree argp = TREE_OPERAND (base, 1);
+ typedef std::pair<tree, tree> addr_pair;
+ addr_pair pair;
+ if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ {
+ tree argp0 = tree_strip_nop_conversions
+ (TREE_OPERAND (argp, 0));
+ tree argp1 = TREE_OPERAND (argp, 1);
+ pair = addr_pair (argp0, argp1);
+ if (memrefs.contains (pair))
+ break;
+
+ memrefs.add (pair);
+ stmt_cost
+ += builtin_vectorization_cost (scalar_stmt,
+ NULL_TREE, 0);
+ }
}
}
}
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5e984ee2a55..141278ec35e 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -365,6 +365,9 @@ static const common_vector_cost rvv_vls_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 1, /* segment_permute (2) */
+ 1, /* segment_permute (4) */
+ 1, /* segment_permute (8) */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
@@ -381,6 +384,9 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 1, /* segment_permute (2) */
+ 1, /* segment_permute (4) */
+ 1, /* segment_permute (8) */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
--
2.43.2
> + /* Segment load/store permute cost. */
> + const int segment_permute_2;
> + const int segment_permute_4;
> + const int segment_permute_8;
>
> Why do we only have 2/4/8, I think we should have 2/3/4/5/6/7/8
No idea why I posted that (wrong) version, I used it for
some testing locally. Attached is the proper version, still
called it v3...
Regards
Robin
Subject: [PATCH v3] RISC-V: Add initial cost handling for segment
loads/stores.
This patch makes segment loads and stores more expensive. It adds
segment_permute_2 as well as 3 to 8 cost fields to the common vector
costs and adds handling to adjust_stmt_cost.
gcc/ChangeLog:
* config/riscv/riscv-protos.h (struct common_vector_cost): Add
segment_permute cost.
* config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost):
Handle segment loads/stores.
* config/riscv/riscv.cc: Initialize segment_permute_[2-8] to 1.
---
gcc/config/riscv/riscv-protos.h | 9 ++
gcc/config/riscv/riscv-vector-costs.cc | 163 ++++++++++++++++++-------
gcc/config/riscv/riscv.cc | 14 +++
3 files changed, 144 insertions(+), 42 deletions(-)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 80efdf2b7e5..90d1fcbb3b1 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -218,6 +218,15 @@ struct common_vector_cost
const int gather_load_cost;
const int scatter_store_cost;
+ /* Segment load/store permute cost. */
+ const int segment_permute_2;
+ const int segment_permute_3;
+ const int segment_permute_4;
+ const int segment_permute_5;
+ const int segment_permute_6;
+ const int segment_permute_7;
+ const int segment_permute_8;
+
/* Cost of a vector-to-scalar operation. */
const int vec_to_scalar_cost;
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index adf9c197df5..f4da213fe14 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1043,6 +1043,25 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
return vector_costs::better_main_loop_than_p (other);
}
+/* Returns the group size i.e. the number of vectors to be loaded by a
+ segmented load/store instruction. Return 0 if it is no segmented
+ load/store. */
+static int
+segment_loadstore_group_size (enum vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info)
+{
+ if (stmt_info
+ && (kind == vector_load || kind == vector_store)
+ && STMT_VINFO_DATA_REF (stmt_info))
+ {
+ stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+ if (stmt_info
+ && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+ return DR_GROUP_SIZE (stmt_info);
+ }
+ return 0;
+}
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1067,55 +1086,115 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
case vector_load:
case vector_store:
{
- /* Unit-stride vector loads and stores do not have offset addressing
- as opposed to scalar loads and stores.
- If the address depends on a variable we need an additional
- add/sub for each load/store in the worst case. */
- if (stmt_info && stmt_info->stmt)
+ if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info))
{
- data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
- class loop *father = stmt_info->stmt->bb->loop_father;
- if (!loop && father && !father->inner && father->superloops)
+ /* Segment loads and stores. When the group size is > 1
+ the vectorizer will add a vector load/store statement for
+ each vector in the group. Here we additionally add permute
+ costs for each. */
+ /* TODO: Indexed and ordered/unordered cost. */
+ int group_size = segment_loadstore_group_size (kind, stmt_info);
+ if (group_size > 1)
+ {
+ switch (group_size)
+ {
+ case 2:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_2;
+ else
+ stmt_cost += costs->vls->segment_permute_2;
+ break;
+ case 3:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_3;
+ else
+ stmt_cost += costs->vls->segment_permute_3;
+ break;
+ case 4:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_4;
+ else
+ stmt_cost += costs->vls->segment_permute_4;
+ break;
+ case 5:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_5;
+ else
+ stmt_cost += costs->vls->segment_permute_5;
+ break;
+ case 6:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_6;
+ else
+ stmt_cost += costs->vls->segment_permute_6;
+ break;
+ case 7:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_7;
+ else
+ stmt_cost += costs->vls->segment_permute_7;
+ break;
+ case 8:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_8;
+ else
+ stmt_cost += costs->vls->segment_permute_8;
+ break;
+ default:
+ break;
+ }
+ }
+ else
{
- tree ref;
- if (TREE_CODE (dr->ref) != MEM_REF
- || !(ref = TREE_OPERAND (dr->ref, 0))
- || TREE_CODE (ref) != SSA_NAME)
- break;
+ /* Unit-stride vector loads and stores do not have offset
+ addressing as opposed to scalar loads and stores.
+ If the address depends on a variable we need an additional
+ add/sub for each load/store in the worst case. */
+ data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ class loop *father = stmt_info->stmt->bb->loop_father;
+ if (!loop && father && !father->inner && father->superloops)
+ {
+ tree ref;
+ if (TREE_CODE (dr->ref) != MEM_REF
+ || !(ref = TREE_OPERAND (dr->ref, 0))
+ || TREE_CODE (ref) != SSA_NAME)
+ break;
- if (SSA_NAME_IS_DEFAULT_DEF (ref))
- break;
+ if (SSA_NAME_IS_DEFAULT_DEF (ref))
+ break;
- if (memrefs.contains ({ref, cst0}))
- break;
+ if (memrefs.contains ({ref, cst0}))
+ break;
- memrefs.add ({ref, cst0});
+ memrefs.add ({ref, cst0});
- /* In case we have not seen REF before and the base address
- is a pointer operation try a bit harder. */
- tree base = DR_BASE_ADDRESS (dr);
- if (TREE_CODE (base) == POINTER_PLUS_EXPR
- || TREE_CODE (base) == POINTER_DIFF_EXPR)
- {
- /* Deconstruct BASE's first operand. If it is a binary
- operation, i.e. a base and an "offset" store this
- pair. Only increase the stmt_cost if we haven't seen
- it before. */
- tree argp = TREE_OPERAND (base, 1);
- typedef std::pair<tree, tree> addr_pair;
- addr_pair pair;
- if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ /* In case we have not seen REF before and the base
+ address is a pointer operation try a bit harder. */
+ tree base = DR_BASE_ADDRESS (dr);
+ if (TREE_CODE (base) == POINTER_PLUS_EXPR
+ || TREE_CODE (base) == POINTER_DIFF_EXPR)
{
- tree argp0 = tree_strip_nop_conversions
- (TREE_OPERAND (argp, 0));
- tree argp1 = TREE_OPERAND (argp, 1);
- pair = addr_pair (argp0, argp1);
- if (memrefs.contains (pair))
- break;
-
- memrefs.add (pair);
- stmt_cost += builtin_vectorization_cost (scalar_stmt,
- NULL_TREE, 0);
+ /* Deconstruct BASE's first operand. If it is a
+ binary operation, i.e. a base and an "offset"
+ store this pair. Only increase the stmt_cost if
+ we haven't seen it before. */
+ tree argp = TREE_OPERAND (base, 1);
+ typedef std::pair<tree, tree> addr_pair;
+ addr_pair pair;
+ if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ {
+ tree argp0 = tree_strip_nop_conversions
+ (TREE_OPERAND (argp, 0));
+ tree argp1 = TREE_OPERAND (argp, 1);
+ pair = addr_pair (argp0, argp1);
+ if (memrefs.contains (pair))
+ break;
+
+ memrefs.add (pair);
+ stmt_cost
+ += builtin_vectorization_cost (scalar_stmt,
+ NULL_TREE, 0);
+ }
}
}
}
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5e984ee2a55..874a42873d7 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -365,6 +365,13 @@ static const common_vector_cost rvv_vls_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 1, /* segment_permute (2) */
+ 1, /* segment_permute (3) */
+ 1, /* segment_permute (4) */
+ 1, /* segment_permute (5) */
+ 1, /* segment_permute (6) */
+ 1, /* segment_permute (7) */
+ 1, /* segment_permute (8) */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
@@ -381,6 +388,13 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 1, /* segment_permute (2) */
+ 1, /* segment_permute (3) */
+ 1, /* segment_permute (4) */
+ 1, /* segment_permute (5) */
+ 1, /* segment_permute (6) */
+ 1, /* segment_permute (7) */
+ 1, /* segment_permute (8) */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
Could you rebase to the trunk ? I don't think segment load store cost depends on previous patch you sent.
juzhe.zhong@rivai.ai
From: Robin Dapp
Date: 2024-03-01 23:07
To: 钟居哲; gcc-patches; palmer; kito.cheng
CC: rdapp.gcc; Jeff Law
Subject: Re: [PATCH] RISC-V: Add initial cost handling for segment loads/stores.
> + /* Segment load/store permute cost. */
> + const int segment_permute_2;
> + const int segment_permute_4;
> + const int segment_permute_8;
>
> Why do we only have 2/4/8, I think we should have 2/3/4/5/6/7/8
No idea why I posted that (wrong) version, I used it for
some testing locally. Attached is the proper version, still
called it v3...
Regards
Robin
Subject: [PATCH v3] RISC-V: Add initial cost handling for segment
loads/stores.
This patch makes segment loads and stores more expensive. It adds
segment_permute_2 as well as 3 to 8 cost fields to the common vector
costs and adds handling to adjust_stmt_cost.
gcc/ChangeLog:
* config/riscv/riscv-protos.h (struct common_vector_cost): Add
segment_permute cost.
* config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost):
Handle segment loads/stores.
* config/riscv/riscv.cc: Initialize segment_permute_[2-8] to 1.
---
gcc/config/riscv/riscv-protos.h | 9 ++
gcc/config/riscv/riscv-vector-costs.cc | 163 ++++++++++++++++++-------
gcc/config/riscv/riscv.cc | 14 +++
3 files changed, 144 insertions(+), 42 deletions(-)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 80efdf2b7e5..90d1fcbb3b1 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -218,6 +218,15 @@ struct common_vector_cost
const int gather_load_cost;
const int scatter_store_cost;
+ /* Segment load/store permute cost. */
+ const int segment_permute_2;
+ const int segment_permute_3;
+ const int segment_permute_4;
+ const int segment_permute_5;
+ const int segment_permute_6;
+ const int segment_permute_7;
+ const int segment_permute_8;
+
/* Cost of a vector-to-scalar operation. */
const int vec_to_scalar_cost;
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index adf9c197df5..f4da213fe14 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1043,6 +1043,25 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
return vector_costs::better_main_loop_than_p (other);
}
+/* Returns the group size i.e. the number of vectors to be loaded by a
+ segmented load/store instruction. Return 0 if it is no segmented
+ load/store. */
+static int
+segment_loadstore_group_size (enum vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info)
+{
+ if (stmt_info
+ && (kind == vector_load || kind == vector_store)
+ && STMT_VINFO_DATA_REF (stmt_info))
+ {
+ stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+ if (stmt_info
+ && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+ return DR_GROUP_SIZE (stmt_info);
+ }
+ return 0;
+}
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1067,55 +1086,115 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
case vector_load:
case vector_store:
{
- /* Unit-stride vector loads and stores do not have offset addressing
- as opposed to scalar loads and stores.
- If the address depends on a variable we need an additional
- add/sub for each load/store in the worst case. */
- if (stmt_info && stmt_info->stmt)
+ if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info))
{
- data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
- class loop *father = stmt_info->stmt->bb->loop_father;
- if (!loop && father && !father->inner && father->superloops)
+ /* Segment loads and stores. When the group size is > 1
+ the vectorizer will add a vector load/store statement for
+ each vector in the group. Here we additionally add permute
+ costs for each. */
+ /* TODO: Indexed and ordered/unordered cost. */
+ int group_size = segment_loadstore_group_size (kind, stmt_info);
+ if (group_size > 1)
+ {
+ switch (group_size)
+ {
+ case 2:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_2;
+ else
+ stmt_cost += costs->vls->segment_permute_2;
+ break;
+ case 3:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_3;
+ else
+ stmt_cost += costs->vls->segment_permute_3;
+ break;
+ case 4:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_4;
+ else
+ stmt_cost += costs->vls->segment_permute_4;
+ break;
+ case 5:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_5;
+ else
+ stmt_cost += costs->vls->segment_permute_5;
+ break;
+ case 6:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_6;
+ else
+ stmt_cost += costs->vls->segment_permute_6;
+ break;
+ case 7:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_7;
+ else
+ stmt_cost += costs->vls->segment_permute_7;
+ break;
+ case 8:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost += costs->vla->segment_permute_8;
+ else
+ stmt_cost += costs->vls->segment_permute_8;
+ break;
+ default:
+ break;
+ }
+ }
+ else
{
- tree ref;
- if (TREE_CODE (dr->ref) != MEM_REF
- || !(ref = TREE_OPERAND (dr->ref, 0))
- || TREE_CODE (ref) != SSA_NAME)
- break;
+ /* Unit-stride vector loads and stores do not have offset
+ addressing as opposed to scalar loads and stores.
+ If the address depends on a variable we need an additional
+ add/sub for each load/store in the worst case. */
+ data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ class loop *father = stmt_info->stmt->bb->loop_father;
+ if (!loop && father && !father->inner && father->superloops)
+ {
+ tree ref;
+ if (TREE_CODE (dr->ref) != MEM_REF
+ || !(ref = TREE_OPERAND (dr->ref, 0))
+ || TREE_CODE (ref) != SSA_NAME)
+ break;
- if (SSA_NAME_IS_DEFAULT_DEF (ref))
- break;
+ if (SSA_NAME_IS_DEFAULT_DEF (ref))
+ break;
- if (memrefs.contains ({ref, cst0}))
- break;
+ if (memrefs.contains ({ref, cst0}))
+ break;
- memrefs.add ({ref, cst0});
+ memrefs.add ({ref, cst0});
- /* In case we have not seen REF before and the base address
- is a pointer operation try a bit harder. */
- tree base = DR_BASE_ADDRESS (dr);
- if (TREE_CODE (base) == POINTER_PLUS_EXPR
- || TREE_CODE (base) == POINTER_DIFF_EXPR)
- {
- /* Deconstruct BASE's first operand. If it is a binary
- operation, i.e. a base and an "offset" store this
- pair. Only increase the stmt_cost if we haven't seen
- it before. */
- tree argp = TREE_OPERAND (base, 1);
- typedef std::pair<tree, tree> addr_pair;
- addr_pair pair;
- if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ /* In case we have not seen REF before and the base
+ address is a pointer operation try a bit harder. */
+ tree base = DR_BASE_ADDRESS (dr);
+ if (TREE_CODE (base) == POINTER_PLUS_EXPR
+ || TREE_CODE (base) == POINTER_DIFF_EXPR)
{
- tree argp0 = tree_strip_nop_conversions
- (TREE_OPERAND (argp, 0));
- tree argp1 = TREE_OPERAND (argp, 1);
- pair = addr_pair (argp0, argp1);
- if (memrefs.contains (pair))
- break;
-
- memrefs.add (pair);
- stmt_cost += builtin_vectorization_cost (scalar_stmt,
- NULL_TREE, 0);
+ /* Deconstruct BASE's first operand. If it is a
+ binary operation, i.e. a base and an "offset"
+ store this pair. Only increase the stmt_cost if
+ we haven't seen it before. */
+ tree argp = TREE_OPERAND (base, 1);
+ typedef std::pair<tree, tree> addr_pair;
+ addr_pair pair;
+ if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ {
+ tree argp0 = tree_strip_nop_conversions
+ (TREE_OPERAND (argp, 0));
+ tree argp1 = TREE_OPERAND (argp, 1);
+ pair = addr_pair (argp0, argp1);
+ if (memrefs.contains (pair))
+ break;
+
+ memrefs.add (pair);
+ stmt_cost
+ += builtin_vectorization_cost (scalar_stmt,
+ NULL_TREE, 0);
+ }
}
}
}
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5e984ee2a55..874a42873d7 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -365,6 +365,13 @@ static const common_vector_cost rvv_vls_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 1, /* segment_permute (2) */
+ 1, /* segment_permute (3) */
+ 1, /* segment_permute (4) */
+ 1, /* segment_permute (5) */
+ 1, /* segment_permute (6) */
+ 1, /* segment_permute (7) */
+ 1, /* segment_permute (8) */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
@@ -381,6 +388,13 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 1, /* segment_permute (2) */
+ 1, /* segment_permute (3) */
+ 1, /* segment_permute (4) */
+ 1, /* segment_permute (5) */
+ 1, /* segment_permute (6) */
+ 1, /* segment_permute (7) */
+ 1, /* segment_permute (8) */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
--
2.43.2
@@ -218,6 +218,10 @@ struct common_vector_cost
const int gather_load_cost;
const int scatter_store_cost;
+ /* Segment load/store cost. */
+ const int segment_load_cost;
+ const int segment_store_cost;
+
/* Cost of a vector-to-scalar operation. */
const int vec_to_scalar_cost;
@@ -1043,6 +1043,24 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const
return vector_costs::better_main_loop_than_p (other);
}
+/* Returns the group size i.e. the number of vectors to be loaded by a
+ segmented load/store instruction. Return 0 if it is no segmented
+ load/store. */
+static int
+segment_loadstore_group_size (enum vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info)
+{
+ if ((kind == vector_load || kind == vector_store)
+ && STMT_VINFO_DATA_REF (stmt_info))
+ {
+ stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+ if (stmt_info
+ && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+ return DR_GROUP_SIZE (stmt_info);
+ }
+ return 0;
+}
+
/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1067,55 +1085,80 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, loop_vec_info loop,
case vector_load:
case vector_store:
{
- /* Unit-stride vector loads and stores do not have offset addressing
- as opposed to scalar loads and stores.
- If the address depends on a variable we need an additional
- add/sub for each load/store in the worst case. */
- if (stmt_info && stmt_info->stmt)
+ if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info))
{
- data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
- class loop *father = stmt_info->stmt->bb->loop_father;
- if (!loop && father && !father->inner && father->superloops)
+ int group_size;
+ if ((group_size
+ = segment_loadstore_group_size (kind, stmt_info)) > 1)
{
- tree ref;
- if (TREE_CODE (dr->ref) != MEM_REF
- || !(ref = TREE_OPERAND (dr->ref, 0))
- || TREE_CODE (ref) != SSA_NAME)
- break;
+ /* Segment loads and stores. When the group size is > 1
+ the vectorizer will add a vector load/store statement for
+ each vector in the group. Note that STMT_COST is
+ overwritten here rather than adjusted. */
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+ stmt_cost
+ = (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
+ ? costs->vla->segment_load_cost
+ : costs->vla->segment_store_cost);
+ else
+ stmt_cost
+ = (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
+ ? costs->vls->segment_load_cost
+ : costs->vls->segment_store_cost);
+ break;
+ /* TODO: Indexed and ordered/unordered cost. */
+ }
+ else
+ {
+ /* Unit-stride vector loads and stores do not have offset
+ addressing as opposed to scalar loads and stores.
+ If the address depends on a variable we need an additional
+ add/sub for each load/store in the worst case. */
+ data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+ class loop *father = stmt_info->stmt->bb->loop_father;
+ if (!loop && father && !father->inner && father->superloops)
+ {
+ tree ref;
+ if (TREE_CODE (dr->ref) != MEM_REF
+ || !(ref = TREE_OPERAND (dr->ref, 0))
+ || TREE_CODE (ref) != SSA_NAME)
+ break;
- if (SSA_NAME_IS_DEFAULT_DEF (ref))
- break;
+ if (SSA_NAME_IS_DEFAULT_DEF (ref))
+ break;
- if (memrefs.contains ({ref, cst0}))
- break;
+ if (memrefs.contains ({ref, cst0}))
+ break;
- memrefs.add ({ref, cst0});
+ memrefs.add ({ref, cst0});
- /* In case we have not seen REF before and the base address
- is a pointer operation try a bit harder. */
- tree base = DR_BASE_ADDRESS (dr);
- if (TREE_CODE (base) == POINTER_PLUS_EXPR
- || TREE_CODE (base) == POINTER_DIFF_EXPR)
- {
- /* Deconstruct BASE's first operand. If it is a binary
- operation, i.e. a base and an "offset" store this
- pair. Only increase the stmt_cost if we haven't seen
- it before. */
- tree argp = TREE_OPERAND (base, 1);
- typedef std::pair<tree, tree> addr_pair;
- addr_pair pair;
- if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ /* In case we have not seen REF before and the base
+ address is a pointer operation try a bit harder. */
+ tree base = DR_BASE_ADDRESS (dr);
+ if (TREE_CODE (base) == POINTER_PLUS_EXPR
+ || TREE_CODE (base) == POINTER_DIFF_EXPR)
{
- tree argp0 = tree_strip_nop_conversions
- (TREE_OPERAND (argp, 0));
- tree argp1 = TREE_OPERAND (argp, 1);
- pair = addr_pair (argp0, argp1);
- if (memrefs.contains (pair))
- break;
-
- memrefs.add (pair);
- stmt_cost += builtin_vectorization_cost (scalar_stmt,
- NULL_TREE, 0);
+ /* Deconstruct BASE's first operand. If it is a
+ binary operation, i.e. a base and an "offset"
+ store this pair. Only increase the stmt_cost if
+ we haven't seen it before. */
+ tree argp = TREE_OPERAND (base, 1);
+ typedef std::pair<tree, tree> addr_pair;
+ addr_pair pair;
+ if (TREE_CODE_CLASS (TREE_CODE (argp)) == tcc_binary)
+ {
+ tree argp0 = tree_strip_nop_conversions
+ (TREE_OPERAND (argp, 0));
+ tree argp1 = TREE_OPERAND (argp, 1);
+ pair = addr_pair (argp0, argp1);
+ if (memrefs.contains (pair))
+ break;
+
+ memrefs.add (pair);
+ stmt_cost
+ += builtin_vectorization_cost (scalar_stmt,
+ NULL_TREE, 0);
+ }
}
}
}
@@ -365,6 +365,8 @@ static const common_vector_cost rvv_vls_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 2, /* segment_load_cost */
+ 2, /* segment_store_cost */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
@@ -381,6 +383,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
1, /* fp_stmt_cost */
1, /* gather_load_cost */
1, /* scatter_store_cost */
+ 2, /* segment_load_cost */
+ 2, /* segment_store_cost */
1, /* vec_to_scalar_cost */
1, /* scalar_to_vec_cost */
1, /* permute_cost */
@@ -21,8 +21,8 @@ void move_replacements (rtx *x, rtx *y, int n_replacements)
}
}
-/* { dg-final { scan-assembler {e64,m2} } } */
-/* { dg-final { scan-assembler-not {e64,m4} } } */
+/* { dg-final { scan-assembler-not {e64,m2} } } */
+/* { dg-final { scan-assembler {e64,m4} } } */
/* { dg-final { scan-assembler-not {jr} } } */
/* { dg-final { scan-assembler {ret} } } */
/* { dg-final { scan-assembler-not {sp} } } */