RISC-V: Count pointer type SSA into RVV regs liveness for dynamic LMUL cost model
Checks
Commit Message
This patch fixes the following choosing unexpected big LMUL which cause register spillings.
Before this patch, choosing LMUL = 4:
addi sp,sp,-160
addiw t1,a2,-1
li a5,7
bleu t1,a5,.L16
vsetivli zero,8,e64,m4,ta,ma
vmv.v.x v4,a0
vs4r.v v4,0(sp) ---> spill to the stack.
vmv.v.x v4,a1
addi a5,sp,64
vs4r.v v4,0(a5) ---> spill to the stack.
The root cause is the following codes:
if (poly_int_tree_p (var)
|| (is_gimple_val (var)
&& !POINTER_TYPE_P (TREE_TYPE (var))))
We count the variable as consuming a RVV reg group when it is not POINTER_TYPE.
It is right for load/store STMT for example:
_1 = (MEM)*addr --> addr won't be allocated an RVV vector group.
However, we find it is not right for non-load/store STMT:
_3 = _1 == x_8(D);
_1 is pointer type too but we does allocate a RVV register group for it.
So after this patch, we are choosing the perfect LMUL for the testcase in this patch:
ble a2,zero,.L17
addiw a7,a2,-1
li a5,3
bleu a7,a5,.L15
srliw a5,a7,2
slli a6,a5,1
add a6,a6,a5
lui a5,%hi(replacements)
addi t1,a5,%lo(replacements)
slli a6,a6,5
lui t4,%hi(.LANCHOR0)
lui t3,%hi(.LANCHOR0+8)
lui a3,%hi(.LANCHOR0+16)
lui a4,%hi(.LC1)
vsetivli zero,4,e16,mf2,ta,ma
addi t4,t4,%lo(.LANCHOR0)
addi t3,t3,%lo(.LANCHOR0+8)
addi a3,a3,%lo(.LANCHOR0+16)
addi a4,a4,%lo(.LC1)
add a6,t1,a6
addi a5,a5,%lo(replacements)
vle16.v v18,0(t4)
vle16.v v17,0(t3)
vle16.v v16,0(a3)
vmsgeu.vi v25,v18,4
vadd.vi v24,v18,-4
vmsgeu.vi v23,v17,4
vadd.vi v22,v17,-4
vlm.v v21,0(a4)
vmsgeu.vi v20,v16,4
vadd.vi v19,v16,-4
vsetvli zero,zero,e64,m2,ta,mu
vmv.v.x v12,a0
vmv.v.x v14,a1
.L4:
vlseg3e64.v v6,(a5)
vmseq.vv v2,v6,v12
vmseq.vv v0,v8,v12
vmsne.vv v1,v8,v12
vmand.mm v1,v1,v2
vmerge.vvm v2,v8,v14,v0
vmv1r.v v0,v1
addi a4,a5,24
vmerge.vvm v6,v6,v14,v0
vmerge.vim v2,v2,0,v0
vrgatherei16.vv v4,v6,v18
vmv1r.v v0,v25
vrgatherei16.vv v4,v2,v24,v0.t
vs1r.v v4,0(a5)
addi a3,a5,48
vmv1r.v v0,v21
vmv2r.v v4,v2
vcompress.vm v4,v6,v0
vs1r.v v4,0(a4)
vmv1r.v v0,v23
addi a4,a5,72
vrgatherei16.vv v4,v6,v17
vrgatherei16.vv v4,v2,v22,v0.t
vs1r.v v4,0(a3)
vmv1r.v v0,v20
vrgatherei16.vv v4,v6,v16
addi a5,a5,96
vrgatherei16.vv v4,v2,v19,v0.t
vs1r.v v4,0(a4)
bne a6,a5,.L4
No spillings, no "sp" register used.
Tested on both RV32 and RV64, no regression.
Ok for trunk ?
PR target/113112
gcc/ChangeLog:
* config/riscv/riscv-vector-costs.cc (compute_nregs_for_mode): Fix pointer type liveness count.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c: New test.
---
gcc/config/riscv/riscv-vector-costs.cc | 12 ++++++--
.../vect/costmodel/riscv/rvv/pr113112-4.c | 28 +++++++++++++++++++
2 files changed, 37 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
Comments
On 12/28/23 18:21, Juzhe-Zhong wrote:
> This patch fixes the following choosing unexpected big LMUL which cause register spillings.
>
> Before this patch, choosing LMUL = 4:
>
> addi sp,sp,-160
> addiw t1,a2,-1
> li a5,7
> bleu t1,a5,.L16
> vsetivli zero,8,e64,m4,ta,ma
> vmv.v.x v4,a0
> vs4r.v v4,0(sp) ---> spill to the stack.
> vmv.v.x v4,a1
> addi a5,sp,64
> vs4r.v v4,0(a5) ---> spill to the stack.
>
> The root cause is the following codes:
>
> if (poly_int_tree_p (var)
> || (is_gimple_val (var)
> && !POINTER_TYPE_P (TREE_TYPE (var))))
>
> We count the variable as consuming a RVV reg group when it is not POINTER_TYPE.
>
> It is right for load/store STMT for example:
>
> _1 = (MEM)*addr --> addr won't be allocated an RVV vector group.
>
> However, we find it is not right for non-load/store STMT:
>
> _3 = _1 == x_8(D);
>
> _1 is pointer type too but we does allocate a RVV register group for it.
>
> So after this patch, we are choosing the perfect LMUL for the testcase in this patch:
>
> ble a2,zero,.L17
> addiw a7,a2,-1
> li a5,3
> bleu a7,a5,.L15
> srliw a5,a7,2
> slli a6,a5,1
> add a6,a6,a5
> lui a5,%hi(replacements)
> addi t1,a5,%lo(replacements)
> slli a6,a6,5
> lui t4,%hi(.LANCHOR0)
> lui t3,%hi(.LANCHOR0+8)
> lui a3,%hi(.LANCHOR0+16)
> lui a4,%hi(.LC1)
> vsetivli zero,4,e16,mf2,ta,ma
> addi t4,t4,%lo(.LANCHOR0)
> addi t3,t3,%lo(.LANCHOR0+8)
> addi a3,a3,%lo(.LANCHOR0+16)
> addi a4,a4,%lo(.LC1)
> add a6,t1,a6
> addi a5,a5,%lo(replacements)
> vle16.v v18,0(t4)
> vle16.v v17,0(t3)
> vle16.v v16,0(a3)
> vmsgeu.vi v25,v18,4
> vadd.vi v24,v18,-4
> vmsgeu.vi v23,v17,4
> vadd.vi v22,v17,-4
> vlm.v v21,0(a4)
> vmsgeu.vi v20,v16,4
> vadd.vi v19,v16,-4
> vsetvli zero,zero,e64,m2,ta,mu
> vmv.v.x v12,a0
> vmv.v.x v14,a1
> .L4:
> vlseg3e64.v v6,(a5)
> vmseq.vv v2,v6,v12
> vmseq.vv v0,v8,v12
> vmsne.vv v1,v8,v12
> vmand.mm v1,v1,v2
> vmerge.vvm v2,v8,v14,v0
> vmv1r.v v0,v1
> addi a4,a5,24
> vmerge.vvm v6,v6,v14,v0
> vmerge.vim v2,v2,0,v0
> vrgatherei16.vv v4,v6,v18
> vmv1r.v v0,v25
> vrgatherei16.vv v4,v2,v24,v0.t
> vs1r.v v4,0(a5)
> addi a3,a5,48
> vmv1r.v v0,v21
> vmv2r.v v4,v2
> vcompress.vm v4,v6,v0
> vs1r.v v4,0(a4)
> vmv1r.v v0,v23
> addi a4,a5,72
> vrgatherei16.vv v4,v6,v17
> vrgatherei16.vv v4,v2,v22,v0.t
> vs1r.v v4,0(a3)
> vmv1r.v v0,v20
> vrgatherei16.vv v4,v6,v16
> addi a5,a5,96
> vrgatherei16.vv v4,v2,v19,v0.t
> vs1r.v v4,0(a4)
> bne a6,a5,.L4
>
> No spillings, no "sp" register used.
>
> Tested on both RV32 and RV64, no regression.
>
> Ok for trunk ?
>
> PR target/113112
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vector-costs.cc (compute_nregs_for_mode): Fix pointer type liveness count.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c: New test.
>
> ---
> gcc/config/riscv/riscv-vector-costs.cc | 12 ++++++--
> .../vect/costmodel/riscv/rvv/pr113112-4.c | 28 +++++++++++++++++++
> 2 files changed, 37 insertions(+), 3 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c
>
> diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
> index 0c485dc4f29..b41a79429d4 100644
> --- a/gcc/config/riscv/riscv-vector-costs.cc
> +++ b/gcc/config/riscv/riscv-vector-costs.cc
> @@ -277,9 +277,12 @@ compute_local_live_ranges (
> {
> unsigned int point = program_point.point;
> gimple *stmt = program_point.stmt;
> + stmt_vec_info stmt_info = program_point.stmt_info;
> tree lhs = gimple_get_lhs (stmt);
> if (lhs != NULL_TREE && is_gimple_reg (lhs)
> - && !POINTER_TYPE_P (TREE_TYPE (lhs)))
> + && (!POINTER_TYPE_P (TREE_TYPE (lhs))
> + || STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
> + != store_vec_info_type))
> {
> biggest_mode = get_biggest_mode (biggest_mode,
> TYPE_MODE (TREE_TYPE (lhs)));
> @@ -305,7 +308,10 @@ compute_local_live_ranges (
> the future. */
> if (poly_int_tree_p (var)
> || (is_gimple_val (var)
> - && !POINTER_TYPE_P (TREE_TYPE (var))))
> + && (!POINTER_TYPE_P (TREE_TYPE (var))
> + || STMT_VINFO_TYPE (
> + vect_stmt_to_vectorize (stmt_info))
> + != load_vec_info_type)))
> {
> biggest_mode
> = get_biggest_mode (biggest_mode,
Just a nit. Why not compute vect_stmt_to_vectorize (stmt_info) into a
local to improve the bad line break? Or perhaps even compute
STMT_VINFO_TYPE (...) into a local?
OK with or without a change for that nit.
jeff
@@ -277,9 +277,12 @@ compute_local_live_ranges (
{
unsigned int point = program_point.point;
gimple *stmt = program_point.stmt;
+ stmt_vec_info stmt_info = program_point.stmt_info;
tree lhs = gimple_get_lhs (stmt);
if (lhs != NULL_TREE && is_gimple_reg (lhs)
- && !POINTER_TYPE_P (TREE_TYPE (lhs)))
+ && (!POINTER_TYPE_P (TREE_TYPE (lhs))
+ || STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
+ != store_vec_info_type))
{
biggest_mode = get_biggest_mode (biggest_mode,
TYPE_MODE (TREE_TYPE (lhs)));
@@ -305,7 +308,10 @@ compute_local_live_ranges (
the future. */
if (poly_int_tree_p (var)
|| (is_gimple_val (var)
- && !POINTER_TYPE_P (TREE_TYPE (var))))
+ && (!POINTER_TYPE_P (TREE_TYPE (var))
+ || STMT_VINFO_TYPE (
+ vect_stmt_to_vectorize (stmt_info))
+ != load_vec_info_type)))
{
biggest_mode
= get_biggest_mode (biggest_mode,
@@ -374,7 +380,7 @@ compute_nregs_for_mode (machine_mode mode, machine_mode biggest_mode, int lmul)
unsigned int biggest_size = GET_MODE_SIZE (biggest_mode).to_constant ();
gcc_assert (biggest_size >= mode_size);
unsigned int ratio = biggest_size / mode_size;
- return lmul / ratio;
+ return MAX (lmul / ratio, 1);
}
/* This function helps to determine whether current LMUL will cause
new file mode 100644
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -Ofast -ftree-vectorize --param riscv-autovec-lmul=dynamic --param riscv-autovec-preference=fixed-vlmax -fno-schedule-insns -fno-schedule-insns2" } */
+
+typedef struct rtx_def *rtx;
+struct replacement {
+ rtx *where;
+ rtx *subreg_loc;
+ int mode;
+};
+static struct replacement replacements[150];
+void move_replacements (rtx *x, rtx *y, int n_replacements)
+{
+ int i;
+ for (i = 0; i < n_replacements; i++)
+ if (replacements[i].subreg_loc == x)
+ replacements[i].subreg_loc = y;
+ else if (replacements[i].where == x)
+ {
+ replacements[i].where = y;
+ replacements[i].subreg_loc = 0;
+ }
+}
+
+/* { dg-final { scan-assembler {e64,m2} } } */
+/* { dg-final { scan-assembler-not {e64,m4} } } */
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler {ret} } } */
+/* { dg-final { scan-assembler-not {sp} } } */