RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis
Checks
Commit Message
Consider this following case:
foo:
ble a0,zero,.L11
lui a2,%hi(.LANCHOR0)
addi sp,sp,-128
addi a2,a2,%lo(.LANCHOR0)
mv a1,a0
vsetvli a6,zero,e32,m8,ta,ma
vid.v v8
vs8r.v v8,0(sp) ---> spill
.L3:
vl8re32.v v16,0(sp) ---> reload
vsetvli a4,a1,e8,m2,ta,ma
li a3,0
vsetvli a5,zero,e32,m8,ta,ma
vmv8r.v v0,v16
vmv.v.x v8,a4
vmv.v.i v24,0
vadd.vv v8,v16,v8
vmv8r.v v16,v24
vs8r.v v8,0(sp) ---> spill
.L4:
addiw a3,a3,1
vadd.vv v8,v0,v16
vadd.vi v16,v16,1
vadd.vv v24,v24,v8
bne a0,a3,.L4
vsetvli zero,a4,e32,m8,ta,ma
sub a1,a1,a4
vse32.v v24,0(a2)
slli a4,a4,2
add a2,a2,a4
bne a1,zero,.L3
li a0,0
addi sp,sp,128
jr ra
.L11:
li a0,0
ret
Pick unexpected LMUL = 8.
The root cause is we didn't involve PHI initial value in the dynamic LMUL calculation:
# j_17 = PHI <j_11(9), 0(5)> ---> # vect_vec_iv_.8_24 = PHI <_25(9), { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }(5)>
We didn't count { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } in consuming vector register but it does allocate an vector register group for it.
This patch fixes this missing count. Then after this patch we pick up perfect LMUL (LMUL = M4)
foo:
ble a0,zero,.L9
lui a4,%hi(.LANCHOR0)
addi a4,a4,%lo(.LANCHOR0)
mv a2,a0
vsetivli zero,16,e32,m4,ta,ma
vid.v v20
.L3:
vsetvli a3,a2,e8,m1,ta,ma
li a5,0
vsetivli zero,16,e32,m4,ta,ma
vmv4r.v v16,v20
vmv.v.i v12,0
vmv.v.x v4,a3
vmv4r.v v8,v12
vadd.vv v20,v20,v4
.L4:
addiw a5,a5,1
vmv4r.v v4,v8
vadd.vi v8,v8,1
vadd.vv v4,v16,v4
vadd.vv v12,v12,v4
bne a0,a5,.L4
slli a5,a3,2
vsetvli zero,a3,e32,m4,ta,ma
sub a2,a2,a3
vse32.v v12,0(a4)
add a4,a4,a5
bne a2,zero,.L3
.L9:
li a0,0
ret
Tested on --with-arch=gcv no regression. Ok for trunk ?
PR target/113112
gcc/ChangeLog:
* config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Refine dump information.
(preferred_new_lmul_p): Make PHI initial value into live regs calculation.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: New test.
---
gcc/config/riscv/riscv-vector-costs.cc | 45 ++++++++++++++++---
.../vect/costmodel/riscv/rvv/pr113112-1.c | 31 +++++++++++++
2 files changed, 71 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c
Comments
On 12/22/23 02:51, Juzhe-Zhong wrote:
> Consider this following case:
>
> foo:
> ble a0,zero,.L11
> lui a2,%hi(.LANCHOR0)
> addi sp,sp,-128
> addi a2,a2,%lo(.LANCHOR0)
> mv a1,a0
> vsetvli a6,zero,e32,m8,ta,ma
> vid.v v8
> vs8r.v v8,0(sp) ---> spill
> .L3:
> vl8re32.v v16,0(sp) ---> reload
> vsetvli a4,a1,e8,m2,ta,ma
> li a3,0
> vsetvli a5,zero,e32,m8,ta,ma
> vmv8r.v v0,v16
> vmv.v.x v8,a4
> vmv.v.i v24,0
> vadd.vv v8,v16,v8
> vmv8r.v v16,v24
> vs8r.v v8,0(sp) ---> spill
> .L4:
> addiw a3,a3,1
> vadd.vv v8,v0,v16
> vadd.vi v16,v16,1
> vadd.vv v24,v24,v8
> bne a0,a3,.L4
> vsetvli zero,a4,e32,m8,ta,ma
> sub a1,a1,a4
> vse32.v v24,0(a2)
> slli a4,a4,2
> add a2,a2,a4
> bne a1,zero,.L3
> li a0,0
> addi sp,sp,128
> jr ra
> .L11:
> li a0,0
> ret
>
> Pick unexpected LMUL = 8.
>
> The root cause is we didn't involve PHI initial value in the dynamic LMUL calculation:
>
> # j_17 = PHI <j_11(9), 0(5)> ---> # vect_vec_iv_.8_24 = PHI <_25(9), { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }(5)>
>
> We didn't count { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } in consuming vector register but it does allocate an vector register group for it.
Yup. There's analogues in the scalar space. Depending on the context
we might consider the value live on the edge, at the end of e->src or at
the start of e->dest.
In the scalar space we commonly have multiple constant values and we try
to account for them as best as we can as each distinct constant can
result in a constant load. We also try to find pseudos that happen to
already have the value we want so that they participate in the
coalescing process. I doubt either of these cases are particularly
important for vector though.
>
> This patch fixes this missing count. Then after this patch we pick up perfect LMUL (LMUL = M4)
>
> foo:
> ble a0,zero,.L9
> lui a4,%hi(.LANCHOR0)
> addi a4,a4,%lo(.LANCHOR0)
> mv a2,a0
> vsetivli zero,16,e32,m4,ta,ma
> vid.v v20
> .L3:
> vsetvli a3,a2,e8,m1,ta,ma
> li a5,0
> vsetivli zero,16,e32,m4,ta,ma
> vmv4r.v v16,v20
> vmv.v.i v12,0
> vmv.v.x v4,a3
> vmv4r.v v8,v12
> vadd.vv v20,v20,v4
> .L4:
> addiw a5,a5,1
> vmv4r.v v4,v8
> vadd.vi v8,v8,1
> vadd.vv v4,v16,v4
> vadd.vv v12,v12,v4
> bne a0,a5,.L4
> slli a5,a3,2
> vsetvli zero,a3,e32,m4,ta,ma
> sub a2,a2,a3
> vse32.v v12,0(a4)
> add a4,a4,a5
> bne a2,zero,.L3
> .L9:
> li a0,0
> ret
>
> Tested on --with-arch=gcv no regression. Ok for trunk ?
>
> PR target/113112
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Refine dump information.
> (preferred_new_lmul_p): Make PHI initial value into live regs calculation.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: New test.
OK assuming you've done the necessary regression testing.
jeff
Committed. Thanks Jeff.
juzhe.zhong@rivai.ai
From: Jeff Law
Date: 2023-12-23 00:58
To: Juzhe-Zhong; gcc-patches
CC: kito.cheng; kito.cheng; rdapp.gcc
Subject: Re: [PATCH] RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis
On 12/22/23 02:51, Juzhe-Zhong wrote:
> Consider this following case:
>
> foo:
> ble a0,zero,.L11
> lui a2,%hi(.LANCHOR0)
> addi sp,sp,-128
> addi a2,a2,%lo(.LANCHOR0)
> mv a1,a0
> vsetvli a6,zero,e32,m8,ta,ma
> vid.v v8
> vs8r.v v8,0(sp) ---> spill
> .L3:
> vl8re32.v v16,0(sp) ---> reload
> vsetvli a4,a1,e8,m2,ta,ma
> li a3,0
> vsetvli a5,zero,e32,m8,ta,ma
> vmv8r.v v0,v16
> vmv.v.x v8,a4
> vmv.v.i v24,0
> vadd.vv v8,v16,v8
> vmv8r.v v16,v24
> vs8r.v v8,0(sp) ---> spill
> .L4:
> addiw a3,a3,1
> vadd.vv v8,v0,v16
> vadd.vi v16,v16,1
> vadd.vv v24,v24,v8
> bne a0,a3,.L4
> vsetvli zero,a4,e32,m8,ta,ma
> sub a1,a1,a4
> vse32.v v24,0(a2)
> slli a4,a4,2
> add a2,a2,a4
> bne a1,zero,.L3
> li a0,0
> addi sp,sp,128
> jr ra
> .L11:
> li a0,0
> ret
>
> Pick unexpected LMUL = 8.
>
> The root cause is we didn't involve PHI initial value in the dynamic LMUL calculation:
>
> # j_17 = PHI <j_11(9), 0(5)> ---> # vect_vec_iv_.8_24 = PHI <_25(9), { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }(5)>
>
> We didn't count { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } in consuming vector register but it does allocate an vector register group for it.
Yup. There's analogues in the scalar space. Depending on the context
we might consider the value live on the edge, at the end of e->src or at
the start of e->dest.
In the scalar space we commonly have multiple constant values and we try
to account for them as best as we can as each distinct constant can
result in a constant load. We also try to find pseudos that happen to
already have the value we want so that they participate in the
coalescing process. I doubt either of these cases are particularly
important for vector though.
>
> This patch fixes this missing count. Then after this patch we pick up perfect LMUL (LMUL = M4)
>
> foo:
> ble a0,zero,.L9
> lui a4,%hi(.LANCHOR0)
> addi a4,a4,%lo(.LANCHOR0)
> mv a2,a0
> vsetivli zero,16,e32,m4,ta,ma
> vid.v v20
> .L3:
> vsetvli a3,a2,e8,m1,ta,ma
> li a5,0
> vsetivli zero,16,e32,m4,ta,ma
> vmv4r.v v16,v20
> vmv.v.i v12,0
> vmv.v.x v4,a3
> vmv4r.v v8,v12
> vadd.vv v20,v20,v4
> .L4:
> addiw a5,a5,1
> vmv4r.v v4,v8
> vadd.vi v8,v8,1
> vadd.vv v4,v16,v4
> vadd.vv v12,v12,v4
> bne a0,a5,.L4
> slli a5,a3,2
> vsetvli zero,a3,e32,m4,ta,ma
> sub a2,a2,a3
> vse32.v v12,0(a4)
> add a4,a4,a5
> bne a2,zero,.L3
> .L9:
> li a0,0
> ret
>
> Tested on --with-arch=gcv no regression. Ok for trunk ?
>
> PR target/113112
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Refine dump information.
> (preferred_new_lmul_p): Make PHI initial value into live regs calculation.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: New test.
OK assuming you've done the necessary regression testing.
jeff
@@ -355,10 +355,11 @@ max_number_of_live_regs (const basic_block bb,
}
if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "Maximum lmul = %d, %d number of live V_REG at program "
- "point %d for bb %d\n",
- lmul, max_nregs, live_point, bb->index);
+ dump_printf_loc (
+ MSG_NOTE, vect_location,
+ "Maximum lmul = %d, At most %d number of live V_REG at program "
+ "point %d for bb %d\n",
+ lmul, max_nregs, live_point, bb->index);
return max_nregs;
}
@@ -472,6 +473,41 @@ update_local_live_ranges (
tree def = gimple_phi_arg_def (phi, j);
auto *live_ranges = live_ranges_per_bb.get (bb);
auto *live_range = live_ranges->get (def);
+ if (poly_int_tree_p (def))
+ {
+ /* Insert live range of INTEGER_CST since we will need to
+ allocate a vector register for it.
+
+ E.g. # j_17 = PHI <j_11(9), 0(5)> will be transformed
+ into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)>
+
+ The live range for such value is short which only lives
+ at program point 0. */
+ if (live_range)
+ {
+ unsigned int start = (*live_range).first;
+ (*live_range).first = 0;
+ if (dump_enabled_p ())
+ dump_printf_loc (
+ MSG_NOTE, vect_location,
+ "Update %T start point from %d to 0:\n", def, start);
+ }
+ else
+ {
+ live_ranges->put (def, pair (0, 1));
+ auto &program_points = (*program_points_per_bb.get (bb));
+ if (program_points.is_empty ())
+ {
+ stmt_point info = {1, phi};
+ program_points.safe_push (info);
+ }
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Add %T start point from 0 to 1:\n",
+ def);
+ }
+ continue;
+ }
if (live_range && flow_bb_inside_loop_p (loop, e->src))
{
unsigned int start = (*live_range).first;
@@ -580,7 +616,6 @@ preferred_new_lmul_p (loop_vec_info other_loop_vinfo)
biggest_mode, lmul);
if (nregs > max_nregs)
max_nregs = nregs;
- live_ranges_per_bb.empty ();
}
live_ranges_per_bb.empty ();
return max_nregs > V_REG_NUM;
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic --param riscv-autovec-preference=fixed-vlmax -fdump-tree-vect-details" } */
+
+#define N 40
+
+int a[N];
+
+__attribute__ ((noinline)) int
+foo (int n){
+ int i,j;
+ int sum,x;
+
+ for (i = 0; i < n; i++) {
+ sum = 0;
+ for (j = 0; j < n; j++) {
+ sum += (i + j);
+ }
+ a[i] = sum;
+ }
+ return 0;
+}
+
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-times {ret} 1 } } */
+/* { dg-final { scan-tree-dump "Maximum lmul = 8" "vect" } } */
+/* { dg-final { scan-tree-dump "Maximum lmul = 4" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
+/* { dg-final { scan-tree-dump "At most 8 number of live V_REG at program point 0 for bb 4" "vect" } } */
+/* { dg-final { scan-tree-dump "At most 40 number of live V_REG at program point 0 for bb 3" "vect" } } */
+/* { dg-final { scan-tree-dump "At most 8 number of live V_REG at program point 0 for bb 5" "vect" } } */