Checks
Commit Message
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
PR 108270
Fix issue: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108270.
Consider the following testcase:
void f (void * restrict in, void * restrict out, int l, int n, int m)
{
for (int i = 0; i < l; i++){
for (int j = 0; j < m; j++){
for (int k = 0; k < n; k++)
{
vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, 17);
__riscv_vse8_v_i8mf8 (out + i + j, v, 17);
}
}
}
}
Compile option: -O3
Before this patch:
mv a7,a2
mv a6,a0
mv t1,a1
mv a2,a3
vsetivli zero,17,e8,mf8,ta,ma
...
After this patch:
mv a7,a2
mv a6,a0
mv t1,a1
mv a2,a3
ble a7,zero,.L1
ble a4,zero,.L1
ble a3,zero,.L1
add a1,a0,a4
li a0,0
vsetivli zero,17,e8,mf8,ta,ma
...
This issue is a missed optmization produced by Phase 3 global backward demand fusion instead of
LCM.
This patch is fixing poor placement of the vsetvl.
This point is seletected not because LCM but by Phase 3 (VL/VTYPE demand info backward fusion and propogation) which
is I introduced into VSETVL PASS to enhance LCM && improve vsetvl instruction performance.
This patch is to supress the Phase 3 too aggressive backward fusion and propagation to the top of the function program
when there is no define instruction of AVL (AVL is 0 ~ 31 imm since vsetivli instruction allows imm value instead of reg).
You may want to ask why we need Phase 3 to the job.
Well, we have so many situations that pure LCM fails to optimize, here I can show you a simple case to demonstrate it:
void f (void * restrict in, void * restrict out, int n, int m, int cond)
{
size_t vl = 101;
for (size_t j = 0; j < m; j++){
if (cond) {
for (size_t i = 0; i < n; i++)
{
vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, vl);
__riscv_vse8_v_i8mf8 (out + i, v, vl);
}
} else {
for (size_t i = 0; i < n; i++)
{
vint32mf2_t v = __riscv_vle32_v_i32mf2 (in + i + j, vl);
v = __riscv_vadd_vv_i32mf2 (v,v,vl);
__riscv_vse32_v_i32mf2 (out + i, v, vl);
}
}
}
}
You can see:
The first inner loop needs vsetvli e8 mf8 for vle+vse.
The second inner loop need vsetvli e32 mf2 for vle+vadd+vse.
If we don't have Phase 3 (Only handled by LCM (Phase 4)), we will end up with :
outerloop:
...
vsetvli e8mf8
inner loop 1:
....
vsetvli e32mf2
inner loop 2:
....
However, if we have Phase 3, Phase 3 is going to fuse the vsetvli e32 mf2 of inner loop 2 into vsetvli e8 mf8, then we will end up with this result after phase 3:
outerloop:
...
inner loop 1:
vsetvli e32mf2
....
inner loop 2:
vsetvli e32mf2
....
Then, this demand information after phase 3 will be well optimized after phase 4 (LCM), after Phase 4 result is:
vsetvli e32mf2
outerloop:
...
inner loop 1:
....
inner loop 2:
....
You can see this is the optimal codegen after current VSETVL PASS (Phase 3: Demand backward fusion and propagation + Phase 4: LCM ). This is a known issue when I start to implement VSETVL PASS.
gcc/ChangeLog:
* config/riscv/riscv-vsetvl.cc (vector_infos_manager::all_empty_predecessor_p): New function.
(pass_vsetvl::backward_demand_fusion): Ditto.
* config/riscv/riscv-vsetvl.h: Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/vsetvl/imm_bb_prop-1.c: Adapt testcase.
* gcc.target/riscv/rvv/vsetvl/imm_conflict-3.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/pr108270.c: New test.
---
gcc/config/riscv/riscv-vsetvl.cc | 23 +++++++++++++++++++
gcc/config/riscv/riscv-vsetvl.h | 2 ++
.../riscv/rvv/vsetvl/imm_bb_prop-1.c | 2 +-
.../riscv/rvv/vsetvl/imm_conflict-3.c | 4 ++--
.../gcc.target/riscv/rvv/vsetvl/pr108270.c | 19 +++++++++++++++
5 files changed, 47 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr108270.c
@@ -2355,6 +2355,21 @@ vector_infos_manager::get_all_available_exprs (
return available_list;
}
+bool
+vector_infos_manager::all_empty_predecessor_p (const basic_block cfg_bb) const
+{
+ hash_set<basic_block> pred_cfg_bbs = get_all_predecessors (cfg_bb);
+ for (const basic_block pred_cfg_bb : pred_cfg_bbs)
+ {
+ const auto &pred_block_info = vector_block_infos[pred_cfg_bb->index];
+ if (!pred_block_info.local_dem.valid_or_dirty_p ()
+ && !pred_block_info.reaching_out.valid_or_dirty_p ())
+ continue;
+ return false;
+ }
+ return true;
+}
+
bool
vector_infos_manager::all_same_ratio_p (sbitmap bitdata) const
{
@@ -3138,6 +3153,14 @@ pass_vsetvl::backward_demand_fusion (void)
if (!backward_propagate_worthwhile_p (cfg_bb, curr_block_info))
continue;
+ /* Fix PR108270:
+
+ bb 0 -> bb 1
+ We don't need to backward fuse VL/VTYPE info from bb 1 to bb 0
+ if bb 1 is not inside a loop and all predecessors of bb 0 are empty. */
+ if (m_vector_manager->all_empty_predecessor_p (cfg_bb))
+ continue;
+
edge e;
edge_iterator ei;
/* Backward propagate to each predecessor. */
@@ -450,6 +450,8 @@ public:
/* Return true if all expression set in bitmap are same ratio. */
bool all_same_ratio_p (sbitmap) const;
+ bool all_empty_predecessor_p (const basic_block) const;
+
void release (void);
void create_bitmap_vectors (void);
void free_bitmap_vectors (void);
@@ -29,4 +29,4 @@ void f (int8_t * restrict in, int8_t * restrict out, int n, int cond)
}
}
-/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*tu,\s*m[au]} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
@@ -20,7 +20,7 @@ void f (int8_t * restrict in, int8_t * restrict out, int n, int cond)
}
}
-/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*5,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-funroll-loops" no-opts "-g" } } } } */
-/* { dg-final { scan-assembler-times {vsetivli} 1 { target { no-opts "-O0" no-opts "-funroll-loops" no-opts "-g" } } } } */
+/* { dg-final { scan-assembler-times {vsetivli} 2 { target { no-opts "-O0" no-opts "-funroll-loops" no-opts "-g" } } } } */
/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-funroll-loops" no-opts "-g" } } } } */
new file mode 100644
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (void * restrict in, void * restrict out, int l, int n, int m)
+{
+ for (int i = 0; i < l; i++){
+ for (int j = 0; j < m; j++){
+ for (int k = 0; k < n; k++)
+ {
+ vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + j, 17);
+ __riscv_vse8_v_i8mf8 (out + i + j, v, 17);
+ }
+ }
+ }
+}
+
+/* { dg-final { scan-assembler-not {mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+mv\s+[a-x0-9]+,[a-x0-9]+\s+vsetivli} } } */