[V2] RISC-V: Fix PR109615
Checks
Commit Message
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
This patch is to fix following case:
void f (int8_t * restrict in, int8_t * restrict out, int n, int m, int cond)
{
size_t vl = 101;
if (cond)
vl = m * 2;
else
vl = m * 2 * vl;
for (size_t i = 0; i < n; i++)
{
vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
__riscv_vse8_v_i8mf8 (out + i, v, vl);
vbool64_t mask = __riscv_vlm_v_b64 (in + i + 100, vl);
vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tumu (mask, v, in + i + 100, vl);
__riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
}
for (size_t i = 0; i < n; i++)
{
vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl);
__riscv_vse8_v_i8mf8 (out + i + 300, v, vl);
}
}
The value of "vl" is coming from different blocks so it will be wrapped as a PHI node of each
block.
In the first loop, the "vl" source is a PHI node from bb 4.
In the second loop, the "vl" source is a PHI node from bb 5.
since bb 5 is dominated by bb 4, the PHI input of "vl" in the second loop is the PHI node of "vl"
in bb 4.
So when 2 "vl" PHI node are both degenerate PHI node (the phi->num_inputs () == 1) and their only
input are same, it's safe for us to consider they are compatible.
This patch is only optimize degenerate PHI since it's safe and simple optimization.
non-dengerate PHI are considered as incompatible unless the PHI are the same in RTL_SSA.
TODO: non-generate PHI is complicated, we can support it when it is necessary in the future.
Before this patch:
...
.L2:
addi a4,a1,100
add t1,a0,a2
mv t0,a0
beq a2,zero,.L1
vsetvli zero,a3,e8,mf8,tu,mu
.L4:
addi a6,t0,100
addi a7,a4,-100
vle8.v v1,0(t0)
addi t0,t0,1
vse8.v v1,0(a7)
vlm.v v0,0(a6)
vle8.v v1,0(a6),v0.t
vse8.v v1,0(a4)
addi a4,a4,1
bne t0,t1,.L4
addi a0,a0,300
addi a1,a1,300
add a2,a0,a2
vsetvli zero,a3,e8,mf8,ta,ma
.L5:
vle8.v v2,0(a0)
addi a0,a0,1
vse8.v v2,0(a1)
addi a1,a1,1
bne a2,a0,.L5
.L1:
ret
After this patch:
...
.L2:
addi a4,a1,100
add t1,a0,a2
mv t0,a0
beq a2,zero,.L1
vsetvli zero,a3,e8,mf8,tu,mu
.L4:
addi a6,t0,100
addi a7,a4,-100
vle8.v v1,0(t0)
addi t0,t0,1
vse8.v v1,0(a7)
vlm.v v0,0(a6)
vle8.v v1,0(a6),v0.t
vse8.v v1,0(a4)
addi a4,a4,1
bne t0,t1,.L4
addi a0,a0,300
addi a1,a1,300
add a2,a0,a2
.L5:
vle8.v v2,0(a0)
addi a0,a0,1
vse8.v v2,0(a1)
addi a1,a1,1
bne a2,a0,.L5
.L1:
ret
PR target/109615
gcc/ChangeLog:
* config/riscv/riscv-vsetvl.cc (avl_info::multiple_source_equal_p): Add denegrate PHI optmization.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/vsetvl/avl_single-74.c: Adapt testcase.
* gcc.target/riscv/rvv/vsetvl/vsetvl-11.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/pr109615.c: New test.
---
gcc/config/riscv/riscv-vsetvl.cc | 81 +++++--------------
.../riscv/rvv/vsetvl/avl_single-74.c | 4 +-
.../gcc.target/riscv/rvv/vsetvl/pr109615.c | 33 ++++++++
.../gcc.target/riscv/rvv/vsetvl/vsetvl-11.c | 2 +-
4 files changed, 54 insertions(+), 66 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c
Comments
LGTM, committed to trunk with few changelog adjustments and few extra comments.
On Fri, May 5, 2023 at 2:33 PM <juzhe.zhong@rivai.ai> wrote:
>
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> This patch is to fix following case:
> void f (int8_t * restrict in, int8_t * restrict out, int n, int m, int cond)
> {
> size_t vl = 101;
> if (cond)
> vl = m * 2;
> else
> vl = m * 2 * vl;
>
> for (size_t i = 0; i < n; i++)
> {
> vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
> __riscv_vse8_v_i8mf8 (out + i, v, vl);
>
> vbool64_t mask = __riscv_vlm_v_b64 (in + i + 100, vl);
>
> vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tumu (mask, v, in + i + 100, vl);
> __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
> }
>
> for (size_t i = 0; i < n; i++)
> {
> vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl);
> __riscv_vse8_v_i8mf8 (out + i + 300, v, vl);
> }
> }
>
> The value of "vl" is coming from different blocks so it will be wrapped as a PHI node of each
> block.
>
> In the first loop, the "vl" source is a PHI node from bb 4.
> In the second loop, the "vl" source is a PHI node from bb 5.
> since bb 5 is dominated by bb 4, the PHI input of "vl" in the second loop is the PHI node of "vl"
> in bb 4.
> So when 2 "vl" PHI node are both degenerate PHI node (the phi->num_inputs () == 1) and their only
> input are same, it's safe for us to consider they are compatible.
>
> This patch is only optimize degenerate PHI since it's safe and simple optimization.
>
> non-dengerate PHI are considered as incompatible unless the PHI are the same in RTL_SSA.
> TODO: non-generate PHI is complicated, we can support it when it is necessary in the future.
>
> Before this patch:
>
> ...
> .L2:
> addi a4,a1,100
> add t1,a0,a2
> mv t0,a0
> beq a2,zero,.L1
> vsetvli zero,a3,e8,mf8,tu,mu
> .L4:
> addi a6,t0,100
> addi a7,a4,-100
> vle8.v v1,0(t0)
> addi t0,t0,1
> vse8.v v1,0(a7)
> vlm.v v0,0(a6)
> vle8.v v1,0(a6),v0.t
> vse8.v v1,0(a4)
> addi a4,a4,1
> bne t0,t1,.L4
> addi a0,a0,300
> addi a1,a1,300
> add a2,a0,a2
> vsetvli zero,a3,e8,mf8,ta,ma
> .L5:
> vle8.v v2,0(a0)
> addi a0,a0,1
> vse8.v v2,0(a1)
> addi a1,a1,1
> bne a2,a0,.L5
> .L1:
> ret
>
> After this patch:
>
> ...
> .L2:
> addi a4,a1,100
> add t1,a0,a2
> mv t0,a0
> beq a2,zero,.L1
> vsetvli zero,a3,e8,mf8,tu,mu
> .L4:
> addi a6,t0,100
> addi a7,a4,-100
> vle8.v v1,0(t0)
> addi t0,t0,1
> vse8.v v1,0(a7)
> vlm.v v0,0(a6)
> vle8.v v1,0(a6),v0.t
> vse8.v v1,0(a4)
> addi a4,a4,1
> bne t0,t1,.L4
> addi a0,a0,300
> addi a1,a1,300
> add a2,a0,a2
> .L5:
> vle8.v v2,0(a0)
> addi a0,a0,1
> vse8.v v2,0(a1)
> addi a1,a1,1
> bne a2,a0,.L5
> .L1:
> ret
>
> PR target/109615
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vsetvl.cc (avl_info::multiple_source_equal_p): Add denegrate PHI optmization.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/vsetvl/avl_single-74.c: Adapt testcase.
> * gcc.target/riscv/rvv/vsetvl/vsetvl-11.c: Ditto.
> * gcc.target/riscv/rvv/vsetvl/pr109615.c: New test.
>
> ---
> gcc/config/riscv/riscv-vsetvl.cc | 81 +++++--------------
> .../riscv/rvv/vsetvl/avl_single-74.c | 4 +-
> .../gcc.target/riscv/rvv/vsetvl/pr109615.c | 33 ++++++++
> .../gcc.target/riscv/rvv/vsetvl/vsetvl-11.c | 2 +-
> 4 files changed, 54 insertions(+), 66 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c
>
> diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
> index 609f86d8704..39b4d21210b 100644
> --- a/gcc/config/riscv/riscv-vsetvl.cc
> +++ b/gcc/config/riscv/riscv-vsetvl.cc
> @@ -1676,72 +1676,27 @@ avl_info::single_source_equal_p (const avl_info &other) const
> bool
> avl_info::multiple_source_equal_p (const avl_info &other) const
> {
> - /* TODO: We don't do too much optimization here since it's
> - too complicated in case of analyzing the PHI node.
> + /* When the def info is same in RTL_SSA namespace, it's safe
> + to consider they are avl compatible. */
> + if (m_source == other.get_source ())
> + return true;
>
> - For example:
> - void f (void * restrict in, void * restrict out, int n, int m, int cond)
> - {
> - size_t vl;
> - switch (cond)
> - {
> - case 1:
> - vl = 100;
> - break;
> - case 2:
> - vl = *(size_t*)(in + 100);
> - break;
> - case 3:
> - {
> - size_t new_vl = *(size_t*)(in + 500);
> - size_t new_vl2 = *(size_t*)(in + 600);
> - vl = new_vl + new_vl2 + 777;
> - break;
> - }
> - default:
> - vl = 4000;
> - break;
> - }
> - for (size_t i = 0; i < n; i++)
> - {
> - vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
> - __riscv_vse8_v_i8mf8 (out + i, v, vl);
> + /* We only consider handle PHI node. */
> + if (!m_source->insn ()->is_phi () || !other.get_source ()->insn ()->is_phi ())
> + return false;
>
> - vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 100, vl);
> - __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
> - }
> + phi_info *phi1 = as_a<phi_info *> (m_source);
> + phi_info *phi2 = as_a<phi_info *> (other.get_source ());
>
> - size_t vl2;
> - switch (cond)
> - {
> - case 1:
> - vl2 = 100;
> - break;
> - case 2:
> - vl2 = *(size_t*)(in + 100);
> - break;
> - case 3:
> - {
> - size_t new_vl = *(size_t*)(in + 500);
> - size_t new_vl2 = *(size_t*)(in + 600);
> - vl2 = new_vl + new_vl2 + 777;
> - break;
> - }
> - default:
> - vl2 = 4000;
> - break;
> - }
> - for (size_t i = 0; i < m; i++)
> - {
> - vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl2);
> - __riscv_vse8_v_i8mf8 (out + i + 300, v, vl2);
> - vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 200, vl2);
> - __riscv_vse8_v_i8mf8 (out + i + 200, v2, vl2);
> - }
> - }
> - Such case may not be necessary to optimize since the codes of defining
> - vl and vl2 are redundant. */
> - return m_source == other.get_source ();
> + if (phi1->is_degenerate () && phi2->is_degenerate ())
> + {
> + /* Case 1: If both PHI nodes have the same single input in use list.
> + We consider they are AVL compatible. */
> + if (phi1->input_value (0) == phi2->input_value (0))
> + return true;
> + }
> + /* TODO: We can support more optimization cases in the future. */
> + return false;
> }
>
> avl_info &
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c
> index ff540ec792d..cc4f88be888 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_single-74.c
> @@ -23,5 +23,5 @@ void f (int8_t * restrict in, int8_t * restrict out, int n, int cond, size_t vl,
> }
> }
>
> -/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> -/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> +/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> +/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c
> new file mode 100644
> index 00000000000..90b0bb79937
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr109615.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
> +
> +#include "riscv_vector.h"
> +
> +void f (int8_t * restrict in, int8_t * restrict out, int n, int m, int cond)
> +{
> + size_t vl = 101;
> + if (cond)
> + vl = m * 2;
> + else
> + vl = m * 2 * vl;
> +
> + for (size_t i = 0; i < n; i++)
> + {
> + vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
> + __riscv_vse8_v_i8mf8 (out + i, v, vl);
> +
> + vbool64_t mask = __riscv_vlm_v_b64 (in + i + 100, vl);
> +
> + vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tumu (mask, v, in + i + 100, vl);
> + __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
> + }
> +
> + for (size_t i = 0; i < n; i++)
> + {
> + vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl);
> + __riscv_vse8_v_i8mf8 (out + i + 300, v, vl);
> + }
> +}
> +
> +/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*tu,\s*mu} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> +/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c
> index fa825f031f9..3ef0fdcb66d 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl-11.c
> @@ -18,4 +18,4 @@ void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int c
> }
> }
>
> -/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
> +/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
> --
> 2.36.3
>
@@ -1676,72 +1676,27 @@ avl_info::single_source_equal_p (const avl_info &other) const
bool
avl_info::multiple_source_equal_p (const avl_info &other) const
{
- /* TODO: We don't do too much optimization here since it's
- too complicated in case of analyzing the PHI node.
+ /* When the def info is same in RTL_SSA namespace, it's safe
+ to consider they are avl compatible. */
+ if (m_source == other.get_source ())
+ return true;
- For example:
- void f (void * restrict in, void * restrict out, int n, int m, int cond)
- {
- size_t vl;
- switch (cond)
- {
- case 1:
- vl = 100;
- break;
- case 2:
- vl = *(size_t*)(in + 100);
- break;
- case 3:
- {
- size_t new_vl = *(size_t*)(in + 500);
- size_t new_vl2 = *(size_t*)(in + 600);
- vl = new_vl + new_vl2 + 777;
- break;
- }
- default:
- vl = 4000;
- break;
- }
- for (size_t i = 0; i < n; i++)
- {
- vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
- __riscv_vse8_v_i8mf8 (out + i, v, vl);
+ /* We only consider handle PHI node. */
+ if (!m_source->insn ()->is_phi () || !other.get_source ()->insn ()->is_phi ())
+ return false;
- vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 100, vl);
- __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
- }
+ phi_info *phi1 = as_a<phi_info *> (m_source);
+ phi_info *phi2 = as_a<phi_info *> (other.get_source ());
- size_t vl2;
- switch (cond)
- {
- case 1:
- vl2 = 100;
- break;
- case 2:
- vl2 = *(size_t*)(in + 100);
- break;
- case 3:
- {
- size_t new_vl = *(size_t*)(in + 500);
- size_t new_vl2 = *(size_t*)(in + 600);
- vl2 = new_vl + new_vl2 + 777;
- break;
- }
- default:
- vl2 = 4000;
- break;
- }
- for (size_t i = 0; i < m; i++)
- {
- vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl2);
- __riscv_vse8_v_i8mf8 (out + i + 300, v, vl2);
- vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tu (v, in + i + 200, vl2);
- __riscv_vse8_v_i8mf8 (out + i + 200, v2, vl2);
- }
- }
- Such case may not be necessary to optimize since the codes of defining
- vl and vl2 are redundant. */
- return m_source == other.get_source ();
+ if (phi1->is_degenerate () && phi2->is_degenerate ())
+ {
+ /* Case 1: If both PHI nodes have the same single input in use list.
+ We consider they are AVL compatible. */
+ if (phi1->input_value (0) == phi2->input_value (0))
+ return true;
+ }
+ /* TODO: We can support more optimization cases in the future. */
+ return false;
}
avl_info &
@@ -23,5 +23,5 @@ void f (int8_t * restrict in, int8_t * restrict out, int n, int cond, size_t vl,
}
}
-/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
new file mode 100644
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-tree-vectorize -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void f (int8_t * restrict in, int8_t * restrict out, int n, int m, int cond)
+{
+ size_t vl = 101;
+ if (cond)
+ vl = m * 2;
+ else
+ vl = m * 2 * vl;
+
+ for (size_t i = 0; i < n; i++)
+ {
+ vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
+ __riscv_vse8_v_i8mf8 (out + i, v, vl);
+
+ vbool64_t mask = __riscv_vlm_v_b64 (in + i + 100, vl);
+
+ vint8mf8_t v2 = __riscv_vle8_v_i8mf8_tumu (mask, v, in + i + 100, vl);
+ __riscv_vse8_v_i8mf8 (out + i + 100, v2, vl);
+ }
+
+ for (size_t i = 0; i < n; i++)
+ {
+ vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i + 300, vl);
+ __riscv_vse8_v_i8mf8 (out + i + 300, v, vl);
+ }
+}
+
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*tu,\s*mu} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
@@ -18,4 +18,4 @@ void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int c
}
}
-/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */