RISC-V: Optimize VSETVL codegen of SELECT_VL with LEN_MASK_{LOAD, STORE}
Checks
Commit Message
This patch is depending on LEN_MASK_{LOAD,STORE} patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-June/622742.html
After enabling the LEN_MASK_{LOAD,STORE}, I notice that there is a case that VSETVL PASS need to be optimized:
void
f (int32_t *__restrict a,
int32_t *__restrict b,
int32_t *__restrict cond,
int n)
{
for (int i = 0; i < 8; i++)
if (cond[i])
a[i] = b[i];
}
Before this patch:
f:
vsetivli a5,8,e8,mf4,tu,mu --> Propagate "8" to the following vsetvl
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v0,0(a2)
vsetvli a6,zero,e32,m1,ta,ma
li a3,8
vmsne.vi v0,v0,0
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v1,0(a1),v0.t
vse32.v v1,0(a0),v0.t
sub a4,a3,a5
beq a3,a5,.L6
slli a5,a5,2
add a2,a2,a5
add a1,a1,a5
add a0,a0,a5
vsetvli a5,a4,e8,mf4,tu,mu --> Propagate "a4" to the following vsetvl
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v0,0(a2)
vsetvli a6,zero,e32,m1,ta,ma
vmsne.vi v0,v0,0
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v1,0(a1),v0.t
vse32.v v1,0(a0),v0.t
.L6:
ret
Current VSETLV PASS only enable AVL propagation of VLMAX AVL ("zero").
Now, we enable AVL propagation of immediate && conservative non-VLMAX.
After this patch:
f:
vsetivli a5,8,e8,mf4,ta,ma
vle32.v v0,0(a2)
vsetvli a6,zero,e32,m1,ta,ma
li a3,8
vmsne.vi v0,v0,0
vsetivli zero,8,e32,m1,ta,ma
vle32.v v1,0(a1),v0.t
vse32.v v1,0(a0),v0.t
sub a4,a3,a5
beq a3,a5,.L6
slli a5,a5,2
vsetvli a4,a4,e8,mf4,ta,ma
add a2,a2,a5
vle32.v v0,0(a2)
add a1,a1,a5
vsetvli a6,zero,e32,m1,ta,ma
add a0,a0,a5
vmsne.vi v0,v0,0
vsetvli zero,a4,e32,m1,ta,ma
vle32.v v1,0(a1),v0.t
vse32.v v1,0(a0),v0.t
.L6:
ret
gcc/ChangeLog:
* config/riscv/riscv-vsetvl.cc (vector_insn_info::parse_insn): Ehance AVL propagation.
* config/riscv/riscv-vsetvl.h: New function.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/partial/select_vl-1.c: Add dump checks.
* gcc.target/riscv/rvv/autovec/partial/select_vl-2.c: New test.
---
gcc/config/riscv/riscv-vsetvl.cc | 48 +++++++++++++++++--
gcc/config/riscv/riscv-vsetvl.h | 2 +
.../riscv/rvv/autovec/partial/select_vl-1.c | 5 +-
.../riscv/rvv/autovec/partial/select_vl-2.c | 25 ++++++++++
4 files changed, 76 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/select_vl-2.c
Comments
On 6/25/23 06:20, Juzhe-Zhong wrote:
> This patch is depending on LEN_MASK_{LOAD,STORE} patch:
> https://gcc.gnu.org/pipermail/gcc-patches/2023-June/622742.html
>
> After enabling the LEN_MASK_{LOAD,STORE}, I notice that there is a case that VSETVL PASS need to be optimized:
>
> void
> f (int32_t *__restrict a,
> int32_t *__restrict b,
> int32_t *__restrict cond,
> int n)
> {
> for (int i = 0; i < 8; i++)
> if (cond[i])
> a[i] = b[i];
> }
>
> Before this patch:
> f:
> vsetivli a5,8,e8,mf4,tu,mu --> Propagate "8" to the following vsetvl
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v0,0(a2)
> vsetvli a6,zero,e32,m1,ta,ma
> li a3,8
> vmsne.vi v0,v0,0
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v1,0(a1),v0.t
> vse32.v v1,0(a0),v0.t
> sub a4,a3,a5
> beq a3,a5,.L6
> slli a5,a5,2
> add a2,a2,a5
> add a1,a1,a5
> add a0,a0,a5
> vsetvli a5,a4,e8,mf4,tu,mu --> Propagate "a4" to the following vsetvl
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v0,0(a2)
> vsetvli a6,zero,e32,m1,ta,ma
> vmsne.vi v0,v0,0
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v1,0(a1),v0.t
> vse32.v v1,0(a0),v0.t
> .L6:
> ret
>
> Current VSETLV PASS only enable AVL propagation of VLMAX AVL ("zero").
> Now, we enable AVL propagation of immediate && conservative non-VLMAX.
>
> After this patch:
>
> f:
> vsetivli a5,8,e8,mf4,ta,ma
> vle32.v v0,0(a2)
> vsetvli a6,zero,e32,m1,ta,ma
> li a3,8
> vmsne.vi v0,v0,0
> vsetivli zero,8,e32,m1,ta,ma
> vle32.v v1,0(a1),v0.t
> vse32.v v1,0(a0),v0.t
> sub a4,a3,a5
> beq a3,a5,.L6
> slli a5,a5,2
> vsetvli a4,a4,e8,mf4,ta,ma
> add a2,a2,a5
> vle32.v v0,0(a2)
> add a1,a1,a5
> vsetvli a6,zero,e32,m1,ta,ma
> add a0,a0,a5
> vmsne.vi v0,v0,0
> vsetvli zero,a4,e32,m1,ta,ma
> vle32.v v1,0(a1),v0.t
> vse32.v v1,0(a0),v0.t
> .L6:
> ret
>
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vsetvl.cc (vector_insn_info::parse_insn): Ehance AVL propagation.
> * config/riscv/riscv-vsetvl.h: New function.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/partial/select_vl-1.c: Add dump checks.
> * gcc.target/riscv/rvv/autovec/partial/select_vl-2.c: New test.
OK
jeff
Committed, thanks Jeff.
Pan
-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Jeff Law via Gcc-patches
Sent: Sunday, June 25, 2023 8:57 PM
To: Juzhe-Zhong <juzhe.zhong@rivai.ai>; gcc-patches@gcc.gnu.org
Cc: kito.cheng@gmail.com; kito.cheng@sifive.com; palmer@dabbelt.com; palmer@rivosinc.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH] RISC-V: Optimize VSETVL codegen of SELECT_VL with LEN_MASK_{LOAD,STORE}
On 6/25/23 06:20, Juzhe-Zhong wrote:
> This patch is depending on LEN_MASK_{LOAD,STORE} patch:
> https://gcc.gnu.org/pipermail/gcc-patches/2023-June/622742.html
>
> After enabling the LEN_MASK_{LOAD,STORE}, I notice that there is a case that VSETVL PASS need to be optimized:
>
> void
> f (int32_t *__restrict a,
> int32_t *__restrict b,
> int32_t *__restrict cond,
> int n)
> {
> for (int i = 0; i < 8; i++)
> if (cond[i])
> a[i] = b[i];
> }
>
> Before this patch:
> f:
> vsetivli a5,8,e8,mf4,tu,mu --> Propagate "8" to the following vsetvl
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v0,0(a2)
> vsetvli a6,zero,e32,m1,ta,ma
> li a3,8
> vmsne.vi v0,v0,0
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v1,0(a1),v0.t
> vse32.v v1,0(a0),v0.t
> sub a4,a3,a5
> beq a3,a5,.L6
> slli a5,a5,2
> add a2,a2,a5
> add a1,a1,a5
> add a0,a0,a5
> vsetvli a5,a4,e8,mf4,tu,mu --> Propagate "a4" to the following vsetvl
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v0,0(a2)
> vsetvli a6,zero,e32,m1,ta,ma
> vmsne.vi v0,v0,0
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v1,0(a1),v0.t
> vse32.v v1,0(a0),v0.t
> .L6:
> ret
>
> Current VSETLV PASS only enable AVL propagation of VLMAX AVL ("zero").
> Now, we enable AVL propagation of immediate && conservative non-VLMAX.
>
> After this patch:
>
> f:
> vsetivli a5,8,e8,mf4,ta,ma
> vle32.v v0,0(a2)
> vsetvli a6,zero,e32,m1,ta,ma
> li a3,8
> vmsne.vi v0,v0,0
> vsetivli zero,8,e32,m1,ta,ma
> vle32.v v1,0(a1),v0.t
> vse32.v v1,0(a0),v0.t
> sub a4,a3,a5
> beq a3,a5,.L6
> slli a5,a5,2
> vsetvli a4,a4,e8,mf4,ta,ma
> add a2,a2,a5
> vle32.v v0,0(a2)
> add a1,a1,a5
> vsetvli a6,zero,e32,m1,ta,ma
> add a0,a0,a5
> vmsne.vi v0,v0,0
> vsetvli zero,a4,e32,m1,ta,ma
> vle32.v v1,0(a1),v0.t
> vse32.v v1,0(a0),v0.t
> .L6:
> ret
>
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vsetvl.cc (vector_insn_info::parse_insn): Ehance AVL propagation.
> * config/riscv/riscv-vsetvl.h: New function.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/partial/select_vl-1.c: Add dump checks.
> * gcc.target/riscv/rvv/autovec/partial/select_vl-2.c: New test.
OK
jeff
@@ -2003,9 +2003,51 @@ vector_insn_info::parse_insn (insn_info *insn)
new_info.parse_insn (def_insn);
if (!same_vlmax_p (new_info) && !scalar_move_insn_p (insn->rtl ()))
return;
- /* TODO: Currently, we don't forward AVL for non-VLMAX vsetvl. */
- if (vlmax_avl_p (new_info.get_avl ()))
- set_avl_info (avl_info (new_info.get_avl (), get_avl_source ()));
+
+ if (new_info.has_avl ())
+ {
+ if (new_info.has_avl_imm ())
+ set_avl_info (avl_info (new_info.get_avl (), nullptr));
+ else
+ {
+ if (vlmax_avl_p (new_info.get_avl ()))
+ set_avl_info (avl_info (new_info.get_avl (), get_avl_source ()));
+ else
+ {
+ /* Conservatively propagate non-VLMAX AVL of user vsetvl:
+ 1. The user vsetvl should be same block with the rvv insn.
+ 2. The user vsetvl is the only def insn of rvv insn.
+ 3. The AVL is not modified between def-use chain.
+ 4. The VL is only used by insn within EBB.
+ */
+ bool modified_p = false;
+ for (insn_info *i = def_insn->next_nondebug_insn ();
+ real_insn_and_same_bb_p (i, get_insn ()->bb ());
+ i = i->next_nondebug_insn ())
+ {
+ if (find_access (i->defs (), REGNO (new_info.get_avl ())))
+ {
+ modified_p = true;
+ break;
+ }
+ }
+
+ bool has_live_out_use = false;
+ for (use_info *use : m_avl.get_source ()->all_uses ())
+ {
+ if (use->is_live_out_use ())
+ {
+ has_live_out_use = true;
+ break;
+ }
+ }
+ if (!modified_p && !has_live_out_use
+ && def_insn == m_avl.get_source ()->insn ()
+ && m_insn->bb () == def_insn->bb ())
+ set_avl_info (new_info.get_avl_info ());
+ }
+ }
+ }
if (scalar_move_insn_p (insn->rtl ()) && m_avl.has_non_zero_avl ())
m_demands[DEMAND_NONZERO_AVL] = true;
@@ -180,6 +180,7 @@ public:
bool has_avl_reg () const { return get_value () && REG_P (get_value ()); }
bool has_avl_no_reg () const { return !get_value (); }
bool has_non_zero_avl () const;
+ bool has_avl () const { return get_value (); }
};
/* Basic structure to save VL/VTYPE information. */
@@ -219,6 +220,7 @@ public:
bool has_avl_reg () const { return m_avl.has_avl_reg (); }
bool has_avl_no_reg () const { return m_avl.has_avl_no_reg (); }
bool has_non_zero_avl () const { return m_avl.has_non_zero_avl (); };
+ bool has_avl () const { return m_avl.has_avl (); }
rtx get_avl () const { return m_avl.get_value (); }
const avl_info &get_avl_info () const { return m_avl; }
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns -fdump-tree-optimized-details" } */
+/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d --param riscv-autovec-preference=scalable -fno-vect-cost-model -fno-tree-loop-distribute-patterns -fdump-tree-optimized-details" } */
#include <stdint-gcc.h>
@@ -20,7 +20,10 @@
TEST_TYPE (uint32_t) \
TEST_TYPE (int64_t) \
TEST_TYPE (uint64_t) \
+ TEST_TYPE (_Float16) \
TEST_TYPE (float) \
TEST_TYPE (double)
TEST_ALL ()
+
+/* { dg-final { scan-tree-dump-times "\.SELECT_VL" 11 "optimized" } } */
new file mode 100644
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fno-schedule-insns --param riscv-autovec-lmul=m1 -O3 -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+
+/*
+** foo:
+** vsetivli\t[a-x0-9]+,\s*8,\s*e(8?|16?|32?|64),\s*m(1?|2?|4?|8?|f2?|f4?|f8),\s*t[au],\s*m[au]
+** vle32\.v\tv[0-9]+,0\([a-x0-9]+\)
+** ...
+** vsetvli\t[a-x0-9]+,\s*[a-x0-9]+,\s*e(8?|16?|32?|64),\s*m(1?|2?|4?|8?|f2?|f4?|f8),\s*t[au],\s*m[au]
+** add\t[a-x0-9]+,[a-x0-9]+,[a-x0-9]+
+** vle32\.v\tv[0-9]+,0\([a-x0-9]+\)
+** ...
+*/
+void
+foo (int32_t *__restrict a,
+ int32_t *__restrict b,
+ int32_t *__restrict cond)
+{
+ for (int i = 0; i < 8; i++)
+ if (cond[i])
+ a[i] = b[i];
+}