[V2] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction

Message ID 20231018102533.2643245-1-juzhe.zhong@rivai.ai
State Unresolved
Headers
Series [V2] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

juzhe.zhong@rivai.ai Oct. 18, 2023, 10:25 a.m. UTC
  Confirm dynamic LMUL algorithm works well for choosing LMUL = 4 for the PR:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111848

But it generate horrible register spillings.

The root cause is that we didn't hoist the vmv.v.x outside the loop which
increase the SLP loop register pressure.

So, change the COSNT_VECTOR move into vec_duplicate splitter that we can gain better optimizations:

1. better LICM.
2. More opportunities of transforming 'vv' into 'vx' in the future.

Before this patch:

f3:
        ble     a4,zero,.L8
        csrr    t0,vlenb
        slli    t1,t0,4
        csrr    a6,vlenb
        sub     sp,sp,t1
        csrr    a5,vlenb
        slli    a6,a6,3
        slli    a5,a5,2
        add     a6,a6,sp
        vsetvli a7,zero,e16,m8,ta,ma
        slli    a4,a4,3
        vid.v   v8
        addi    t6,a5,-1
        vand.vi v8,v8,-2
        neg     t5,a5
        vs8r.v  v8,0(sp)
        vadd.vi v8,v8,1
        vs8r.v  v8,0(a6)
        j       .L4
.L12:
        vsetvli a7,zero,e16,m8,ta,ma
.L4:
        csrr    t0,vlenb
        slli    t0,t0,3
        vl8re16.v       v16,0(sp)
        add     t0,t0,sp
        vmv.v.x v8,t6
        mv      t1,a4
        vand.vv v24,v16,v8
        mv      a6,a4
        vl8re16.v       v16,0(t0)
        vand.vv v8,v16,v8
        bleu    a4,a5,.L3
        mv      a6,a5
.L3:
        vsetvli zero,a6,e8,m4,ta,ma
        vle8.v  v20,0(a2)
        vle8.v  v16,0(a3)
        vsetvli a7,zero,e8,m4,ta,ma
        vrgatherei16.vv v4,v20,v24
        vadd.vv v4,v16,v4
        vsetvli zero,a6,e8,m4,ta,ma
        vse8.v  v4,0(a0)
        vle8.v  v20,0(a2)
        vsetvli a7,zero,e8,m4,ta,ma
        vrgatherei16.vv v4,v20,v8
        vadd.vv v4,v4,v16
        vsetvli zero,a6,e8,m4,ta,ma
        vse8.v  v4,0(a1)
        add     a4,a4,t5
        add     a0,a0,a5
        add     a3,a3,a5
        add     a1,a1,a5
        add     a2,a2,a5
        bgtu    t1,a5,.L12
        csrr    t0,vlenb
        slli    t1,t0,4
        add     sp,sp,t1
        jr      ra
.L8:
        ret

After this patch:

f3:
	ble	a4,zero,.L6
	csrr	a6,vlenb
	csrr	a5,vlenb
	slli	a6,a6,2
	slli	a5,a5,2
	addi	a6,a6,-1
	slli	a4,a4,3
	neg	t5,a5
	vsetvli	t1,zero,e16,m8,ta,ma
	vmv.v.x	v24,a6
	vid.v	v8
	vand.vi	v8,v8,-2
	vadd.vi	v16,v8,1
	vand.vv	v8,v8,v24
	vand.vv	v16,v16,v24
.L4:
	mv	t1,a4
	mv	a6,a4
	bleu	a4,a5,.L3
	mv	a6,a5
.L3:
	vsetvli	zero,a6,e8,m4,ta,ma
	vle8.v	v28,0(a2)
	vle8.v	v24,0(a3)
	vsetvli	a7,zero,e8,m4,ta,ma
	vrgatherei16.vv	v4,v28,v8
	vadd.vv	v4,v24,v4
	vsetvli	zero,a6,e8,m4,ta,ma
	vse8.v	v4,0(a0)
	vle8.v	v28,0(a2)
	vsetvli	a7,zero,e8,m4,ta,ma
	vrgatherei16.vv	v4,v28,v16
	vadd.vv	v4,v4,v24
	vsetvli	zero,a6,e8,m4,ta,ma
	vse8.v	v4,0(a1)
	add	a4,a4,t5
	add	a0,a0,a5
	add	a3,a3,a5
	add	a1,a1,a5
	add	a2,a2,a5
	bgtu	t1,a5,.L4
.L6:
	ret

Note that this patch triggers multiple FAILs:
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c execution test

They failed are all because of bugs on VSETVL PASS:

10dd4:       0c707057                vsetvli zero,zero,e8,mf2,ta,ma
   10dd8:       5e06b8d7                vmv.v.i v17,13
   10ddc:       9ed030d7                vmv1r.v v1,v13
   10de0:       b21040d7                vncvt.x.x.w     v1,v1           ----> raise illegal instruction since we don't have SEW = 8 -> SEW = 4 narrowing.
   10de4:       5e0785d7                vmv.v.v v11,v15

Confirm the recent VSETVL refactor patch: https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633231.html fixed all of them.

So this patch should be committed after the VSETVL refactor patch.

	PR target/111848

gcc/ChangeLog:

	* config/riscv/riscv-selftests.cc (run_const_vector_selftests): Adapt selftest.
	* config/riscv/riscv-v.cc (expand_const_vector): Change it into vec_duplicate splitter.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: Adapt test.
	* gcc.dg/vect/costmodel/riscv/rvv/pr111848.c: New test.

---
 gcc/config/riscv/riscv-selftests.cc           | 14 ++++----
 gcc/config/riscv/riscv-v.cc                   | 27 ++++++++++++--
 .../costmodel/riscv/rvv/dynamic-lmul2-7.c     |  3 +-
 .../vect/costmodel/riscv/rvv/pr111848.c       | 35 +++++++++++++++++++
 4 files changed, 68 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c
  

Comments

juzhe.zhong@rivai.ai Oct. 18, 2023, 10:51 a.m. UTC | #1
More details of VSETVL bug:

Loop:
   10ddc:       9ed030d7                vmv1r.v v1,v13
   10de0:       b21040d7                vncvt.x.x.w     v1,v1
   10de4:       5e0785d7                vmv.v.v v11,v15
   10de8:       b700a5d7                vmacc.vv        v11,v1,v16
   10dec:       a6e8a0d7                vmadd.vv        v1,v17,v14
   10df0:       26b7b5d7                vand.vi v11,v11,15
   10df4:       0c75f7d7                vsetvli a5,a1,e8,mf2,ta,ma
   10df8:       0c707557                vsetvli a0,zero,e8,mf2,ta,ma
   10dfc:       2617b0d7                vand.vi v1,v1,15
   10e00:       0c75f057                vsetvli zero,a1,e8,mf2,ta,ma
   10e04:       8d9d                    sub     a1,a1,a5
   10e06:       020705a7                vse8.v  v11,(a4)
   10e0a:       0c77f057                vsetvli zero,a5,e8,mf2,ta,ma
   10e0e:       020685a7                vse8.v  v11,(a3)
   10e12:       020600a7                vse8.v  v1,(a2)
   10e16:       973e                    add     a4,a4,a5
   10e18:       0c807557                vsetvli a0,zero,e16,m1,ta,ma
   10e1c:       96be                    add     a3,a3,a5
   10e1e:       963e                    add     a2,a2,a5
   10e20:       02d606d7                vadd.vv v13,v13,v12
   10e24:       fdc5                    bnez    a1,10ddc <main+0xca0>

The vncvt.x.x.w consume e16m1 VTYPE vsetvl but it shouldn't, it should be e8mf2.
This issue is fixed by recent refactor patch.


juzhe.zhong@rivai.ai
 
From: Juzhe-Zhong
Date: 2023-10-18 18:25
To: gcc-patches
CC: kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc; Juzhe-Zhong
Subject: [PATCH V2] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction
Confirm dynamic LMUL algorithm works well for choosing LMUL = 4 for the PR:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111848
 
But it generate horrible register spillings.
 
The root cause is that we didn't hoist the vmv.v.x outside the loop which
increase the SLP loop register pressure.
 
So, change the COSNT_VECTOR move into vec_duplicate splitter that we can gain better optimizations:
 
1. better LICM.
2. More opportunities of transforming 'vv' into 'vx' in the future.
 
Before this patch:
 
f3:
        ble     a4,zero,.L8
        csrr    t0,vlenb
        slli    t1,t0,4
        csrr    a6,vlenb
        sub     sp,sp,t1
        csrr    a5,vlenb
        slli    a6,a6,3
        slli    a5,a5,2
        add     a6,a6,sp
        vsetvli a7,zero,e16,m8,ta,ma
        slli    a4,a4,3
        vid.v   v8
        addi    t6,a5,-1
        vand.vi v8,v8,-2
        neg     t5,a5
        vs8r.v  v8,0(sp)
        vadd.vi v8,v8,1
        vs8r.v  v8,0(a6)
        j       .L4
.L12:
        vsetvli a7,zero,e16,m8,ta,ma
.L4:
        csrr    t0,vlenb
        slli    t0,t0,3
        vl8re16.v       v16,0(sp)
        add     t0,t0,sp
        vmv.v.x v8,t6
        mv      t1,a4
        vand.vv v24,v16,v8
        mv      a6,a4
        vl8re16.v       v16,0(t0)
        vand.vv v8,v16,v8
        bleu    a4,a5,.L3
        mv      a6,a5
.L3:
        vsetvli zero,a6,e8,m4,ta,ma
        vle8.v  v20,0(a2)
        vle8.v  v16,0(a3)
        vsetvli a7,zero,e8,m4,ta,ma
        vrgatherei16.vv v4,v20,v24
        vadd.vv v4,v16,v4
        vsetvli zero,a6,e8,m4,ta,ma
        vse8.v  v4,0(a0)
        vle8.v  v20,0(a2)
        vsetvli a7,zero,e8,m4,ta,ma
        vrgatherei16.vv v4,v20,v8
        vadd.vv v4,v4,v16
        vsetvli zero,a6,e8,m4,ta,ma
        vse8.v  v4,0(a1)
        add     a4,a4,t5
        add     a0,a0,a5
        add     a3,a3,a5
        add     a1,a1,a5
        add     a2,a2,a5
        bgtu    t1,a5,.L12
        csrr    t0,vlenb
        slli    t1,t0,4
        add     sp,sp,t1
        jr      ra
.L8:
        ret
 
After this patch:
 
f3:
ble a4,zero,.L6
csrr a6,vlenb
csrr a5,vlenb
slli a6,a6,2
slli a5,a5,2
addi a6,a6,-1
slli a4,a4,3
neg t5,a5
vsetvli t1,zero,e16,m8,ta,ma
vmv.v.x v24,a6
vid.v v8
vand.vi v8,v8,-2
vadd.vi v16,v8,1
vand.vv v8,v8,v24
vand.vv v16,v16,v24
.L4:
mv t1,a4
mv a6,a4
bleu a4,a5,.L3
mv a6,a5
.L3:
vsetvli zero,a6,e8,m4,ta,ma
vle8.v v28,0(a2)
vle8.v v24,0(a3)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v28,v8
vadd.vv v4,v24,v4
vsetvli zero,a6,e8,m4,ta,ma
vse8.v v4,0(a0)
vle8.v v28,0(a2)
vsetvli a7,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v28,v16
vadd.vv v4,v4,v24
vsetvli zero,a6,e8,m4,ta,ma
vse8.v v4,0(a1)
add a4,a4,t5
add a0,a0,a5
add a3,a3,a5
add a1,a1,a5
add a2,a2,a5
bgtu t1,a5,.L4
.L6:
ret
 
Note that this patch triggers multiple FAILs:
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-3.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-4.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/cond/cond_arith_run-8.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-1.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_load_run-2.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-1.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c execution test
FAIL: gcc.target/riscv/rvv/autovec/gather-scatter/strided_store_run-2.c execution test
 
They failed are all because of bugs on VSETVL PASS:
 
10dd4:       0c707057                vsetvli zero,zero,e8,mf2,ta,ma
   10dd8:       5e06b8d7                vmv.v.i v17,13
   10ddc:       9ed030d7                vmv1r.v v1,v13
   10de0:       b21040d7                vncvt.x.x.w     v1,v1           ----> raise illegal instruction since we don't have SEW = 8 -> SEW = 4 narrowing.
   10de4:       5e0785d7                vmv.v.v v11,v15
 
Confirm the recent VSETVL refactor patch: https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633231.html fixed all of them.
 
So this patch should be committed after the VSETVL refactor patch.
 
PR target/111848
 
gcc/ChangeLog:
 
* config/riscv/riscv-selftests.cc (run_const_vector_selftests): Adapt selftest.
* config/riscv/riscv-v.cc (expand_const_vector): Change it into vec_duplicate splitter.
 
gcc/testsuite/ChangeLog:
 
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: Adapt test.
* gcc.dg/vect/costmodel/riscv/rvv/pr111848.c: New test.
 
---
gcc/config/riscv/riscv-selftests.cc           | 14 ++++----
gcc/config/riscv/riscv-v.cc                   | 27 ++++++++++++--
.../costmodel/riscv/rvv/dynamic-lmul2-7.c     |  3 +-
.../vect/costmodel/riscv/rvv/pr111848.c       | 35 +++++++++++++++++++
4 files changed, 68 insertions(+), 11 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c
 
diff --git a/gcc/config/riscv/riscv-selftests.cc b/gcc/config/riscv/riscv-selftests.cc
index cdc863ee4f7..0ac17fb70a1 100644
--- a/gcc/config/riscv/riscv-selftests.cc
+++ b/gcc/config/riscv/riscv-selftests.cc
@@ -267,15 +267,14 @@ run_const_vector_selftests (void)
      rtx dup = gen_const_vec_duplicate (mode, GEN_INT (val));
      emit_move_insn (dest, dup);
      rtx_insn *insn = get_last_insn ();
-       rtx src = XEXP (SET_SRC (PATTERN (insn)), 1);
+       rtx src = SET_SRC (PATTERN (insn));
      /* 1. Should be vmv.v.i for in rang of -16 ~ 15.
2. Should be vmv.v.x for exceed -16 ~ 15.  */
      if (IN_RANGE (val, -16, 15))
- ASSERT_TRUE (rtx_equal_p (src, dup));
-       else
ASSERT_TRUE (
-   rtx_equal_p (src,
-        gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0))));
+   rtx_equal_p (XEXP (SET_SRC (PATTERN (insn)), 1), dup));
+       else
+ ASSERT_TRUE (GET_CODE (src) == VEC_DUPLICATE);
      end_sequence ();
    }
}
@@ -294,10 +293,9 @@ run_const_vector_selftests (void)
  rtx dup = gen_const_vec_duplicate (mode, ele);
  emit_move_insn (dest, dup);
  rtx_insn *insn = get_last_insn ();
-   rtx src = XEXP (SET_SRC (PATTERN (insn)), 1);
+   rtx src = SET_SRC (PATTERN (insn));
  /* Should always be vfmv.v.f.  */
-   ASSERT_TRUE (
-     rtx_equal_p (src, gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0))));
+   ASSERT_TRUE (GET_CODE (src) == VEC_DUPLICATE);
  end_sequence ();
}
     }
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 895c11d13fc..6116f5df504 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -1001,8 +1001,31 @@ expand_const_vector (rtx target, rtx src)
}
       else
{
-   rtx ops[] = {tmp, elt};
-   emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
+   /* Emit vec_duplicate<mode> split pattern before RA so that
+      we could have a better optimization opportunity in LICM
+      which will hoist vmv.v.x outside the loop and in fwprop && combine
+      which will transform 'vv' into 'vx' instruction.
+
+      The reason we don't emit vec_duplicate<mode> split pattern during
+      RA since the split stage after RA is a too late stage to generate
+      RVV instruction which need an additional register (We can't
+      allocate a new register after RA) for VL operand of vsetvl
+      instruction (vsetvl a5, zero).  */
+   if (lra_in_progress)
+     {
+       rtx ops[] = {tmp, elt};
+       emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
+     }
+   else
+     {
+       struct expand_operand ops[2];
+       enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
+       gcc_assert (icode != CODE_FOR_nothing);
+       create_output_operand (&ops[0], tmp, mode);
+       create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
+       expand_insn (icode, 2, ops);
+       tmp = ops[0].value;
+     }
}
       if (tmp != target)
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
index 3dfc6f16a25..2a735d8c6b6 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
@@ -18,7 +18,8 @@ bar (int *x, int a, int b, int n)
}
/* { dg-final { scan-assembler {e32,m2} } } */
-/* { dg-final { scan-assembler-times {csrr} 1 } } */
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-times {ret} 2 } } *
/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c
new file mode 100644
index 00000000000..b203ca907fa
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -fdump-tree-vect-details" } */
+
+void
+f3 (uint8_t *restrict a, uint8_t *restrict b,
+   uint8_t *restrict c, uint8_t *restrict d,
+   int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      a[i * 8] = c[i * 8] + d[i * 8];
+      a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1];
+      a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2];
+      a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3];
+      a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4];
+      a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5];
+      a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6];
+      a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7];
+      b[i * 8] = c[i * 8 + 1] + d[i * 8];
+      b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1];
+      b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2];
+      b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3];
+      b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4];
+      b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5];
+      b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6];
+      b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7];
+    }
+}
+
+/* { dg-final { scan-assembler {e8,m4} } } */
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-times {ret} 1 } } *
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
-- 
2.36.3
  
Robin Dapp Oct. 19, 2023, 8:43 a.m. UTC | #2
Hi Juzhe,

as discussed off-list this approach generally makes sense to me so
the patch LGTM once the vsetvl rework is upstream and settled.

Independently, we still need to understand why the more complex
broadcast pattern is not hoisted out of the loop.

Regards
 Robin
  
juzhe.zhong@rivai.ai Oct. 19, 2023, 8:53 a.m. UTC | #3
May be it is COST issue of RVV instruction ?

  /* TODO: We set RVV instruction cost as 1 by default.
     Cost Model need to be well analyzed and supported in the future. */
  if (riscv_v_ext_mode_p (mode))
    {
      *total = COSTS_N_INSNS (1);
      return true;
    }

Since all RVV instructions are considered as very cheap (COST = 1).


juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-10-19 16:43
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; jeffreyalaw
Subject: Re: [PATCH V2] RISC-V: Fix failed hoist in LICM of vmv.v.x instruction
Hi Juzhe,
 
as discussed off-list this approach generally makes sense to me so
the patch LGTM once the vsetvl rework is upstream and settled.
 
Independently, we still need to understand why the more complex
broadcast pattern is not hoisted out of the loop.
 
Regards
Robin
  
Lehua Ding Oct. 20, 2023, 3:59 a.m. UTC | #4
Committed after the commited of the vsetvl pass refactor patch, thanks 
Robin.

On 2023/10/19 16:43, Robin Dapp wrote:
> Hi Juzhe,
> 
> as discussed off-list this approach generally makes sense to me so
> the patch LGTM once the vsetvl rework is upstream and settled.
> 
> Independently, we still need to understand why the more complex
> broadcast pattern is not hoisted out of the loop.
> 
> Regards
>   Robin
  

Patch

diff --git a/gcc/config/riscv/riscv-selftests.cc b/gcc/config/riscv/riscv-selftests.cc
index cdc863ee4f7..0ac17fb70a1 100644
--- a/gcc/config/riscv/riscv-selftests.cc
+++ b/gcc/config/riscv/riscv-selftests.cc
@@ -267,15 +267,14 @@  run_const_vector_selftests (void)
 	      rtx dup = gen_const_vec_duplicate (mode, GEN_INT (val));
 	      emit_move_insn (dest, dup);
 	      rtx_insn *insn = get_last_insn ();
-	      rtx src = XEXP (SET_SRC (PATTERN (insn)), 1);
+	      rtx src = SET_SRC (PATTERN (insn));
 	      /* 1. Should be vmv.v.i for in rang of -16 ~ 15.
 		 2. Should be vmv.v.x for exceed -16 ~ 15.  */
 	      if (IN_RANGE (val, -16, 15))
-		ASSERT_TRUE (rtx_equal_p (src, dup));
-	      else
 		ASSERT_TRUE (
-		  rtx_equal_p (src,
-			       gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0))));
+		  rtx_equal_p (XEXP (SET_SRC (PATTERN (insn)), 1), dup));
+	      else
+		ASSERT_TRUE (GET_CODE (src) == VEC_DUPLICATE);
 	      end_sequence ();
 	    }
 	}
@@ -294,10 +293,9 @@  run_const_vector_selftests (void)
 	  rtx dup = gen_const_vec_duplicate (mode, ele);
 	  emit_move_insn (dest, dup);
 	  rtx_insn *insn = get_last_insn ();
-	  rtx src = XEXP (SET_SRC (PATTERN (insn)), 1);
+	  rtx src = SET_SRC (PATTERN (insn));
 	  /* Should always be vfmv.v.f.  */
-	  ASSERT_TRUE (
-	    rtx_equal_p (src, gen_rtx_VEC_DUPLICATE (mode, XEXP (src, 0))));
+	  ASSERT_TRUE (GET_CODE (src) == VEC_DUPLICATE);
 	  end_sequence ();
 	}
     }
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 895c11d13fc..6116f5df504 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -1001,8 +1001,31 @@  expand_const_vector (rtx target, rtx src)
 	}
       else
 	{
-	  rtx ops[] = {tmp, elt};
-	  emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
+	  /* Emit vec_duplicate<mode> split pattern before RA so that
+	     we could have a better optimization opportunity in LICM
+	     which will hoist vmv.v.x outside the loop and in fwprop && combine
+	     which will transform 'vv' into 'vx' instruction.
+
+	     The reason we don't emit vec_duplicate<mode> split pattern during
+	     RA since the split stage after RA is a too late stage to generate
+	     RVV instruction which need an additional register (We can't
+	     allocate a new register after RA) for VL operand of vsetvl
+	     instruction (vsetvl a5, zero).  */
+	  if (lra_in_progress)
+	    {
+	      rtx ops[] = {tmp, elt};
+	      emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
+	    }
+	  else
+	    {
+	      struct expand_operand ops[2];
+	      enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
+	      gcc_assert (icode != CODE_FOR_nothing);
+	      create_output_operand (&ops[0], tmp, mode);
+	      create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
+	      expand_insn (icode, 2, ops);
+	      tmp = ops[0].value;
+	    }
 	}
 
       if (tmp != target)
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
index 3dfc6f16a25..2a735d8c6b6 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
@@ -18,7 +18,8 @@  bar (int *x, int a, int b, int n)
 }
 
 /* { dg-final { scan-assembler {e32,m2} } } */
-/* { dg-final { scan-assembler-times {csrr} 1 } } */
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-times {ret} 2 } } *
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c
new file mode 100644
index 00000000000..b203ca907fa
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111848.c
@@ -0,0 +1,35 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -fdump-tree-vect-details" } */
+
+void
+f3 (uint8_t *restrict a, uint8_t *restrict b,
+   uint8_t *restrict c, uint8_t *restrict d,
+   int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      a[i * 8] = c[i * 8] + d[i * 8];
+      a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1];
+      a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2];
+      a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3];
+      a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4];
+      a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5];
+      a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6];
+      a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7];
+      b[i * 8] = c[i * 8 + 1] + d[i * 8];
+      b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1];
+      b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2];
+      b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3];
+      b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4];
+      b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5];
+      b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6];
+      b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7];
+    }
+}
+
+/* { dg-final { scan-assembler {e8,m4} } } */
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-times {ret} 1 } } *
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */