RISC-V: Allow dest operand and accumulator operand overlap of widen reduction instruction[PR112327]
Checks
Commit Message
Consider this following intrinsic code:
void rvv_dot_prod(int16_t *pSrcA, int16_t *pSrcB, uint32_t n, int64_t *result)
{
size_t vl;
vint16m4_t vSrcA, vSrcB;
vint64m1_t vSum = __riscv_vmv_s_x_i64m1(0, 1);
while (n > 0) {
vl = __riscv_vsetvl_e16m4(n);
vSrcA = __riscv_vle16_v_i16m4(pSrcA, vl);
vSrcB = __riscv_vle16_v_i16m4(pSrcB, vl);
vSum = __riscv_vwredsum_vs_i32m8_i64m1(__riscv_vwmul_vv_i32m8(vSrcA, vSrcB, vl), vSum, vl);
pSrcA += vl;
pSrcB += vl;
n -= vl;
}
*result = __riscv_vmv_x_s_i64m1_i64(vSum);
}
https://godbolt.org/z/vWd35W7G6
Before this patch:
...
Loop:
...
vmv1r.v v2,v1
...
vwredsum.vs v1,v8,v2
...
After this patch:
...
Loop:
...
vwredsum.vs v1,v8,v1
...
PR target/112327
gcc/ChangeLog:
* config/riscv/vector.md: Add '0'.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/base/pr112327-1.c: New test.
* gcc.target/riscv/rvv/base/pr112327-2.c: New test.
---
gcc/config/riscv/vector.md | 4 +--
.../gcc.target/riscv/rvv/base/pr112327-1.c | 27 +++++++++++++++++++
.../gcc.target/riscv/rvv/base/pr112327-2.c | 27 +++++++++++++++++++
3 files changed, 56 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112327-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112327-2.c
Comments
On 11/1/23 00:56, Juzhe-Zhong wrote:
>
> Consider this following intrinsic code:
>
> void rvv_dot_prod(int16_t *pSrcA, int16_t *pSrcB, uint32_t n, int64_t *result)
> {
> size_t vl;
> vint16m4_t vSrcA, vSrcB;
> vint64m1_t vSum = __riscv_vmv_s_x_i64m1(0, 1);
> while (n > 0) {
> vl = __riscv_vsetvl_e16m4(n);
> vSrcA = __riscv_vle16_v_i16m4(pSrcA, vl);
> vSrcB = __riscv_vle16_v_i16m4(pSrcB, vl);
> vSum = __riscv_vwredsum_vs_i32m8_i64m1(__riscv_vwmul_vv_i32m8(vSrcA, vSrcB, vl), vSum, vl);
> pSrcA += vl;
> pSrcB += vl;
> n -= vl;
> }
> *result = __riscv_vmv_x_s_i64m1_i64(vSum);
> }
>
> https://godbolt.org/z/vWd35W7G6
>
> Before this patch:
>
> ...
> Loop:
> ...
> vmv1r.v v2,v1
> ...
> vwredsum.vs v1,v8,v2
> ...
>
> After this patch:
>
> ...
> Loop:
> ...
> vwredsum.vs v1,v8,v1
> ...
>
> PR target/112327
>
> gcc/ChangeLog:
>
> * config/riscv/vector.md: Add '0'.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/base/pr112327-1.c: New test.
> * gcc.target/riscv/rvv/base/pr112327-2.c: New test.
OK
jeff
Committed, thanks Jeff.
Pan
-----Original Message-----
From: Jeff Law <jeffreyalaw@gmail.com>
Sent: Thursday, November 2, 2023 3:02 AM
To: Juzhe-Zhong <juzhe.zhong@rivai.ai>; gcc-patches@gcc.gnu.org
Cc: kito.cheng@gmail.com; kito.cheng@sifive.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH] RISC-V: Allow dest operand and accumulator operand overlap of widen reduction instruction[PR112327]
On 11/1/23 00:56, Juzhe-Zhong wrote:
>
> Consider this following intrinsic code:
>
> void rvv_dot_prod(int16_t *pSrcA, int16_t *pSrcB, uint32_t n, int64_t *result)
> {
> size_t vl;
> vint16m4_t vSrcA, vSrcB;
> vint64m1_t vSum = __riscv_vmv_s_x_i64m1(0, 1);
> while (n > 0) {
> vl = __riscv_vsetvl_e16m4(n);
> vSrcA = __riscv_vle16_v_i16m4(pSrcA, vl);
> vSrcB = __riscv_vle16_v_i16m4(pSrcB, vl);
> vSum = __riscv_vwredsum_vs_i32m8_i64m1(__riscv_vwmul_vv_i32m8(vSrcA, vSrcB, vl), vSum, vl);
> pSrcA += vl;
> pSrcB += vl;
> n -= vl;
> }
> *result = __riscv_vmv_x_s_i64m1_i64(vSum);
> }
>
> https://godbolt.org/z/vWd35W7G6
>
> Before this patch:
>
> ...
> Loop:
> ...
> vmv1r.v v2,v1
> ...
> vwredsum.vs v1,v8,v2
> ...
>
> After this patch:
>
> ...
> Loop:
> ...
> vwredsum.vs v1,v8,v1
> ...
>
> PR target/112327
>
> gcc/ChangeLog:
>
> * config/riscv/vector.md: Add '0'.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/base/pr112327-1.c: New test.
> * gcc.target/riscv/rvv/base/pr112327-2.c: New test.
OK
jeff
@@ -7765,7 +7765,7 @@
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(unspec:<V_EXT_LMUL1> [
(match_operand:VI_QHS 3 "register_operand" " vr, vr")
- (match_operand:<V_EXT_LMUL1> 4 "register_operand" " vr, vr")
+ (match_operand:<V_EXT_LMUL1> 4 "register_operand" " vr0, vr0")
] ANY_WREDUC)
(match_operand:<V_EXT_LMUL1> 2 "vector_merge_operand" " vu, 0")] UNSPEC_REDUC))]
"TARGET_VECTOR"
@@ -7834,7 +7834,7 @@
(reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE)
(unspec:<V_EXT_LMUL1> [
(match_operand:VF_HS 3 "register_operand" " vr, vr")
- (match_operand:<V_EXT_LMUL1> 4 "register_operand" " vr, vr")
+ (match_operand:<V_EXT_LMUL1> 4 "register_operand" " vr0, vr0")
] ANY_FWREDUC_SUM)
(match_operand:<V_EXT_LMUL1> 2 "vector_merge_operand" " vu, 0")] UNSPEC_REDUC))]
"TARGET_VECTOR"
new file mode 100644
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+void
+foo (int16_t *pSrcA, int16_t *pSrcB, uint32_t n, int64_t *result)
+{
+ size_t vl;
+ vint16m4_t vSrcA, vSrcB;
+ vint64m1_t vSum = __riscv_vmv_s_x_i64m1 (0, 1);
+ while (n > 0)
+ {
+ vl = __riscv_vsetvl_e16m4 (n);
+ vSrcA = __riscv_vle16_v_i16m4 (pSrcA, vl);
+ vSrcB = __riscv_vle16_v_i16m4 (pSrcB, vl);
+ vSum = __riscv_vwredsum_vs_i32m8_i64m1 (
+ __riscv_vwmul_vv_i32m8 (vSrcA, vSrcB, vl), vSum, vl);
+ pSrcA += vl;
+ pSrcB += vl;
+ n -= vl;
+ }
+ *result = __riscv_vmv_x_s_i64m1_i64 (vSum);
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv\.v\.v} } } */
new file mode 100644
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zfh -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+void
+foo (_Float16 *pSrcA, _Float16 *pSrcB, uint32_t n, double *result)
+{
+ size_t vl;
+ vfloat16m4_t vSrcA, vSrcB;
+ vfloat64m1_t vSum = __riscv_vfmv_s_f_f64m1 (0, 1);
+ while (n > 0)
+ {
+ vl = __riscv_vsetvl_e16m4 (n);
+ vSrcA = __riscv_vle16_v_f16m4 (pSrcA, vl);
+ vSrcB = __riscv_vle16_v_f16m4 (pSrcB, vl);
+ vSum = __riscv_vfwredusum_vs_f32m8_f64m1 (
+ __riscv_vfwmul_vv_f32m8 (vSrcA, vSrcB, vl), vSum, vl);
+ pSrcA += vl;
+ pSrcB += vl;
+ n -= vl;
+ }
+ *result = __riscv_vfmv_f_s_f64m1_f64 (vSum);
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv\.v\.v} } } */