[V3] MATCH: Optimize COND_ADD_LEN reduction pattern
Checks
Commit Message
This patch leverage this commit: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
to optimize COND_LEN_ADD reduction pattern.
We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.
Consider thsi following case:
#include <stdint.h>
void
pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
{
uint64_t result = 0;
for (int i = 0; i < loop_size; i++)
{
if (b[i] <= a[i])
{
result += a[i];
}
}
a[0] = result;
}
Before this patch:
vsetvli a7,zero,e64,m1,ta,ma
vmv.v.i v2,0
vmv1r.v v3,v2 --- redundant
.L3:
vsetvli a5,a2,e64,m1,ta,ma
vle64.v v1,0(a3)
vle64.v v0,0(a1)
slli a6,a5,3
vsetvli a7,zero,e64,m1,ta,ma
sub a2,a2,a5
vmsleu.vv v0,v0,v1
add a1,a1,a6
vmerge.vvm v1,v3,v1,v0 ---- redundant.
add a3,a3,a6
vsetvli zero,a5,e64,m1,tu,ma
vadd.vv v2,v2,v1
bne a2,zero,.L3
li a5,0
vsetvli a4,zero,e64,m1,ta,ma
vmv.s.x v1,a5
vredsum.vs v2,v2,v1
vmv.x.s a5,v2
sd a5,0(a0)
ret
After this patch:
vsetvli a6,zero,e64,m1,ta,ma
vmv.v.i v1,0
.L3:
vsetvli a5,a2,e64,m1,ta,ma
vle64.v v2,0(a4)
vle64.v v0,0(a1)
slli a3,a5,3
vsetvli a6,zero,e64,m1,ta,ma
sub a2,a2,a5
vmsleu.vv v0,v0,v2
add a1,a1,a3
vsetvli zero,a5,e64,m1,tu,mu
add a4,a4,a3
vadd.vv v1,v1,v2,v0.t
bne a2,zero,.L3
li a5,0
vsetivli zero,1,e64,m1,ta,ma
vmv.s.x v2,a5
vsetvli a5,zero,e64,m1,ta,ma
vredsum.vs v1,v1,v2
vmv.x.s a5,v1
sd a5,0(a0)
ret
Bootstrap && Regression is running.
Ok for trunk when testing passes ?
PR tree-optimization/111594
PR tree-optimization/110660
gcc/ChangeLog:
* match.pd: Optimize COND_LEN_ADD reduction.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
* gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.
---
gcc/match.pd | 15 ++++++++++
.../riscv/rvv/autovec/cond/cond_reduc-1.c | 29 +++++++++++++++++++
.../riscv/rvv/autovec/cond/pr111594.c | 22 ++++++++++++++
3 files changed, 66 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
Comments
On Tue, 26 Sep 2023, Juzhe-Zhong wrote:
> This patch leverage this commit: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
> to optimize COND_LEN_ADD reduction pattern.
>
> We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.
>
> Consider thsi following case:
>
> #include <stdint.h>
>
> void
> pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> {
> uint64_t result = 0;
>
> for (int i = 0; i < loop_size; i++)
> {
> if (b[i] <= a[i])
> {
> result += a[i];
> }
> }
>
> a[0] = result;
> }
>
> Before this patch:
> vsetvli a7,zero,e64,m1,ta,ma
> vmv.v.i v2,0
> vmv1r.v v3,v2 --- redundant
> .L3:
> vsetvli a5,a2,e64,m1,ta,ma
> vle64.v v1,0(a3)
> vle64.v v0,0(a1)
> slli a6,a5,3
> vsetvli a7,zero,e64,m1,ta,ma
> sub a2,a2,a5
> vmsleu.vv v0,v0,v1
> add a1,a1,a6
> vmerge.vvm v1,v3,v1,v0 ---- redundant.
> add a3,a3,a6
> vsetvli zero,a5,e64,m1,tu,ma
> vadd.vv v2,v2,v1
> bne a2,zero,.L3
> li a5,0
> vsetvli a4,zero,e64,m1,ta,ma
> vmv.s.x v1,a5
> vredsum.vs v2,v2,v1
> vmv.x.s a5,v2
> sd a5,0(a0)
> ret
>
> After this patch:
>
> vsetvli a6,zero,e64,m1,ta,ma
> vmv.v.i v1,0
> .L3:
> vsetvli a5,a2,e64,m1,ta,ma
> vle64.v v2,0(a4)
> vle64.v v0,0(a1)
> slli a3,a5,3
> vsetvli a6,zero,e64,m1,ta,ma
> sub a2,a2,a5
> vmsleu.vv v0,v0,v2
> add a1,a1,a3
> vsetvli zero,a5,e64,m1,tu,mu
> add a4,a4,a3
> vadd.vv v1,v1,v2,v0.t
> bne a2,zero,.L3
> li a5,0
> vsetivli zero,1,e64,m1,ta,ma
> vmv.s.x v2,a5
> vsetvli a5,zero,e64,m1,ta,ma
> vredsum.vs v1,v1,v2
> vmv.x.s a5,v1
> sd a5,0(a0)
> ret
>
> Bootstrap && Regression is running.
>
> Ok for trunk when testing passes ?
OK
> PR tree-optimization/111594
> PR tree-optimization/110660
>
> gcc/ChangeLog:
>
> * match.pd: Optimize COND_LEN_ADD reduction.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
> * gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.
>
> ---
> gcc/match.pd | 15 ++++++++++
> .../riscv/rvv/autovec/cond/cond_reduc-1.c | 29 +++++++++++++++++++
> .../riscv/rvv/autovec/cond/pr111594.c | 22 ++++++++++++++
> 3 files changed, 66 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index a17778fbaa6..3ce90c3333b 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8866,6 +8866,21 @@ and,
> (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
>
> +/* Detect simplication for a conditional length reduction where
> +
> + a = mask ? b : 0
> + c = i < len + bias ? d + a : d
> +
> + is turned into
> +
> + c = mask && i < len + bias ? d + b : d. */
> +(simplify
> + (IFN_COND_LEN_ADD integer_truep @0 (vec_cond @1 @2 zerop@5) @0 @3 @4)
> + (if (ANY_INTEGRAL_TYPE_P (type)
> + || (FLOAT_TYPE_P (type)
> + && fold_real_zero_addition_p (type, NULL_TREE, @5, 0)))
> + (IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
> +
> /* For pointers @0 and @2 and nonnegative constant offset @1, look for
> expressions like:
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> new file mode 100644
> index 00000000000..db6f9d1ec6c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d -fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
> +
> +#include <stdint-gcc.h>
> +
> +#define COND_REDUCTION(TYPE) \
> + TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size) \
> + { \
> + TYPE result = 0; \
> + for (int i = 0; i < loop_size; i++) \
> + if (b[i] <= a[i]) \
> + result += a[i]; \
> + return result; \
> + }
> +
> +COND_REDUCTION (int8_t)
> +COND_REDUCTION (int16_t)
> +COND_REDUCTION (int32_t)
> +COND_REDUCTION (int64_t)
> +COND_REDUCTION (uint8_t)
> +COND_REDUCTION (uint16_t)
> +COND_REDUCTION (uint32_t)
> +COND_REDUCTION (uint64_t)
> +COND_REDUCTION (_Float16)
> +COND_REDUCTION (float)
> +COND_REDUCTION (double)
> +
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "COND_LEN_ADD" 11 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
> new file mode 100644
> index 00000000000..6d81b26fbd0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d -fno-vect-cost-model -ffast-math" } */
> +
> +#include <stdint-gcc.h>
> +
> +void
> +pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> +{
> + uint64_t result = 0;
> +
> + for (int i = 0; i < loop_size; i++)
> + {
> + if (b[i] <= a[i])
> + {
> + result += a[i];
> + }
> + }
> +
> + a[0] = result;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmerge} } } */
>
Committed as passed x86 bootstrap and regression test, thanks Richard.
Pan
-----Original Message-----
From: Richard Biener <rguenther@suse.de>
Sent: Tuesday, September 26, 2023 7:35 PM
To: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Cc: gcc-patches@gcc.gnu.org; richard.sandiford@arm.com
Subject: Re: [PATCH V3] MATCH: Optimize COND_ADD_LEN reduction pattern
On Tue, 26 Sep 2023, Juzhe-Zhong wrote:
> This patch leverage this commit: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
> to optimize COND_LEN_ADD reduction pattern.
>
> We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.
>
> Consider thsi following case:
>
> #include <stdint.h>
>
> void
> pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> {
> uint64_t result = 0;
>
> for (int i = 0; i < loop_size; i++)
> {
> if (b[i] <= a[i])
> {
> result += a[i];
> }
> }
>
> a[0] = result;
> }
>
> Before this patch:
> vsetvli a7,zero,e64,m1,ta,ma
> vmv.v.i v2,0
> vmv1r.v v3,v2 --- redundant
> .L3:
> vsetvli a5,a2,e64,m1,ta,ma
> vle64.v v1,0(a3)
> vle64.v v0,0(a1)
> slli a6,a5,3
> vsetvli a7,zero,e64,m1,ta,ma
> sub a2,a2,a5
> vmsleu.vv v0,v0,v1
> add a1,a1,a6
> vmerge.vvm v1,v3,v1,v0 ---- redundant.
> add a3,a3,a6
> vsetvli zero,a5,e64,m1,tu,ma
> vadd.vv v2,v2,v1
> bne a2,zero,.L3
> li a5,0
> vsetvli a4,zero,e64,m1,ta,ma
> vmv.s.x v1,a5
> vredsum.vs v2,v2,v1
> vmv.x.s a5,v2
> sd a5,0(a0)
> ret
>
> After this patch:
>
> vsetvli a6,zero,e64,m1,ta,ma
> vmv.v.i v1,0
> .L3:
> vsetvli a5,a2,e64,m1,ta,ma
> vle64.v v2,0(a4)
> vle64.v v0,0(a1)
> slli a3,a5,3
> vsetvli a6,zero,e64,m1,ta,ma
> sub a2,a2,a5
> vmsleu.vv v0,v0,v2
> add a1,a1,a3
> vsetvli zero,a5,e64,m1,tu,mu
> add a4,a4,a3
> vadd.vv v1,v1,v2,v0.t
> bne a2,zero,.L3
> li a5,0
> vsetivli zero,1,e64,m1,ta,ma
> vmv.s.x v2,a5
> vsetvli a5,zero,e64,m1,ta,ma
> vredsum.vs v1,v1,v2
> vmv.x.s a5,v1
> sd a5,0(a0)
> ret
>
> Bootstrap && Regression is running.
>
> Ok for trunk when testing passes ?
OK
> PR tree-optimization/111594
> PR tree-optimization/110660
>
> gcc/ChangeLog:
>
> * match.pd: Optimize COND_LEN_ADD reduction.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
> * gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.
>
> ---
> gcc/match.pd | 15 ++++++++++
> .../riscv/rvv/autovec/cond/cond_reduc-1.c | 29 +++++++++++++++++++
> .../riscv/rvv/autovec/cond/pr111594.c | 22 ++++++++++++++
> 3 files changed, 66 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index a17778fbaa6..3ce90c3333b 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8866,6 +8866,21 @@ and,
> (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
>
> +/* Detect simplication for a conditional length reduction where
> +
> + a = mask ? b : 0
> + c = i < len + bias ? d + a : d
> +
> + is turned into
> +
> + c = mask && i < len + bias ? d + b : d. */
> +(simplify
> + (IFN_COND_LEN_ADD integer_truep @0 (vec_cond @1 @2 zerop@5) @0 @3 @4)
> + (if (ANY_INTEGRAL_TYPE_P (type)
> + || (FLOAT_TYPE_P (type)
> + && fold_real_zero_addition_p (type, NULL_TREE, @5, 0)))
> + (IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
> +
> /* For pointers @0 and @2 and nonnegative constant offset @1, look for
> expressions like:
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> new file mode 100644
> index 00000000000..db6f9d1ec6c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d -fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
> +
> +#include <stdint-gcc.h>
> +
> +#define COND_REDUCTION(TYPE) \
> + TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size) \
> + { \
> + TYPE result = 0; \
> + for (int i = 0; i < loop_size; i++) \
> + if (b[i] <= a[i]) \
> + result += a[i]; \
> + return result; \
> + }
> +
> +COND_REDUCTION (int8_t)
> +COND_REDUCTION (int16_t)
> +COND_REDUCTION (int32_t)
> +COND_REDUCTION (int64_t)
> +COND_REDUCTION (uint8_t)
> +COND_REDUCTION (uint16_t)
> +COND_REDUCTION (uint32_t)
> +COND_REDUCTION (uint64_t)
> +COND_REDUCTION (_Float16)
> +COND_REDUCTION (float)
> +COND_REDUCTION (double)
> +
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "COND_LEN_ADD" 11 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
> new file mode 100644
> index 00000000000..6d81b26fbd0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d -fno-vect-cost-model -ffast-math" } */
> +
> +#include <stdint-gcc.h>
> +
> +void
> +pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> +{
> + uint64_t result = 0;
> +
> + for (int i = 0; i < loop_size; i++)
> + {
> + if (b[i] <= a[i])
> + {
> + result += a[i];
> + }
> + }
> +
> + a[0] = result;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmerge} } } */
>
@@ -8866,6 +8866,21 @@ and,
(IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
(IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
+/* Detect simplication for a conditional length reduction where
+
+ a = mask ? b : 0
+ c = i < len + bias ? d + a : d
+
+ is turned into
+
+ c = mask && i < len + bias ? d + b : d. */
+(simplify
+ (IFN_COND_LEN_ADD integer_truep @0 (vec_cond @1 @2 zerop@5) @0 @3 @4)
+ (if (ANY_INTEGRAL_TYPE_P (type)
+ || (FLOAT_TYPE_P (type)
+ && fold_real_zero_addition_p (type, NULL_TREE, @5, 0)))
+ (IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
+
/* For pointers @0 and @2 and nonnegative constant offset @1, look for
expressions like:
new file mode 100644
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d -fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
+
+#include <stdint-gcc.h>
+
+#define COND_REDUCTION(TYPE) \
+ TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size) \
+ { \
+ TYPE result = 0; \
+ for (int i = 0; i < loop_size; i++) \
+ if (b[i] <= a[i]) \
+ result += a[i]; \
+ return result; \
+ }
+
+COND_REDUCTION (int8_t)
+COND_REDUCTION (int16_t)
+COND_REDUCTION (int32_t)
+COND_REDUCTION (int64_t)
+COND_REDUCTION (uint8_t)
+COND_REDUCTION (uint16_t)
+COND_REDUCTION (uint32_t)
+COND_REDUCTION (uint64_t)
+COND_REDUCTION (_Float16)
+COND_REDUCTION (float)
+COND_REDUCTION (double)
+
+/* { dg-final { scan-tree-dump-not "VCOND_MASK" "optimized" } } */
+/* { dg-final { scan-tree-dump-times "COND_LEN_ADD" 11 "optimized" } } */
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d -fno-vect-cost-model -ffast-math" } */
+
+#include <stdint-gcc.h>
+
+void
+pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
+{
+ uint64_t result = 0;
+
+ for (int i = 0; i < loop_size; i++)
+ {
+ if (b[i] <= a[i])
+ {
+ result += a[i];
+ }
+ }
+
+ a[0] = result;
+}
+
+/* { dg-final { scan-assembler-not {vmerge} } } */