MATCH: Optimize COND_ADD_LEN reduction pattern

Message ID 20230926071257.129536-1-juzhe.zhong@rivai.ai
State Accepted
Headers
Series MATCH: Optimize COND_ADD_LEN reduction pattern |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

juzhe.zhong@rivai.ai Sept. 26, 2023, 7:12 a.m. UTC
  This patch leverage this commit: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
to optimize COND_LEN_ADD reduction pattern.

We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.

Consider thsi following case:

#include <stdint.h>

void
pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
{
  uint64_t result = 0;

  for (int i = 0; i < loop_size; i++)
    {
      if (b[i] <= a[i])
	{
	  result += a[i];
	}
    }

  a[0] = result;
}

Before this patch:
        vsetvli a7,zero,e64,m1,ta,ma
        vmv.v.i v2,0
        vmv1r.v v3,v2                    --- redundant
.L3:
        vsetvli a5,a2,e64,m1,ta,ma
        vle64.v v1,0(a3)
        vle64.v v0,0(a1)
        slli    a6,a5,3
        vsetvli a7,zero,e64,m1,ta,ma
        sub     a2,a2,a5
        vmsleu.vv       v0,v0,v1
        add     a1,a1,a6
        vmerge.vvm      v1,v3,v1,v0     ---- redundant.
        add     a3,a3,a6
        vsetvli zero,a5,e64,m1,tu,ma
        vadd.vv v2,v2,v1
        bne     a2,zero,.L3
        li      a5,0
        vsetvli a4,zero,e64,m1,ta,ma
        vmv.s.x v1,a5
        vredsum.vs      v2,v2,v1
        vmv.x.s a5,v2
        sd      a5,0(a0)
        ret

After this patch:

	vsetvli	a6,zero,e64,m1,ta,ma
	vmv.v.i	v1,0
.L3:
	vsetvli	a5,a2,e64,m1,ta,ma
	vle64.v	v2,0(a4)
	vle64.v	v0,0(a1)
	slli	a3,a5,3
	vsetvli	a6,zero,e64,m1,ta,ma
	sub	a2,a2,a5
	vmsleu.vv	v0,v0,v2
	add	a1,a1,a3
	vsetvli	zero,a5,e64,m1,tu,mu
	add	a4,a4,a3
	vadd.vv	v1,v1,v2,v0.t
	bne	a2,zero,.L3
	li	a5,0
	vsetivli	zero,1,e64,m1,ta,ma
	vmv.s.x	v2,a5
	vsetvli	a5,zero,e64,m1,ta,ma
	vredsum.vs	v1,v1,v2
	vmv.x.s	a5,v1
	sd	a5,0(a0)
	ret

Bootstrap && Regression is running.

Ok for trunk when testing passes ?

	PR tree-optimization/111594
        PR tree-optimization/110660

gcc/ChangeLog:

	* match.pd: Optimize COND_LEN_ADD reduction.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
	* gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.

---
 gcc/match.pd                                  | 13 +++++++++
 .../riscv/rvv/autovec/cond/cond_reduc-1.c     | 29 +++++++++++++++++++
 .../riscv/rvv/autovec/cond/pr111594.c         | 22 ++++++++++++++
 3 files changed, 64 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
  

Comments

Richard Biener Sept. 26, 2023, 7:46 a.m. UTC | #1
On Tue, Sep 26, 2023 at 9:13 AM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote:
>
>
> This patch leverage this commit: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
> to optimize COND_LEN_ADD reduction pattern.
>
> We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.
>
> Consider thsi following case:
>
> #include <stdint.h>
>
> void
> pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> {
>   uint64_t result = 0;
>
>   for (int i = 0; i < loop_size; i++)
>     {
>       if (b[i] <= a[i])
>         {
>           result += a[i];
>         }
>     }
>
>   a[0] = result;
> }
>
> Before this patch:
>         vsetvli a7,zero,e64,m1,ta,ma
>         vmv.v.i v2,0
>         vmv1r.v v3,v2                    --- redundant
> .L3:
>         vsetvli a5,a2,e64,m1,ta,ma
>         vle64.v v1,0(a3)
>         vle64.v v0,0(a1)
>         slli    a6,a5,3
>         vsetvli a7,zero,e64,m1,ta,ma
>         sub     a2,a2,a5
>         vmsleu.vv       v0,v0,v1
>         add     a1,a1,a6
>         vmerge.vvm      v1,v3,v1,v0     ---- redundant.
>         add     a3,a3,a6
>         vsetvli zero,a5,e64,m1,tu,ma
>         vadd.vv v2,v2,v1
>         bne     a2,zero,.L3
>         li      a5,0
>         vsetvli a4,zero,e64,m1,ta,ma
>         vmv.s.x v1,a5
>         vredsum.vs      v2,v2,v1
>         vmv.x.s a5,v2
>         sd      a5,0(a0)
>         ret
>
> After this patch:
>
>         vsetvli a6,zero,e64,m1,ta,ma
>         vmv.v.i v1,0
> .L3:
>         vsetvli a5,a2,e64,m1,ta,ma
>         vle64.v v2,0(a4)
>         vle64.v v0,0(a1)
>         slli    a3,a5,3
>         vsetvli a6,zero,e64,m1,ta,ma
>         sub     a2,a2,a5
>         vmsleu.vv       v0,v0,v2
>         add     a1,a1,a3
>         vsetvli zero,a5,e64,m1,tu,mu
>         add     a4,a4,a3
>         vadd.vv v1,v1,v2,v0.t
>         bne     a2,zero,.L3
>         li      a5,0
>         vsetivli        zero,1,e64,m1,ta,ma
>         vmv.s.x v2,a5
>         vsetvli a5,zero,e64,m1,ta,ma
>         vredsum.vs      v1,v1,v2
>         vmv.x.s a5,v1
>         sd      a5,0(a0)
>         ret
>
> Bootstrap && Regression is running.
>
> Ok for trunk when testing passes ?
>
>         PR tree-optimization/111594
>         PR tree-optimization/110660
>
> gcc/ChangeLog:
>
>         * match.pd: Optimize COND_LEN_ADD reduction.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
>         * gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.
>
> ---
>  gcc/match.pd                                  | 13 +++++++++
>  .../riscv/rvv/autovec/cond/cond_reduc-1.c     | 29 +++++++++++++++++++
>  .../riscv/rvv/autovec/cond/pr111594.c         | 22 ++++++++++++++
>  3 files changed, 64 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index a17778fbaa6..af8d12c138e 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8866,6 +8866,19 @@ and,
>    (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
>     (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
>
> +/* Detect simplication for a conditional length reduction where
> +
> +   a = mask ? b : 0
> +   c = i < len + bias ? d + a : d
> +
> +   is turned into
> +
> +   c = mask && i < len ? d + b : d.  */
> +(simplify
> +  (IFN_COND_LEN_ADD integer_minus_onep @0 (vec_cond @1 @2 zerop) @0 @3 @4)

I think you want intger_truep instead of integer_minus_onep for
readability.  Since you
use zerop here can you also adjust the preceeding pattern?

> +   (if (!HONOR_NANS (type) && !HONOR_SIGNED_ZEROS (type))

it might be better to check ANY_INTEGRAL_TYPE_P (type) ||
fold_real_zero_addition_p (type, NULL_TREE, @5, 0)
your change misses HONOR_SIGN_DEPENDENT_ROUNDING I think.

> +    (IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
> +



>  /* For pointers @0 and @2 and nonnegative constant offset @1, look for
>     expressions like:
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> new file mode 100644
> index 00000000000..db6f9d1ec6c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d -fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
> +
> +#include <stdint-gcc.h>
> +
> +#define COND_REDUCTION(TYPE)                                                   \
> +  TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size)           \
> +  {                                                                            \
> +    TYPE result = 0;                                                           \
> +    for (int i = 0; i < loop_size; i++)                                        \
> +      if (b[i] <= a[i])                                                        \
> +       result += a[i];                                                        \
> +    return result;                                                             \
> +  }
> +
> +COND_REDUCTION (int8_t)
> +COND_REDUCTION (int16_t)
> +COND_REDUCTION (int32_t)
> +COND_REDUCTION (int64_t)
> +COND_REDUCTION (uint8_t)
> +COND_REDUCTION (uint16_t)
> +COND_REDUCTION (uint32_t)
> +COND_REDUCTION (uint64_t)
> +COND_REDUCTION (_Float16)
> +COND_REDUCTION (float)
> +COND_REDUCTION (double)
> +
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "COND_LEN_ADD" 11 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
> new file mode 100644
> index 00000000000..6d81b26fbd0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d -fno-vect-cost-model -ffast-math" } */
> +
> +#include <stdint-gcc.h>
> +
> +void
> +pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> +{
> +  uint64_t result = 0;
> +
> +  for (int i = 0; i < loop_size; i++)
> +    {
> +      if (b[i] <= a[i])
> +       {
> +         result += a[i];
> +       }
> +    }
> +
> +  a[0] = result;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> --
> 2.36.3
>
  
juzhe.zhong@rivai.ai Sept. 26, 2023, 8:29 a.m. UTC | #2
Hi, Richi.

Addresse comments.

One is V2 patch for COND_LEN_ADD reduction:
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631340.html 

The second one is optimize COND_ADD reduction:
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/631341.html 




juzhe.zhong@rivai.ai
 
From: Richard Biener
Date: 2023-09-26 15:46
To: Juzhe-Zhong
CC: gcc-patches; richard.sandiford; rguenther; pinskia
Subject: Re: [PATCH] MATCH: Optimize COND_ADD_LEN reduction pattern
On Tue, Sep 26, 2023 at 9:13 AM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote:
>
>
> This patch leverage this commit: https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=62b505a4d5fc89
> to optimize COND_LEN_ADD reduction pattern.
>
> We are doing optimization of VEC_COND_EXPR + COND_LEN_ADD -> COND_LEN_ADD.
>
> Consider thsi following case:
>
> #include <stdint.h>
>
> void
> pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> {
>   uint64_t result = 0;
>
>   for (int i = 0; i < loop_size; i++)
>     {
>       if (b[i] <= a[i])
>         {
>           result += a[i];
>         }
>     }
>
>   a[0] = result;
> }
>
> Before this patch:
>         vsetvli a7,zero,e64,m1,ta,ma
>         vmv.v.i v2,0
>         vmv1r.v v3,v2                    --- redundant
> .L3:
>         vsetvli a5,a2,e64,m1,ta,ma
>         vle64.v v1,0(a3)
>         vle64.v v0,0(a1)
>         slli    a6,a5,3
>         vsetvli a7,zero,e64,m1,ta,ma
>         sub     a2,a2,a5
>         vmsleu.vv       v0,v0,v1
>         add     a1,a1,a6
>         vmerge.vvm      v1,v3,v1,v0     ---- redundant.
>         add     a3,a3,a6
>         vsetvli zero,a5,e64,m1,tu,ma
>         vadd.vv v2,v2,v1
>         bne     a2,zero,.L3
>         li      a5,0
>         vsetvli a4,zero,e64,m1,ta,ma
>         vmv.s.x v1,a5
>         vredsum.vs      v2,v2,v1
>         vmv.x.s a5,v2
>         sd      a5,0(a0)
>         ret
>
> After this patch:
>
>         vsetvli a6,zero,e64,m1,ta,ma
>         vmv.v.i v1,0
> .L3:
>         vsetvli a5,a2,e64,m1,ta,ma
>         vle64.v v2,0(a4)
>         vle64.v v0,0(a1)
>         slli    a3,a5,3
>         vsetvli a6,zero,e64,m1,ta,ma
>         sub     a2,a2,a5
>         vmsleu.vv       v0,v0,v2
>         add     a1,a1,a3
>         vsetvli zero,a5,e64,m1,tu,mu
>         add     a4,a4,a3
>         vadd.vv v1,v1,v2,v0.t
>         bne     a2,zero,.L3
>         li      a5,0
>         vsetivli        zero,1,e64,m1,ta,ma
>         vmv.s.x v2,a5
>         vsetvli a5,zero,e64,m1,ta,ma
>         vredsum.vs      v1,v1,v2
>         vmv.x.s a5,v1
>         sd      a5,0(a0)
>         ret
>
> Bootstrap && Regression is running.
>
> Ok for trunk when testing passes ?
>
>         PR tree-optimization/111594
>         PR tree-optimization/110660
>
> gcc/ChangeLog:
>
>         * match.pd: Optimize COND_LEN_ADD reduction.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c: New test.
>         * gcc.target/riscv/rvv/autovec/cond/pr111594.c: New test.
>
> ---
>  gcc/match.pd                                  | 13 +++++++++
>  .../riscv/rvv/autovec/cond/cond_reduc-1.c     | 29 +++++++++++++++++++
>  .../riscv/rvv/autovec/cond/pr111594.c         | 22 ++++++++++++++
>  3 files changed, 64 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index a17778fbaa6..af8d12c138e 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8866,6 +8866,19 @@ and,
>    (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
>     (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
>
> +/* Detect simplication for a conditional length reduction where
> +
> +   a = mask ? b : 0
> +   c = i < len + bias ? d + a : d
> +
> +   is turned into
> +
> +   c = mask && i < len ? d + b : d.  */
> +(simplify
> +  (IFN_COND_LEN_ADD integer_minus_onep @0 (vec_cond @1 @2 zerop) @0 @3 @4)
 
I think you want intger_truep instead of integer_minus_onep for
readability.  Since you
use zerop here can you also adjust the preceeding pattern?
 
> +   (if (!HONOR_NANS (type) && !HONOR_SIGNED_ZEROS (type))
 
it might be better to check ANY_INTEGRAL_TYPE_P (type) ||
fold_real_zero_addition_p (type, NULL_TREE, @5, 0)
your change misses HONOR_SIGN_DEPENDENT_ROUNDING I think.
 
> +    (IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
> +
 
 
 
>  /* For pointers @0 and @2 and nonnegative constant offset @1, look for
>     expressions like:
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> new file mode 100644
> index 00000000000..db6f9d1ec6c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d -fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
> +
> +#include <stdint-gcc.h>
> +
> +#define COND_REDUCTION(TYPE)                                                   \
> +  TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size)           \
> +  {                                                                            \
> +    TYPE result = 0;                                                           \
> +    for (int i = 0; i < loop_size; i++)                                        \
> +      if (b[i] <= a[i])                                                        \
> +       result += a[i];                                                        \
> +    return result;                                                             \
> +  }
> +
> +COND_REDUCTION (int8_t)
> +COND_REDUCTION (int16_t)
> +COND_REDUCTION (int32_t)
> +COND_REDUCTION (int64_t)
> +COND_REDUCTION (uint8_t)
> +COND_REDUCTION (uint16_t)
> +COND_REDUCTION (uint32_t)
> +COND_REDUCTION (uint64_t)
> +COND_REDUCTION (_Float16)
> +COND_REDUCTION (float)
> +COND_REDUCTION (double)
> +
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "COND_LEN_ADD" 11 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
> new file mode 100644
> index 00000000000..6d81b26fbd0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d -fno-vect-cost-model -ffast-math" } */
> +
> +#include <stdint-gcc.h>
> +
> +void
> +pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
> +{
> +  uint64_t result = 0;
> +
> +  for (int i = 0; i < loop_size; i++)
> +    {
> +      if (b[i] <= a[i])
> +       {
> +         result += a[i];
> +       }
> +    }
> +
> +  a[0] = result;
> +}
> +
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> --
> 2.36.3
>
  

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index a17778fbaa6..af8d12c138e 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8866,6 +8866,19 @@  and,
   (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
    (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
 
+/* Detect simplication for a conditional length reduction where
+
+   a = mask ? b : 0
+   c = i < len + bias ? d + a : d
+
+   is turned into
+
+   c = mask && i < len ? d + b : d.  */
+(simplify
+  (IFN_COND_LEN_ADD integer_minus_onep @0 (vec_cond @1 @2 zerop) @0 @3 @4)
+   (if (!HONOR_NANS (type) && !HONOR_SIGNED_ZEROS (type))
+    (IFN_COND_LEN_ADD @1 @0 @2 @0 @3 @4)))
+
 /* For pointers @0 and @2 and nonnegative constant offset @1, look for
    expressions like:
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
new file mode 100644
index 00000000000..db6f9d1ec6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/cond_reduc-1.c
@@ -0,0 +1,29 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvfh -mabi=lp64d -fno-vect-cost-model -ffast-math -fdump-tree-optimized" } */
+
+#include <stdint-gcc.h>
+
+#define COND_REDUCTION(TYPE)                                                   \
+  TYPE foo##TYPE (TYPE *restrict a, TYPE *restrict b, int loop_size)           \
+  {                                                                            \
+    TYPE result = 0;                                                           \
+    for (int i = 0; i < loop_size; i++)                                        \
+      if (b[i] <= a[i])                                                        \
+	result += a[i];                                                        \
+    return result;                                                             \
+  }
+
+COND_REDUCTION (int8_t)
+COND_REDUCTION (int16_t)
+COND_REDUCTION (int32_t)
+COND_REDUCTION (int64_t)
+COND_REDUCTION (uint8_t)
+COND_REDUCTION (uint16_t)
+COND_REDUCTION (uint32_t)
+COND_REDUCTION (uint64_t)
+COND_REDUCTION (_Float16)
+COND_REDUCTION (float)
+COND_REDUCTION (double)
+
+/* { dg-final { scan-tree-dump-not "VCOND_MASK" "optimized" } } */
+/* { dg-final { scan-tree-dump-times "COND_LEN_ADD" 11 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
new file mode 100644
index 00000000000..6d81b26fbd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111594.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d -fno-vect-cost-model -ffast-math" } */
+
+#include <stdint-gcc.h>
+
+void
+pr11594 (uint64_t *restrict a, uint64_t *restrict b, int loop_size)
+{
+  uint64_t result = 0;
+
+  for (int i = 0; i < loop_size; i++)
+    {
+      if (b[i] <= a[i])
+	{
+	  result += a[i];
+	}
+    }
+
+  a[0] = result;
+}
+
+/* { dg-final { scan-assembler-not {vmerge} } } */