[V2] RISC-V: Fix unexpected big LMUL choosing in dynamic LMUL model for non-adjacent load/store

Message ID 20231016113108.877163-1-juzhe.zhong@rivai.ai
State Unresolved
Headers
Series [V2] RISC-V: Fix unexpected big LMUL choosing in dynamic LMUL model for non-adjacent load/store |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

juzhe.zhong@rivai.ai Oct. 16, 2023, 11:31 a.m. UTC
  Consider this following case:
int
bar (int *x, int a, int b, int n)
{
  x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__);
  int sum1 = 0;
  int sum2 = 0;
  for (int i = 0; i < n; ++i)
    {
      sum1 += x[2*i] - a;
      sum1 += x[2*i+1] * b;
      sum2 += x[2*i] - b;
      sum2 += x[2*i+1] * a;
    }
  return sum1 + sum2;
}

Before this patch:

bar:
        ble     a3,zero,.L5
        csrr    t0,vlenb
        csrr    a6,vlenb
        slli    t1,t0,3
        vsetvli a5,zero,e32,m4,ta,ma
        sub     sp,sp,t1
        vid.v   v20
        vmv.v.x v12,a1
        vand.vi v4,v20,1
        vmv.v.x v16,a2
        vmseq.vi        v4,v4,1
        slli    t3,a6,2
        vsetvli zero,a5,e32,m4,ta,ma
        vmv1r.v v0,v4
        viota.m v8,v4
        add     a7,t3,sp
        vsetvli a5,zero,e32,m4,ta,mu
        vand.vi v28,v20,-2
        vadd.vi v4,v28,1
        vs4r.v  v20,0(a7)                        -----  spill
        vrgather.vv     v24,v12,v8
        vrgather.vv     v20,v16,v8
        vrgather.vv     v24,v16,v8,v0.t
        vrgather.vv     v20,v12,v8,v0.t
        vs4r.v  v4,0(sp)                          ----- spill
        slli    a3,a3,1
        addi    t4,a6,-1
        neg     t1,a6
        vmv4r.v v0,v20
        vmv.v.i v4,0
        j       .L4
.L13:
        vsetvli a5,zero,e32,m4,ta,ma
.L4:
        mv      a7,a3
        mv      a4,a3
        bleu    a3,a6,.L3
        csrr    a4,vlenb
.L3:
        vmv.v.x v8,t4
        vl4re32.v       v12,0(sp)                ---- spill
        vand.vv v20,v28,v8
        vand.vv v8,v12,v8
        vsetvli zero,a4,e32,m4,ta,ma
        vle32.v v16,0(a0)
        vsetvli a5,zero,e32,m4,ta,ma
        add     a3,a3,t1
        vrgather.vv     v12,v16,v20
        add     a0,a0,t3
        vrgather.vv     v20,v16,v8
        vsub.vv v12,v12,v0
        vsetvli zero,a4,e32,m4,tu,ma
        vadd.vv v4,v4,v12
        vmacc.vv        v4,v24,v20
        bgtu    a7,a6,.L13
        csrr    a1,vlenb
        slli    a1,a1,2
        add     a1,a1,sp
        li      a4,-1
        csrr    t0,vlenb
        vsetvli a5,zero,e32,m4,ta,ma
        vl4re32.v       v12,0(a1)               ---- spill
        vmv.v.i v8,0
        vmul.vx v0,v12,a4
        li      a2,0
        slli    t1,t0,3
        vadd.vi v0,v0,-1
        vand.vi v0,v0,1
        vmseq.vv        v0,v0,v8
        vand.vi v12,v12,1
        vmerge.vvm      v16,v8,v4,v0
        vmseq.vv        v12,v12,v8
        vmv.s.x v1,a2
        vmv1r.v v0,v12
        vredsum.vs      v16,v16,v1
        vmerge.vvm      v8,v8,v4,v0
        vmv.x.s a0,v16
        vredsum.vs      v8,v8,v1
        vmv.x.s a5,v8
        add     sp,sp,t1
        addw    a0,a0,a5
        jr      ra
.L5:
        li      a0,0
        ret

We can there are multiple horrible register spillings.
The root cause of this issue is for a scalar IR load:

_5 = *_4;

We didn't check whether it is a continguous load/store or gather/scatter load/store

Since it will be translate into:

		   1. MASK_LEN_GATHER_LOAD (..., perm indice).
		   2. Continguous load/store + VEC_PERM (..., perm indice)

It's obvious that no matter which situation, we will end up with consuming one vector register group (perm indice)
that we didn't count it before.

So this case we pick LMUL = 4 which is incorrect choice for dynamic LMUL cost model.

The key of this patch is:

	  if ((type == load_vec_info_type || type == store_vec_info_type)
	      && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)))
	    {
               ...
            }

Add one more register consumption if it is not an adjacent load/store.

After this patch, it pick LMUL = 2 which is optimal:

bar:
	ble	a3,zero,.L4
	csrr	a6,vlenb
	vsetvli	a5,zero,e32,m2,ta,ma
	vmv.v.x	v6,a2
	srli	a2,a6,1
	vmv.v.x	v4,a1
	vid.v	v12
	slli	a3,a3,1
	vand.vi	v0,v12,1
	addi	t1,a2,-1
	vmseq.vi	v0,v0,1
	slli	a6,a6,1
	vsetvli	zero,a5,e32,m2,ta,ma
	neg	a7,a2
	viota.m	v2,v0
	vsetvli	a5,zero,e32,m2,ta,mu
	vrgather.vv	v16,v4,v2
	vrgather.vv	v14,v6,v2
	vrgather.vv	v16,v6,v2,v0.t
	vrgather.vv	v14,v4,v2,v0.t
	vand.vi	v18,v12,-2
	vmv.v.i	v2,0
	vadd.vi	v20,v18,1
.L3:
	minu	a4,a3,a2
	vsetvli	zero,a4,e32,m2,ta,ma
	vle32.v	v8,0(a0)
	vsetvli	a5,zero,e32,m2,ta,ma
	vmv.v.x	v4,t1
	vand.vv	v10,v18,v4
	vrgather.vv	v6,v8,v10
	vsub.vv	v6,v6,v14
	vsetvli	zero,a4,e32,m2,tu,ma
	vadd.vv	v2,v2,v6
	vsetvli	a1,zero,e32,m2,ta,ma
	vand.vv	v4,v20,v4
	vrgather.vv	v6,v8,v4
	vsetvli	zero,a4,e32,m2,tu,ma
	mv	a4,a3
	add	a0,a0,a6
	add	a3,a3,a7
	vmacc.vv	v2,v16,v6
	bgtu	a4,a2,.L3
	vsetvli	a1,zero,e32,m2,ta,ma
	vand.vi	v0,v12,1
	vmv.v.i	v4,0
	li	a3,-1
	vmseq.vv	v0,v0,v4
	vmv.s.x	v1,zero
	vmerge.vvm	v6,v4,v2,v0
	vredsum.vs	v6,v6,v1
	vmul.vx	v0,v12,a3
	vadd.vi	v0,v0,-1
	vand.vi	v0,v0,1
	vmv.x.s	a4,v6
	vmseq.vv	v0,v0,v4
	vmv.s.x	v1,zero
	vmerge.vvm	v4,v4,v2,v0
	vredsum.vs	v4,v4,v1
	vmv.x.s	a0,v4
	addw	a0,a0,a4
	ret
.L4:
	li	a0,0
	ret

No spillings.

gcc/ChangeLog:

	* config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Fix unexpected LMUL.
	(get_store_value): New function.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc        | 103 ++++++++++++++++--
 .../costmodel/riscv/rvv/dynamic-lmul2-7.c     |  25 +++++
 2 files changed, 120 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
  

Comments

Robin Dapp Oct. 16, 2023, 12:33 p.m. UTC | #1
Hi Juzhe,

> +/* Get STORE value.  */
> +static tree
> +get_store_value (gimple *stmt)
> +{
> +  if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
> +    {
> +      if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
> +	return gimple_call_arg (stmt, 3);
> +      else
> +	gcc_unreachable ();
> +    }
> +  else
> +    return gimple_assign_rhs1 (stmt);
> +}

This was something I was about to mention in the review of v1
that I already started.  This is better now.

> +	      basic_block bb = e->src;

Rename to pred or so?  And then keep the original bb.

>  	      if (!live_range)
> -		continue;
> +		{
> +		  if (single_succ_p (e->src))
> +		    {
> +		      /*
> +			 <bb 7> [local count: 850510900]:
> +			 goto <bb 3>; [100.00%]
> +
> +			 Handle this case, we should extend live range of bb 3.
> +		      */

/* If the predecessor is an extended basic block extend it and look for
   DEF's definition again.  */

> +		      bb = single_succ (e->src);
> +		      if (!program_points_per_bb.get (bb))
> +			continue;
> +		      live_ranges = live_ranges_per_bb.get (bb);
> +		      max_point
> +			= (*program_points_per_bb.get (bb)).length () - 1;
> +		      live_range = live_ranges->get (def);
> +		      if (!live_range)
> +			continue;
> +		    }
> +		  else
> +		    continue;
> +		}

We're approaching a complexity where reverse postorder would have
helped ;)  Maybe split out the live range into a separate function
get_live_range_for_bb ()?

> +      for (si = gsi_start_bb (bbs[i]); !gsi_end_p (si); gsi_next (&si))
> +	{
> +	  if (!(is_gimple_assign (gsi_stmt (si))
> +		|| is_gimple_call (gsi_stmt (si))))
> +	    continue;
> +	  stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
> +	  enum stmt_vec_info_type type
> +	    = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
> +	  if ((type == load_vec_info_type || type == store_vec_info_type)
> +	      && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)))
> +	    {
> +	      /* For non-adjacent load/store STMT, we will potentially
> +		 convert it into:
> +
> +		   1. MASK_LEN_GATHER_LOAD (..., perm indice).
> +		   2. Continguous load/store + VEC_PERM (..., perm indice)
> +
> +		We will be likely using one more vector variable.  */
> +	      unsigned int max_point
> +		= (*program_points_per_bb.get (bbs[i])).length () - 1;
> +	      auto *live_ranges = live_ranges_per_bb.get (bbs[i]);
> +	      bool existed_p = false;
> +	      tree var = type == load_vec_info_type
> +			   ? gimple_get_lhs (gsi_stmt (si))
> +			   : get_store_value (gsi_stmt (si));
> +	      tree sel_type = build_nonstandard_integer_type (
> +		TYPE_PRECISION (TREE_TYPE (var)), 1);
> +	      tree sel = build_decl (UNKNOWN_LOCATION, VAR_DECL,
> +				     get_identifier ("vect_perm"), sel_type);
> +	      pair &live_range = live_ranges->get_or_insert (sel, &existed_p);
> +	      gcc_assert (!existed_p);
> +	      live_range = pair (0, max_point);
> +	      if (dump_enabled_p ())
> +		dump_printf_loc (MSG_NOTE, vect_location,
> +				 "Add perm indice %T, start = 0, end = %d\n",
> +				 sel, max_point);
> +	    }
> +	}
>      }
>  }

So we're inserting a dummy vect_perm element (that's live from the start?).
Would it make sense to instead increase the number of needed registers for
a load/store and handle this similarly to compute_nregs_for_mode?
Maybe also do it directly in compute_local_live_ranges and extend live_range
by an nregs?

Regards
 Robin
  
juzhe.zhong@rivai.ai Oct. 16, 2023, 2:18 p.m. UTC | #2
>> So we're inserting a dummy vect_perm element (that's live from the start?).
>> Would it make sense to instead increase the number of needed registers for
>> a load/store and handle this similarly to compute_nregs_for_mode?
>> Maybe also do it directly in compute_local_live_ranges and extend live_range
>> by an nregs?

Yeah,  we're inserting a dummy vect_perm element which has live range from 0 to max_point of the bb.
I have tried it in 'compute_nregs_for_mode' and 'compute_local_live_ranges' since we don't know the maximum point of bb yet.

Address other comments in V3:
[PATCH V3] RISC-V: Fix unexpected big LMUL choosing in dynamic LMUL model for non-adjacent load/store (gnu.org)



juzhe.zhong@rivai.ai
 
From: Robin Dapp
Date: 2023-10-16 20:33
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; jeffreyalaw
Subject: Re: [PATCH V2] RISC-V: Fix unexpected big LMUL choosing in dynamic LMUL model for non-adjacent load/store
Hi Juzhe,
 
> +/* Get STORE value.  */
> +static tree
> +get_store_value (gimple *stmt)
> +{
> +  if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
> +    {
> +      if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
> + return gimple_call_arg (stmt, 3);
> +      else
> + gcc_unreachable ();
> +    }
> +  else
> +    return gimple_assign_rhs1 (stmt);
> +}
 
This was something I was about to mention in the review of v1
that I already started.  This is better now.
 
> +       basic_block bb = e->src;
 
Rename to pred or so?  And then keep the original bb.
 
>        if (!live_range)
> - continue;
> + {
> +   if (single_succ_p (e->src))
> +     {
> +       /*
> + <bb 7> [local count: 850510900]:
> + goto <bb 3>; [100.00%]
> +
> + Handle this case, we should extend live range of bb 3.
> +       */
 
/* If the predecessor is an extended basic block extend it and look for
   DEF's definition again.  */
 
> +       bb = single_succ (e->src);
> +       if (!program_points_per_bb.get (bb))
> + continue;
> +       live_ranges = live_ranges_per_bb.get (bb);
> +       max_point
> + = (*program_points_per_bb.get (bb)).length () - 1;
> +       live_range = live_ranges->get (def);
> +       if (!live_range)
> + continue;
> +     }
> +   else
> +     continue;
> + }
 
We're approaching a complexity where reverse postorder would have
helped ;)  Maybe split out the live range into a separate function
get_live_range_for_bb ()?
 
> +      for (si = gsi_start_bb (bbs[i]); !gsi_end_p (si); gsi_next (&si))
> + {
> +   if (!(is_gimple_assign (gsi_stmt (si))
> + || is_gimple_call (gsi_stmt (si))))
> +     continue;
> +   stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
> +   enum stmt_vec_info_type type
> +     = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
> +   if ((type == load_vec_info_type || type == store_vec_info_type)
> +       && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)))
> +     {
> +       /* For non-adjacent load/store STMT, we will potentially
> + convert it into:
> +
> +    1. MASK_LEN_GATHER_LOAD (..., perm indice).
> +    2. Continguous load/store + VEC_PERM (..., perm indice)
> +
> + We will be likely using one more vector variable.  */
> +       unsigned int max_point
> + = (*program_points_per_bb.get (bbs[i])).length () - 1;
> +       auto *live_ranges = live_ranges_per_bb.get (bbs[i]);
> +       bool existed_p = false;
> +       tree var = type == load_vec_info_type
> +    ? gimple_get_lhs (gsi_stmt (si))
> +    : get_store_value (gsi_stmt (si));
> +       tree sel_type = build_nonstandard_integer_type (
> + TYPE_PRECISION (TREE_TYPE (var)), 1);
> +       tree sel = build_decl (UNKNOWN_LOCATION, VAR_DECL,
> +      get_identifier ("vect_perm"), sel_type);
> +       pair &live_range = live_ranges->get_or_insert (sel, &existed_p);
> +       gcc_assert (!existed_p);
> +       live_range = pair (0, max_point);
> +       if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> + "Add perm indice %T, start = 0, end = %d\n",
> + sel, max_point);
> +     }
> + }
>      }
>  }
 
So we're inserting a dummy vect_perm element (that's live from the start?).
Would it make sense to instead increase the number of needed registers for
a load/store and handle this similarly to compute_nregs_for_mode?
Maybe also do it directly in compute_local_live_ranges and extend live_range
by an nregs?
 
Regards
Robin
  

Patch

diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 0b890396535..4e8842cd2a6 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -40,6 +40,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "bitmap.h"
 #include "ssa.h"
 #include "backend.h"
+#include "tree-data-ref.h"
 
 /* This file should be included last.  */
 #include "riscv-vector-costs.h"
@@ -135,8 +136,9 @@  compute_local_program_points (
 		    || is_gimple_call (gsi_stmt (si))))
 		continue;
 	      stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
-	      if (STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
-		  != undef_vec_info_type)
+	      enum stmt_vec_info_type type
+		= STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+	      if (type != undef_vec_info_type)
 		{
 		  stmt_point info = {point, gsi_stmt (si)};
 		  program_points.safe_push (info);
@@ -289,9 +291,7 @@  max_number_of_live_regs (const basic_block bb,
   unsigned int i;
   unsigned int live_point = 0;
   auto_vec<unsigned int> live_vars_vec;
-  live_vars_vec.safe_grow (max_point + 1, true);
-  for (i = 0; i < live_vars_vec.length (); ++i)
-    live_vars_vec[i] = 0;
+  live_vars_vec.safe_grow_cleared (max_point + 1, true);
   for (hash_map<tree, pair>::iterator iter = live_ranges.begin ();
        iter != live_ranges.end (); ++iter)
     {
@@ -360,6 +360,21 @@  get_current_lmul (class loop *loop)
   return loop_autovec_infos.get (loop)->current_lmul;
 }
 
+/* Get STORE value.  */
+static tree
+get_store_value (gimple *stmt)
+{
+  if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
+    {
+      if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	return gimple_call_arg (stmt, 3);
+      else
+	gcc_unreachable ();
+    }
+  else
+    return gimple_assign_rhs1 (stmt);
+}
+
 /* Update the live ranges according PHI.
 
    Loop:
@@ -395,12 +410,13 @@  update_local_live_ranges (
   unsigned int nbbs = loop->num_nodes;
   unsigned int i, j;
   gphi_iterator psi;
+  gimple_stmt_iterator si;
   for (i = 0; i < nbbs; i++)
     {
-      basic_block bb = bbs[i];
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
-			 "Update local program points for bb %d:\n", bb->index);
+			 "Update local program points for bb %d:\n",
+			 bbs[i]->index);
       for (psi = gsi_start_phis (bbs[i]); !gsi_end_p (psi); gsi_next (&psi))
 	{
 	  gphi *phi = psi.phi ();
@@ -419,9 +435,42 @@  update_local_live_ranges (
 	      unsigned int max_point
 		= (*program_points_per_bb.get (e->src)).length () - 1;
 	      auto *live_range = live_ranges->get (def);
+	      basic_block bb = e->src;
 	      if (!live_range)
-		continue;
+		{
+		  if (single_succ_p (e->src))
+		    {
+		      /*
+			 <bb 7> [local count: 850510900]:
+			 goto <bb 3>; [100.00%]
+
+			 Handle this case, we should extend live range of bb 3.
+		      */
+		      bb = single_succ (e->src);
+		      if (!program_points_per_bb.get (bb))
+			continue;
+		      live_ranges = live_ranges_per_bb.get (bb);
+		      max_point
+			= (*program_points_per_bb.get (bb)).length () - 1;
+		      live_range = live_ranges->get (def);
+		      if (!live_range)
+			continue;
+		    }
+		  else
+		    continue;
+		}
 
+	      gcc_assert (live_range);
+	      if (bbs[i] == bb)
+		{
+		  gcc_assert (live_range);
+		  unsigned int start = (*live_range).first;
+		  (*live_range).first = 0;
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				     "Update %T start point from %d to %d:\n",
+				     def, start, (*live_range).first);
+		}
 	      unsigned int end = (*live_range).second;
 	      (*live_range).second = max_point;
 	      if (dump_enabled_p ())
@@ -430,6 +479,44 @@  update_local_live_ranges (
 				 end, (*live_range).second);
 	    }
 	}
+      for (si = gsi_start_bb (bbs[i]); !gsi_end_p (si); gsi_next (&si))
+	{
+	  if (!(is_gimple_assign (gsi_stmt (si))
+		|| is_gimple_call (gsi_stmt (si))))
+	    continue;
+	  stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
+	  enum stmt_vec_info_type type
+	    = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+	  if ((type == load_vec_info_type || type == store_vec_info_type)
+	      && !adjacent_dr_p (STMT_VINFO_DATA_REF (stmt_info)))
+	    {
+	      /* For non-adjacent load/store STMT, we will potentially
+		 convert it into:
+
+		   1. MASK_LEN_GATHER_LOAD (..., perm indice).
+		   2. Continguous load/store + VEC_PERM (..., perm indice)
+
+		We will be likely using one more vector variable.  */
+	      unsigned int max_point
+		= (*program_points_per_bb.get (bbs[i])).length () - 1;
+	      auto *live_ranges = live_ranges_per_bb.get (bbs[i]);
+	      bool existed_p = false;
+	      tree var = type == load_vec_info_type
+			   ? gimple_get_lhs (gsi_stmt (si))
+			   : get_store_value (gsi_stmt (si));
+	      tree sel_type = build_nonstandard_integer_type (
+		TYPE_PRECISION (TREE_TYPE (var)), 1);
+	      tree sel = build_decl (UNKNOWN_LOCATION, VAR_DECL,
+				     get_identifier ("vect_perm"), sel_type);
+	      pair &live_range = live_ranges->get_or_insert (sel, &existed_p);
+	      gcc_assert (!existed_p);
+	      live_range = pair (0, max_point);
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Add perm indice %T, start = 0, end = %d\n",
+				 sel, max_point);
+	    }
+	}
     }
 }
 
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
new file mode 100644
index 00000000000..3dfc6f16a25
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -fdump-tree-vect-details" } */
+
+int
+bar (int *x, int a, int b, int n)
+{
+  x = __builtin_assume_aligned (x, __BIGGEST_ALIGNMENT__);
+  int sum1 = 0;
+  int sum2 = 0;
+  for (int i = 0; i < n; ++i)
+    {
+      sum1 += x[2*i] - a;
+      sum1 += x[2*i+1] * b;
+      sum2 += x[2*i] - b;
+      sum2 += x[2*i+1] * a;
+    }
+  return sum1 + sum2;
+}
+
+/* { dg-final { scan-assembler {e32,m2} } } */
+/* { dg-final { scan-assembler-times {csrr} 1 } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */