RISC-V: Support highpart register overlap for vwcvt

Message ID 20231129083208.2417668-1-juzhe.zhong@rivai.ai
State Unresolved
Headers
Series RISC-V: Support highpart register overlap for vwcvt |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

juzhe.zhong@rivai.ai Nov. 29, 2023, 8:32 a.m. UTC
  Since Richard supports register filters recently, we are able to support highpart register
overlap for widening RVV instructions.

This patch support it for vwcvt intrinsics.

I leverage real application user codes for vwcvt:
https://github.com/riscv/riscv-v-spec/issues/929
https://godbolt.org/z/xoeGnzd8q

This is the real application codes that using LMUL = 8 with unrolling to gain optimal
performance for specific libraury.

You can see in the codegen, GCC has optimal codegen for such since we supported register
lowpart overlap for narrowing instructions (dest EEW < source EEW).

Now, we start to support highpart register overlap from this patch for widening instructions (dest EEW > source EEW).

Leverage this intrinsic codes above but for vwcvt:

https://godbolt.org/z/1TMPE5Wfr

size_t
foo (char const *buf, size_t len)
{
  size_t sum = 0;
  size_t vl = __riscv_vsetvlmax_e8m8 ();
  size_t step = vl * 4;
  const char *it = buf, *end = buf + len;
  for (; it + step <= end;)
    {
      vint8m4_t v0 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;
      vint8m4_t v1 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;
      vint8m4_t v2 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;
      vint8m4_t v3 = __riscv_vle8_v_i8m4 ((void *) it, vl);
      it += vl;

      asm volatile("nop" ::: "memory");
      vint16m8_t vw0 = __riscv_vwcvt_x_x_v_i16m8 (v0, vl);
      vint16m8_t vw1 = __riscv_vwcvt_x_x_v_i16m8 (v1, vl);
      vint16m8_t vw2 = __riscv_vwcvt_x_x_v_i16m8 (v2, vl);
      vint16m8_t vw3 = __riscv_vwcvt_x_x_v_i16m8 (v3, vl);

      asm volatile("nop" ::: "memory");
      size_t sum0 = __riscv_vmv_x_s_i16m8_i16 (vw0);
      size_t sum1 = __riscv_vmv_x_s_i16m8_i16 (vw1);
      size_t sum2 = __riscv_vmv_x_s_i16m8_i16 (vw2);
      size_t sum3 = __riscv_vmv_x_s_i16m8_i16 (vw3);

      sum += sumation (sum0, sum1, sum2, sum3);
    }
  return sum;
}

Before this patch:

...
csrr    t0,vlenb
...
        vwcvt.x.x.v     v16,v8
        vwcvt.x.x.v     v8,v28
        vs8r.v  v16,0(sp)               ---> spill
        vwcvt.x.x.v     v16,v24
        vwcvt.x.x.v     v24,v4
        nop
        vsetvli zero,zero,e16,m8,ta,ma
        vmv.x.s a2,v16
        vl8re16.v       v16,0(sp)      --->  reload
...
csrr    t0,vlenb
...

You can see heavy spill && reload inside the loop body.

After this patch:

...
	vwcvt.x.x.v	v8,v12
	vwcvt.x.x.v	v16,v20
	vwcvt.x.x.v	v24,v28
	vwcvt.x.x.v	v0,v4
...

Optimal codegen after this patch.

Tested on zvl128b no regression.

I am gonna to test zve64d/zvl256b/zvl512b/zvl1024b.

Ok for trunk if no regression on the testing above ?

	PR target/112431

gcc/ChangeLog:

	* config/riscv/constraints.md (TARGET_VECTOR ? V_REGS : NO_REGS): New register filters.
	* config/riscv/riscv.md (no,W21,W42,W84,W41,W81,W82): Ditto.
	(no,yes): Ditto.
	* config/riscv/vector.md: Support highpart register overlap for vwcvt.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/base/pr112431-1.c: New test.
	* gcc.target/riscv/rvv/base/pr112431-2.c: New test.
	* gcc.target/riscv/rvv/base/pr112431-3.c: New test.

---
 gcc/config/riscv/constraints.md               |  23 ++++
 gcc/config/riscv/riscv.md                     |  24 ++++
 gcc/config/riscv/vector.md                    |  19 ++--
 .../gcc.target/riscv/rvv/base/pr112431-1.c    | 104 ++++++++++++++++++
 .../gcc.target/riscv/rvv/base/pr112431-2.c    |  68 ++++++++++++
 .../gcc.target/riscv/rvv/base/pr112431-3.c    |  51 +++++++++
 6 files changed, 280 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
  

Patch

diff --git a/gcc/config/riscv/constraints.md b/gcc/config/riscv/constraints.md
index 68be4515c04..19bb36616bf 100644
--- a/gcc/config/riscv/constraints.md
+++ b/gcc/config/riscv/constraints.md
@@ -169,6 +169,29 @@ 
 (define_register_constraint "vm" "TARGET_VECTOR ? VM_REGS : NO_REGS"
   "A vector mask register (if available).")
 
+;; These following constraints are used by RVV instructions with dest EEW > src EEW.
+;; RISC-V 'V' Spec 5.2. Vector Operands:
+;; The destination EEW is greater than the source EEW, the source EMUL is at least 1,
+;; and the overlap is in the highest-numbered part of the destination register group.
+;; (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, v2, or v4 is not).
+(define_register_constraint "W21" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 2 == 1." "regno % 2 == 1")
+
+(define_register_constraint "W42" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 4 == 2." "regno % 4 == 2")
+
+(define_register_constraint "W84" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 8 == 4." "regno % 8 == 4")
+
+(define_register_constraint "W41" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 4 == 1." "regno % 4 == 1")
+
+(define_register_constraint "W81" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 8 == 1." "regno % 8 == 1")
+
+(define_register_constraint "W82" "TARGET_VECTOR ? V_REGS : NO_REGS"
+  "A vector register has register number % 8 == 2." "regno % 8 == 2")
+
 ;; This constraint is used to match instruction "csrr %0, vlenb" which is generated in "mov<mode>".
 ;; VLENB is a run-time constant which represent the vector register length in bytes.
 ;; BYTES_PER_RISCV_VECTOR represent runtime invariant of vector register length in bytes.
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 935eeb7fd8e..6bf2dfdf9b4 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -501,6 +501,27 @@ 
   ]
   (const_string "no")))
 
+(define_attr "vconstraint" "no,W21,W42,W84,W41,W81,W82"
+  (const_string "no"))
+
+(define_attr "vconstraint_enabled" "no,yes"
+  (cond [(eq_attr "vconstraint" "no")
+         (const_string "yes")
+
+         (and (eq_attr "vconstraint" "W21")
+	      (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 2"))
+	 (const_string "no")
+
+         (and (eq_attr "vconstraint" "W42,W41")
+	      (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 4"))
+	 (const_string "no")
+
+         (and (eq_attr "vconstraint" "W84,W81,W82")
+	      (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 8"))
+	 (const_string "no")
+        ]
+       (const_string "yes")))
+
 ;; Attribute to control enable or disable instructions.
 (define_attr "enabled" "no,yes"
   (cond [
@@ -509,6 +530,9 @@ 
 
     (eq_attr "fp_vector_disabled" "yes")
     (const_string "no")
+
+    (eq_attr "vconstraint_enabled" "no")
+    (const_string "no")
   ]
   (const_string "yes")))
 
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index ba9c9e5a9b6..bace900fee5 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -3898,22 +3898,22 @@ 
 
 ;; vwcvt<u>.x.x.v
 (define_insn "@pred_<optab><mode>"
-  [(set (match_operand:VWEXTI 0 "register_operand"                  "=&vr,&vr")
+  [(set (match_operand:VWEXTI 0 "register_operand"                   "=vr,   vr,   vr,   vr,  vr,    vr, ?&vr, ?&vr")
 	(if_then_else:VWEXTI
 	  (unspec:<VM>
-	    [(match_operand:<VM> 1 "vector_mask_operand"           "vmWc1,vmWc1")
-	     (match_operand 4 "vector_length_operand"              "   rK,   rK")
-	     (match_operand 5 "const_int_operand"                  "    i,    i")
-	     (match_operand 6 "const_int_operand"                  "    i,    i")
-	     (match_operand 7 "const_int_operand"                  "    i,    i")
+	    [(match_operand:<VM> 1 "vector_mask_operand"           "vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1,vmWc1")
+	     (match_operand 4 "vector_length_operand"              "   rK,   rK,   rK,   rK,   rK,   rK,   rK,   rK")
+	     (match_operand 5 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
+	     (match_operand 6 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
+	     (match_operand 7 "const_int_operand"                  "    i,    i,    i,    i,    i,    i,    i,    i")
 	     (reg:SI VL_REGNUM)
 	     (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
 	  (plus:VWEXTI
 	    (any_extend:VWEXTI
-	      (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "   vr,   vr"))
+	      (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "  W21,  W21,  W42,  W42,  W84,  W84,   vr,   vr"))
 	    (vec_duplicate:VWEXTI
 	      (reg:<VEL> X0_REGNUM)))
-	  (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    0")))]
+	  (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    0,   vu,    0,   vu,    0,   vu,    0")))]
   "TARGET_VECTOR"
   "vwcvt<u>.x.x.v\t%0,%3%p1"
   [(set_attr "type" "viwalu")
@@ -3921,7 +3921,8 @@ 
    (set_attr "vl_op_idx" "4")
    (set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[5])"))
    (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[6])"))
-   (set (attr "avl_type_idx") (const_int 7))])
+   (set (attr "avl_type_idx") (const_int 7))
+   (set_attr "vconstraint" "W21,W21,W42,W42,W84,W84,no,no")])
 
 ;; -------------------------------------------------------------------------------
 ;; ---- Predicated integer Narrowing operations
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
new file mode 100644
index 00000000000..6b9a7c448f0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-1.c
@@ -0,0 +1,104 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+size_t __attribute__ ((noinline))
+sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
+	  size_t sum5, size_t sum6, size_t sum7, size_t sum8, size_t sum9,
+	  size_t sum10, size_t sum11, size_t sum12, size_t sum13, size_t sum14,
+	  size_t sum15)
+{
+  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7 + sum8 + sum9
+	 + sum10 + sum11 + sum12 + sum13 + sum14 + sum15;
+}
+
+size_t
+foo (char const *buf, size_t len)
+{
+  size_t sum = 0;
+  size_t vl = __riscv_vsetvlmax_e8m8 ();
+  size_t step = vl * 4;
+  const char *it = buf, *end = buf + len;
+  for (; it + step <= end;)
+    {
+      vint8m1_t v0 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v1 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v2 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v3 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v4 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v5 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v6 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v7 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v8 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v9 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v10 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v11 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v12 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v13 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v14 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      vint8m1_t v15 = __riscv_vle8_v_i8m1 ((void *) it, vl);
+      it += vl;
+      
+      asm volatile("nop" ::: "memory");
+      vint16m2_t vw0 = __riscv_vwcvt_x_x_v_i16m2 (v0, vl);
+      vint16m2_t vw1 = __riscv_vwcvt_x_x_v_i16m2 (v1, vl);
+      vint16m2_t vw2 = __riscv_vwcvt_x_x_v_i16m2 (v2, vl);
+      vint16m2_t vw3 = __riscv_vwcvt_x_x_v_i16m2 (v3, vl);
+      vint16m2_t vw4 = __riscv_vwcvt_x_x_v_i16m2 (v4, vl);
+      vint16m2_t vw5 = __riscv_vwcvt_x_x_v_i16m2 (v5, vl);
+      vint16m2_t vw6 = __riscv_vwcvt_x_x_v_i16m2 (v6, vl);
+      vint16m2_t vw7 = __riscv_vwcvt_x_x_v_i16m2 (v7, vl);
+      vint16m2_t vw8 = __riscv_vwcvt_x_x_v_i16m2 (v8, vl);
+      vint16m2_t vw9 = __riscv_vwcvt_x_x_v_i16m2 (v9, vl);
+      vint16m2_t vw10 = __riscv_vwcvt_x_x_v_i16m2 (v10, vl);
+      vint16m2_t vw11 = __riscv_vwcvt_x_x_v_i16m2 (v11, vl);
+      vint16m2_t vw12 = __riscv_vwcvt_x_x_v_i16m2 (v12, vl);
+      vint16m2_t vw13 = __riscv_vwcvt_x_x_v_i16m2 (v13, vl);
+      vint16m2_t vw14 = __riscv_vwcvt_x_x_v_i16m2 (v14, vl);
+      vint16m2_t vw15 = __riscv_vwcvt_x_x_v_i16m2 (v15, vl);
+
+      asm volatile("nop" ::: "memory");
+      size_t sum0 = __riscv_vmv_x_s_i16m2_i16 (vw0);
+      size_t sum1 = __riscv_vmv_x_s_i16m2_i16 (vw1);
+      size_t sum2 = __riscv_vmv_x_s_i16m2_i16 (vw2);
+      size_t sum3 = __riscv_vmv_x_s_i16m2_i16 (vw3);
+      size_t sum4 = __riscv_vmv_x_s_i16m2_i16 (vw4);
+      size_t sum5 = __riscv_vmv_x_s_i16m2_i16 (vw5);
+      size_t sum6 = __riscv_vmv_x_s_i16m2_i16 (vw6);
+      size_t sum7 = __riscv_vmv_x_s_i16m2_i16 (vw7);
+      size_t sum8 = __riscv_vmv_x_s_i16m2_i16 (vw8);
+      size_t sum9 = __riscv_vmv_x_s_i16m2_i16 (vw9);
+      size_t sum10 = __riscv_vmv_x_s_i16m2_i16 (vw10);
+      size_t sum11 = __riscv_vmv_x_s_i16m2_i16 (vw11);
+      size_t sum12 = __riscv_vmv_x_s_i16m2_i16 (vw12);
+      size_t sum13 = __riscv_vmv_x_s_i16m2_i16 (vw13);
+      size_t sum14 = __riscv_vmv_x_s_i16m2_i16 (vw14);
+      size_t sum15 = __riscv_vmv_x_s_i16m2_i16 (vw15);
+
+      sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8,
+		       sum9, sum10, sum11, sum12, sum13, sum14, sum15);
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
new file mode 100644
index 00000000000..da92d59406f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-2.c
@@ -0,0 +1,68 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+size_t __attribute__ ((noinline))
+sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
+	  size_t sum5, size_t sum6, size_t sum7)
+{
+  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7;
+}
+
+size_t
+foo (char const *buf, size_t len)
+{
+  size_t sum = 0;
+  size_t vl = __riscv_vsetvlmax_e8m8 ();
+  size_t step = vl * 4;
+  const char *it = buf, *end = buf + len;
+  for (; it + step <= end;)
+    {
+      vint8m2_t v0 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v1 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v2 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v3 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v4 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v5 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v6 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+      vint8m2_t v7 = __riscv_vle8_v_i8m2 ((void *) it, vl);
+      it += vl;
+
+      asm volatile("nop" ::: "memory");
+      vint16m4_t vw0 = __riscv_vwcvt_x_x_v_i16m4 (v0, vl);
+      vint16m4_t vw1 = __riscv_vwcvt_x_x_v_i16m4 (v1, vl);
+      vint16m4_t vw2 = __riscv_vwcvt_x_x_v_i16m4 (v2, vl);
+      vint16m4_t vw3 = __riscv_vwcvt_x_x_v_i16m4 (v3, vl);
+      vint16m4_t vw4 = __riscv_vwcvt_x_x_v_i16m4 (v4, vl);
+      vint16m4_t vw5 = __riscv_vwcvt_x_x_v_i16m4 (v5, vl);
+      vint16m4_t vw6 = __riscv_vwcvt_x_x_v_i16m4 (v6, vl);
+      vint16m4_t vw7 = __riscv_vwcvt_x_x_v_i16m4 (v7, vl);
+
+      asm volatile("nop" ::: "memory");
+      size_t sum0 = __riscv_vmv_x_s_i16m4_i16 (vw0);
+      size_t sum1 = __riscv_vmv_x_s_i16m4_i16 (vw1);
+      size_t sum2 = __riscv_vmv_x_s_i16m4_i16 (vw2);
+      size_t sum3 = __riscv_vmv_x_s_i16m4_i16 (vw3);
+      size_t sum4 = __riscv_vmv_x_s_i16m4_i16 (vw4);
+      size_t sum5 = __riscv_vmv_x_s_i16m4_i16 (vw5);
+      size_t sum6 = __riscv_vmv_x_s_i16m4_i16 (vw6);
+      size_t sum7 = __riscv_vmv_x_s_i16m4_i16 (vw7);
+
+      sum += sumation (sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7);
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
new file mode 100644
index 00000000000..46f93a9049b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-3.c
@@ -0,0 +1,51 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+size_t __attribute__ ((noinline))
+sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3)
+{
+  return sum0 + sum1 + sum2 + sum3;
+}
+
+size_t
+foo (char const *buf, size_t len)
+{
+  size_t sum = 0;
+  size_t vl = __riscv_vsetvlmax_e8m8 ();
+  size_t step = vl * 4;
+  const char *it = buf, *end = buf + len;
+  for (; it + step <= end;)
+    {
+      vint8m4_t v0 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+      vint8m4_t v1 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+      vint8m4_t v2 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+      vint8m4_t v3 = __riscv_vle8_v_i8m4 ((void *) it, vl);
+      it += vl;
+
+      asm volatile("nop" ::: "memory");
+      vint16m8_t vw0 = __riscv_vwcvt_x_x_v_i16m8 (v0, vl);
+      vint16m8_t vw1 = __riscv_vwcvt_x_x_v_i16m8 (v1, vl);
+      vint16m8_t vw2 = __riscv_vwcvt_x_x_v_i16m8 (v2, vl);
+      vint16m8_t vw3 = __riscv_vwcvt_x_x_v_i16m8 (v3, vl);
+
+      asm volatile("nop" ::: "memory");
+      size_t sum0 = __riscv_vmv_x_s_i16m8_i16 (vw0);
+      size_t sum1 = __riscv_vmv_x_s_i16m8_i16 (vw1);
+      size_t sum2 = __riscv_vmv_x_s_i16m8_i16 (vw2);
+      size_t sum3 = __riscv_vmv_x_s_i16m8_i16 (vw3);
+
+      sum += sumation (sum0, sum1, sum2, sum3);
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {vmv1r} } } */
+/* { dg-final { scan-assembler-not {vmv2r} } } */
+/* { dg-final { scan-assembler-not {vmv4r} } } */
+/* { dg-final { scan-assembler-not {vmv8r} } } */
+/* { dg-final { scan-assembler-not {csrr} } } */