RISC-V: Add vwadd.wv/vwsub.wv auto-vectorization lowering optimization
Checks
Commit Message
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
1. This patch optimize the codegen of the following auto-vectorization codes:
void foo (int32_t * __restrict a, int64_t * __restrict b, int64_t * __restrict c, int n)
{
for (int i = 0; i < n; i++)
c[i] = (int64_t)a[i] + b[i];
}
Combine instruction from:
...
vsext.vf2
vadd.vv
...
into:
...
vwadd.wv
...
Since for PLUS operation, GCC prefer the following RTL operand order when combining:
(plus: (sign_extend:..)
(reg:)
instead of
(plus: (reg:..)
(sign_extend:)
which is different from MINUS pattern.
I split patterns of vwadd/vwsub, and add dedicated patterns for them.
2. This patch not only optimize the case as above (1) mentioned, also enhance vwadd.vv/vwsub.vv
optimization for complicate PLUS/MINUS codes, consider this following codes:
__attribute__ ((noipa)) void
vwadd_int16_t_int8_t (int16_t *__restrict dst, int16_t *__restrict dst2,
int16_t *__restrict dst3, int8_t *__restrict a,
int8_t *__restrict b, int8_t *__restrict a2,
int8_t *__restrict b2, int n)
{
for (int i = 0; i < n; i++)
{
dst[i] = (int16_t) a[i] + (int16_t) b[i];
dst2[i] = (int16_t) a2[i] + (int16_t) b[i];
dst3[i] = (int16_t) a2[i] + (int16_t) a[i];
}
}
Before this patch:
...
vsetvli zero,a6,e8,mf2,ta,ma
vle8.v v2,0(a3)
vle8.v v1,0(a4)
vsetvli t1,zero,e16,m1,ta,ma
vsext.vf2 v3,v2
vsext.vf2 v2,v1
vadd.vv v1,v2,v3
vsetvli zero,a6,e16,m1,ta,ma
vse16.v v1,0(a0)
vle8.v v4,0(a5)
vsetvli t1,zero,e16,m1,ta,ma
vsext.vf2 v1,v4
vadd.vv v2,v1,v2
...
After this patch:
...
vsetvli zero,a6,e8,mf2,ta,ma
vle8.v v3,0(a4)
vle8.v v1,0(a3)
vsetvli t4,zero,e8,mf2,ta,ma
vwadd.vv v2,v1,v3
vsetvli zero,a6,e16,m1,ta,ma
vse16.v v2,0(a0)
vle8.v v2,0(a5)
vsetvli t4,zero,e8,mf2,ta,ma
vwadd.vv v4,v3,v2
vsetvli zero,a6,e16,m1,ta,ma
vse16.v v4,0(a1)
vsetvli t4,zero,e8,mf2,ta,ma
sub a7,a7,a6
vwadd.vv v3,v2,v1
vsetvli zero,a6,e16,m1,ta,ma
vse16.v v3,0(a2)
...
The reason why current upstream GCC can not optimize codes using vwadd thoroughly is combine PASS
needs intermediate RTL IR (extend one of the operand pattern (vwadd.wv)), then base on this intermediate
RTL IR, extend the other operand to generate vwadd.vv.
So vwadd.wv/vwsub.wv definitely helps to vwadd.vv/vwsub.vv code optimizations.
gcc/ChangeLog:
* config/riscv/riscv-vector-builtins-bases.cc: Change vwadd.wv/vwsub.wv intrinsic API expander
* config/riscv/vector.md (@pred_single_widen_<plus_minus:optab><any_extend:su><mode>): Remove it.
(@pred_single_widen_sub<any_extend:su><mode>): New pattern.
(@pred_single_widen_add<any_extend:su><mode>): New pattern.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/widen/widen-5.c: New test.
* gcc.target/riscv/rvv/autovec/widen/widen-6.c: New test.
* gcc.target/riscv/rvv/autovec/widen/widen-complicate-1.c: New test.
* gcc.target/riscv/rvv/autovec/widen/widen-complicate-2.c: New test.
* gcc.target/riscv/rvv/autovec/widen/widen_run-5.c: New test.
* gcc.target/riscv/rvv/autovec/widen/widen_run-6.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc | 8 +++--
gcc/config/riscv/vector.md | 29 +++++++++++++---
.../riscv/rvv/autovec/widen/widen-5.c | 27 +++++++++++++++
.../riscv/rvv/autovec/widen/widen-6.c | 27 +++++++++++++++
.../rvv/autovec/widen/widen-complicate-1.c | 31 +++++++++++++++++
.../rvv/autovec/widen/widen-complicate-2.c | 31 +++++++++++++++++
.../riscv/rvv/autovec/widen/widen_run-5.c | 34 +++++++++++++++++++
.../riscv/rvv/autovec/widen/widen_run-6.c | 34 +++++++++++++++++++
8 files changed, 215 insertions(+), 6 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-5.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-6.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-5.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-6.c
Comments
On 5/31/23 21:48, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> 1. This patch optimize the codegen of the following auto-vectorization codes:
>
> void foo (int32_t * __restrict a, int64_t * __restrict b, int64_t * __restrict c, int n)
> {
> for (int i = 0; i < n; i++)
> c[i] = (int64_t)a[i] + b[i];
> }
>
> Combine instruction from:
>
> ...
> vsext.vf2
> vadd.vv
> ...
>
> into:
>
> ...
> vwadd.wv
> ...
>
> Since for PLUS operation, GCC prefer the following RTL operand order when combining:
>
> (plus: (sign_extend:..)
> (reg:)
>
> instead of
>
> (plus: (reg:..)
> (sign_extend:)
>
> which is different from MINUS pattern.
Right. Canonicaliation rules will have the sign_extend as the first
operand when the opcode is associative.
>
> I split patterns of vwadd/vwsub, and add dedicated patterns for them.
>
> 2. This patch not only optimize the case as above (1) mentioned, also enhance vwadd.vv/vwsub.vv
> optimization for complicate PLUS/MINUS codes, consider this following codes:
>
> __attribute__ ((noipa)) void
> vwadd_int16_t_int8_t (int16_t *__restrict dst, int16_t *__restrict dst2,
> int16_t *__restrict dst3, int8_t *__restrict a,
> int8_t *__restrict b, int8_t *__restrict a2,
> int8_t *__restrict b2, int n)
> {
> for (int i = 0; i < n; i++)
> {
> dst[i] = (int16_t) a[i] + (int16_t) b[i];
> dst2[i] = (int16_t) a2[i] + (int16_t) b[i];
> dst3[i] = (int16_t) a2[i] + (int16_t) a[i];
> }
> }
>
> Before this patch:
> ...
> vsetvli zero,a6,e8,mf2,ta,ma
> vle8.v v2,0(a3)
> vle8.v v1,0(a4)
> vsetvli t1,zero,e16,m1,ta,ma
> vsext.vf2 v3,v2
> vsext.vf2 v2,v1
> vadd.vv v1,v2,v3
> vsetvli zero,a6,e16,m1,ta,ma
> vse16.v v1,0(a0)
> vle8.v v4,0(a5)
> vsetvli t1,zero,e16,m1,ta,ma
> vsext.vf2 v1,v4
> vadd.vv v2,v1,v2
> ...
>
> After this patch:
> ...
> vsetvli zero,a6,e8,mf2,ta,ma
> vle8.v v3,0(a4)
> vle8.v v1,0(a3)
> vsetvli t4,zero,e8,mf2,ta,ma
> vwadd.vv v2,v1,v3
> vsetvli zero,a6,e16,m1,ta,ma
> vse16.v v2,0(a0)
> vle8.v v2,0(a5)
> vsetvli t4,zero,e8,mf2,ta,ma
> vwadd.vv v4,v3,v2
> vsetvli zero,a6,e16,m1,ta,ma
> vse16.v v4,0(a1)
> vsetvli t4,zero,e8,mf2,ta,ma
> sub a7,a7,a6
> vwadd.vv v3,v2,v1
> vsetvli zero,a6,e16,m1,ta,ma
> vse16.v v3,0(a2)
> ...
>
> The reason why current upstream GCC can not optimize codes using vwadd thoroughly is combine PASS
> needs intermediate RTL IR (extend one of the operand pattern (vwadd.wv)), then base on this intermediate
> RTL IR, extend the other operand to generate vwadd.vv.
>
> So vwadd.wv/vwsub.wv definitely helps to vwadd.vv/vwsub.vv code optimizations.
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vector-builtins-bases.cc: Change vwadd.wv/vwsub.wv intrinsic API expander
> * config/riscv/vector.md (@pred_single_widen_<plus_minus:optab><any_extend:su><mode>): Remove it.
> (@pred_single_widen_sub<any_extend:su><mode>): New pattern.
> (@pred_single_widen_add<any_extend:su><mode>): New pattern.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/widen/widen-5.c: New test.
> * gcc.target/riscv/rvv/autovec/widen/widen-6.c: New test.
> * gcc.target/riscv/rvv/autovec/widen/widen-complicate-1.c: New test.
> * gcc.target/riscv/rvv/autovec/widen/widen-complicate-2.c: New test.
> * gcc.target/riscv/rvv/autovec/widen/widen_run-5.c: New test.
> * gcc.target/riscv/rvv/autovec/widen/widen_run-6.c: New test.
OK
jeff
Committed, thanks Jeff.
Pan
-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Jeff Law via Gcc-patches
Sent: Friday, June 2, 2023 2:52 AM
To: juzhe.zhong@rivai.ai; gcc-patches@gcc.gnu.org
Cc: kito.cheng@gmail.com; kito.cheng@sifive.com; palmer@dabbelt.com; palmer@rivosinc.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH] RISC-V: Add vwadd.wv/vwsub.wv auto-vectorization lowering optimization
On 5/31/23 21:48, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> 1. This patch optimize the codegen of the following auto-vectorization codes:
>
> void foo (int32_t * __restrict a, int64_t * __restrict b, int64_t *
> __restrict c, int n) {
> for (int i = 0; i < n; i++)
> c[i] = (int64_t)a[i] + b[i];
> }
>
> Combine instruction from:
>
> ...
> vsext.vf2
> vadd.vv
> ...
>
> into:
>
> ...
> vwadd.wv
> ...
>
> Since for PLUS operation, GCC prefer the following RTL operand order when combining:
>
> (plus: (sign_extend:..)
> (reg:)
>
> instead of
>
> (plus: (reg:..)
> (sign_extend:)
>
> which is different from MINUS pattern.
Right. Canonicaliation rules will have the sign_extend as the first operand when the opcode is associative.
>
> I split patterns of vwadd/vwsub, and add dedicated patterns for them.
>
> 2. This patch not only optimize the case as above (1) mentioned, also enhance vwadd.vv/vwsub.vv
> optimization for complicate PLUS/MINUS codes, consider this following codes:
>
> __attribute__ ((noipa)) void
> vwadd_int16_t_int8_t (int16_t *__restrict dst, int16_t *__restrict dst2,
> int16_t *__restrict dst3, int8_t *__restrict a,
> int8_t *__restrict b, int8_t *__restrict a2,
> int8_t *__restrict b2, int n)
> {
> for (int i = 0; i < n; i++)
> {
> dst[i] = (int16_t) a[i] + (int16_t) b[i];
> dst2[i] = (int16_t) a2[i] + (int16_t) b[i];
> dst3[i] = (int16_t) a2[i] + (int16_t) a[i];
> }
> }
>
> Before this patch:
> ...
> vsetvli zero,a6,e8,mf2,ta,ma
> vle8.v v2,0(a3)
> vle8.v v1,0(a4)
> vsetvli t1,zero,e16,m1,ta,ma
> vsext.vf2 v3,v2
> vsext.vf2 v2,v1
> vadd.vv v1,v2,v3
> vsetvli zero,a6,e16,m1,ta,ma
> vse16.v v1,0(a0)
> vle8.v v4,0(a5)
> vsetvli t1,zero,e16,m1,ta,ma
> vsext.vf2 v1,v4
> vadd.vv v2,v1,v2
> ...
>
> After this patch:
> ...
> vsetvli zero,a6,e8,mf2,ta,ma
> vle8.v v3,0(a4)
> vle8.v v1,0(a3)
> vsetvli t4,zero,e8,mf2,ta,ma
> vwadd.vv v2,v1,v3
> vsetvli zero,a6,e16,m1,ta,ma
> vse16.v v2,0(a0)
> vle8.v v2,0(a5)
> vsetvli t4,zero,e8,mf2,ta,ma
> vwadd.vv v4,v3,v2
> vsetvli zero,a6,e16,m1,ta,ma
> vse16.v v4,0(a1)
> vsetvli t4,zero,e8,mf2,ta,ma
> sub a7,a7,a6
> vwadd.vv v3,v2,v1
> vsetvli zero,a6,e16,m1,ta,ma
> vse16.v v3,0(a2)
> ...
>
> The reason why current upstream GCC can not optimize codes using vwadd
> thoroughly is combine PASS needs intermediate RTL IR (extend one of
> the operand pattern (vwadd.wv)), then base on this intermediate RTL IR, extend the other operand to generate vwadd.vv.
>
> So vwadd.wv/vwsub.wv definitely helps to vwadd.vv/vwsub.vv code optimizations.
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vector-builtins-bases.cc: Change vwadd.wv/vwsub.wv intrinsic API expander
> * config/riscv/vector.md (@pred_single_widen_<plus_minus:optab><any_extend:su><mode>): Remove it.
> (@pred_single_widen_sub<any_extend:su><mode>): New pattern.
> (@pred_single_widen_add<any_extend:su><mode>): New pattern.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/widen/widen-5.c: New test.
> * gcc.target/riscv/rvv/autovec/widen/widen-6.c: New test.
> * gcc.target/riscv/rvv/autovec/widen/widen-complicate-1.c: New test.
> * gcc.target/riscv/rvv/autovec/widen/widen-complicate-2.c: New test.
> * gcc.target/riscv/rvv/autovec/widen/widen_run-5.c: New test.
> * gcc.target/riscv/rvv/autovec/widen/widen_run-6.c: New test.
OK
jeff
@@ -361,8 +361,12 @@ public:
return e.use_exact_insn (
code_for_pred_dual_widen_scalar (CODE1, CODE2, e.vector_mode ()));
case OP_TYPE_wv:
- return e.use_exact_insn (
- code_for_pred_single_widen (CODE1, CODE2, e.vector_mode ()));
+ if (CODE1 == PLUS)
+ return e.use_exact_insn (
+ code_for_pred_single_widen_add (CODE2, e.vector_mode ()));
+ else
+ return e.use_exact_insn (
+ code_for_pred_single_widen_sub (CODE2, e.vector_mode ()));
case OP_TYPE_wx:
return e.use_exact_insn (
code_for_pred_single_widen_scalar (CODE1, CODE2, e.vector_mode ()));
@@ -3131,7 +3131,7 @@
[(set_attr "type" "vi<widen_binop_insn_type>")
(set_attr "mode" "<V_DOUBLE_TRUNC>")])
-(define_insn "@pred_single_widen_<plus_minus:optab><any_extend:su><mode>"
+(define_insn "@pred_single_widen_sub<any_extend:su><mode>"
[(set (match_operand:VWEXTI 0 "register_operand" "=&vr,&vr")
(if_then_else:VWEXTI
(unspec:<VM>
@@ -3142,14 +3142,35 @@
(match_operand 8 "const_int_operand" " i, i")
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
- (plus_minus:VWEXTI
+ (minus:VWEXTI
(match_operand:VWEXTI 3 "register_operand" " vr, vr")
(any_extend:VWEXTI
(match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" " vr, vr")))
(match_operand:VWEXTI 2 "vector_merge_operand" " vu, 0")))]
"TARGET_VECTOR"
- "vw<plus_minus:insn><any_extend:u>.wv\t%0,%3,%4%p1"
- [(set_attr "type" "vi<widen_binop_insn_type>")
+ "vwsub<any_extend:u>.wv\t%0,%3,%4%p1"
+ [(set_attr "type" "viwalu")
+ (set_attr "mode" "<V_DOUBLE_TRUNC>")])
+
+(define_insn "@pred_single_widen_add<any_extend:su><mode>"
+ [(set (match_operand:VWEXTI 0 "register_operand" "=&vr,&vr")
+ (if_then_else:VWEXTI
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1,vmWc1")
+ (match_operand 5 "vector_length_operand" " rK, rK")
+ (match_operand 6 "const_int_operand" " i, i")
+ (match_operand 7 "const_int_operand" " i, i")
+ (match_operand 8 "const_int_operand" " i, i")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (plus:VWEXTI
+ (any_extend:VWEXTI
+ (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" " vr, vr"))
+ (match_operand:VWEXTI 3 "register_operand" " vr, vr"))
+ (match_operand:VWEXTI 2 "vector_merge_operand" " vu, 0")))]
+ "TARGET_VECTOR"
+ "vwadd<any_extend:u>.wv\t%0,%3,%4%p1"
+ [(set_attr "type" "viwalu")
(set_attr "mode" "<V_DOUBLE_TRUNC>")])
(define_insn "@pred_single_widen_<plus_minus:optab><any_extend:su><mode>_scalar"
new file mode 100644
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable" } */
+
+#include <stdint-gcc.h>
+
+#define TEST_TYPE(TYPE1, TYPE2) \
+ __attribute__ ((noipa)) void vwadd_##TYPE1_##TYPE2 (TYPE1 *__restrict dst, \
+ TYPE2 *__restrict a, \
+ TYPE1 *__restrict b, \
+ int n) \
+ { \
+ for (int i = 0; i < n; i++) \
+ dst[i] = (TYPE1) a[i] + b[i]; \
+ }
+
+#define TEST_ALL() \
+ TEST_TYPE (int16_t, int8_t) \
+ TEST_TYPE (uint16_t, uint8_t) \
+ TEST_TYPE (int32_t, int16_t) \
+ TEST_TYPE (uint32_t, uint16_t) \
+ TEST_TYPE (int64_t, int32_t) \
+ TEST_TYPE (uint64_t, uint32_t)
+
+TEST_ALL ()
+
+/* { dg-final { scan-assembler-times {\tvwadd\.wv} 3 } } */
+/* { dg-final { scan-assembler-times {\tvwaddu\.wv} 3 } } */
new file mode 100644
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable" } */
+
+#include <stdint-gcc.h>
+
+#define TEST_TYPE(TYPE1, TYPE2) \
+ __attribute__ ((noipa)) void vwsub_##TYPE1_##TYPE2 (TYPE1 *__restrict dst, \
+ TYPE1 *__restrict a, \
+ TYPE2 *__restrict b, \
+ int n) \
+ { \
+ for (int i = 0; i < n; i++) \
+ dst[i] = a[i] - (TYPE1) b[i]; \
+ }
+
+#define TEST_ALL() \
+ TEST_TYPE (int16_t, int8_t) \
+ TEST_TYPE (uint16_t, uint8_t) \
+ TEST_TYPE (int32_t, int16_t) \
+ TEST_TYPE (uint32_t, uint16_t) \
+ TEST_TYPE (int64_t, int32_t) \
+ TEST_TYPE (uint64_t, uint32_t)
+
+TEST_ALL ()
+
+/* { dg-final { scan-assembler-times {\tvwsub\.wv} 3 } } */
+/* { dg-final { scan-assembler-times {\tvwsubu\.wv} 3 } } */
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable" } */
+
+#include <stdint-gcc.h>
+
+#define TEST_TYPE(TYPE1, TYPE2) \
+ __attribute__ ((noipa)) void vwadd_##TYPE1_##TYPE2 ( \
+ TYPE1 *__restrict dst, TYPE1 *__restrict dst2, TYPE1 *__restrict dst3, \
+ TYPE2 *__restrict a, TYPE2 *__restrict b, TYPE2 *__restrict a2, \
+ TYPE2 *__restrict b2, int n) \
+ { \
+ for (int i = 0; i < n; i++) \
+ { \
+ dst[i] = (TYPE1) a[i] + (TYPE1) b[i]; \
+ dst2[i] = (TYPE1) a2[i] + (TYPE1) b[i]; \
+ dst3[i] = (TYPE1) a2[i] + (TYPE1) a[i]; \
+ } \
+ }
+
+#define TEST_ALL() \
+ TEST_TYPE (int16_t, int8_t) \
+ TEST_TYPE (uint16_t, uint8_t) \
+ TEST_TYPE (int32_t, int16_t) \
+ TEST_TYPE (uint32_t, uint16_t) \
+ TEST_TYPE (int64_t, int32_t) \
+ TEST_TYPE (uint64_t, uint32_t)
+
+TEST_ALL ()
+
+/* { dg-final { scan-assembler-times {\tvwadd\.vv} 9 } } */
+/* { dg-final { scan-assembler-times {\tvwaddu\.vv} 9 } } */
new file mode 100644
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable" } */
+
+#include <stdint-gcc.h>
+
+#define TEST_TYPE(TYPE1, TYPE2) \
+ __attribute__ ((noipa)) void vwadd_##TYPE1_##TYPE2 ( \
+ TYPE1 *__restrict dst, TYPE1 *__restrict dst2, TYPE1 *__restrict dst3, \
+ TYPE2 *__restrict a, TYPE2 *__restrict b, TYPE2 *__restrict a2, \
+ TYPE2 *__restrict b2, int n) \
+ { \
+ for (int i = 0; i < n; i++) \
+ { \
+ dst[i] = (TYPE1) a[i] - (TYPE1) b[i]; \
+ dst2[i] = (TYPE1) a2[i] - (TYPE1) b[i]; \
+ dst3[i] = (TYPE1) a2[i] - (TYPE1) a[i]; \
+ } \
+ }
+
+#define TEST_ALL() \
+ TEST_TYPE (int16_t, int8_t) \
+ TEST_TYPE (uint16_t, uint8_t) \
+ TEST_TYPE (int32_t, int16_t) \
+ TEST_TYPE (uint32_t, uint16_t) \
+ TEST_TYPE (int64_t, int32_t) \
+ TEST_TYPE (uint64_t, uint32_t)
+
+TEST_ALL ()
+
+/* { dg-final { scan-assembler-times {\tvwsub\.vv} 9 } } */
+/* { dg-final { scan-assembler-times {\tvwsubu\.vv} 9 } } */
new file mode 100644
@@ -0,0 +1,34 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */
+
+#include <assert.h>
+#include "widen-5.c"
+
+#define SZ 512
+
+#define RUN(TYPE1, TYPE2, LIMIT) \
+ TYPE2 a##TYPE2[SZ]; \
+ TYPE1 b##TYPE1[SZ]; \
+ TYPE1 dst##TYPE1[SZ]; \
+ for (int i = 0; i < SZ; i++) \
+ { \
+ a##TYPE2[i] = LIMIT + i % 8723; \
+ b##TYPE1[i] = LIMIT + i & 1964; \
+ } \
+ vwadd_##TYPE1_##TYPE2 (dst##TYPE1, a##TYPE2, b##TYPE1, SZ); \
+ for (int i = 0; i < SZ; i++) \
+ assert (dst##TYPE1[i] == ((TYPE1) a##TYPE2[i] + (TYPE1) b##TYPE1[i]));
+
+#define RUN_ALL() \
+ RUN (int16_t, int8_t, -128) \
+ RUN (uint16_t, uint8_t, 255) \
+ RUN (int32_t, int16_t, -32768) \
+ RUN (uint32_t, uint16_t, 65535) \
+ RUN (int64_t, int32_t, -2147483648) \
+ RUN (uint64_t, uint32_t, 4294967295)
+
+int
+main ()
+{
+ RUN_ALL ()
+}
new file mode 100644
@@ -0,0 +1,34 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */
+
+#include <assert.h>
+#include "widen-6.c"
+
+#define SZ 512
+
+#define RUN(TYPE1, TYPE2, LIMIT) \
+ TYPE1 a##TYPE1[SZ]; \
+ TYPE2 b##TYPE2[SZ]; \
+ TYPE1 dst##TYPE1[SZ]; \
+ for (int i = 0; i < SZ; i++) \
+ { \
+ a##TYPE1[i] = LIMIT + i % 8723; \
+ b##TYPE2[i] = LIMIT + i & 1964; \
+ } \
+ vwsub_##TYPE1_##TYPE2 (dst##TYPE1, a##TYPE1, b##TYPE2, SZ); \
+ for (int i = 0; i < SZ; i++) \
+ assert (dst##TYPE1[i] == ((TYPE1) a##TYPE1[i] - (TYPE1) b##TYPE2[i]));
+
+#define RUN_ALL() \
+ RUN (int16_t, int8_t, -128) \
+ RUN (uint16_t, uint8_t, 255) \
+ RUN (int32_t, int16_t, -32768) \
+ RUN (uint32_t, uint16_t, 65535) \
+ RUN (int64_t, int32_t, -2147483648) \
+ RUN (uint64_t, uint32_t, 4294967295)
+
+int
+main ()
+{
+ RUN_ALL ()
+}