[V2] RISC-V: Support non-SLP unordered reduction
Checks
Commit Message
This patch add reduc_*_scal to support reduction auto-vectorization.
Use COND_LEN_* + reduc_*_scal to support unordered non-SLP auto-vectorization.
Consider this following case:
int __attribute__((noipa))
and_loop (int32_t * __restrict x,
int32_t n, int res)
{
for (int i = 0; i < n; ++i)
res &= x[i];
return res;
}
ASM:
and_loop:
ble a1,zero,.L4
vsetvli a3,zero,e32,m1,ta,ma
vmv.v.i v1,-1
.L3:
vsetvli a5,a1,e32,m1,tu,ma ------------> MUST BE "TU".
slli a4,a5,2
sub a1,a1,a5
vle32.v v2,0(a0)
add a0,a0,a4
vand.vv v1,v2,v1
bne a1,zero,.L3
vsetivli zero,1,e32,m1,ta,ma
vmv.v.i v2,-1
vsetvli a3,zero,e32,m1,ta,ma
vredand.vs v1,v1,v2
vmv.x.s a5,v1
and a0,a2,a5
ret
.L4:
mv a0,a2
ret
Fix bug of VSETVL PASS which is caused by reduction testcase.
SLP reduction and floating-point in-order reduction are not supported yet.
gcc/ChangeLog:
* config/riscv/autovec.md (reduc_plus_scal_<mode>): New pattern.
(reduc_smax_scal_<mode>): Ditto.
(reduc_umax_scal_<mode>): Ditto.
(reduc_smin_scal_<mode>): Ditto.
(reduc_umin_scal_<mode>): Ditto.
(reduc_and_scal_<mode>): Ditto.
(reduc_ior_scal_<mode>): Ditto.
(reduc_xor_scal_<mode>): Ditto.
* config/riscv/riscv-protos.h (enum insn_type): Add reduction.
(expand_reduction): New function.
* config/riscv/riscv-v.cc (emit_vlmax_reduction_insn): Ditto.
(emit_vlmax_fp_reduction_insn): Ditto.
(get_m1_mode): Ditto.
(expand_cond_len_binop): Fix name.
(expand_reduction): New function
* config/riscv/riscv-vsetvl.cc (gen_vsetvl_pat): Fix VSETVL BUG.
(validate_change_or_fail): New function.
(change_insn): Fix VSETVL BUG.
(change_vsetvl_insn): Ditto.
(pass_vsetvl::backward_demand_fusion): Ditto.
(pass_vsetvl::df_post_optimization): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/rvv.exp: Add reduction tests.
* gcc.target/riscv/rvv/autovec/reduc/reduc-1.c: New test.
* gcc.target/riscv/rvv/autovec/reduc/reduc-2.c: New test.
* gcc.target/riscv/rvv/autovec/reduc/reduc-3.c: New test.
* gcc.target/riscv/rvv/autovec/reduc/reduc-4.c: New test.
* gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c: New test.
* gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c: New test.
* gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c: New test.
* gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c: New test.
---
gcc/config/riscv/autovec.md | 138 ++++++++++++++++++
gcc/config/riscv/riscv-protos.h | 2 +
gcc/config/riscv/riscv-v.cc | 84 ++++++++++-
gcc/config/riscv/riscv-vsetvl.cc | 57 ++++++--
.../riscv/rvv/autovec/reduc/reduc-1.c | 118 +++++++++++++++
.../riscv/rvv/autovec/reduc/reduc-2.c | 129 ++++++++++++++++
.../riscv/rvv/autovec/reduc/reduc-3.c | 65 +++++++++
.../riscv/rvv/autovec/reduc/reduc-4.c | 59 ++++++++
.../riscv/rvv/autovec/reduc/reduc_run-1.c | 56 +++++++
.../riscv/rvv/autovec/reduc/reduc_run-2.c | 79 ++++++++++
.../riscv/rvv/autovec/reduc/reduc_run-3.c | 49 +++++++
.../riscv/rvv/autovec/reduc/reduc_run-4.c | 66 +++++++++
gcc/testsuite/gcc.target/riscv/rvv/rvv.exp | 2 +
13 files changed, 887 insertions(+), 17 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-3.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-4.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c
Comments
LGTM, thanks :)
On Mon, Jul 17, 2023 at 4:20 PM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote:
>
> This patch add reduc_*_scal to support reduction auto-vectorization.
>
> Use COND_LEN_* + reduc_*_scal to support unordered non-SLP auto-vectorization.
>
> Consider this following case:
> int __attribute__((noipa))
> and_loop (int32_t * __restrict x,
> int32_t n, int res)
> {
> for (int i = 0; i < n; ++i)
> res &= x[i];
> return res;
> }
>
> ASM:
> and_loop:
> ble a1,zero,.L4
> vsetvli a3,zero,e32,m1,ta,ma
> vmv.v.i v1,-1
> .L3:
> vsetvli a5,a1,e32,m1,tu,ma ------------> MUST BE "TU".
> slli a4,a5,2
> sub a1,a1,a5
> vle32.v v2,0(a0)
> add a0,a0,a4
> vand.vv v1,v2,v1
> bne a1,zero,.L3
> vsetivli zero,1,e32,m1,ta,ma
> vmv.v.i v2,-1
> vsetvli a3,zero,e32,m1,ta,ma
> vredand.vs v1,v1,v2
> vmv.x.s a5,v1
> and a0,a2,a5
> ret
> .L4:
> mv a0,a2
> ret
>
> Fix bug of VSETVL PASS which is caused by reduction testcase.
>
> SLP reduction and floating-point in-order reduction are not supported yet.
>
> gcc/ChangeLog:
>
> * config/riscv/autovec.md (reduc_plus_scal_<mode>): New pattern.
> (reduc_smax_scal_<mode>): Ditto.
> (reduc_umax_scal_<mode>): Ditto.
> (reduc_smin_scal_<mode>): Ditto.
> (reduc_umin_scal_<mode>): Ditto.
> (reduc_and_scal_<mode>): Ditto.
> (reduc_ior_scal_<mode>): Ditto.
> (reduc_xor_scal_<mode>): Ditto.
> * config/riscv/riscv-protos.h (enum insn_type): Add reduction.
> (expand_reduction): New function.
> * config/riscv/riscv-v.cc (emit_vlmax_reduction_insn): Ditto.
> (emit_vlmax_fp_reduction_insn): Ditto.
> (get_m1_mode): Ditto.
> (expand_cond_len_binop): Fix name.
> (expand_reduction): New function
> * config/riscv/riscv-vsetvl.cc (gen_vsetvl_pat): Fix VSETVL BUG.
> (validate_change_or_fail): New function.
> (change_insn): Fix VSETVL BUG.
> (change_vsetvl_insn): Ditto.
> (pass_vsetvl::backward_demand_fusion): Ditto.
> (pass_vsetvl::df_post_optimization): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/rvv.exp: Add reduction tests.
> * gcc.target/riscv/rvv/autovec/reduc/reduc-1.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc-2.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc-3.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc-4.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c: New test.
>
> ---
> gcc/config/riscv/autovec.md | 138 ++++++++++++++++++
> gcc/config/riscv/riscv-protos.h | 2 +
> gcc/config/riscv/riscv-v.cc | 84 ++++++++++-
> gcc/config/riscv/riscv-vsetvl.cc | 57 ++++++--
> .../riscv/rvv/autovec/reduc/reduc-1.c | 118 +++++++++++++++
> .../riscv/rvv/autovec/reduc/reduc-2.c | 129 ++++++++++++++++
> .../riscv/rvv/autovec/reduc/reduc-3.c | 65 +++++++++
> .../riscv/rvv/autovec/reduc/reduc-4.c | 59 ++++++++
> .../riscv/rvv/autovec/reduc/reduc_run-1.c | 56 +++++++
> .../riscv/rvv/autovec/reduc/reduc_run-2.c | 79 ++++++++++
> .../riscv/rvv/autovec/reduc/reduc_run-3.c | 49 +++++++
> .../riscv/rvv/autovec/reduc/reduc_run-4.c | 66 +++++++++
> gcc/testsuite/gcc.target/riscv/rvv/rvv.exp | 2 +
> 13 files changed, 887 insertions(+), 17 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-3.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-4.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index 64a41bd7101..8cdec75bacf 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -1554,3 +1554,141 @@
> riscv_vector::expand_cond_len_ternop (icode, operands);
> DONE;
> })
> +
> +;; =========================================================================
> +;; == Reductions
> +;; =========================================================================
> +
> +;; -------------------------------------------------------------------------
> +;; ---- [INT] Tree reductions
> +;; -------------------------------------------------------------------------
> +;; Includes:
> +;; - vredsum.vs
> +;; - vredmaxu.vs
> +;; - vredmax.vs
> +;; - vredminu.vs
> +;; - vredmin.vs
> +;; - vredand.vs
> +;; - vredor.vs
> +;; - vredxor.vs
> +;; -------------------------------------------------------------------------
> +
> +(define_expand "reduc_plus_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (PLUS, operands, CONST0_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +(define_expand "reduc_smax_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + int prec = GET_MODE_PRECISION (<VEL>mode);
> + rtx min = immed_wide_int_const (wi::min_value (prec, SIGNED), <VEL>mode);
> + riscv_vector::expand_reduction (SMAX, operands, min);
> + DONE;
> +})
> +
> +(define_expand "reduc_umax_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (UMAX, operands, CONST0_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +(define_expand "reduc_smin_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + int prec = GET_MODE_PRECISION (<VEL>mode);
> + rtx max = immed_wide_int_const (wi::max_value (prec, SIGNED), <VEL>mode);
> + riscv_vector::expand_reduction (SMIN, operands, max);
> + DONE;
> +})
> +
> +(define_expand "reduc_umin_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + int prec = GET_MODE_PRECISION (<VEL>mode);
> + rtx max = immed_wide_int_const (wi::max_value (prec, UNSIGNED), <VEL>mode);
> + riscv_vector::expand_reduction (UMIN, operands, max);
> + DONE;
> +})
> +
> +(define_expand "reduc_and_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (AND, operands, CONSTM1_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +(define_expand "reduc_ior_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (IOR, operands, CONST0_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +(define_expand "reduc_xor_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (XOR, operands, CONST0_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +;; -------------------------------------------------------------------------
> +;; ---- [FP] Tree reductions
> +;; -------------------------------------------------------------------------
> +;; Includes:
> +;; - vfredusum.vs
> +;; - vfredmax.vs
> +;; - vfredmin.vs
> +;; -------------------------------------------------------------------------
> +
> +(define_expand "reduc_plus_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VF 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (PLUS, operands, CONST0_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +(define_expand "reduc_smax_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VF 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + REAL_VALUE_TYPE rv;
> + real_inf (&rv, true);
> + rtx f = const_double_from_real_value (rv, <VEL>mode);
> + riscv_vector::expand_reduction (SMAX, operands, f);
> + DONE;
> +})
> +
> +(define_expand "reduc_smin_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VF 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + REAL_VALUE_TYPE rv;
> + real_inf (&rv, false);
> + rtx f = const_double_from_real_value (rv, <VEL>mode);
> + riscv_vector::expand_reduction (SMIN, operands, f);
> + DONE;
> +})
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index f91c2d51c3c..16fb8dabca0 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -198,6 +198,7 @@ enum insn_type
> RVV_COMPRESS_OP = 4,
> RVV_GATHER_M_OP = 5,
> RVV_SCATTER_M_OP = 4,
> + RVV_REDUCTION_OP = 3,
> };
> enum vlmul_type
> {
> @@ -281,6 +282,7 @@ bool has_vi_variant_p (rtx_code, rtx);
> void expand_vec_cmp (rtx, rtx_code, rtx, rtx);
> bool expand_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
> void expand_cond_len_binop (rtx_code, rtx *);
> +void expand_reduction (rtx_code, rtx *, rtx);
> #endif
> bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
> bool, void (*)(rtx *, rtx));
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index c3fd4a1b03b..b4884a30872 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -1159,6 +1159,43 @@ emit_vlmax_compress_insn (unsigned icode, rtx *ops)
> e.emit_insn ((enum insn_code) icode, ops);
> }
>
> +/* Emit reduction instruction. */
> +static void
> +emit_vlmax_reduction_insn (unsigned icode, int op_num, rtx *ops)
> +{
> + machine_mode dest_mode = GET_MODE (ops[0]);
> + machine_mode mask_mode = get_mask_mode (GET_MODE (ops[1])).require ();
> + insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num,
> + /* HAS_DEST_P */ true,
> + /* FULLY_UNMASKED_P */ true,
> + /* USE_REAL_MERGE_P */ false,
> + /* HAS_AVL_P */ true,
> + /* VLMAX_P */ true, dest_mode,
> + mask_mode);
> +
> + e.set_policy (TAIL_ANY);
> + e.emit_insn ((enum insn_code) icode, ops);
> +}
> +
> +/* Emit reduction instruction. */
> +static void
> +emit_vlmax_fp_reduction_insn (unsigned icode, int op_num, rtx *ops)
> +{
> + machine_mode dest_mode = GET_MODE (ops[0]);
> + machine_mode mask_mode = get_mask_mode (GET_MODE (ops[1])).require ();
> + insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num,
> + /* HAS_DEST_P */ true,
> + /* FULLY_UNMASKED_P */ true,
> + /* USE_REAL_MERGE_P */ false,
> + /* HAS_AVL_P */ true,
> + /* VLMAX_P */ true, dest_mode,
> + mask_mode);
> +
> + e.set_policy (TAIL_ANY);
> + e.set_rounding_mode (FRM_DYN);
> + e.emit_insn ((enum insn_code) icode, ops);
> +}
> +
> /* Emit merge instruction. */
>
> static machine_mode
> @@ -1651,6 +1688,17 @@ get_mask_mode (machine_mode mode)
> return get_vector_mode (BImode, GET_MODE_NUNITS (mode));
> }
>
> +/* Return the appropriate M1 mode for MODE. */
> +
> +static opt_machine_mode
> +get_m1_mode (machine_mode mode)
> +{
> + scalar_mode smode = GET_MODE_INNER (mode);
> + unsigned int bytes = GET_MODE_SIZE (smode);
> + poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
> + return get_vector_mode (smode, m1_nunits);
> +}
> +
> /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
> This function is not only used by builtins, but also will be used by
> auto-vectorization in the future. */
> @@ -3121,9 +3169,9 @@ expand_cond_len_binop (rtx_code code, rtx *ops)
> rtx ops[] = {dest, mask, merge, src1, src2};
> insn_code icode = code_for_pred (code, mode);
> if (needs_fp_rounding (code, mode))
> - emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_MU, ops, len);
> + emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_TU, ops, len);
> else
> - emit_nonvlmax_tu_insn (icode, RVV_BINOP_MU, ops, len);
> + emit_nonvlmax_tu_insn (icode, RVV_BINOP_TU, ops, len);
> }
> else
> /* FIXME: Enable this case when we support it in the middle-end. */
> @@ -3316,4 +3364,36 @@ expand_cond_len_ternop (unsigned icode, rtx *ops)
> gcc_unreachable ();
> }
>
> +/* Expand reduction operations. */
> +void
> +expand_reduction (rtx_code code, rtx *ops, rtx init)
> +{
> + machine_mode vmode = GET_MODE (ops[1]);
> + machine_mode m1_mode = get_m1_mode (vmode).require ();
> + machine_mode m1_mmode = get_mask_mode (m1_mode).require ();
> +
> + rtx m1_tmp = gen_reg_rtx (m1_mode);
> + rtx m1_mask = gen_scalar_move_mask (m1_mmode);
> + rtx m1_undef = RVV_VUNDEF (m1_mode);
> + rtx scalar_move_ops[] = {m1_tmp, m1_mask, m1_undef, init};
> + emit_scalar_move_insn (code_for_pred_broadcast (m1_mode), scalar_move_ops);
> +
> + rtx m1_tmp2 = gen_reg_rtx (m1_mode);
> + rtx reduc_ops[] = {m1_tmp2, ops[1], m1_tmp};
> +
> + if (FLOAT_MODE_P (vmode) && code == PLUS)
> + {
> + insn_code icode
> + = code_for_pred_reduc_plus (UNSPEC_UNORDERED, vmode, m1_mode);
> + emit_vlmax_fp_reduction_insn (icode, RVV_REDUCTION_OP, reduc_ops);
> + }
> + else
> + {
> + insn_code icode = code_for_pred_reduc (code, vmode, m1_mode);
> + emit_vlmax_reduction_insn (icode, RVV_REDUCTION_OP, reduc_ops);
> + }
> +
> + emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2));
> +}
> +
> } // namespace riscv_vector
> diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
> index 586dc8e5379..bb7ba129a5d 100644
> --- a/gcc/config/riscv/riscv-vsetvl.cc
> +++ b/gcc/config/riscv/riscv-vsetvl.cc
> @@ -646,7 +646,8 @@ gen_vsetvl_pat (enum vsetvl_type insn_type, const vl_vtype_info &info, rtx vl)
> }
>
> static rtx
> -gen_vsetvl_pat (rtx_insn *rinsn, const vector_insn_info &info)
> +gen_vsetvl_pat (rtx_insn *rinsn, const vector_insn_info &info,
> + rtx vl = NULL_RTX)
> {
> rtx new_pat;
> vl_vtype_info new_info = info;
> @@ -654,15 +655,17 @@ gen_vsetvl_pat (rtx_insn *rinsn, const vector_insn_info &info)
> && fault_first_load_p (info.get_insn ()->rtl ()))
> new_info.set_avl_info (
> avl_info (get_avl (info.get_insn ()->rtl ()), nullptr));
> - if (vsetvl_insn_p (rinsn) || vlmax_avl_p (info.get_avl ()))
> + if (vl)
> + new_pat = gen_vsetvl_pat (VSETVL_NORMAL, new_info, vl);
> + else
> {
> - rtx dest = get_vl (rinsn);
> - new_pat = gen_vsetvl_pat (VSETVL_NORMAL, new_info, dest);
> + if (vsetvl_insn_p (rinsn) || vlmax_avl_p (info.get_avl ()))
> + new_pat = gen_vsetvl_pat (VSETVL_NORMAL, new_info, get_vl (rinsn));
> + else if (INSN_CODE (rinsn) == CODE_FOR_vsetvl_vtype_change_only)
> + new_pat = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY, new_info, NULL_RTX);
> + else
> + new_pat = gen_vsetvl_pat (VSETVL_DISCARD_RESULT, new_info, NULL_RTX);
> }
> - else if (INSN_CODE (rinsn) == CODE_FOR_vsetvl_vtype_change_only)
> - new_pat = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY, new_info, NULL_RTX);
> - else
> - new_pat = gen_vsetvl_pat (VSETVL_DISCARD_RESULT, new_info, NULL_RTX);
> return new_pat;
> }
>
> @@ -805,6 +808,14 @@ get_vl_vtype_info (const insn_info *insn)
> return info;
> }
>
> +/* Change insn and Assert the change always happens. */
> +static void
> +validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
> +{
> + bool change_p = validate_change (object, loc, new_rtx, in_group);
> + gcc_assert (change_p);
> +}
> +
> static void
> change_insn (rtx_insn *rinsn, rtx new_pat)
> {
> @@ -818,7 +829,7 @@ change_insn (rtx_insn *rinsn, rtx new_pat)
> print_rtl_single (dump_file, PATTERN (rinsn));
> }
>
> - validate_change (rinsn, &PATTERN (rinsn), new_pat, false);
> + validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat, false);
>
> if (dump_file)
> {
> @@ -874,7 +885,7 @@ change_insn (function_info *ssa, insn_change change, insn_info *insn,
> }
>
> insn_change_watermark watermark;
> - validate_change (rinsn, &PATTERN (rinsn), new_pat, true);
> + validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat, true);
>
> /* These routines report failures themselves. */
> if (!recog (attempt, change) || !change_is_worthwhile (change, false))
> @@ -931,7 +942,8 @@ change_insn (function_info *ssa, insn_change change, insn_info *insn,
> }
>
> static void
> -change_vsetvl_insn (const insn_info *insn, const vector_insn_info &info)
> +change_vsetvl_insn (const insn_info *insn, const vector_insn_info &info,
> + rtx vl = NULL_RTX)
> {
> rtx_insn *rinsn;
> if (vector_config_insn_p (insn->rtl ()))
> @@ -945,7 +957,7 @@ change_vsetvl_insn (const insn_info *insn, const vector_insn_info &info)
> rinsn = PREV_INSN (insn->rtl ());
> gcc_assert (vector_config_insn_p (rinsn));
> }
> - rtx new_pat = gen_vsetvl_pat (rinsn, info);
> + rtx new_pat = gen_vsetvl_pat (rinsn, info, vl);
> change_insn (rinsn, new_pat);
> }
>
> @@ -3377,7 +3389,20 @@ pass_vsetvl::backward_demand_fusion (void)
> new_info))
> continue;
>
> - change_vsetvl_insn (new_info.get_insn (), new_info);
> + rtx vl = NULL_RTX;
> + /* Backward VLMAX VL:
> + bb 3:
> + vsetivli zero, 1 ... -> vsetvli t1, zero
> + vmv.s.x
> + bb 5:
> + vsetvli t1, zero ... -> to be elided.
> + vlse16.v
> +
> + We should forward "t1". */
> + if (!block_info.reaching_out.has_avl_reg ()
> + && vlmax_avl_p (new_info.get_avl ()))
> + vl = get_vl (prop.get_insn ()->rtl ());
> + change_vsetvl_insn (new_info.get_insn (), new_info, vl);
> if (block_info.local_dem == block_info.reaching_out)
> block_info.local_dem = new_info;
> block_info.reaching_out = new_info;
> @@ -4524,13 +4549,15 @@ pass_vsetvl::df_post_optimization (void) const
> {
> rtx new_pat = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY,
> info, NULL_RTX);
> - validate_change (rinsn, &PATTERN (rinsn), new_pat, false);
> + validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat,
> + false);
> }
> else if (!vlmax_avl_p (info.get_avl ()))
> {
> rtx new_pat = gen_vsetvl_pat (VSETVL_DISCARD_RESULT, info,
> NULL_RTX);
> - validate_change (rinsn, &PATTERN (rinsn), new_pat, false);
> + validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat,
> + false);
> }
> }
> }
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-1.c
> new file mode 100644
> index 00000000000..0d543af13ca
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-1.c
> @@ -0,0 +1,118 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include <stdint-gcc.h>
> +
> +#define DEF_REDUC_PLUS(TYPE) \
> +TYPE __attribute__ ((noinline, noclone)) \
> +reduc_plus_##TYPE (TYPE *a, int n) \
> +{ \
> + TYPE r = 0; \
> + for (int i = 0; i < n; ++i) \
> + r += a[i]; \
> + return r; \
> +}
> +
> +#define TEST_PLUS(T) \
> + T (int8_t) \
> + T (int16_t) \
> + T (int32_t) \
> + T (int64_t) \
> + T (uint8_t) \
> + T (uint16_t) \
> + T (uint32_t) \
> + T (uint64_t) \
> + T (_Float16) \
> + T (float) \
> + T (double)
> +
> +TEST_PLUS (DEF_REDUC_PLUS)
> +
> +#define DEF_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
> +TYPE __attribute__ ((noinline, noclone)) \
> +reduc_##NAME##_##TYPE (TYPE *a, int n) \
> +{ \
> + TYPE r = 13; \
> + for (int i = 0; i < n; ++i) \
> + r = a[i] CMP_OP r ? a[i] : r; \
> + return r; \
> +}
> +
> +#define TEST_MAXMIN(T) \
> + T (int8_t, max, >) \
> + T (int16_t, max, >) \
> + T (int32_t, max, >) \
> + T (int64_t, max, >) \
> + T (uint8_t, max, >) \
> + T (uint16_t, max, >) \
> + T (uint32_t, max, >) \
> + T (uint64_t, max, >) \
> + T (_Float16, max, >) \
> + T (float, max, >) \
> + T (double, max, >) \
> + \
> + T (int8_t, min, <) \
> + T (int16_t, min, <) \
> + T (int32_t, min, <) \
> + T (int64_t, min, <) \
> + T (uint8_t, min, <) \
> + T (uint16_t, min, <) \
> + T (uint32_t, min, <) \
> + T (uint64_t, min, <) \
> + T (_Float16, min, <) \
> + T (float, min, <) \
> + T (double, min, <)
> +
> +TEST_MAXMIN (DEF_REDUC_MAXMIN)
> +
> +#define DEF_REDUC_BITWISE(TYPE, NAME, BIT_OP) \
> +TYPE __attribute__ ((noinline, noclone)) \
> +reduc_##NAME##_##TYPE (TYPE *a, int n) \
> +{ \
> + TYPE r = 13; \
> + for (int i = 0; i < n; ++i) \
> + r BIT_OP a[i]; \
> + return r; \
> +}
> +
> +#define TEST_BITWISE(T) \
> + T (int8_t, and, &=) \
> + T (int16_t, and, &=) \
> + T (int32_t, and, &=) \
> + T (int64_t, and, &=) \
> + T (uint8_t, and, &=) \
> + T (uint16_t, and, &=) \
> + T (uint32_t, and, &=) \
> + T (uint64_t, and, &=) \
> + \
> + T (int8_t, ior, |=) \
> + T (int16_t, ior, |=) \
> + T (int32_t, ior, |=) \
> + T (int64_t, ior, |=) \
> + T (uint8_t, ior, |=) \
> + T (uint16_t, ior, |=) \
> + T (uint32_t, ior, |=) \
> + T (uint64_t, ior, |=) \
> + \
> + T (int8_t, xor, ^=) \
> + T (int16_t, xor, ^=) \
> + T (int32_t, xor, ^=) \
> + T (int64_t, xor, ^=) \
> + T (uint8_t, xor, ^=) \
> + T (uint16_t, xor, ^=) \
> + T (uint32_t, xor, ^=) \
> + T (uint64_t, xor, ^=)
> +
> +TEST_BITWISE (DEF_REDUC_BITWISE)
> +
> +/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vfredusum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler-times {vfredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler-times {vfredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-2.c
> new file mode 100644
> index 00000000000..136a8a378bf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-2.c
> @@ -0,0 +1,129 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include <stdint-gcc.h>
> +
> +#define NUM_ELEMS(TYPE) (1024 / sizeof (TYPE))
> +
> +#define DEF_REDUC_PLUS(TYPE) \
> +void __attribute__ ((noinline, noclone)) \
> +reduc_plus_##TYPE (TYPE (*restrict a)[NUM_ELEMS (TYPE)], \
> + TYPE *restrict r, int n) \
> +{ \
> + for (int i = 0; i < n; i++) \
> + { \
> + r[i] = 0; \
> + for (int j = 0; j < NUM_ELEMS (TYPE); j++) \
> + r[i] += a[i][j]; \
> + } \
> +}
> +
> +#define TEST_PLUS(T) \
> + T (int8_t) \
> + T (int16_t) \
> + T (int32_t) \
> + T (int64_t) \
> + T (uint8_t) \
> + T (uint16_t) \
> + T (uint32_t) \
> + T (uint64_t) \
> + T (_Float16) \
> + T (float) \
> + T (double)
> +
> +TEST_PLUS (DEF_REDUC_PLUS)
> +
> +#define DEF_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
> +void __attribute__ ((noinline, noclone)) \
> +reduc_##NAME##_##TYPE (TYPE (*restrict a)[NUM_ELEMS (TYPE)], \
> + TYPE *restrict r, int n) \
> +{ \
> + for (int i = 0; i < n; i++) \
> + { \
> + r[i] = a[i][0]; \
> + for (int j = 0; j < NUM_ELEMS (TYPE); j++) \
> + r[i] = a[i][j] CMP_OP r[i] ? a[i][j] : r[i]; \
> + } \
> +}
> +
> +#define TEST_MAXMIN(T) \
> + T (int8_t, max, >) \
> + T (int16_t, max, >) \
> + T (int32_t, max, >) \
> + T (int64_t, max, >) \
> + T (uint8_t, max, >) \
> + T (uint16_t, max, >) \
> + T (uint32_t, max, >) \
> + T (uint64_t, max, >) \
> + T (_Float16, max, >) \
> + T (float, max, >) \
> + T (double, max, >) \
> + \
> + T (int8_t, min, <) \
> + T (int16_t, min, <) \
> + T (int32_t, min, <) \
> + T (int64_t, min, <) \
> + T (uint8_t, min, <) \
> + T (uint16_t, min, <) \
> + T (uint32_t, min, <) \
> + T (uint64_t, min, <) \
> + T (_Float16, min, <) \
> + T (float, min, <) \
> + T (double, min, <)
> +
> +TEST_MAXMIN (DEF_REDUC_MAXMIN)
> +
> +#define DEF_REDUC_BITWISE(TYPE,NAME,BIT_OP) \
> +void __attribute__ ((noinline, noclone)) \
> +reduc_##NAME##TYPE (TYPE (*restrict a)[NUM_ELEMS(TYPE)], \
> + TYPE *restrict r, int n) \
> +{ \
> + for (int i = 0; i < n; i++) \
> + { \
> + r[i] = a[i][0]; \
> + for (int j = 0; j < NUM_ELEMS(TYPE); j++) \
> + r[i] BIT_OP a[i][j]; \
> + } \
> +}
> +
> +#define TEST_BITWISE(T) \
> + T (int8_t, and, &=) \
> + T (int16_t, and, &=) \
> + T (int32_t, and, &=) \
> + T (int64_t, and, &=) \
> + T (uint8_t, and, &=) \
> + T (uint16_t, and, &=) \
> + T (uint32_t, and, &=) \
> + T (uint64_t, and, &=) \
> + \
> + T (int8_t, ior, |=) \
> + T (int16_t, ior, |=) \
> + T (int32_t, ior, |=) \
> + T (int64_t, ior, |=) \
> + T (uint8_t, ior, |=) \
> + T (uint16_t, ior, |=) \
> + T (uint32_t, ior, |=) \
> + T (uint64_t, ior, |=) \
> + \
> + T (int8_t, xor, ^=) \
> + T (int16_t, xor, ^=) \
> + T (int32_t, xor, ^=) \
> + T (int64_t, xor, ^=) \
> + T (uint8_t, xor, ^=) \
> + T (uint16_t, xor, ^=) \
> + T (uint32_t, xor, ^=) \
> + T (uint64_t, xor, ^=)
> +
> +TEST_BITWISE (DEF_REDUC_BITWISE)
> +
> +/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vfredusum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler-times {vfredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler-times {vfredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-3.c
> new file mode 100644
> index 00000000000..c3638344f80
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-3.c
> @@ -0,0 +1,65 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include <stdint-gcc.h>
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x, int n)
> +{
> + unsigned short res = 0;
> + for (int i = 0; i < n; ++i)
> + res += x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x, int n)
> +{
> + unsigned short res = ~0;
> + for (int i = 0; i < n; ++i)
> + res = res < x[i] ? res : x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x, int n)
> +{
> + unsigned short res = 0;
> + for (int i = 0; i < n; ++i)
> + res = res > x[i] ? res : x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x, int n)
> +{
> + unsigned short res = ~0;
> + for (int i = 0; i < n; ++i)
> + res &= x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x, int n)
> +{
> + unsigned short res = 0;
> + for (int i = 0; i < n; ++i)
> + res |= x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x, int n)
> +{
> + unsigned short res = 0;
> + for (int i = 0; i < n; ++i)
> + res ^= x[i];
> + return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-4.c
> new file mode 100644
> index 00000000000..f00a12826c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-4.c
> @@ -0,0 +1,59 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include <stdint-gcc.h>
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res += x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res = res < x[i] ? res : x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res = res > x[i] ? res : x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res &= x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res |= x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res ^= x[i];
> + return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c
> new file mode 100644
> index 00000000000..b500f857598
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c
> @@ -0,0 +1,56 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include "reduc-1.c"
> +
> +#define NUM_ELEMS(TYPE) (73 + sizeof (TYPE))
> +
> +#define INIT_VECTOR(TYPE) \
> + TYPE a[NUM_ELEMS (TYPE) + 1]; \
> + for (int i = 0; i < NUM_ELEMS (TYPE) + 1; i++) \
> + { \
> + a[i] = ((i * 2) * (i & 1 ? 1 : -1) | 3); \
> + asm volatile ("" ::: "memory"); \
> + }
> +
> +#define TEST_REDUC_PLUS(TYPE) \
> + { \
> + INIT_VECTOR (TYPE); \
> + TYPE r1 = reduc_plus_##TYPE (a, NUM_ELEMS (TYPE)); \
> + volatile TYPE r2 = 0; \
> + for (int i = 0; i < NUM_ELEMS (TYPE); ++i) \
> + r2 += a[i]; \
> + if (r1 != r2) \
> + __builtin_abort (); \
> + }
> +
> +#define TEST_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
> + { \
> + INIT_VECTOR (TYPE); \
> + TYPE r1 = reduc_##NAME##_##TYPE (a, NUM_ELEMS (TYPE)); \
> + volatile TYPE r2 = 13; \
> + for (int i = 0; i < NUM_ELEMS (TYPE); ++i) \
> + r2 = a[i] CMP_OP r2 ? a[i] : r2; \
> + if (r1 != r2) \
> + __builtin_abort (); \
> + }
> +
> +#define TEST_REDUC_BITWISE(TYPE, NAME, BIT_OP) \
> + { \
> + INIT_VECTOR (TYPE); \
> + TYPE r1 = reduc_##NAME##_##TYPE (a, NUM_ELEMS (TYPE)); \
> + volatile TYPE r2 = 13; \
> + for (int i = 0; i < NUM_ELEMS (TYPE); ++i) \
> + r2 BIT_OP a[i]; \
> + if (r1 != r2) \
> + __builtin_abort (); \
> + }
> +
> +int main ()
> +{
> + TEST_PLUS (TEST_REDUC_PLUS)
> + TEST_MAXMIN (TEST_REDUC_MAXMIN)
> + TEST_BITWISE (TEST_REDUC_BITWISE)
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c
> new file mode 100644
> index 00000000000..3c2f62557b1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c
> @@ -0,0 +1,79 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */
> +
> +#include "reduc-2.c"
> +
> +#define NROWS 53
> +
> +/* -ffast-math fuzz for PLUS. */
> +#define CMP__Float16(X, Y) ((X) >= (Y) * 0.875 && (X) <= (Y) * 1.125)
> +#define CMP_float(X, Y) ((X) == (Y))
> +#define CMP_double(X, Y) ((X) == (Y))
> +#define CMP_int8_t(X, Y) ((X) == (Y))
> +#define CMP_int16_t(X, Y) ((X) == (Y))
> +#define CMP_int32_t(X, Y) ((X) == (Y))
> +#define CMP_int64_t(X, Y) ((X) == (Y))
> +#define CMP_uint8_t(X, Y) ((X) == (Y))
> +#define CMP_uint16_t(X, Y) ((X) == (Y))
> +#define CMP_uint32_t(X, Y) ((X) == (Y))
> +#define CMP_uint64_t(X, Y) ((X) == (Y))
> +
> +#define INIT_MATRIX(TYPE) \
> + TYPE mat[NROWS][NUM_ELEMS (TYPE)]; \
> + TYPE r[NROWS]; \
> + for (int i = 0; i < NROWS; i++) \
> + for (int j = 0; j < NUM_ELEMS (TYPE); j++) \
> + { \
> + mat[i][j] = i + (j * 2) * (j & 1 ? 1 : -1); \
> + asm volatile ("" ::: "memory"); \
> + }
> +
> +#define TEST_REDUC_PLUS(TYPE) \
> + { \
> + INIT_MATRIX (TYPE); \
> + reduc_plus_##TYPE (mat, r, NROWS); \
> + for (int i = 0; i < NROWS; i++) \
> + { \
> + volatile TYPE r2 = 0; \
> + for (int j = 0; j < NUM_ELEMS (TYPE); ++j) \
> + r2 += mat[i][j]; \
> + if (!CMP_##TYPE (r[i], r2)) \
> + __builtin_abort (); \
> + } \
> + }
> +
> +#define TEST_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
> + { \
> + INIT_MATRIX (TYPE); \
> + reduc_##NAME##_##TYPE (mat, r, NROWS); \
> + for (int i = 0; i < NROWS; i++) \
> + { \
> + volatile TYPE r2 = mat[i][0]; \
> + for (int j = 0; j < NUM_ELEMS (TYPE); ++j) \
> + r2 = mat[i][j] CMP_OP r2 ? mat[i][j] : r2; \
> + if (r[i] != r2) \
> + __builtin_abort (); \
> + } \
> + }
> +
> +#define TEST_REDUC_BITWISE(TYPE, NAME, BIT_OP) \
> + { \
> + INIT_MATRIX (TYPE); \
> + reduc_##NAME##_##TYPE (mat, r, NROWS); \
> + for (int i = 0; i < NROWS; i++) \
> + { \
> + volatile TYPE r2 = mat[i][0]; \
> + for (int j = 0; j < NUM_ELEMS (TYPE); ++j) \
> + r2 BIT_OP mat[i][j]; \
> + if (r[i] != r2) \
> + __builtin_abort (); \
> + } \
> + }
> +
> +int main ()
> +{
> + TEST_PLUS (TEST_REDUC_PLUS)
> + TEST_MAXMIN (TEST_REDUC_MAXMIN)
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c
> new file mode 100644
> index 00000000000..d1b22c0d69a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c
> @@ -0,0 +1,49 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include "reduc-3.c"
> +
> +#define N 0x1100
> +
> +int
> +main (void)
> +{
> + unsigned short x[N];
> + for (int i = 0; i < N; ++i)
> + x[i] = (i + 1) * (i + 2);
> +
> + if (add_loop (x, 0) != 0
> + || add_loop (x, 11) != 572
> + || add_loop (x, 0x100) != 22016
> + || add_loop (x, 0xfff) != 20480
> + || max_loop (x, 0) != 0
> + || max_loop (x, 11) != 132
> + || max_loop (x, 0x100) != 65280
> + || max_loop (x, 0xfff) != 65504
> + || or_loop (x, 0) != 0
> + || or_loop (x, 11) != 0xfe
> + || or_loop (x, 0x80) != 0x7ffe
> + || or_loop (x, 0xb4) != 0x7ffe
> + || or_loop (x, 0xb5) != 0xfffe
> + || eor_loop (x, 0) != 0
> + || eor_loop (x, 11) != 0xe8
> + || eor_loop (x, 0x100) != 0xcf00
> + || eor_loop (x, 0xfff) != 0xa000)
> + __builtin_abort ();
> +
> + for (int i = 0; i < N; ++i)
> + x[i] = ~x[i];
> +
> + if (min_loop (x, 0) != 65535
> + || min_loop (x, 11) != 65403
> + || min_loop (x, 0x100) != 255
> + || min_loop (x, 0xfff) != 31
> + || and_loop (x, 0) != 0xffff
> + || and_loop (x, 11) != 0xff01
> + || and_loop (x, 0x80) != 0x8001
> + || and_loop (x, 0xb4) != 0x8001
> + || and_loop (x, 0xb5) != 1)
> + __builtin_abort ();
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c
> new file mode 100644
> index 00000000000..c17e125a763
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c
> @@ -0,0 +1,66 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include "reduc-4.c"
> +
> +#define N 0x1100
> +
> +int
> +main (void)
> +{
> + unsigned short x[N];
> + for (int i = 0; i < N; ++i)
> + x[i] = (i + 1) * (i + 2);
> +
> + if (add_loop (x, 0, 10) != 10
> + || add_loop (x, 11, 42) != 614
> + || add_loop (x, 0x100, 84) != 22100
> + || add_loop (x, 0xfff, 20) != 20500
> + || max_loop (x, 0, 10) != 10
> + || max_loop (x, 11, 131) != 132
> + || max_loop (x, 11, 133) != 133
> + || max_loop (x, 0x100, 65279) != 65280
> + || max_loop (x, 0x100, 65281) != 65281
> + || max_loop (x, 0xfff, 65503) != 65504
> + || max_loop (x, 0xfff, 65505) != 65505
> + || or_loop (x, 0, 0x71) != 0x71
> + || or_loop (x, 11, 0) != 0xfe
> + || or_loop (x, 11, 0xb3c) != 0xbfe
> + || or_loop (x, 0x80, 0) != 0x7ffe
> + || or_loop (x, 0x80, 1) != 0x7fff
> + || or_loop (x, 0xb4, 0) != 0x7ffe
> + || or_loop (x, 0xb4, 1) != 0x7fff
> + || or_loop (x, 0xb5, 0) != 0xfffe
> + || or_loop (x, 0xb5, 1) != 0xffff
> + || eor_loop (x, 0, 0x3e) != 0x3e
> + || eor_loop (x, 11, 0) != 0xe8
> + || eor_loop (x, 11, 0x1ff) != 0x117
> + || eor_loop (x, 0x100, 0) != 0xcf00
> + || eor_loop (x, 0x100, 0xeee) != 0xc1ee
> + || eor_loop (x, 0xfff, 0) != 0xa000
> + || eor_loop (x, 0xfff, 0x8888) != 0x2888)
> + __builtin_abort ();
> +
> + for (int i = 0; i < N; ++i)
> + x[i] = ~x[i];
> +
> + if (min_loop (x, 0, 10000) != 10000
> + || min_loop (x, 11, 65404) != 65403
> + || min_loop (x, 11, 65402) != 65402
> + || min_loop (x, 0x100, 256) != 255
> + || min_loop (x, 0x100, 254) != 254
> + || min_loop (x, 0xfff, 32) != 31
> + || min_loop (x, 0xfff, 30) != 30
> + || and_loop (x, 0, 0x1234) != 0x1234
> + || and_loop (x, 11, 0xffff) != 0xff01
> + || and_loop (x, 11, 0xcdef) != 0xcd01
> + || and_loop (x, 0x80, 0xffff) != 0x8001
> + || and_loop (x, 0x80, 0xfffe) != 0x8000
> + || and_loop (x, 0xb4, 0xffff) != 0x8001
> + || and_loop (x, 0xb4, 0xfffe) != 0x8000
> + || and_loop (x, 0xb5, 0xffff) != 1
> + || and_loop (x, 0xb5, 0xfffe) != 0)
> + __builtin_abort ();
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
> index 19589fa9638..532c17c4065 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
> @@ -71,6 +71,8 @@ foreach op $AUTOVEC_TEST_OPTS {
> "" "$op"
> dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/ternop/*.\[cS\]]] \
> "" "$op"
> + dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/reduc/*.\[cS\]]] \
> + "" "$op"
> }
>
> # widening operation only test on LMUL < 8
> --
> 2.36.1
>
Committed, thanks Kito.
Pan
-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Kito Cheng via Gcc-patches
Sent: Monday, July 17, 2023 5:33 PM
To: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Cc: gcc-patches@gcc.gnu.org; kito.cheng@gmail.com; palmer@dabbelt.com; palmer@rivosinc.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH V2] RISC-V: Support non-SLP unordered reduction
LGTM, thanks :)
On Mon, Jul 17, 2023 at 4:20 PM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote:
>
> This patch add reduc_*_scal to support reduction auto-vectorization.
>
> Use COND_LEN_* + reduc_*_scal to support unordered non-SLP auto-vectorization.
>
> Consider this following case:
> int __attribute__((noipa))
> and_loop (int32_t * __restrict x,
> int32_t n, int res)
> {
> for (int i = 0; i < n; ++i)
> res &= x[i];
> return res;
> }
>
> ASM:
> and_loop:
> ble a1,zero,.L4
> vsetvli a3,zero,e32,m1,ta,ma
> vmv.v.i v1,-1
> .L3:
> vsetvli a5,a1,e32,m1,tu,ma ------------> MUST BE "TU".
> slli a4,a5,2
> sub a1,a1,a5
> vle32.v v2,0(a0)
> add a0,a0,a4
> vand.vv v1,v2,v1
> bne a1,zero,.L3
> vsetivli zero,1,e32,m1,ta,ma
> vmv.v.i v2,-1
> vsetvli a3,zero,e32,m1,ta,ma
> vredand.vs v1,v1,v2
> vmv.x.s a5,v1
> and a0,a2,a5
> ret
> .L4:
> mv a0,a2
> ret
>
> Fix bug of VSETVL PASS which is caused by reduction testcase.
>
> SLP reduction and floating-point in-order reduction are not supported yet.
>
> gcc/ChangeLog:
>
> * config/riscv/autovec.md (reduc_plus_scal_<mode>): New pattern.
> (reduc_smax_scal_<mode>): Ditto.
> (reduc_umax_scal_<mode>): Ditto.
> (reduc_smin_scal_<mode>): Ditto.
> (reduc_umin_scal_<mode>): Ditto.
> (reduc_and_scal_<mode>): Ditto.
> (reduc_ior_scal_<mode>): Ditto.
> (reduc_xor_scal_<mode>): Ditto.
> * config/riscv/riscv-protos.h (enum insn_type): Add reduction.
> (expand_reduction): New function.
> * config/riscv/riscv-v.cc (emit_vlmax_reduction_insn): Ditto.
> (emit_vlmax_fp_reduction_insn): Ditto.
> (get_m1_mode): Ditto.
> (expand_cond_len_binop): Fix name.
> (expand_reduction): New function
> * config/riscv/riscv-vsetvl.cc (gen_vsetvl_pat): Fix VSETVL BUG.
> (validate_change_or_fail): New function.
> (change_insn): Fix VSETVL BUG.
> (change_vsetvl_insn): Ditto.
> (pass_vsetvl::backward_demand_fusion): Ditto.
> (pass_vsetvl::df_post_optimization): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/rvv.exp: Add reduction tests.
> * gcc.target/riscv/rvv/autovec/reduc/reduc-1.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc-2.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc-3.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc-4.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c: New test.
> * gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c: New test.
>
> ---
> gcc/config/riscv/autovec.md | 138 ++++++++++++++++++
> gcc/config/riscv/riscv-protos.h | 2 +
> gcc/config/riscv/riscv-v.cc | 84 ++++++++++-
> gcc/config/riscv/riscv-vsetvl.cc | 57 ++++++--
> .../riscv/rvv/autovec/reduc/reduc-1.c | 118 +++++++++++++++
> .../riscv/rvv/autovec/reduc/reduc-2.c | 129 ++++++++++++++++
> .../riscv/rvv/autovec/reduc/reduc-3.c | 65 +++++++++
> .../riscv/rvv/autovec/reduc/reduc-4.c | 59 ++++++++
> .../riscv/rvv/autovec/reduc/reduc_run-1.c | 56 +++++++
> .../riscv/rvv/autovec/reduc/reduc_run-2.c | 79 ++++++++++
> .../riscv/rvv/autovec/reduc/reduc_run-3.c | 49 +++++++
> .../riscv/rvv/autovec/reduc/reduc_run-4.c | 66 +++++++++
> gcc/testsuite/gcc.target/riscv/rvv/rvv.exp | 2 +
> 13 files changed, 887 insertions(+), 17 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-3.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-4.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index 64a41bd7101..8cdec75bacf 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -1554,3 +1554,141 @@
> riscv_vector::expand_cond_len_ternop (icode, operands);
> DONE;
> })
> +
> +;; =========================================================================
> +;; == Reductions
> +;; =========================================================================
> +
> +;; -------------------------------------------------------------------------
> +;; ---- [INT] Tree reductions
> +;; -------------------------------------------------------------------------
> +;; Includes:
> +;; - vredsum.vs
> +;; - vredmaxu.vs
> +;; - vredmax.vs
> +;; - vredminu.vs
> +;; - vredmin.vs
> +;; - vredand.vs
> +;; - vredor.vs
> +;; - vredxor.vs
> +;; -------------------------------------------------------------------------
> +
> +(define_expand "reduc_plus_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (PLUS, operands, CONST0_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +(define_expand "reduc_smax_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + int prec = GET_MODE_PRECISION (<VEL>mode);
> + rtx min = immed_wide_int_const (wi::min_value (prec, SIGNED), <VEL>mode);
> + riscv_vector::expand_reduction (SMAX, operands, min);
> + DONE;
> +})
> +
> +(define_expand "reduc_umax_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (UMAX, operands, CONST0_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +(define_expand "reduc_smin_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + int prec = GET_MODE_PRECISION (<VEL>mode);
> + rtx max = immed_wide_int_const (wi::max_value (prec, SIGNED), <VEL>mode);
> + riscv_vector::expand_reduction (SMIN, operands, max);
> + DONE;
> +})
> +
> +(define_expand "reduc_umin_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + int prec = GET_MODE_PRECISION (<VEL>mode);
> + rtx max = immed_wide_int_const (wi::max_value (prec, UNSIGNED), <VEL>mode);
> + riscv_vector::expand_reduction (UMIN, operands, max);
> + DONE;
> +})
> +
> +(define_expand "reduc_and_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (AND, operands, CONSTM1_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +(define_expand "reduc_ior_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (IOR, operands, CONST0_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +(define_expand "reduc_xor_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VI 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (XOR, operands, CONST0_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +;; -------------------------------------------------------------------------
> +;; ---- [FP] Tree reductions
> +;; -------------------------------------------------------------------------
> +;; Includes:
> +;; - vfredusum.vs
> +;; - vfredmax.vs
> +;; - vfredmin.vs
> +;; -------------------------------------------------------------------------
> +
> +(define_expand "reduc_plus_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VF 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + riscv_vector::expand_reduction (PLUS, operands, CONST0_RTX (<VEL>mode));
> + DONE;
> +})
> +
> +(define_expand "reduc_smax_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VF 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + REAL_VALUE_TYPE rv;
> + real_inf (&rv, true);
> + rtx f = const_double_from_real_value (rv, <VEL>mode);
> + riscv_vector::expand_reduction (SMAX, operands, f);
> + DONE;
> +})
> +
> +(define_expand "reduc_smin_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> + (match_operand:VF 1 "register_operand")]
> + "TARGET_VECTOR"
> +{
> + REAL_VALUE_TYPE rv;
> + real_inf (&rv, false);
> + rtx f = const_double_from_real_value (rv, <VEL>mode);
> + riscv_vector::expand_reduction (SMIN, operands, f);
> + DONE;
> +})
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index f91c2d51c3c..16fb8dabca0 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -198,6 +198,7 @@ enum insn_type
> RVV_COMPRESS_OP = 4,
> RVV_GATHER_M_OP = 5,
> RVV_SCATTER_M_OP = 4,
> + RVV_REDUCTION_OP = 3,
> };
> enum vlmul_type
> {
> @@ -281,6 +282,7 @@ bool has_vi_variant_p (rtx_code, rtx);
> void expand_vec_cmp (rtx, rtx_code, rtx, rtx);
> bool expand_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
> void expand_cond_len_binop (rtx_code, rtx *);
> +void expand_reduction (rtx_code, rtx *, rtx);
> #endif
> bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
> bool, void (*)(rtx *, rtx));
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index c3fd4a1b03b..b4884a30872 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -1159,6 +1159,43 @@ emit_vlmax_compress_insn (unsigned icode, rtx *ops)
> e.emit_insn ((enum insn_code) icode, ops);
> }
>
> +/* Emit reduction instruction. */
> +static void
> +emit_vlmax_reduction_insn (unsigned icode, int op_num, rtx *ops)
> +{
> + machine_mode dest_mode = GET_MODE (ops[0]);
> + machine_mode mask_mode = get_mask_mode (GET_MODE (ops[1])).require ();
> + insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num,
> + /* HAS_DEST_P */ true,
> + /* FULLY_UNMASKED_P */ true,
> + /* USE_REAL_MERGE_P */ false,
> + /* HAS_AVL_P */ true,
> + /* VLMAX_P */ true, dest_mode,
> + mask_mode);
> +
> + e.set_policy (TAIL_ANY);
> + e.emit_insn ((enum insn_code) icode, ops);
> +}
> +
> +/* Emit reduction instruction. */
> +static void
> +emit_vlmax_fp_reduction_insn (unsigned icode, int op_num, rtx *ops)
> +{
> + machine_mode dest_mode = GET_MODE (ops[0]);
> + machine_mode mask_mode = get_mask_mode (GET_MODE (ops[1])).require ();
> + insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num,
> + /* HAS_DEST_P */ true,
> + /* FULLY_UNMASKED_P */ true,
> + /* USE_REAL_MERGE_P */ false,
> + /* HAS_AVL_P */ true,
> + /* VLMAX_P */ true, dest_mode,
> + mask_mode);
> +
> + e.set_policy (TAIL_ANY);
> + e.set_rounding_mode (FRM_DYN);
> + e.emit_insn ((enum insn_code) icode, ops);
> +}
> +
> /* Emit merge instruction. */
>
> static machine_mode
> @@ -1651,6 +1688,17 @@ get_mask_mode (machine_mode mode)
> return get_vector_mode (BImode, GET_MODE_NUNITS (mode));
> }
>
> +/* Return the appropriate M1 mode for MODE. */
> +
> +static opt_machine_mode
> +get_m1_mode (machine_mode mode)
> +{
> + scalar_mode smode = GET_MODE_INNER (mode);
> + unsigned int bytes = GET_MODE_SIZE (smode);
> + poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
> + return get_vector_mode (smode, m1_nunits);
> +}
> +
> /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
> This function is not only used by builtins, but also will be used by
> auto-vectorization in the future. */
> @@ -3121,9 +3169,9 @@ expand_cond_len_binop (rtx_code code, rtx *ops)
> rtx ops[] = {dest, mask, merge, src1, src2};
> insn_code icode = code_for_pred (code, mode);
> if (needs_fp_rounding (code, mode))
> - emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_MU, ops, len);
> + emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_TU, ops, len);
> else
> - emit_nonvlmax_tu_insn (icode, RVV_BINOP_MU, ops, len);
> + emit_nonvlmax_tu_insn (icode, RVV_BINOP_TU, ops, len);
> }
> else
> /* FIXME: Enable this case when we support it in the middle-end. */
> @@ -3316,4 +3364,36 @@ expand_cond_len_ternop (unsigned icode, rtx *ops)
> gcc_unreachable ();
> }
>
> +/* Expand reduction operations. */
> +void
> +expand_reduction (rtx_code code, rtx *ops, rtx init)
> +{
> + machine_mode vmode = GET_MODE (ops[1]);
> + machine_mode m1_mode = get_m1_mode (vmode).require ();
> + machine_mode m1_mmode = get_mask_mode (m1_mode).require ();
> +
> + rtx m1_tmp = gen_reg_rtx (m1_mode);
> + rtx m1_mask = gen_scalar_move_mask (m1_mmode);
> + rtx m1_undef = RVV_VUNDEF (m1_mode);
> + rtx scalar_move_ops[] = {m1_tmp, m1_mask, m1_undef, init};
> + emit_scalar_move_insn (code_for_pred_broadcast (m1_mode), scalar_move_ops);
> +
> + rtx m1_tmp2 = gen_reg_rtx (m1_mode);
> + rtx reduc_ops[] = {m1_tmp2, ops[1], m1_tmp};
> +
> + if (FLOAT_MODE_P (vmode) && code == PLUS)
> + {
> + insn_code icode
> + = code_for_pred_reduc_plus (UNSPEC_UNORDERED, vmode, m1_mode);
> + emit_vlmax_fp_reduction_insn (icode, RVV_REDUCTION_OP, reduc_ops);
> + }
> + else
> + {
> + insn_code icode = code_for_pred_reduc (code, vmode, m1_mode);
> + emit_vlmax_reduction_insn (icode, RVV_REDUCTION_OP, reduc_ops);
> + }
> +
> + emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2));
> +}
> +
> } // namespace riscv_vector
> diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
> index 586dc8e5379..bb7ba129a5d 100644
> --- a/gcc/config/riscv/riscv-vsetvl.cc
> +++ b/gcc/config/riscv/riscv-vsetvl.cc
> @@ -646,7 +646,8 @@ gen_vsetvl_pat (enum vsetvl_type insn_type, const vl_vtype_info &info, rtx vl)
> }
>
> static rtx
> -gen_vsetvl_pat (rtx_insn *rinsn, const vector_insn_info &info)
> +gen_vsetvl_pat (rtx_insn *rinsn, const vector_insn_info &info,
> + rtx vl = NULL_RTX)
> {
> rtx new_pat;
> vl_vtype_info new_info = info;
> @@ -654,15 +655,17 @@ gen_vsetvl_pat (rtx_insn *rinsn, const vector_insn_info &info)
> && fault_first_load_p (info.get_insn ()->rtl ()))
> new_info.set_avl_info (
> avl_info (get_avl (info.get_insn ()->rtl ()), nullptr));
> - if (vsetvl_insn_p (rinsn) || vlmax_avl_p (info.get_avl ()))
> + if (vl)
> + new_pat = gen_vsetvl_pat (VSETVL_NORMAL, new_info, vl);
> + else
> {
> - rtx dest = get_vl (rinsn);
> - new_pat = gen_vsetvl_pat (VSETVL_NORMAL, new_info, dest);
> + if (vsetvl_insn_p (rinsn) || vlmax_avl_p (info.get_avl ()))
> + new_pat = gen_vsetvl_pat (VSETVL_NORMAL, new_info, get_vl (rinsn));
> + else if (INSN_CODE (rinsn) == CODE_FOR_vsetvl_vtype_change_only)
> + new_pat = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY, new_info, NULL_RTX);
> + else
> + new_pat = gen_vsetvl_pat (VSETVL_DISCARD_RESULT, new_info, NULL_RTX);
> }
> - else if (INSN_CODE (rinsn) == CODE_FOR_vsetvl_vtype_change_only)
> - new_pat = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY, new_info, NULL_RTX);
> - else
> - new_pat = gen_vsetvl_pat (VSETVL_DISCARD_RESULT, new_info, NULL_RTX);
> return new_pat;
> }
>
> @@ -805,6 +808,14 @@ get_vl_vtype_info (const insn_info *insn)
> return info;
> }
>
> +/* Change insn and Assert the change always happens. */
> +static void
> +validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
> +{
> + bool change_p = validate_change (object, loc, new_rtx, in_group);
> + gcc_assert (change_p);
> +}
> +
> static void
> change_insn (rtx_insn *rinsn, rtx new_pat)
> {
> @@ -818,7 +829,7 @@ change_insn (rtx_insn *rinsn, rtx new_pat)
> print_rtl_single (dump_file, PATTERN (rinsn));
> }
>
> - validate_change (rinsn, &PATTERN (rinsn), new_pat, false);
> + validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat, false);
>
> if (dump_file)
> {
> @@ -874,7 +885,7 @@ change_insn (function_info *ssa, insn_change change, insn_info *insn,
> }
>
> insn_change_watermark watermark;
> - validate_change (rinsn, &PATTERN (rinsn), new_pat, true);
> + validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat, true);
>
> /* These routines report failures themselves. */
> if (!recog (attempt, change) || !change_is_worthwhile (change, false))
> @@ -931,7 +942,8 @@ change_insn (function_info *ssa, insn_change change, insn_info *insn,
> }
>
> static void
> -change_vsetvl_insn (const insn_info *insn, const vector_insn_info &info)
> +change_vsetvl_insn (const insn_info *insn, const vector_insn_info &info,
> + rtx vl = NULL_RTX)
> {
> rtx_insn *rinsn;
> if (vector_config_insn_p (insn->rtl ()))
> @@ -945,7 +957,7 @@ change_vsetvl_insn (const insn_info *insn, const vector_insn_info &info)
> rinsn = PREV_INSN (insn->rtl ());
> gcc_assert (vector_config_insn_p (rinsn));
> }
> - rtx new_pat = gen_vsetvl_pat (rinsn, info);
> + rtx new_pat = gen_vsetvl_pat (rinsn, info, vl);
> change_insn (rinsn, new_pat);
> }
>
> @@ -3377,7 +3389,20 @@ pass_vsetvl::backward_demand_fusion (void)
> new_info))
> continue;
>
> - change_vsetvl_insn (new_info.get_insn (), new_info);
> + rtx vl = NULL_RTX;
> + /* Backward VLMAX VL:
> + bb 3:
> + vsetivli zero, 1 ... -> vsetvli t1, zero
> + vmv.s.x
> + bb 5:
> + vsetvli t1, zero ... -> to be elided.
> + vlse16.v
> +
> + We should forward "t1". */
> + if (!block_info.reaching_out.has_avl_reg ()
> + && vlmax_avl_p (new_info.get_avl ()))
> + vl = get_vl (prop.get_insn ()->rtl ());
> + change_vsetvl_insn (new_info.get_insn (), new_info, vl);
> if (block_info.local_dem == block_info.reaching_out)
> block_info.local_dem = new_info;
> block_info.reaching_out = new_info;
> @@ -4524,13 +4549,15 @@ pass_vsetvl::df_post_optimization (void) const
> {
> rtx new_pat = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY,
> info, NULL_RTX);
> - validate_change (rinsn, &PATTERN (rinsn), new_pat, false);
> + validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat,
> + false);
> }
> else if (!vlmax_avl_p (info.get_avl ()))
> {
> rtx new_pat = gen_vsetvl_pat (VSETVL_DISCARD_RESULT, info,
> NULL_RTX);
> - validate_change (rinsn, &PATTERN (rinsn), new_pat, false);
> + validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat,
> + false);
> }
> }
> }
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-1.c
> new file mode 100644
> index 00000000000..0d543af13ca
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-1.c
> @@ -0,0 +1,118 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include <stdint-gcc.h>
> +
> +#define DEF_REDUC_PLUS(TYPE) \
> +TYPE __attribute__ ((noinline, noclone)) \
> +reduc_plus_##TYPE (TYPE *a, int n) \
> +{ \
> + TYPE r = 0; \
> + for (int i = 0; i < n; ++i) \
> + r += a[i]; \
> + return r; \
> +}
> +
> +#define TEST_PLUS(T) \
> + T (int8_t) \
> + T (int16_t) \
> + T (int32_t) \
> + T (int64_t) \
> + T (uint8_t) \
> + T (uint16_t) \
> + T (uint32_t) \
> + T (uint64_t) \
> + T (_Float16) \
> + T (float) \
> + T (double)
> +
> +TEST_PLUS (DEF_REDUC_PLUS)
> +
> +#define DEF_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
> +TYPE __attribute__ ((noinline, noclone)) \
> +reduc_##NAME##_##TYPE (TYPE *a, int n) \
> +{ \
> + TYPE r = 13; \
> + for (int i = 0; i < n; ++i) \
> + r = a[i] CMP_OP r ? a[i] : r; \
> + return r; \
> +}
> +
> +#define TEST_MAXMIN(T) \
> + T (int8_t, max, >) \
> + T (int16_t, max, >) \
> + T (int32_t, max, >) \
> + T (int64_t, max, >) \
> + T (uint8_t, max, >) \
> + T (uint16_t, max, >) \
> + T (uint32_t, max, >) \
> + T (uint64_t, max, >) \
> + T (_Float16, max, >) \
> + T (float, max, >) \
> + T (double, max, >) \
> + \
> + T (int8_t, min, <) \
> + T (int16_t, min, <) \
> + T (int32_t, min, <) \
> + T (int64_t, min, <) \
> + T (uint8_t, min, <) \
> + T (uint16_t, min, <) \
> + T (uint32_t, min, <) \
> + T (uint64_t, min, <) \
> + T (_Float16, min, <) \
> + T (float, min, <) \
> + T (double, min, <)
> +
> +TEST_MAXMIN (DEF_REDUC_MAXMIN)
> +
> +#define DEF_REDUC_BITWISE(TYPE, NAME, BIT_OP) \
> +TYPE __attribute__ ((noinline, noclone)) \
> +reduc_##NAME##_##TYPE (TYPE *a, int n) \
> +{ \
> + TYPE r = 13; \
> + for (int i = 0; i < n; ++i) \
> + r BIT_OP a[i]; \
> + return r; \
> +}
> +
> +#define TEST_BITWISE(T) \
> + T (int8_t, and, &=) \
> + T (int16_t, and, &=) \
> + T (int32_t, and, &=) \
> + T (int64_t, and, &=) \
> + T (uint8_t, and, &=) \
> + T (uint16_t, and, &=) \
> + T (uint32_t, and, &=) \
> + T (uint64_t, and, &=) \
> + \
> + T (int8_t, ior, |=) \
> + T (int16_t, ior, |=) \
> + T (int32_t, ior, |=) \
> + T (int64_t, ior, |=) \
> + T (uint8_t, ior, |=) \
> + T (uint16_t, ior, |=) \
> + T (uint32_t, ior, |=) \
> + T (uint64_t, ior, |=) \
> + \
> + T (int8_t, xor, ^=) \
> + T (int16_t, xor, ^=) \
> + T (int32_t, xor, ^=) \
> + T (int64_t, xor, ^=) \
> + T (uint8_t, xor, ^=) \
> + T (uint16_t, xor, ^=) \
> + T (uint32_t, xor, ^=) \
> + T (uint64_t, xor, ^=)
> +
> +TEST_BITWISE (DEF_REDUC_BITWISE)
> +
> +/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vfredusum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler-times {vfredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler-times {vfredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-2.c
> new file mode 100644
> index 00000000000..136a8a378bf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-2.c
> @@ -0,0 +1,129 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include <stdint-gcc.h>
> +
> +#define NUM_ELEMS(TYPE) (1024 / sizeof (TYPE))
> +
> +#define DEF_REDUC_PLUS(TYPE) \
> +void __attribute__ ((noinline, noclone)) \
> +reduc_plus_##TYPE (TYPE (*restrict a)[NUM_ELEMS (TYPE)], \
> + TYPE *restrict r, int n) \
> +{ \
> + for (int i = 0; i < n; i++) \
> + { \
> + r[i] = 0; \
> + for (int j = 0; j < NUM_ELEMS (TYPE); j++) \
> + r[i] += a[i][j]; \
> + } \
> +}
> +
> +#define TEST_PLUS(T) \
> + T (int8_t) \
> + T (int16_t) \
> + T (int32_t) \
> + T (int64_t) \
> + T (uint8_t) \
> + T (uint16_t) \
> + T (uint32_t) \
> + T (uint64_t) \
> + T (_Float16) \
> + T (float) \
> + T (double)
> +
> +TEST_PLUS (DEF_REDUC_PLUS)
> +
> +#define DEF_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
> +void __attribute__ ((noinline, noclone)) \
> +reduc_##NAME##_##TYPE (TYPE (*restrict a)[NUM_ELEMS (TYPE)], \
> + TYPE *restrict r, int n) \
> +{ \
> + for (int i = 0; i < n; i++) \
> + { \
> + r[i] = a[i][0]; \
> + for (int j = 0; j < NUM_ELEMS (TYPE); j++) \
> + r[i] = a[i][j] CMP_OP r[i] ? a[i][j] : r[i]; \
> + } \
> +}
> +
> +#define TEST_MAXMIN(T) \
> + T (int8_t, max, >) \
> + T (int16_t, max, >) \
> + T (int32_t, max, >) \
> + T (int64_t, max, >) \
> + T (uint8_t, max, >) \
> + T (uint16_t, max, >) \
> + T (uint32_t, max, >) \
> + T (uint64_t, max, >) \
> + T (_Float16, max, >) \
> + T (float, max, >) \
> + T (double, max, >) \
> + \
> + T (int8_t, min, <) \
> + T (int16_t, min, <) \
> + T (int32_t, min, <) \
> + T (int64_t, min, <) \
> + T (uint8_t, min, <) \
> + T (uint16_t, min, <) \
> + T (uint32_t, min, <) \
> + T (uint64_t, min, <) \
> + T (_Float16, min, <) \
> + T (float, min, <) \
> + T (double, min, <)
> +
> +TEST_MAXMIN (DEF_REDUC_MAXMIN)
> +
> +#define DEF_REDUC_BITWISE(TYPE,NAME,BIT_OP) \
> +void __attribute__ ((noinline, noclone)) \
> +reduc_##NAME##TYPE (TYPE (*restrict a)[NUM_ELEMS(TYPE)], \
> + TYPE *restrict r, int n) \
> +{ \
> + for (int i = 0; i < n; i++) \
> + { \
> + r[i] = a[i][0]; \
> + for (int j = 0; j < NUM_ELEMS(TYPE); j++) \
> + r[i] BIT_OP a[i][j]; \
> + } \
> +}
> +
> +#define TEST_BITWISE(T) \
> + T (int8_t, and, &=) \
> + T (int16_t, and, &=) \
> + T (int32_t, and, &=) \
> + T (int64_t, and, &=) \
> + T (uint8_t, and, &=) \
> + T (uint16_t, and, &=) \
> + T (uint32_t, and, &=) \
> + T (uint64_t, and, &=) \
> + \
> + T (int8_t, ior, |=) \
> + T (int16_t, ior, |=) \
> + T (int32_t, ior, |=) \
> + T (int64_t, ior, |=) \
> + T (uint8_t, ior, |=) \
> + T (uint16_t, ior, |=) \
> + T (uint32_t, ior, |=) \
> + T (uint64_t, ior, |=) \
> + \
> + T (int8_t, xor, ^=) \
> + T (int16_t, xor, ^=) \
> + T (int32_t, xor, ^=) \
> + T (int64_t, xor, ^=) \
> + T (uint8_t, xor, ^=) \
> + T (uint16_t, xor, ^=) \
> + T (uint32_t, xor, ^=) \
> + T (uint64_t, xor, ^=)
> +
> +TEST_BITWISE (DEF_REDUC_BITWISE)
> +
> +/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
> +/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
> +/* { dg-final { scan-assembler-times {vfredusum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler-times {vfredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler-times {vfredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-3.c
> new file mode 100644
> index 00000000000..c3638344f80
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-3.c
> @@ -0,0 +1,65 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include <stdint-gcc.h>
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x, int n)
> +{
> + unsigned short res = 0;
> + for (int i = 0; i < n; ++i)
> + res += x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x, int n)
> +{
> + unsigned short res = ~0;
> + for (int i = 0; i < n; ++i)
> + res = res < x[i] ? res : x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x, int n)
> +{
> + unsigned short res = 0;
> + for (int i = 0; i < n; ++i)
> + res = res > x[i] ? res : x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x, int n)
> +{
> + unsigned short res = ~0;
> + for (int i = 0; i < n; ++i)
> + res &= x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x, int n)
> +{
> + unsigned short res = 0;
> + for (int i = 0; i < n; ++i)
> + res |= x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x, int n)
> +{
> + unsigned short res = 0;
> + for (int i = 0; i < n; ++i)
> + res ^= x[i];
> + return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-4.c
> new file mode 100644
> index 00000000000..f00a12826c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc-4.c
> @@ -0,0 +1,59 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include <stdint-gcc.h>
> +
> +unsigned short __attribute__((noipa))
> +add_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res += x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +min_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res = res < x[i] ? res : x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +max_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res = res > x[i] ? res : x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +and_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res &= x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +or_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res |= x[i];
> + return res;
> +}
> +
> +unsigned short __attribute__((noipa))
> +eor_loop (unsigned short *x, int n, unsigned short res)
> +{
> + for (int i = 0; i < n; ++i)
> + res ^= x[i];
> + return res;
> +}
> +
> +/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c
> new file mode 100644
> index 00000000000..b500f857598
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-1.c
> @@ -0,0 +1,56 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include "reduc-1.c"
> +
> +#define NUM_ELEMS(TYPE) (73 + sizeof (TYPE))
> +
> +#define INIT_VECTOR(TYPE) \
> + TYPE a[NUM_ELEMS (TYPE) + 1]; \
> + for (int i = 0; i < NUM_ELEMS (TYPE) + 1; i++) \
> + { \
> + a[i] = ((i * 2) * (i & 1 ? 1 : -1) | 3); \
> + asm volatile ("" ::: "memory"); \
> + }
> +
> +#define TEST_REDUC_PLUS(TYPE) \
> + { \
> + INIT_VECTOR (TYPE); \
> + TYPE r1 = reduc_plus_##TYPE (a, NUM_ELEMS (TYPE)); \
> + volatile TYPE r2 = 0; \
> + for (int i = 0; i < NUM_ELEMS (TYPE); ++i) \
> + r2 += a[i]; \
> + if (r1 != r2) \
> + __builtin_abort (); \
> + }
> +
> +#define TEST_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
> + { \
> + INIT_VECTOR (TYPE); \
> + TYPE r1 = reduc_##NAME##_##TYPE (a, NUM_ELEMS (TYPE)); \
> + volatile TYPE r2 = 13; \
> + for (int i = 0; i < NUM_ELEMS (TYPE); ++i) \
> + r2 = a[i] CMP_OP r2 ? a[i] : r2; \
> + if (r1 != r2) \
> + __builtin_abort (); \
> + }
> +
> +#define TEST_REDUC_BITWISE(TYPE, NAME, BIT_OP) \
> + { \
> + INIT_VECTOR (TYPE); \
> + TYPE r1 = reduc_##NAME##_##TYPE (a, NUM_ELEMS (TYPE)); \
> + volatile TYPE r2 = 13; \
> + for (int i = 0; i < NUM_ELEMS (TYPE); ++i) \
> + r2 BIT_OP a[i]; \
> + if (r1 != r2) \
> + __builtin_abort (); \
> + }
> +
> +int main ()
> +{
> + TEST_PLUS (TEST_REDUC_PLUS)
> + TEST_MAXMIN (TEST_REDUC_MAXMIN)
> + TEST_BITWISE (TEST_REDUC_BITWISE)
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c
> new file mode 100644
> index 00000000000..3c2f62557b1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-2.c
> @@ -0,0 +1,79 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */
> +
> +#include "reduc-2.c"
> +
> +#define NROWS 53
> +
> +/* -ffast-math fuzz for PLUS. */
> +#define CMP__Float16(X, Y) ((X) >= (Y) * 0.875 && (X) <= (Y) * 1.125)
> +#define CMP_float(X, Y) ((X) == (Y))
> +#define CMP_double(X, Y) ((X) == (Y))
> +#define CMP_int8_t(X, Y) ((X) == (Y))
> +#define CMP_int16_t(X, Y) ((X) == (Y))
> +#define CMP_int32_t(X, Y) ((X) == (Y))
> +#define CMP_int64_t(X, Y) ((X) == (Y))
> +#define CMP_uint8_t(X, Y) ((X) == (Y))
> +#define CMP_uint16_t(X, Y) ((X) == (Y))
> +#define CMP_uint32_t(X, Y) ((X) == (Y))
> +#define CMP_uint64_t(X, Y) ((X) == (Y))
> +
> +#define INIT_MATRIX(TYPE) \
> + TYPE mat[NROWS][NUM_ELEMS (TYPE)]; \
> + TYPE r[NROWS]; \
> + for (int i = 0; i < NROWS; i++) \
> + for (int j = 0; j < NUM_ELEMS (TYPE); j++) \
> + { \
> + mat[i][j] = i + (j * 2) * (j & 1 ? 1 : -1); \
> + asm volatile ("" ::: "memory"); \
> + }
> +
> +#define TEST_REDUC_PLUS(TYPE) \
> + { \
> + INIT_MATRIX (TYPE); \
> + reduc_plus_##TYPE (mat, r, NROWS); \
> + for (int i = 0; i < NROWS; i++) \
> + { \
> + volatile TYPE r2 = 0; \
> + for (int j = 0; j < NUM_ELEMS (TYPE); ++j) \
> + r2 += mat[i][j]; \
> + if (!CMP_##TYPE (r[i], r2)) \
> + __builtin_abort (); \
> + } \
> + }
> +
> +#define TEST_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
> + { \
> + INIT_MATRIX (TYPE); \
> + reduc_##NAME##_##TYPE (mat, r, NROWS); \
> + for (int i = 0; i < NROWS; i++) \
> + { \
> + volatile TYPE r2 = mat[i][0]; \
> + for (int j = 0; j < NUM_ELEMS (TYPE); ++j) \
> + r2 = mat[i][j] CMP_OP r2 ? mat[i][j] : r2; \
> + if (r[i] != r2) \
> + __builtin_abort (); \
> + } \
> + }
> +
> +#define TEST_REDUC_BITWISE(TYPE, NAME, BIT_OP) \
> + { \
> + INIT_MATRIX (TYPE); \
> + reduc_##NAME##_##TYPE (mat, r, NROWS); \
> + for (int i = 0; i < NROWS; i++) \
> + { \
> + volatile TYPE r2 = mat[i][0]; \
> + for (int j = 0; j < NUM_ELEMS (TYPE); ++j) \
> + r2 BIT_OP mat[i][j]; \
> + if (r[i] != r2) \
> + __builtin_abort (); \
> + } \
> + }
> +
> +int main ()
> +{
> + TEST_PLUS (TEST_REDUC_PLUS)
> + TEST_MAXMIN (TEST_REDUC_MAXMIN)
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c
> new file mode 100644
> index 00000000000..d1b22c0d69a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-3.c
> @@ -0,0 +1,49 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include "reduc-3.c"
> +
> +#define N 0x1100
> +
> +int
> +main (void)
> +{
> + unsigned short x[N];
> + for (int i = 0; i < N; ++i)
> + x[i] = (i + 1) * (i + 2);
> +
> + if (add_loop (x, 0) != 0
> + || add_loop (x, 11) != 572
> + || add_loop (x, 0x100) != 22016
> + || add_loop (x, 0xfff) != 20480
> + || max_loop (x, 0) != 0
> + || max_loop (x, 11) != 132
> + || max_loop (x, 0x100) != 65280
> + || max_loop (x, 0xfff) != 65504
> + || or_loop (x, 0) != 0
> + || or_loop (x, 11) != 0xfe
> + || or_loop (x, 0x80) != 0x7ffe
> + || or_loop (x, 0xb4) != 0x7ffe
> + || or_loop (x, 0xb5) != 0xfffe
> + || eor_loop (x, 0) != 0
> + || eor_loop (x, 11) != 0xe8
> + || eor_loop (x, 0x100) != 0xcf00
> + || eor_loop (x, 0xfff) != 0xa000)
> + __builtin_abort ();
> +
> + for (int i = 0; i < N; ++i)
> + x[i] = ~x[i];
> +
> + if (min_loop (x, 0) != 65535
> + || min_loop (x, 11) != 65403
> + || min_loop (x, 0x100) != 255
> + || min_loop (x, 0xfff) != 31
> + || and_loop (x, 0) != 0xffff
> + || and_loop (x, 11) != 0xff01
> + || and_loop (x, 0x80) != 0x8001
> + || and_loop (x, 0xb4) != 0x8001
> + || and_loop (x, 0xb5) != 1)
> + __builtin_abort ();
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c
> new file mode 100644
> index 00000000000..c17e125a763
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_run-4.c
> @@ -0,0 +1,66 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "--param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
> +
> +#include "reduc-4.c"
> +
> +#define N 0x1100
> +
> +int
> +main (void)
> +{
> + unsigned short x[N];
> + for (int i = 0; i < N; ++i)
> + x[i] = (i + 1) * (i + 2);
> +
> + if (add_loop (x, 0, 10) != 10
> + || add_loop (x, 11, 42) != 614
> + || add_loop (x, 0x100, 84) != 22100
> + || add_loop (x, 0xfff, 20) != 20500
> + || max_loop (x, 0, 10) != 10
> + || max_loop (x, 11, 131) != 132
> + || max_loop (x, 11, 133) != 133
> + || max_loop (x, 0x100, 65279) != 65280
> + || max_loop (x, 0x100, 65281) != 65281
> + || max_loop (x, 0xfff, 65503) != 65504
> + || max_loop (x, 0xfff, 65505) != 65505
> + || or_loop (x, 0, 0x71) != 0x71
> + || or_loop (x, 11, 0) != 0xfe
> + || or_loop (x, 11, 0xb3c) != 0xbfe
> + || or_loop (x, 0x80, 0) != 0x7ffe
> + || or_loop (x, 0x80, 1) != 0x7fff
> + || or_loop (x, 0xb4, 0) != 0x7ffe
> + || or_loop (x, 0xb4, 1) != 0x7fff
> + || or_loop (x, 0xb5, 0) != 0xfffe
> + || or_loop (x, 0xb5, 1) != 0xffff
> + || eor_loop (x, 0, 0x3e) != 0x3e
> + || eor_loop (x, 11, 0) != 0xe8
> + || eor_loop (x, 11, 0x1ff) != 0x117
> + || eor_loop (x, 0x100, 0) != 0xcf00
> + || eor_loop (x, 0x100, 0xeee) != 0xc1ee
> + || eor_loop (x, 0xfff, 0) != 0xa000
> + || eor_loop (x, 0xfff, 0x8888) != 0x2888)
> + __builtin_abort ();
> +
> + for (int i = 0; i < N; ++i)
> + x[i] = ~x[i];
> +
> + if (min_loop (x, 0, 10000) != 10000
> + || min_loop (x, 11, 65404) != 65403
> + || min_loop (x, 11, 65402) != 65402
> + || min_loop (x, 0x100, 256) != 255
> + || min_loop (x, 0x100, 254) != 254
> + || min_loop (x, 0xfff, 32) != 31
> + || min_loop (x, 0xfff, 30) != 30
> + || and_loop (x, 0, 0x1234) != 0x1234
> + || and_loop (x, 11, 0xffff) != 0xff01
> + || and_loop (x, 11, 0xcdef) != 0xcd01
> + || and_loop (x, 0x80, 0xffff) != 0x8001
> + || and_loop (x, 0x80, 0xfffe) != 0x8000
> + || and_loop (x, 0xb4, 0xffff) != 0x8001
> + || and_loop (x, 0xb4, 0xfffe) != 0x8000
> + || and_loop (x, 0xb5, 0xffff) != 1
> + || and_loop (x, 0xb5, 0xfffe) != 0)
> + __builtin_abort ();
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
> index 19589fa9638..532c17c4065 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/rvv.exp
> @@ -71,6 +71,8 @@ foreach op $AUTOVEC_TEST_OPTS {
> "" "$op"
> dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/ternop/*.\[cS\]]] \
> "" "$op"
> + dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/reduc/*.\[cS\]]] \
> + "" "$op"
> }
>
> # widening operation only test on LMUL < 8
> --
> 2.36.1
>
@@ -1554,3 +1554,141 @@
riscv_vector::expand_cond_len_ternop (icode, operands);
DONE;
})
+
+;; =========================================================================
+;; == Reductions
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- [INT] Tree reductions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - vredsum.vs
+;; - vredmaxu.vs
+;; - vredmax.vs
+;; - vredminu.vs
+;; - vredmin.vs
+;; - vredand.vs
+;; - vredor.vs
+;; - vredxor.vs
+;; -------------------------------------------------------------------------
+
+(define_expand "reduc_plus_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+ (match_operand:VI 1 "register_operand")]
+ "TARGET_VECTOR"
+{
+ riscv_vector::expand_reduction (PLUS, operands, CONST0_RTX (<VEL>mode));
+ DONE;
+})
+
+(define_expand "reduc_smax_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+ (match_operand:VI 1 "register_operand")]
+ "TARGET_VECTOR"
+{
+ int prec = GET_MODE_PRECISION (<VEL>mode);
+ rtx min = immed_wide_int_const (wi::min_value (prec, SIGNED), <VEL>mode);
+ riscv_vector::expand_reduction (SMAX, operands, min);
+ DONE;
+})
+
+(define_expand "reduc_umax_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+ (match_operand:VI 1 "register_operand")]
+ "TARGET_VECTOR"
+{
+ riscv_vector::expand_reduction (UMAX, operands, CONST0_RTX (<VEL>mode));
+ DONE;
+})
+
+(define_expand "reduc_smin_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+ (match_operand:VI 1 "register_operand")]
+ "TARGET_VECTOR"
+{
+ int prec = GET_MODE_PRECISION (<VEL>mode);
+ rtx max = immed_wide_int_const (wi::max_value (prec, SIGNED), <VEL>mode);
+ riscv_vector::expand_reduction (SMIN, operands, max);
+ DONE;
+})
+
+(define_expand "reduc_umin_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+ (match_operand:VI 1 "register_operand")]
+ "TARGET_VECTOR"
+{
+ int prec = GET_MODE_PRECISION (<VEL>mode);
+ rtx max = immed_wide_int_const (wi::max_value (prec, UNSIGNED), <VEL>mode);
+ riscv_vector::expand_reduction (UMIN, operands, max);
+ DONE;
+})
+
+(define_expand "reduc_and_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+ (match_operand:VI 1 "register_operand")]
+ "TARGET_VECTOR"
+{
+ riscv_vector::expand_reduction (AND, operands, CONSTM1_RTX (<VEL>mode));
+ DONE;
+})
+
+(define_expand "reduc_ior_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+ (match_operand:VI 1 "register_operand")]
+ "TARGET_VECTOR"
+{
+ riscv_vector::expand_reduction (IOR, operands, CONST0_RTX (<VEL>mode));
+ DONE;
+})
+
+(define_expand "reduc_xor_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+ (match_operand:VI 1 "register_operand")]
+ "TARGET_VECTOR"
+{
+ riscv_vector::expand_reduction (XOR, operands, CONST0_RTX (<VEL>mode));
+ DONE;
+})
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Tree reductions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - vfredusum.vs
+;; - vfredmax.vs
+;; - vfredmin.vs
+;; -------------------------------------------------------------------------
+
+(define_expand "reduc_plus_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+ (match_operand:VF 1 "register_operand")]
+ "TARGET_VECTOR"
+{
+ riscv_vector::expand_reduction (PLUS, operands, CONST0_RTX (<VEL>mode));
+ DONE;
+})
+
+(define_expand "reduc_smax_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+ (match_operand:VF 1 "register_operand")]
+ "TARGET_VECTOR"
+{
+ REAL_VALUE_TYPE rv;
+ real_inf (&rv, true);
+ rtx f = const_double_from_real_value (rv, <VEL>mode);
+ riscv_vector::expand_reduction (SMAX, operands, f);
+ DONE;
+})
+
+(define_expand "reduc_smin_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+ (match_operand:VF 1 "register_operand")]
+ "TARGET_VECTOR"
+{
+ REAL_VALUE_TYPE rv;
+ real_inf (&rv, false);
+ rtx f = const_double_from_real_value (rv, <VEL>mode);
+ riscv_vector::expand_reduction (SMIN, operands, f);
+ DONE;
+})
@@ -198,6 +198,7 @@ enum insn_type
RVV_COMPRESS_OP = 4,
RVV_GATHER_M_OP = 5,
RVV_SCATTER_M_OP = 4,
+ RVV_REDUCTION_OP = 3,
};
enum vlmul_type
{
@@ -281,6 +282,7 @@ bool has_vi_variant_p (rtx_code, rtx);
void expand_vec_cmp (rtx, rtx_code, rtx, rtx);
bool expand_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
void expand_cond_len_binop (rtx_code, rtx *);
+void expand_reduction (rtx_code, rtx *, rtx);
#endif
bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
bool, void (*)(rtx *, rtx));
@@ -1159,6 +1159,43 @@ emit_vlmax_compress_insn (unsigned icode, rtx *ops)
e.emit_insn ((enum insn_code) icode, ops);
}
+/* Emit reduction instruction. */
+static void
+emit_vlmax_reduction_insn (unsigned icode, int op_num, rtx *ops)
+{
+ machine_mode dest_mode = GET_MODE (ops[0]);
+ machine_mode mask_mode = get_mask_mode (GET_MODE (ops[1])).require ();
+ insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num,
+ /* HAS_DEST_P */ true,
+ /* FULLY_UNMASKED_P */ true,
+ /* USE_REAL_MERGE_P */ false,
+ /* HAS_AVL_P */ true,
+ /* VLMAX_P */ true, dest_mode,
+ mask_mode);
+
+ e.set_policy (TAIL_ANY);
+ e.emit_insn ((enum insn_code) icode, ops);
+}
+
+/* Emit reduction instruction. */
+static void
+emit_vlmax_fp_reduction_insn (unsigned icode, int op_num, rtx *ops)
+{
+ machine_mode dest_mode = GET_MODE (ops[0]);
+ machine_mode mask_mode = get_mask_mode (GET_MODE (ops[1])).require ();
+ insn_expander<RVV_INSN_OPERANDS_MAX> e (op_num,
+ /* HAS_DEST_P */ true,
+ /* FULLY_UNMASKED_P */ true,
+ /* USE_REAL_MERGE_P */ false,
+ /* HAS_AVL_P */ true,
+ /* VLMAX_P */ true, dest_mode,
+ mask_mode);
+
+ e.set_policy (TAIL_ANY);
+ e.set_rounding_mode (FRM_DYN);
+ e.emit_insn ((enum insn_code) icode, ops);
+}
+
/* Emit merge instruction. */
static machine_mode
@@ -1651,6 +1688,17 @@ get_mask_mode (machine_mode mode)
return get_vector_mode (BImode, GET_MODE_NUNITS (mode));
}
+/* Return the appropriate M1 mode for MODE. */
+
+static opt_machine_mode
+get_m1_mode (machine_mode mode)
+{
+ scalar_mode smode = GET_MODE_INNER (mode);
+ unsigned int bytes = GET_MODE_SIZE (smode);
+ poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
+ return get_vector_mode (smode, m1_nunits);
+}
+
/* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
This function is not only used by builtins, but also will be used by
auto-vectorization in the future. */
@@ -3121,9 +3169,9 @@ expand_cond_len_binop (rtx_code code, rtx *ops)
rtx ops[] = {dest, mask, merge, src1, src2};
insn_code icode = code_for_pred (code, mode);
if (needs_fp_rounding (code, mode))
- emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_MU, ops, len);
+ emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_TU, ops, len);
else
- emit_nonvlmax_tu_insn (icode, RVV_BINOP_MU, ops, len);
+ emit_nonvlmax_tu_insn (icode, RVV_BINOP_TU, ops, len);
}
else
/* FIXME: Enable this case when we support it in the middle-end. */
@@ -3316,4 +3364,36 @@ expand_cond_len_ternop (unsigned icode, rtx *ops)
gcc_unreachable ();
}
+/* Expand reduction operations. */
+void
+expand_reduction (rtx_code code, rtx *ops, rtx init)
+{
+ machine_mode vmode = GET_MODE (ops[1]);
+ machine_mode m1_mode = get_m1_mode (vmode).require ();
+ machine_mode m1_mmode = get_mask_mode (m1_mode).require ();
+
+ rtx m1_tmp = gen_reg_rtx (m1_mode);
+ rtx m1_mask = gen_scalar_move_mask (m1_mmode);
+ rtx m1_undef = RVV_VUNDEF (m1_mode);
+ rtx scalar_move_ops[] = {m1_tmp, m1_mask, m1_undef, init};
+ emit_scalar_move_insn (code_for_pred_broadcast (m1_mode), scalar_move_ops);
+
+ rtx m1_tmp2 = gen_reg_rtx (m1_mode);
+ rtx reduc_ops[] = {m1_tmp2, ops[1], m1_tmp};
+
+ if (FLOAT_MODE_P (vmode) && code == PLUS)
+ {
+ insn_code icode
+ = code_for_pred_reduc_plus (UNSPEC_UNORDERED, vmode, m1_mode);
+ emit_vlmax_fp_reduction_insn (icode, RVV_REDUCTION_OP, reduc_ops);
+ }
+ else
+ {
+ insn_code icode = code_for_pred_reduc (code, vmode, m1_mode);
+ emit_vlmax_reduction_insn (icode, RVV_REDUCTION_OP, reduc_ops);
+ }
+
+ emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2));
+}
+
} // namespace riscv_vector
@@ -646,7 +646,8 @@ gen_vsetvl_pat (enum vsetvl_type insn_type, const vl_vtype_info &info, rtx vl)
}
static rtx
-gen_vsetvl_pat (rtx_insn *rinsn, const vector_insn_info &info)
+gen_vsetvl_pat (rtx_insn *rinsn, const vector_insn_info &info,
+ rtx vl = NULL_RTX)
{
rtx new_pat;
vl_vtype_info new_info = info;
@@ -654,15 +655,17 @@ gen_vsetvl_pat (rtx_insn *rinsn, const vector_insn_info &info)
&& fault_first_load_p (info.get_insn ()->rtl ()))
new_info.set_avl_info (
avl_info (get_avl (info.get_insn ()->rtl ()), nullptr));
- if (vsetvl_insn_p (rinsn) || vlmax_avl_p (info.get_avl ()))
+ if (vl)
+ new_pat = gen_vsetvl_pat (VSETVL_NORMAL, new_info, vl);
+ else
{
- rtx dest = get_vl (rinsn);
- new_pat = gen_vsetvl_pat (VSETVL_NORMAL, new_info, dest);
+ if (vsetvl_insn_p (rinsn) || vlmax_avl_p (info.get_avl ()))
+ new_pat = gen_vsetvl_pat (VSETVL_NORMAL, new_info, get_vl (rinsn));
+ else if (INSN_CODE (rinsn) == CODE_FOR_vsetvl_vtype_change_only)
+ new_pat = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY, new_info, NULL_RTX);
+ else
+ new_pat = gen_vsetvl_pat (VSETVL_DISCARD_RESULT, new_info, NULL_RTX);
}
- else if (INSN_CODE (rinsn) == CODE_FOR_vsetvl_vtype_change_only)
- new_pat = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY, new_info, NULL_RTX);
- else
- new_pat = gen_vsetvl_pat (VSETVL_DISCARD_RESULT, new_info, NULL_RTX);
return new_pat;
}
@@ -805,6 +808,14 @@ get_vl_vtype_info (const insn_info *insn)
return info;
}
+/* Change insn and Assert the change always happens. */
+static void
+validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
+{
+ bool change_p = validate_change (object, loc, new_rtx, in_group);
+ gcc_assert (change_p);
+}
+
static void
change_insn (rtx_insn *rinsn, rtx new_pat)
{
@@ -818,7 +829,7 @@ change_insn (rtx_insn *rinsn, rtx new_pat)
print_rtl_single (dump_file, PATTERN (rinsn));
}
- validate_change (rinsn, &PATTERN (rinsn), new_pat, false);
+ validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat, false);
if (dump_file)
{
@@ -874,7 +885,7 @@ change_insn (function_info *ssa, insn_change change, insn_info *insn,
}
insn_change_watermark watermark;
- validate_change (rinsn, &PATTERN (rinsn), new_pat, true);
+ validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat, true);
/* These routines report failures themselves. */
if (!recog (attempt, change) || !change_is_worthwhile (change, false))
@@ -931,7 +942,8 @@ change_insn (function_info *ssa, insn_change change, insn_info *insn,
}
static void
-change_vsetvl_insn (const insn_info *insn, const vector_insn_info &info)
+change_vsetvl_insn (const insn_info *insn, const vector_insn_info &info,
+ rtx vl = NULL_RTX)
{
rtx_insn *rinsn;
if (vector_config_insn_p (insn->rtl ()))
@@ -945,7 +957,7 @@ change_vsetvl_insn (const insn_info *insn, const vector_insn_info &info)
rinsn = PREV_INSN (insn->rtl ());
gcc_assert (vector_config_insn_p (rinsn));
}
- rtx new_pat = gen_vsetvl_pat (rinsn, info);
+ rtx new_pat = gen_vsetvl_pat (rinsn, info, vl);
change_insn (rinsn, new_pat);
}
@@ -3377,7 +3389,20 @@ pass_vsetvl::backward_demand_fusion (void)
new_info))
continue;
- change_vsetvl_insn (new_info.get_insn (), new_info);
+ rtx vl = NULL_RTX;
+ /* Backward VLMAX VL:
+ bb 3:
+ vsetivli zero, 1 ... -> vsetvli t1, zero
+ vmv.s.x
+ bb 5:
+ vsetvli t1, zero ... -> to be elided.
+ vlse16.v
+
+ We should forward "t1". */
+ if (!block_info.reaching_out.has_avl_reg ()
+ && vlmax_avl_p (new_info.get_avl ()))
+ vl = get_vl (prop.get_insn ()->rtl ());
+ change_vsetvl_insn (new_info.get_insn (), new_info, vl);
if (block_info.local_dem == block_info.reaching_out)
block_info.local_dem = new_info;
block_info.reaching_out = new_info;
@@ -4524,13 +4549,15 @@ pass_vsetvl::df_post_optimization (void) const
{
rtx new_pat = gen_vsetvl_pat (VSETVL_VTYPE_CHANGE_ONLY,
info, NULL_RTX);
- validate_change (rinsn, &PATTERN (rinsn), new_pat, false);
+ validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat,
+ false);
}
else if (!vlmax_avl_p (info.get_avl ()))
{
rtx new_pat = gen_vsetvl_pat (VSETVL_DISCARD_RESULT, info,
NULL_RTX);
- validate_change (rinsn, &PATTERN (rinsn), new_pat, false);
+ validate_change_or_fail (rinsn, &PATTERN (rinsn), new_pat,
+ false);
}
}
}
new file mode 100644
@@ -0,0 +1,118 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+
+#define DEF_REDUC_PLUS(TYPE) \
+TYPE __attribute__ ((noinline, noclone)) \
+reduc_plus_##TYPE (TYPE *a, int n) \
+{ \
+ TYPE r = 0; \
+ for (int i = 0; i < n; ++i) \
+ r += a[i]; \
+ return r; \
+}
+
+#define TEST_PLUS(T) \
+ T (int8_t) \
+ T (int16_t) \
+ T (int32_t) \
+ T (int64_t) \
+ T (uint8_t) \
+ T (uint16_t) \
+ T (uint32_t) \
+ T (uint64_t) \
+ T (_Float16) \
+ T (float) \
+ T (double)
+
+TEST_PLUS (DEF_REDUC_PLUS)
+
+#define DEF_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
+TYPE __attribute__ ((noinline, noclone)) \
+reduc_##NAME##_##TYPE (TYPE *a, int n) \
+{ \
+ TYPE r = 13; \
+ for (int i = 0; i < n; ++i) \
+ r = a[i] CMP_OP r ? a[i] : r; \
+ return r; \
+}
+
+#define TEST_MAXMIN(T) \
+ T (int8_t, max, >) \
+ T (int16_t, max, >) \
+ T (int32_t, max, >) \
+ T (int64_t, max, >) \
+ T (uint8_t, max, >) \
+ T (uint16_t, max, >) \
+ T (uint32_t, max, >) \
+ T (uint64_t, max, >) \
+ T (_Float16, max, >) \
+ T (float, max, >) \
+ T (double, max, >) \
+ \
+ T (int8_t, min, <) \
+ T (int16_t, min, <) \
+ T (int32_t, min, <) \
+ T (int64_t, min, <) \
+ T (uint8_t, min, <) \
+ T (uint16_t, min, <) \
+ T (uint32_t, min, <) \
+ T (uint64_t, min, <) \
+ T (_Float16, min, <) \
+ T (float, min, <) \
+ T (double, min, <)
+
+TEST_MAXMIN (DEF_REDUC_MAXMIN)
+
+#define DEF_REDUC_BITWISE(TYPE, NAME, BIT_OP) \
+TYPE __attribute__ ((noinline, noclone)) \
+reduc_##NAME##_##TYPE (TYPE *a, int n) \
+{ \
+ TYPE r = 13; \
+ for (int i = 0; i < n; ++i) \
+ r BIT_OP a[i]; \
+ return r; \
+}
+
+#define TEST_BITWISE(T) \
+ T (int8_t, and, &=) \
+ T (int16_t, and, &=) \
+ T (int32_t, and, &=) \
+ T (int64_t, and, &=) \
+ T (uint8_t, and, &=) \
+ T (uint16_t, and, &=) \
+ T (uint32_t, and, &=) \
+ T (uint64_t, and, &=) \
+ \
+ T (int8_t, ior, |=) \
+ T (int16_t, ior, |=) \
+ T (int32_t, ior, |=) \
+ T (int64_t, ior, |=) \
+ T (uint8_t, ior, |=) \
+ T (uint16_t, ior, |=) \
+ T (uint32_t, ior, |=) \
+ T (uint64_t, ior, |=) \
+ \
+ T (int8_t, xor, ^=) \
+ T (int16_t, xor, ^=) \
+ T (int32_t, xor, ^=) \
+ T (int64_t, xor, ^=) \
+ T (uint8_t, xor, ^=) \
+ T (uint16_t, xor, ^=) \
+ T (uint32_t, xor, ^=) \
+ T (uint64_t, xor, ^=)
+
+TEST_BITWISE (DEF_REDUC_BITWISE)
+
+/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
+/* { dg-final { scan-assembler-times {vredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {vredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
+/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
+/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
+/* { dg-final { scan-assembler-times {vfredusum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
+/* { dg-final { scan-assembler-times {vfredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
+/* { dg-final { scan-assembler-times {vfredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
new file mode 100644
@@ -0,0 +1,129 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+
+#define NUM_ELEMS(TYPE) (1024 / sizeof (TYPE))
+
+#define DEF_REDUC_PLUS(TYPE) \
+void __attribute__ ((noinline, noclone)) \
+reduc_plus_##TYPE (TYPE (*restrict a)[NUM_ELEMS (TYPE)], \
+ TYPE *restrict r, int n) \
+{ \
+ for (int i = 0; i < n; i++) \
+ { \
+ r[i] = 0; \
+ for (int j = 0; j < NUM_ELEMS (TYPE); j++) \
+ r[i] += a[i][j]; \
+ } \
+}
+
+#define TEST_PLUS(T) \
+ T (int8_t) \
+ T (int16_t) \
+ T (int32_t) \
+ T (int64_t) \
+ T (uint8_t) \
+ T (uint16_t) \
+ T (uint32_t) \
+ T (uint64_t) \
+ T (_Float16) \
+ T (float) \
+ T (double)
+
+TEST_PLUS (DEF_REDUC_PLUS)
+
+#define DEF_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
+void __attribute__ ((noinline, noclone)) \
+reduc_##NAME##_##TYPE (TYPE (*restrict a)[NUM_ELEMS (TYPE)], \
+ TYPE *restrict r, int n) \
+{ \
+ for (int i = 0; i < n; i++) \
+ { \
+ r[i] = a[i][0]; \
+ for (int j = 0; j < NUM_ELEMS (TYPE); j++) \
+ r[i] = a[i][j] CMP_OP r[i] ? a[i][j] : r[i]; \
+ } \
+}
+
+#define TEST_MAXMIN(T) \
+ T (int8_t, max, >) \
+ T (int16_t, max, >) \
+ T (int32_t, max, >) \
+ T (int64_t, max, >) \
+ T (uint8_t, max, >) \
+ T (uint16_t, max, >) \
+ T (uint32_t, max, >) \
+ T (uint64_t, max, >) \
+ T (_Float16, max, >) \
+ T (float, max, >) \
+ T (double, max, >) \
+ \
+ T (int8_t, min, <) \
+ T (int16_t, min, <) \
+ T (int32_t, min, <) \
+ T (int64_t, min, <) \
+ T (uint8_t, min, <) \
+ T (uint16_t, min, <) \
+ T (uint32_t, min, <) \
+ T (uint64_t, min, <) \
+ T (_Float16, min, <) \
+ T (float, min, <) \
+ T (double, min, <)
+
+TEST_MAXMIN (DEF_REDUC_MAXMIN)
+
+#define DEF_REDUC_BITWISE(TYPE,NAME,BIT_OP) \
+void __attribute__ ((noinline, noclone)) \
+reduc_##NAME##TYPE (TYPE (*restrict a)[NUM_ELEMS(TYPE)], \
+ TYPE *restrict r, int n) \
+{ \
+ for (int i = 0; i < n; i++) \
+ { \
+ r[i] = a[i][0]; \
+ for (int j = 0; j < NUM_ELEMS(TYPE); j++) \
+ r[i] BIT_OP a[i][j]; \
+ } \
+}
+
+#define TEST_BITWISE(T) \
+ T (int8_t, and, &=) \
+ T (int16_t, and, &=) \
+ T (int32_t, and, &=) \
+ T (int64_t, and, &=) \
+ T (uint8_t, and, &=) \
+ T (uint16_t, and, &=) \
+ T (uint32_t, and, &=) \
+ T (uint64_t, and, &=) \
+ \
+ T (int8_t, ior, |=) \
+ T (int16_t, ior, |=) \
+ T (int32_t, ior, |=) \
+ T (int64_t, ior, |=) \
+ T (uint8_t, ior, |=) \
+ T (uint16_t, ior, |=) \
+ T (uint32_t, ior, |=) \
+ T (uint64_t, ior, |=) \
+ \
+ T (int8_t, xor, ^=) \
+ T (int16_t, xor, ^=) \
+ T (int32_t, xor, ^=) \
+ T (int64_t, xor, ^=) \
+ T (uint8_t, xor, ^=) \
+ T (uint16_t, xor, ^=) \
+ T (uint32_t, xor, ^=) \
+ T (uint64_t, xor, ^=)
+
+TEST_BITWISE (DEF_REDUC_BITWISE)
+
+/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
+/* { dg-final { scan-assembler-times {vredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {vredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
+/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
+/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 8 } } */
+/* { dg-final { scan-assembler-times {vfredusum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
+/* { dg-final { scan-assembler-times {vfredmax\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
+/* { dg-final { scan-assembler-times {vfredmin\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 3 } } */
new file mode 100644
@@ -0,0 +1,65 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < n; ++i)
+ res += x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n)
+{
+ unsigned short res = ~0;
+ for (int i = 0; i < n; ++i)
+ res = res < x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < n; ++i)
+ res = res > x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n)
+{
+ unsigned short res = ~0;
+ for (int i = 0; i < n; ++i)
+ res &= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < n; ++i)
+ res |= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n)
+{
+ unsigned short res = 0;
+ for (int i = 0; i < n; ++i)
+ res ^= x[i];
+ return res;
+}
+
+/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
new file mode 100644
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+
+unsigned short __attribute__((noipa))
+add_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res += x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+min_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res = res < x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+max_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res = res > x[i] ? res : x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+and_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res &= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+or_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res |= x[i];
+ return res;
+}
+
+unsigned short __attribute__((noipa))
+eor_loop (unsigned short *x, int n, unsigned short res)
+{
+ for (int i = 0; i < n; ++i)
+ res ^= x[i];
+ return res;
+}
+
+/* { dg-final { scan-assembler-times {vredsum\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vredmaxu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vredminu\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vredand\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vredor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vredxor\.vs\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 1 } } */
new file mode 100644
@@ -0,0 +1,56 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
+
+#include "reduc-1.c"
+
+#define NUM_ELEMS(TYPE) (73 + sizeof (TYPE))
+
+#define INIT_VECTOR(TYPE) \
+ TYPE a[NUM_ELEMS (TYPE) + 1]; \
+ for (int i = 0; i < NUM_ELEMS (TYPE) + 1; i++) \
+ { \
+ a[i] = ((i * 2) * (i & 1 ? 1 : -1) | 3); \
+ asm volatile ("" ::: "memory"); \
+ }
+
+#define TEST_REDUC_PLUS(TYPE) \
+ { \
+ INIT_VECTOR (TYPE); \
+ TYPE r1 = reduc_plus_##TYPE (a, NUM_ELEMS (TYPE)); \
+ volatile TYPE r2 = 0; \
+ for (int i = 0; i < NUM_ELEMS (TYPE); ++i) \
+ r2 += a[i]; \
+ if (r1 != r2) \
+ __builtin_abort (); \
+ }
+
+#define TEST_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
+ { \
+ INIT_VECTOR (TYPE); \
+ TYPE r1 = reduc_##NAME##_##TYPE (a, NUM_ELEMS (TYPE)); \
+ volatile TYPE r2 = 13; \
+ for (int i = 0; i < NUM_ELEMS (TYPE); ++i) \
+ r2 = a[i] CMP_OP r2 ? a[i] : r2; \
+ if (r1 != r2) \
+ __builtin_abort (); \
+ }
+
+#define TEST_REDUC_BITWISE(TYPE, NAME, BIT_OP) \
+ { \
+ INIT_VECTOR (TYPE); \
+ TYPE r1 = reduc_##NAME##_##TYPE (a, NUM_ELEMS (TYPE)); \
+ volatile TYPE r2 = 13; \
+ for (int i = 0; i < NUM_ELEMS (TYPE); ++i) \
+ r2 BIT_OP a[i]; \
+ if (r1 != r2) \
+ __builtin_abort (); \
+ }
+
+int main ()
+{
+ TEST_PLUS (TEST_REDUC_PLUS)
+ TEST_MAXMIN (TEST_REDUC_MAXMIN)
+ TEST_BITWISE (TEST_REDUC_BITWISE)
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,79 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */
+
+#include "reduc-2.c"
+
+#define NROWS 53
+
+/* -ffast-math fuzz for PLUS. */
+#define CMP__Float16(X, Y) ((X) >= (Y) * 0.875 && (X) <= (Y) * 1.125)
+#define CMP_float(X, Y) ((X) == (Y))
+#define CMP_double(X, Y) ((X) == (Y))
+#define CMP_int8_t(X, Y) ((X) == (Y))
+#define CMP_int16_t(X, Y) ((X) == (Y))
+#define CMP_int32_t(X, Y) ((X) == (Y))
+#define CMP_int64_t(X, Y) ((X) == (Y))
+#define CMP_uint8_t(X, Y) ((X) == (Y))
+#define CMP_uint16_t(X, Y) ((X) == (Y))
+#define CMP_uint32_t(X, Y) ((X) == (Y))
+#define CMP_uint64_t(X, Y) ((X) == (Y))
+
+#define INIT_MATRIX(TYPE) \
+ TYPE mat[NROWS][NUM_ELEMS (TYPE)]; \
+ TYPE r[NROWS]; \
+ for (int i = 0; i < NROWS; i++) \
+ for (int j = 0; j < NUM_ELEMS (TYPE); j++) \
+ { \
+ mat[i][j] = i + (j * 2) * (j & 1 ? 1 : -1); \
+ asm volatile ("" ::: "memory"); \
+ }
+
+#define TEST_REDUC_PLUS(TYPE) \
+ { \
+ INIT_MATRIX (TYPE); \
+ reduc_plus_##TYPE (mat, r, NROWS); \
+ for (int i = 0; i < NROWS; i++) \
+ { \
+ volatile TYPE r2 = 0; \
+ for (int j = 0; j < NUM_ELEMS (TYPE); ++j) \
+ r2 += mat[i][j]; \
+ if (!CMP_##TYPE (r[i], r2)) \
+ __builtin_abort (); \
+ } \
+ }
+
+#define TEST_REDUC_MAXMIN(TYPE, NAME, CMP_OP) \
+ { \
+ INIT_MATRIX (TYPE); \
+ reduc_##NAME##_##TYPE (mat, r, NROWS); \
+ for (int i = 0; i < NROWS; i++) \
+ { \
+ volatile TYPE r2 = mat[i][0]; \
+ for (int j = 0; j < NUM_ELEMS (TYPE); ++j) \
+ r2 = mat[i][j] CMP_OP r2 ? mat[i][j] : r2; \
+ if (r[i] != r2) \
+ __builtin_abort (); \
+ } \
+ }
+
+#define TEST_REDUC_BITWISE(TYPE, NAME, BIT_OP) \
+ { \
+ INIT_MATRIX (TYPE); \
+ reduc_##NAME##_##TYPE (mat, r, NROWS); \
+ for (int i = 0; i < NROWS; i++) \
+ { \
+ volatile TYPE r2 = mat[i][0]; \
+ for (int j = 0; j < NUM_ELEMS (TYPE); ++j) \
+ r2 BIT_OP mat[i][j]; \
+ if (r[i] != r2) \
+ __builtin_abort (); \
+ } \
+ }
+
+int main ()
+{
+ TEST_PLUS (TEST_REDUC_PLUS)
+ TEST_MAXMIN (TEST_REDUC_MAXMIN)
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,49 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
+
+#include "reduc-3.c"
+
+#define N 0x1100
+
+int
+main (void)
+{
+ unsigned short x[N];
+ for (int i = 0; i < N; ++i)
+ x[i] = (i + 1) * (i + 2);
+
+ if (add_loop (x, 0) != 0
+ || add_loop (x, 11) != 572
+ || add_loop (x, 0x100) != 22016
+ || add_loop (x, 0xfff) != 20480
+ || max_loop (x, 0) != 0
+ || max_loop (x, 11) != 132
+ || max_loop (x, 0x100) != 65280
+ || max_loop (x, 0xfff) != 65504
+ || or_loop (x, 0) != 0
+ || or_loop (x, 11) != 0xfe
+ || or_loop (x, 0x80) != 0x7ffe
+ || or_loop (x, 0xb4) != 0x7ffe
+ || or_loop (x, 0xb5) != 0xfffe
+ || eor_loop (x, 0) != 0
+ || eor_loop (x, 11) != 0xe8
+ || eor_loop (x, 0x100) != 0xcf00
+ || eor_loop (x, 0xfff) != 0xa000)
+ __builtin_abort ();
+
+ for (int i = 0; i < N; ++i)
+ x[i] = ~x[i];
+
+ if (min_loop (x, 0) != 65535
+ || min_loop (x, 11) != 65403
+ || min_loop (x, 0x100) != 255
+ || min_loop (x, 0xfff) != 31
+ || and_loop (x, 0) != 0xffff
+ || and_loop (x, 11) != 0xff01
+ || and_loop (x, 0x80) != 0x8001
+ || and_loop (x, 0xb4) != 0x8001
+ || and_loop (x, 0xb5) != 1)
+ __builtin_abort ();
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,66 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param=riscv-autovec-preference=scalable -ffast-math -fno-vect-cost-model" } */
+
+#include "reduc-4.c"
+
+#define N 0x1100
+
+int
+main (void)
+{
+ unsigned short x[N];
+ for (int i = 0; i < N; ++i)
+ x[i] = (i + 1) * (i + 2);
+
+ if (add_loop (x, 0, 10) != 10
+ || add_loop (x, 11, 42) != 614
+ || add_loop (x, 0x100, 84) != 22100
+ || add_loop (x, 0xfff, 20) != 20500
+ || max_loop (x, 0, 10) != 10
+ || max_loop (x, 11, 131) != 132
+ || max_loop (x, 11, 133) != 133
+ || max_loop (x, 0x100, 65279) != 65280
+ || max_loop (x, 0x100, 65281) != 65281
+ || max_loop (x, 0xfff, 65503) != 65504
+ || max_loop (x, 0xfff, 65505) != 65505
+ || or_loop (x, 0, 0x71) != 0x71
+ || or_loop (x, 11, 0) != 0xfe
+ || or_loop (x, 11, 0xb3c) != 0xbfe
+ || or_loop (x, 0x80, 0) != 0x7ffe
+ || or_loop (x, 0x80, 1) != 0x7fff
+ || or_loop (x, 0xb4, 0) != 0x7ffe
+ || or_loop (x, 0xb4, 1) != 0x7fff
+ || or_loop (x, 0xb5, 0) != 0xfffe
+ || or_loop (x, 0xb5, 1) != 0xffff
+ || eor_loop (x, 0, 0x3e) != 0x3e
+ || eor_loop (x, 11, 0) != 0xe8
+ || eor_loop (x, 11, 0x1ff) != 0x117
+ || eor_loop (x, 0x100, 0) != 0xcf00
+ || eor_loop (x, 0x100, 0xeee) != 0xc1ee
+ || eor_loop (x, 0xfff, 0) != 0xa000
+ || eor_loop (x, 0xfff, 0x8888) != 0x2888)
+ __builtin_abort ();
+
+ for (int i = 0; i < N; ++i)
+ x[i] = ~x[i];
+
+ if (min_loop (x, 0, 10000) != 10000
+ || min_loop (x, 11, 65404) != 65403
+ || min_loop (x, 11, 65402) != 65402
+ || min_loop (x, 0x100, 256) != 255
+ || min_loop (x, 0x100, 254) != 254
+ || min_loop (x, 0xfff, 32) != 31
+ || min_loop (x, 0xfff, 30) != 30
+ || and_loop (x, 0, 0x1234) != 0x1234
+ || and_loop (x, 11, 0xffff) != 0xff01
+ || and_loop (x, 11, 0xcdef) != 0xcd01
+ || and_loop (x, 0x80, 0xffff) != 0x8001
+ || and_loop (x, 0x80, 0xfffe) != 0x8000
+ || and_loop (x, 0xb4, 0xffff) != 0x8001
+ || and_loop (x, 0xb4, 0xfffe) != 0x8000
+ || and_loop (x, 0xb5, 0xffff) != 1
+ || and_loop (x, 0xb5, 0xfffe) != 0)
+ __builtin_abort ();
+
+ return 0;
+}
@@ -71,6 +71,8 @@ foreach op $AUTOVEC_TEST_OPTS {
"" "$op"
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/ternop/*.\[cS\]]] \
"" "$op"
+ dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/autovec/reduc/*.\[cS\]]] \
+ "" "$op"
}
# widening operation only test on LMUL < 8