ifcvt/vect: Emit COND_ADD for conditional scalar reduction.

Message ID 0193b63e-98dc-42bc-cd33-485361ea50bf@gmail.com
State Unresolved
Headers
Series ifcvt/vect: Emit COND_ADD for conditional scalar reduction. |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Robin Dapp Sept. 20, 2023, 1:51 p.m. UTC
  Hi,

as described in PR111401 we currently emit a COND and a PLUS expression
for conditional reductions.  This makes it difficult to combine both
into a masked reduction statement later.
This patch improves that by directly emitting a COND_ADD during ifcvt and
adjusting some vectorizer code to handle it.

It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
is true.

Related question/change: We only allow PLUS_EXPR in fold_left_reduction_fn
but have code to handle MINUS_EXPR in vectorize_fold_left_reduction.  I
suppose that's intentional but it "just works" on riscv and the testsuite
doesn't change when allowing MINUS_EXPR so I went ahead and did that.

Bootstrapped and regtested on x86 and aarch64.

Regards
 Robin

gcc/ChangeLog:

	PR middle-end/111401
	* internal-fn.cc (cond_fn_p): New function.
	* internal-fn.h (cond_fn_p): Define.
	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
	if supported.
	(predicate_scalar_phi): Add whitespace.
	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
	(neutral_op_for_reduction): Return -0 for PLUS.
	(vect_is_simple_reduction): Don't count else operand in
	COND_ADD.
	(vectorize_fold_left_reduction): Add COND_ADD handling.
	(vectorizable_reduction): Don't count else operand in COND_ADD.
	(vect_transform_reduction): Add COND_ADD handling.
	* tree-vectorizer.h (neutral_op_for_reduction): Add default
	parameter.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
---
 gcc/internal-fn.cc                            |  38 +++++
 gcc/internal-fn.h                             |   1 +
 .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++++
 .../riscv/rvv/autovec/cond/pr111401.c         |  61 ++++++++
 gcc/tree-if-conv.cc                           |  63 ++++++--
 gcc/tree-vect-loop.cc                         | 130 ++++++++++++----
 gcc/tree-vectorizer.h                         |   2 +-
 7 files changed, 394 insertions(+), 42 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
  

Comments

Tamar Christina Sept. 27, 2023, 12:44 a.m. UTC | #1
Hi,

I can't approve but hope you don't mind the review,

> +/* Return true if this CODE describes a conditional (masked)
> +internal_fn.  */
> +
> +bool
> +cond_fn_p (code_helper code)
> +{
> +  if (!code.is_fn_code ())
> +    return false;
> +
> +  if (!internal_fn_p ((combined_fn) code))
> +    return false;
> +
> +  internal_fn fn = as_internal_fn ((combined_fn) code);  switch (fn)
> +    {
> +    #undef DEF_INTERNAL_COND_FN
> +    #define DEF_INTERNAL_COND_FN(NAME, F, O, T)			  \
> +    case IFN_COND_##NAME:					  \
> +    case IFN_COND_LEN_##NAME:					  \
> +      return true;
> +    #include "internal-fn.def"
> +    #undef DEF_INTERNAL_COND_FN
> +
> +    #undef DEF_INTERNAL_SIGNED_COND_FN
> +    #define DEF_INTERNAL_SIGNED_COND_FN(NAME, F, S, SO, UO, T)	  \
> +    case IFN_COND_##NAME:					  \
> +    case IFN_COND_LEN_##NAME:					  \
> +      return true;
> +    #include "internal-fn.def"
> +    #undef DEF_INTERNAL_SIGNED_COND_FN
> +
> +    default:
> +      return false;
> +    }
> +
> +  return false;
> +}
> +
> +

Could you not use conditional_internal_fn_code for this? Just check result is not ERROR_MARK?

>  /* Return true if this CODE describes an internal_fn that returns a vector with
>     elements twice as wide as the element size of the input vectors.  */
> 
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h index
> 99de13a0199..f1cc9db29c0 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -219,6 +219,7 @@ extern bool commutative_ternary_fn_p (internal_fn);
> extern int first_commutative_argument (internal_fn);  extern bool
> associative_binary_fn_p (internal_fn);  extern bool widening_fn_p
> (code_helper);
> +extern bool cond_fn_p (code_helper code);
> 
>  extern bool set_edom_supported_p (void);
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-
> zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..57c600838ee
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=c99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone)) reduc_plus_double (double
> +*restrict a, double init, int *cond, int n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int
> +n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone)) reduc_minus_double (double
> +*restrict a, double init, int *cond, int n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int
> +n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> +  int n = 19;
> +  double a[N];
> +  int cond1[N], cond2[N];
> +
> +  for (int i = 0; i < N; i++)
> +    {
> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> +      cond1[i] = 0;
> +      cond2[i] = i & 4 ? 1 : 0;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);  double ref1 =
> + reduc_plus_double_ref (a, -0.0, cond1, n);  double res2 =
> + reduc_minus_double (a, -0.0, cond1, n);  double ref2 =
> + reduc_minus_double_ref (a, -0.0, cond1, n);  double res3 =
> + reduc_plus_double (a, -0.0, cond1, n);  double ref3 =
> + reduc_plus_double_ref (a, -0.0, cond1, n);  double res4 =
> + reduc_minus_double (a, -0.0, cond1, n);  double ref4 =
> + reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..1d559ce5391
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,61 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param
> +riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init += a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init -= a[i];
> +  return init;
> +}
> +
> +#define SZ 125
> +
> +__attribute__ ((optimize ("1")))
> +int
> +main ()
> +{
> +  double res1 = 0, res2 = 0;
> +  double a1[SZ], a2[SZ];
> +  int c1[SZ], c2[SZ];
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      a1[i] = i * 3 + (i & 4) - (i & 7);
> +      a2[i] = i * 3 + (i & 4) - (i & 7);
> +      c1[i] = i & 1;
> +      c2[i] = i & 1;
> +    }
> +
> +  double init1 = 2.7, init2 = 8.2;
> +  double ref1 = init1, ref2 = init2;
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      if (c1[i])
> +        ref1 += a1[i];
> +      if (c2[i])
> +        ref2 -= a2[i];
> +    }
> +
> +  res1 = foo2 (a1, init1, c1, SZ);
> +  res2 = foo3 (a2, init2, c2, SZ);
> +
> +  if (res1 != ref1)
> +    __builtin_abort ();
> +  if (res2 != ref2)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 2 "vect" } }
> +*/
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index
> 799f071965e..425976b0861 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc,
> gimple_stmt_iterator *gsi,
>    gimple *new_assign;
>    tree rhs;
>    tree rhs1 = gimple_assign_rhs1 (reduc);
> +  tree lhs = gimple_assign_lhs (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1),
> reduction_op, NULL);
> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1),
> reduction_op,
> +					       NULL, false);
>    gimple_seq stmts = NULL;
> 
>    if (dump_file && (dump_flags & TDF_DETAILS)) @@ -1864,19 +1866,52
> @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator
> *gsi,
>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>      }
> 
> -  /* Build cond expression using COND and constant operand
> -     of reduction rhs.  */
> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
> -			    unshare_expr (cond),
> -			    swap ? op_nochange : op1,
> -			    swap ? op1 : op_nochange);
> +  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
> +     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
> +     a vectorizable call as we can create a COND version of it
> + directly.  */  internal_fn ifn;  ifn = get_conditional_internal_fn
> + (reduction_op);
> 
> -  /* Create assignment stmt and insert it at GSI.  */
> -  new_assign = gimple_build_assign (tmp, c);
> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> -  rhs = gimple_build (&stmts, reduction_op,
> -		      TREE_TYPE (rhs1), op0, tmp);
> +  bool try_cond_op = true;
> +  gimple *opstmt;
> +  if (TREE_CODE (op1) == SSA_NAME
> +      && (opstmt = SSA_NAME_DEF_STMT (op1))
> +      && is_gimple_call (opstmt))
> +    {
> +      combined_fn cfn = gimple_call_combined_fn (opstmt);
> +      internal_fn ifnop;
> +      reduction_fn_for_scalar_code (cfn, &ifnop);
> +      if (vectorized_internal_fn_supported_p (ifnop, TREE_TYPE
> +					      (gimple_call_lhs (opstmt))))
> +	try_cond_op = false;
> +    }
> +
> +  if (ifn != IFN_LAST
> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> +      && try_cond_op && !swap)
> +    {
> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
> +						     unshare_expr (cond),
> +						     op0, op1, op0);
> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> +      gimple_call_set_lhs (cond_call, tmp);
> +      rhs = tmp;
> +    }
> +  else
> +    {
> +      /* Build cond expression using COND and constant operand
> +	 of reduction rhs.  */
> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
> +				unshare_expr (cond),
> +				swap ? op_nochange : op1,
> +				swap ? op1 : op_nochange);
> +      /* Create assignment stmt and insert it at GSI.  */
> +      new_assign = gimple_build_assign (tmp, c);
> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +      rhs = gimple_build (&stmts, reduction_op,
> +			  TREE_TYPE (rhs1), op0, tmp);
> +    }
> 
>    if (has_nop)
>      {
> @@ -2241,7 +2276,7 @@ predicate_scalar_phi (gphi *phi,
> gimple_stmt_iterator *gsi)
>  	{
>  	  /* Convert reduction stmt into vectorizable form.  */
>  	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -					       swap,has_nop, nop_reduc);
> +					       swap, has_nop, nop_reduc);
>  	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>  	}
>        new_stmt = gimple_build_assign (res, rhs); diff --git a/gcc/tree-vect-
> loop.cc b/gcc/tree-vect-loop.cc index 23c6e8259e7..94d3cead1e6 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3672,7 +3672,7 @@ vect_analyze_loop (class loop *loop,
> vec_info_shared *shared)  static bool  fold_left_reduction_fn (code_helper
> code, internal_fn *reduc_fn)  {
> -  if (code == PLUS_EXPR)
> +  if (code == PLUS_EXPR || code == MINUS_EXPR)
>      {
>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>        return true;
> @@ -3751,23 +3751,29 @@ reduction_fn_for_scalar_code (code_helper
> code, internal_fn *reduc_fn)
>     by the introduction of additional X elements, return that X, otherwise
>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>     of the scalar elements.  If the reduction has just a single initial value
> -   then INITIAL_VALUE is that value, otherwise it is null.  */
> +   then INITIAL_VALUE is that value, otherwise it is null.
> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> +   In that case no signed zero is returned.  */
> 
>  tree
>  neutral_op_for_reduction (tree scalar_type, code_helper code,
> -			  tree initial_value)
> +			  tree initial_value, bool as_initial)
>  {
>    if (code.is_tree_code ())
>      switch (tree_code (code))
>        {
> -      case WIDEN_SUM_EXPR:
>        case DOT_PROD_EXPR:
>        case SAD_EXPR:
> -      case PLUS_EXPR:
>        case MINUS_EXPR:
>        case BIT_IOR_EXPR:
>        case BIT_XOR_EXPR:
>  	return build_zero_cst (scalar_type);
> +      case WIDEN_SUM_EXPR:
> +      case PLUS_EXPR:
> +	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> +	  return build_real (scalar_type, dconstm0);
> +	else
> +	  return build_zero_cst (scalar_type);
> 
>        case MULT_EXPR:
>  	return build_one_cst (scalar_type);
> @@ -4106,8 +4112,14 @@ vect_is_simple_reduction (loop_vec_info
> loop_info, stmt_vec_info phi_info,
>            return NULL;
>          }
> 
> -      nphi_def_loop_uses++;
> -      phi_use_stmt = use_stmt;
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	 op1 twice (once as definition, once as else) in the same operation.
> +	 Only count it as one. */
> +      if (use_stmt != phi_use_stmt)
> +	{
> +	  nphi_def_loop_uses++;
> +	  phi_use_stmt = use_stmt;
> +	}
>      }
> 
>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6378,7 +6390,7 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
>  	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>  	    initial_value = reduc_info->reduc_initial_values[0];
>  	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> -						 initial_value);
> +						 initial_value, false);
>  	}
>        if (neutral_op)
>  	vector_identity = gimple_build_vector_from_val (&seq, vectype, @@ -
> 6860,8 +6872,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  			       gimple_stmt_iterator *gsi,
>  			       gimple **vec_stmt, slp_tree slp_node,
>  			       gimple *reduc_def_stmt,
> -			       tree_code code, internal_fn reduc_fn,
> -			       tree ops[3], tree vectype_in,
> +			       code_helper code, internal_fn reduc_fn,
> +			       tree *ops, int num_ops, tree vectype_in,
>  			       int reduc_index, vec_loop_masks *masks,
>  			       vec_loop_lens *lens)
>  {
> @@ -6877,17 +6889,40 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
> 
>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>    gcc_assert (ncopies == 1);
> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> +  bool is_cond_op = false;
> +  if (code.is_tree_code ())
> +    code = tree_code (code);
> +  else

This if is not needed is it? It's going to convert the tree_code back to the code_helper.

> +    {
> +      gcc_assert (cond_fn_p (code));
> +      is_cond_op = true;
> +      code = conditional_internal_fn_code (internal_fn (code));
> +    }
> +
> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
> 
>    if (slp_node)
>      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
>  			  TYPE_VECTOR_SUBPARTS (vectype_in)));
> 
> -  tree op0 = ops[1 - reduc_index];
> +  /* The operands either come from a binary operation or an IFN_COND
> operation.
> +     The former is a gimple assign with binary rhs and the latter is a
> +     gimple call with four arguments.  */  gcc_assert (num_ops == 2 ||
> + num_ops == 4);  tree op0, opmask;  if (!is_cond_op)
> +    op0 = ops[1 - reduc_index];
> +  else
> +    {
> +      op0 = ops[2];
> +      opmask = ops[0];
> +      gcc_assert (!slp_node);
> +    }
> 
>    int group_size = 1;
>    stmt_vec_info scalar_dest_def_info;
> -  auto_vec<tree> vec_oprnds0;
> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>    if (slp_node)
>      {
>        auto_vec<vec<tree> > vec_defs (2); @@ -6903,9 +6938,17 @@
> vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>  				     op0, &vec_oprnds0);
>        scalar_dest_def_info = stmt_info;
> +
> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
> +      if (is_cond_op)
> +	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> +					 opmask, &vec_opmask);
>      }
> 
> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> +  gimple *sdef = scalar_dest_def_info->stmt;
> +  tree scalar_dest = is_gimple_call (sdef)
> +		       ? gimple_call_lhs (sdef)
> +		       : gimple_assign_lhs (scalar_dest_def_info->stmt);

This is can be tree scalar_dest = gimple_get_lhs (sdef);

>    tree scalar_type = TREE_TYPE (scalar_dest);
>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
> 
> @@ -6939,17 +6982,20 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
>        tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num,
> vectype_in, i);
> +      else if (is_cond_op)
> +	mask = vec_opmask[0];

Isn't vec_opmask NULL for SLP? You probably need to read it from vec_defs for the COND_EXPR?

>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  	{
>  	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>  				   i, 1);
>  	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS
> (loop_vinfo);
>  	  bias = build_int_cst (intQI_type_node, biasval);
> -	  mask = build_minus_one_cst (truth_type_for (vectype_in));
> +	  if (!is_cond_op)
> +	    mask = build_minus_one_cst (truth_type_for (vectype_in));
>  	}
> 
>        /* Handle MINUS by adding the negative.  */
> -      if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
> +      if (reduc_fn != IFN_LAST && tree_code (code) == MINUS_EXPR)

Change isn't needed, equality on code_helper is overloaded based on the type,
so it'll automatically do a tree_code comparison here.

>  	{
>  	  tree negated = make_ssa_name (vectype_out);
>  	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
> @@ -6957,7 +7003,8 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
>  	  def0 = negated;
>  	}
> 
> -      if (mask && mask_reduc_fn == IFN_LAST)
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +	  && mask && mask_reduc_fn == IFN_LAST)
>  	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>  				    vector_identity);
> 
> @@ -6988,8 +7035,8 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
>  	}
>        else
>  	{
> -	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> -					     reduc_var, def0);
> +	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> +					     tree_code (code), reduc_var, def0);
>  	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>  	  /* Remove the statement, so that we can use the same code paths
>  	     as for statements that we've just created.  */ @@ -7440,6
> +7487,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>  	continue;
> 
> +      /* For an IFN_COND_OP we might hit the reduction definition operand
> +	 twice (once as definition, once as else).  */
> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> +	continue;
> +
>        /* There should be only one cycle def in the stmt, the one
>           leading to reduc_def.  */
>        if (VECTORIZABLE_CYCLE_DEF (dt))
> @@ -7640,6 +7692,13 @@ vectorizable_reduction (loop_vec_info
> loop_vinfo,
>            when generating the code inside the loop.  */
> 
>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> +  /* If conversion might have created a conditional operation like
> +     IFN_COND_ADD already.  Use the internal code for the following
> +checks.  */
> +  if (cond_fn_p (orig_code))
> +      orig_code = conditional_internal_fn_code
> +	(as_internal_fn(combined_fn (orig_code)));
> +

Since orig_code must be an ifn at this point you can just cast, so

  if (cond_fn_p (orig_code))
      orig_code = conditional_internal_fn_code (internal_fn(orig_code));

>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
> 
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE
> (reduc_info); @@ -7678,7 +7737,7 @@ vectorizable_reduction
> (loop_vec_info loop_vinfo,
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -			    "reduction: not commutative/associative");
> +			    "reduction: not commutative/associative\n");
>  	  return false;
>  	}
>      }
> @@ -8213,6 +8272,7 @@ vect_transform_reduction (loop_vec_info
> loop_vinfo,
> 
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
> vectype_in); @@ -8231,17 +8291,21 @@ vect_transform_reduction
> (loop_vec_info loop_vinfo,
>    if (code == COND_EXPR)
>      gcc_assert (ncopies == 1);
> 
> +  /* A COND_OP reduction must have the same definition and else value.
> + */  if (cond_fn_p (code))
> +    gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +

This assert is only valid for binary cond expressions.  Perhaps instead of a generic cond_fn_p
you should explicitly list the COND values that are expected to be used in a reduction.

I believe Richi usually likes this because it catches unexpected values.

Thanks,
Tamar

>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> 
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE
> (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> -      gcc_assert (code.is_tree_code ());
> +      gcc_assert (code.is_tree_code () || cond_fn_p (code));
>        return vectorize_fold_left_reduction
>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> -	   lens);
> +	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +	   reduc_index, masks, lens);
>      }
> 
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE
> (reduc_info); @@ -8254,14 +8318,20 @@ vect_transform_reduction
> (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
> 
> +  /* Get NCOPIES vector definitions for all operands except the reduction
> +     definition.  */
>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  		     single_defuse_cycle && reduc_index == 0
>  		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
>  		     single_defuse_cycle && reduc_index == 1
>  		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -		     op.num_ops == 3
> -		     && !(single_defuse_cycle && reduc_index == 2)
> +		     op.num_ops == 4
> +		     || (op.num_ops == 3
> +			 && !(single_defuse_cycle && reduc_index == 2))
>  		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> +  /* For single def-use cycles get one copy of the vectorized reduction
> +     definition.  */
>    if (single_defuse_cycle)
>      {
>        gcc_assert (!slp_node);
> @@ -8301,7 +8371,7 @@ vect_transform_reduction (loop_vec_info
> loop_vinfo,
>  	}
>        else
>  	{
> -	  if (op.num_ops == 3)
> +	  if (op.num_ops >= 3)
>  	    vop[2] = vec_oprnds2[i];
> 
>  	  if (masked_loop_p && mask_by_cond_expr) @@ -8314,10
> +8384,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  if (emulated_mixed_dot_prod)
>  	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info,
> gsi,
>  						    vec_dest, vop);
> -	  else if (code.is_internal_fn ())
> +
> +	  else if (code.is_internal_fn () && !cond_fn_p (code))
>  	    new_stmt = gimple_build_call_internal (internal_fn (code),
>  						   op.num_ops,
>  						   vop[0], vop[1], vop[2]);
> +	  else if (cond_fn_p (code))
> +	    new_stmt = gimple_build_call_internal (internal_fn (code),
> +						   op.num_ops,
> +						   vop[0], vop[1], vop[2],
> +						   vop[1]);
>  	  else
>  	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>  					    vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index
> f1d0cd79961..e22067400af 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref
> (vec_info *,
>  						  tree);
> 
>  /* In tree-vect-loop.cc.  */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool =
> +true);
>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info
> loop_vinfo);  bool vect_rgroup_iv_might_wrap_p (loop_vec_info,
> rgroup_controls *);
>  /* Used in tree-vect-loop-manip.cc */
> --
> 2.41.0
  
Richard Biener Sept. 27, 2023, 11:42 a.m. UTC | #2
On Wed, 20 Sep 2023, Robin Dapp wrote:

> Hi,
> 
> as described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions.  This makes it difficult to combine both
> into a masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD during ifcvt and
> adjusting some vectorizer code to handle it.
> 
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
> is true.
> 
> Related question/change: We only allow PLUS_EXPR in fold_left_reduction_fn
> but have code to handle MINUS_EXPR in vectorize_fold_left_reduction.  I
> suppose that's intentional but it "just works" on riscv and the testsuite
> doesn't change when allowing MINUS_EXPR so I went ahead and did that.
> 
> Bootstrapped and regtested on x86 and aarch64.

I think overall the patch is fine - please address Tamars comments
though, those look valid.

Thanks,
Richard.

> Regards
>  Robin
> 
> gcc/ChangeLog:
> 
> 	PR middle-end/111401
> 	* internal-fn.cc (cond_fn_p): New function.
> 	* internal-fn.h (cond_fn_p): Define.
> 	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
> 	if supported.
> 	(predicate_scalar_phi): Add whitespace.
> 	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
> 	(neutral_op_for_reduction): Return -0 for PLUS.
> 	(vect_is_simple_reduction): Don't count else operand in
> 	COND_ADD.
> 	(vectorize_fold_left_reduction): Add COND_ADD handling.
> 	(vectorizable_reduction): Don't count else operand in COND_ADD.
> 	(vect_transform_reduction): Add COND_ADD handling.
> 	* tree-vectorizer.h (neutral_op_for_reduction): Add default
> 	parameter.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
> 	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
> ---
>  gcc/internal-fn.cc                            |  38 +++++
>  gcc/internal-fn.h                             |   1 +
>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++++
>  .../riscv/rvv/autovec/cond/pr111401.c         |  61 ++++++++
>  gcc/tree-if-conv.cc                           |  63 ++++++--
>  gcc/tree-vect-loop.cc                         | 130 ++++++++++++----
>  gcc/tree-vectorizer.h                         |   2 +-
>  7 files changed, 394 insertions(+), 42 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 0fd34359247..77939890f5a 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4241,6 +4241,44 @@ first_commutative_argument (internal_fn fn)
>      }
>  }
>  
> +/* Return true if this CODE describes a conditional (masked) internal_fn.  */
> +
> +bool
> +cond_fn_p (code_helper code)
> +{
> +  if (!code.is_fn_code ())
> +    return false;
> +
> +  if (!internal_fn_p ((combined_fn) code))
> +    return false;
> +
> +  internal_fn fn = as_internal_fn ((combined_fn) code);
> +  switch (fn)
> +    {
> +    #undef DEF_INTERNAL_COND_FN
> +    #define DEF_INTERNAL_COND_FN(NAME, F, O, T)			  \
> +    case IFN_COND_##NAME:					  \
> +    case IFN_COND_LEN_##NAME:					  \
> +      return true;
> +    #include "internal-fn.def"
> +    #undef DEF_INTERNAL_COND_FN
> +
> +    #undef DEF_INTERNAL_SIGNED_COND_FN
> +    #define DEF_INTERNAL_SIGNED_COND_FN(NAME, F, S, SO, UO, T)	  \
> +    case IFN_COND_##NAME:					  \
> +    case IFN_COND_LEN_##NAME:					  \
> +      return true;
> +    #include "internal-fn.def"
> +    #undef DEF_INTERNAL_SIGNED_COND_FN
> +
> +    default:
> +      return false;
> +    }
> +
> +  return false;
> +}
> +
> +
>  /* Return true if this CODE describes an internal_fn that returns a vector with
>     elements twice as wide as the element size of the input vectors.  */
>  
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 99de13a0199..f1cc9db29c0 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -219,6 +219,7 @@ extern bool commutative_ternary_fn_p (internal_fn);
>  extern int first_commutative_argument (internal_fn);
>  extern bool associative_binary_fn_p (internal_fn);
>  extern bool widening_fn_p (code_helper);
> +extern bool cond_fn_p (code_helper code);
>  
>  extern bool set_edom_supported_p (void);
>  
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..57c600838ee
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=c99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_plus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_minus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> +  int n = 19;
> +  double a[N];
> +  int cond1[N], cond2[N];
> +
> +  for (int i = 0; i < N; i++)
> +    {
> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> +      cond1[i] = 0;
> +      cond2[i] = i & 4 ? 1 : 0;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res2 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +  double res3 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res4 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..1d559ce5391
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,61 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init += a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init -= a[i];
> +  return init;
> +}
> +
> +#define SZ 125
> +
> +__attribute__ ((optimize ("1")))
> +int
> +main ()
> +{
> +  double res1 = 0, res2 = 0;
> +  double a1[SZ], a2[SZ];
> +  int c1[SZ], c2[SZ];
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      a1[i] = i * 3 + (i & 4) - (i & 7);
> +      a2[i] = i * 3 + (i & 4) - (i & 7);
> +      c1[i] = i & 1;
> +      c2[i] = i & 1;
> +    }
> +
> +  double init1 = 2.7, init2 = 8.2;
> +  double ref1 = init1, ref2 = init2;
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      if (c1[i])
> +        ref1 += a1[i];
> +      if (c2[i])
> +        ref2 -= a2[i];
> +    }
> +
> +  res1 = foo2 (a1, init1, c1, SZ);
> +  res2 = foo3 (a2, init2, c2, SZ);
> +
> +  if (res1 != ref1)
> +    __builtin_abort ();
> +  if (res2 != ref2)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 799f071965e..425976b0861 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>    gimple *new_assign;
>    tree rhs;
>    tree rhs1 = gimple_assign_rhs1 (reduc);
> +  tree lhs = gimple_assign_lhs (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
> +					       NULL, false);
>    gimple_seq stmts = NULL;
>  
>    if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1864,19 +1866,52 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>      }
>  
> -  /* Build cond expression using COND and constant operand
> -     of reduction rhs.  */
> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
> -			    unshare_expr (cond),
> -			    swap ? op_nochange : op1,
> -			    swap ? op1 : op_nochange);
> +  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
> +     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
> +     a vectorizable call as we can create a COND version of it directly.  */
> +  internal_fn ifn;
> +  ifn = get_conditional_internal_fn (reduction_op);
>  
> -  /* Create assignment stmt and insert it at GSI.  */
> -  new_assign = gimple_build_assign (tmp, c);
> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> -  rhs = gimple_build (&stmts, reduction_op,
> -		      TREE_TYPE (rhs1), op0, tmp);
> +  bool try_cond_op = true;
> +  gimple *opstmt;
> +  if (TREE_CODE (op1) == SSA_NAME
> +      && (opstmt = SSA_NAME_DEF_STMT (op1))
> +      && is_gimple_call (opstmt))
> +    {
> +      combined_fn cfn = gimple_call_combined_fn (opstmt);
> +      internal_fn ifnop;
> +      reduction_fn_for_scalar_code (cfn, &ifnop);
> +      if (vectorized_internal_fn_supported_p (ifnop, TREE_TYPE
> +					      (gimple_call_lhs (opstmt))))
> +	try_cond_op = false;
> +    }
> +
> +  if (ifn != IFN_LAST
> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> +      && try_cond_op && !swap)
> +    {
> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
> +						     unshare_expr (cond),
> +						     op0, op1, op0);
> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> +      gimple_call_set_lhs (cond_call, tmp);
> +      rhs = tmp;
> +    }
> +  else
> +    {
> +      /* Build cond expression using COND and constant operand
> +	 of reduction rhs.  */
> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
> +				unshare_expr (cond),
> +				swap ? op_nochange : op1,
> +				swap ? op1 : op_nochange);
> +      /* Create assignment stmt and insert it at GSI.  */
> +      new_assign = gimple_build_assign (tmp, c);
> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +      rhs = gimple_build (&stmts, reduction_op,
> +			  TREE_TYPE (rhs1), op0, tmp);
> +    }
>  
>    if (has_nop)
>      {
> @@ -2241,7 +2276,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
>  	{
>  	  /* Convert reduction stmt into vectorizable form.  */
>  	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -					       swap,has_nop, nop_reduc);
> +					       swap, has_nop, nop_reduc);
>  	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>  	}
>        new_stmt = gimple_build_assign (res, rhs);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 23c6e8259e7..94d3cead1e6 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3672,7 +3672,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
>  static bool
>  fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
>  {
> -  if (code == PLUS_EXPR)
> +  if (code == PLUS_EXPR || code == MINUS_EXPR)
>      {
>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>        return true;
> @@ -3751,23 +3751,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
>     by the introduction of additional X elements, return that X, otherwise
>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>     of the scalar elements.  If the reduction has just a single initial value
> -   then INITIAL_VALUE is that value, otherwise it is null.  */
> +   then INITIAL_VALUE is that value, otherwise it is null.
> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> +   In that case no signed zero is returned.  */
>  
>  tree
>  neutral_op_for_reduction (tree scalar_type, code_helper code,
> -			  tree initial_value)
> +			  tree initial_value, bool as_initial)
>  {
>    if (code.is_tree_code ())
>      switch (tree_code (code))
>        {
> -      case WIDEN_SUM_EXPR:
>        case DOT_PROD_EXPR:
>        case SAD_EXPR:
> -      case PLUS_EXPR:
>        case MINUS_EXPR:
>        case BIT_IOR_EXPR:
>        case BIT_XOR_EXPR:
>  	return build_zero_cst (scalar_type);
> +      case WIDEN_SUM_EXPR:
> +      case PLUS_EXPR:
> +	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> +	  return build_real (scalar_type, dconstm0);
> +	else
> +	  return build_zero_cst (scalar_type);
>  
>        case MULT_EXPR:
>  	return build_one_cst (scalar_type);
> @@ -4106,8 +4112,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
>            return NULL;
>          }
>  
> -      nphi_def_loop_uses++;
> -      phi_use_stmt = use_stmt;
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	 op1 twice (once as definition, once as else) in the same operation.
> +	 Only count it as one. */
> +      if (use_stmt != phi_use_stmt)
> +	{
> +	  nphi_def_loop_uses++;
> +	  phi_use_stmt = use_stmt;
> +	}
>      }
>  
>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6378,7 +6390,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>  	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>  	    initial_value = reduc_info->reduc_initial_values[0];
>  	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> -						 initial_value);
> +						 initial_value, false);
>  	}
>        if (neutral_op)
>  	vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -6860,8 +6872,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  			       gimple_stmt_iterator *gsi,
>  			       gimple **vec_stmt, slp_tree slp_node,
>  			       gimple *reduc_def_stmt,
> -			       tree_code code, internal_fn reduc_fn,
> -			       tree ops[3], tree vectype_in,
> +			       code_helper code, internal_fn reduc_fn,
> +			       tree *ops, int num_ops, tree vectype_in,
>  			       int reduc_index, vec_loop_masks *masks,
>  			       vec_loop_lens *lens)
>  {
> @@ -6877,17 +6889,40 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  
>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>    gcc_assert (ncopies == 1);
> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> +  bool is_cond_op = false;
> +  if (code.is_tree_code ())
> +    code = tree_code (code);
> +  else
> +    {
> +      gcc_assert (cond_fn_p (code));
> +      is_cond_op = true;
> +      code = conditional_internal_fn_code (internal_fn (code));
> +    }
> +
> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
>  
>    if (slp_node)
>      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
>  			  TYPE_VECTOR_SUBPARTS (vectype_in)));
>  
> -  tree op0 = ops[1 - reduc_index];
> +  /* The operands either come from a binary operation or an IFN_COND operation.
> +     The former is a gimple assign with binary rhs and the latter is a
> +     gimple call with four arguments.  */
> +  gcc_assert (num_ops == 2 || num_ops == 4);
> +  tree op0, opmask;
> +  if (!is_cond_op)
> +    op0 = ops[1 - reduc_index];
> +  else
> +    {
> +      op0 = ops[2];
> +      opmask = ops[0];
> +      gcc_assert (!slp_node);
> +    }
>  
>    int group_size = 1;
>    stmt_vec_info scalar_dest_def_info;
> -  auto_vec<tree> vec_oprnds0;
> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>    if (slp_node)
>      {
>        auto_vec<vec<tree> > vec_defs (2);
> @@ -6903,9 +6938,17 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>  				     op0, &vec_oprnds0);
>        scalar_dest_def_info = stmt_info;
> +
> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
> +      if (is_cond_op)
> +	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> +					 opmask, &vec_opmask);
>      }
>  
> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> +  gimple *sdef = scalar_dest_def_info->stmt;
> +  tree scalar_dest = is_gimple_call (sdef)
> +		       ? gimple_call_lhs (sdef)
> +		       : gimple_assign_lhs (scalar_dest_def_info->stmt);
>    tree scalar_type = TREE_TYPE (scalar_dest);
>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
>  
> @@ -6939,17 +6982,20 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
> +      else if (is_cond_op)
> +	mask = vec_opmask[0];
>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  	{
>  	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>  				   i, 1);
>  	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>  	  bias = build_int_cst (intQI_type_node, biasval);
> -	  mask = build_minus_one_cst (truth_type_for (vectype_in));
> +	  if (!is_cond_op)
> +	    mask = build_minus_one_cst (truth_type_for (vectype_in));
>  	}
>  
>        /* Handle MINUS by adding the negative.  */
> -      if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
> +      if (reduc_fn != IFN_LAST && tree_code (code) == MINUS_EXPR)
>  	{
>  	  tree negated = make_ssa_name (vectype_out);
>  	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
> @@ -6957,7 +7003,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	  def0 = negated;
>  	}
>  
> -      if (mask && mask_reduc_fn == IFN_LAST)
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +	  && mask && mask_reduc_fn == IFN_LAST)
>  	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>  				    vector_identity);
>  
> @@ -6988,8 +7035,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> -					     reduc_var, def0);
> +	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> +					     tree_code (code), reduc_var, def0);
>  	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>  	  /* Remove the statement, so that we can use the same code paths
>  	     as for statements that we've just created.  */
> @@ -7440,6 +7487,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>  	continue;
>  
> +      /* For an IFN_COND_OP we might hit the reduction definition operand
> +	 twice (once as definition, once as else).  */
> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> +	continue;
> +
>        /* There should be only one cycle def in the stmt, the one
>           leading to reduc_def.  */
>        if (VECTORIZABLE_CYCLE_DEF (dt))
> @@ -7640,6 +7692,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>            when generating the code inside the loop.  */
>  
>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> +  /* If conversion might have created a conditional operation like
> +     IFN_COND_ADD already.  Use the internal code for the following checks.  */
> +  if (cond_fn_p (orig_code))
> +      orig_code = conditional_internal_fn_code
> +	(as_internal_fn(combined_fn (orig_code)));
> +
>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> @@ -7678,7 +7737,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -			    "reduction: not commutative/associative");
> +			    "reduction: not commutative/associative\n");
>  	  return false;
>  	}
>      }
> @@ -8213,6 +8272,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> @@ -8231,17 +8291,21 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    if (code == COND_EXPR)
>      gcc_assert (ncopies == 1);
>  
> +  /* A COND_OP reduction must have the same definition and else value. */
> +  if (cond_fn_p (code))
> +    gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> -      gcc_assert (code.is_tree_code ());
> +      gcc_assert (code.is_tree_code () || cond_fn_p (code));
>        return vectorize_fold_left_reduction
>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> -	   lens);
> +	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +	   reduc_index, masks, lens);
>      }
>  
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> @@ -8254,14 +8318,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>  
> +  /* Get NCOPIES vector definitions for all operands except the reduction
> +     definition.  */
>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  		     single_defuse_cycle && reduc_index == 0
>  		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
>  		     single_defuse_cycle && reduc_index == 1
>  		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -		     op.num_ops == 3
> -		     && !(single_defuse_cycle && reduc_index == 2)
> +		     op.num_ops == 4
> +		     || (op.num_ops == 3
> +			 && !(single_defuse_cycle && reduc_index == 2))
>  		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> +  /* For single def-use cycles get one copy of the vectorized reduction
> +     definition.  */
>    if (single_defuse_cycle)
>      {
>        gcc_assert (!slp_node);
> @@ -8301,7 +8371,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  if (op.num_ops == 3)
> +	  if (op.num_ops >= 3)
>  	    vop[2] = vec_oprnds2[i];
>  
>  	  if (masked_loop_p && mask_by_cond_expr)
> @@ -8314,10 +8384,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  if (emulated_mixed_dot_prod)
>  	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
>  						    vec_dest, vop);
> -	  else if (code.is_internal_fn ())
> +
> +	  else if (code.is_internal_fn () && !cond_fn_p (code))
>  	    new_stmt = gimple_build_call_internal (internal_fn (code),
>  						   op.num_ops,
>  						   vop[0], vop[1], vop[2]);
> +	  else if (cond_fn_p (code))
> +	    new_stmt = gimple_build_call_internal (internal_fn (code),
> +						   op.num_ops,
> +						   vop[0], vop[1], vop[2],
> +						   vop[1]);
>  	  else
>  	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>  					    vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index f1d0cd79961..e22067400af 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
>  						  tree);
>  
>  /* In tree-vect-loop.cc.  */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
>  bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
>  /* Used in tree-vect-loop-manip.cc */
>
  
Robin Dapp Oct. 4, 2023, 7:54 a.m. UTC | #3
Hi Tamar,

> I can't approve but hope you don't mind the review,

Not at all, greatly appreciated.

I incorporated all your remarks apart from this:

> Isn't vec_opmask NULL for SLP? You probably need to read it from
> vec_defs for the COND_EXPR

Above that I gcc_assert (!slp_node) for the IFN_COND case.  It doesn't
seem to be hit during testsuite runs.  I also didn't manage to create
an example that would trigger it.  When "conditionalizing" an SLP
fold-left reduction we don't seem to vectorize for a different reason.
Granted, I didn't look very closely at that reason :)

Bootstrap and testsuite are currently running with the attached v2
on x86, aarch64 and powerpc.

Besides, when thinking about which COND_OPs we expect I tried to loosen
the restrictions in if-conv by allowing MAX_EXPR and MIN_EXPR.  The
emitted code on riscv looks correct but I hit a bootstrap ICE on x86
so omitted it for now.

Regards
 Robin


Subject: [PATCH v2] ifcvt/vect: Emit COND_ADD for conditional scalar
 reduction.

As described in PR111401 we currently emit a COND and a PLUS expression
for conditional reductions.  This makes it difficult to combine both
into a masked reduction statement later.
This patch improves that by directly emitting a COND_ADD during ifcvt and
adjusting some vectorizer code to handle it.

It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
is true.

gcc/ChangeLog:

	PR middle-end/111401
	* internal-fn.cc (cond_fn_p): New function.
	* internal-fn.h (cond_fn_p): Define.
	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
	if supported.
	(predicate_scalar_phi): Add whitespace.
	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
	(neutral_op_for_reduction): Return -0 for PLUS.
	(vect_is_simple_reduction): Don't count else operand in
	COND_ADD.
	(vectorize_fold_left_reduction): Add COND_ADD handling.
	(vectorizable_reduction): Don't count else operand in COND_ADD.
	(vect_transform_reduction): Add COND_ADD handling.
	* tree-vectorizer.h (neutral_op_for_reduction): Add default
	parameter.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
---
 gcc/internal-fn.cc                            |  17 +++
 gcc/internal-fn.h                             |   1 +
 .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++++
 .../riscv/rvv/autovec/cond/pr111401.c         | 139 +++++++++++++++++
 gcc/tree-if-conv.cc                           |  63 ++++++--
 gcc/tree-vect-loop.cc                         | 129 ++++++++++++----
 gcc/tree-vectorizer.h                         |   2 +-
 7 files changed, 450 insertions(+), 42 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 61d5a9e4772..9b38dc0cef4 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4245,6 +4245,23 @@ first_commutative_argument (internal_fn fn)
     }
 }
 
+/* Return true if this CODE describes a conditional (masked) internal_fn.  */
+
+bool
+cond_fn_p (code_helper code)
+{
+  if (!code.is_fn_code ())
+    return false;
+
+  if (!internal_fn_p ((combined_fn) code))
+    return false;
+
+  internal_fn fn = as_internal_fn ((combined_fn) code);
+
+  return conditional_internal_fn_code (fn) != ERROR_MARK;
+}
+
+
 /* Return true if this CODE describes an internal_fn that returns a vector with
    elements twice as wide as the element size of the input vectors.  */
 
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 99de13a0199..f1cc9db29c0 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -219,6 +219,7 @@ extern bool commutative_ternary_fn_p (internal_fn);
 extern int first_commutative_argument (internal_fn);
 extern bool associative_binary_fn_p (internal_fn);
 extern bool widening_fn_p (code_helper);
+extern bool cond_fn_p (code_helper code);
 
 extern bool set_edom_supported_p (void);
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
new file mode 100644
index 00000000000..7b46e7d8a2a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
@@ -0,0 +1,141 @@
+/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
+/* { dg-do run } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-add-options ieee } */
+/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
+
+#include "tree-vect.h"
+
+#include <math.h>
+
+#define N (VECTOR_BITS * 17)
+
+double __attribute__ ((noinline, noclone))
+reduc_plus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone))
+reduc_minus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  int n = 19;
+  double a[N];
+  int cond1[N], cond2[N];
+
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
+      cond1[i] = 0;
+      cond2[i] = i & 4 ? 1 : 0;
+      asm volatile ("" ::: "memory");
+    }
+
+  double res1 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res2 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
+  double res3 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res4 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond1, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res2 = reduc_minus_double (a, 0.0, cond1, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
+  res3 = reduc_plus_double (a, 0.0, cond1, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res4 = reduc_minus_double (a, 0.0, cond1, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, -0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res2 = reduc_minus_double (a, -0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
+  res3 = reduc_plus_double (a, -0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res4 = reduc_minus_double (a, -0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res2 = reduc_minus_double (a, 0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
+  res3 = reduc_plus_double (a, 0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res4 = reduc_minus_double (a, 0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
new file mode 100644
index 00000000000..8f1cb0d68de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
@@ -0,0 +1,139 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
+
+double
+__attribute__ ((noipa))
+foo2 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init += a[i];
+  return init;
+}
+
+double
+__attribute__ ((noipa))
+foo3 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init -= a[i];
+  return init;
+}
+
+double
+__attribute__ ((noipa))
+foo4 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init *= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo5 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init &= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo6 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init |= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo7 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init ^= a[i];
+  return init;
+}
+
+#define SZ 125
+
+int
+main ()
+{
+  double res1 = 0, res2 = 0, res3 = 0;
+  double a1[SZ], a2[SZ], a3[SZ];
+  int c1[SZ], c2[SZ], c3[SZ];
+
+  int a4[SZ], a5[SZ], a6[SZ], a7[SZ], a8[SZ];
+  int res4 = 0, res5 = 0, res6 = 0;
+  int c4[SZ], c5[SZ], c6[SZ];
+
+  for (int i = 0; i < SZ; i++)
+    {
+      a1[i] = i * 3 + (i & 4) - (i & 7);
+      a2[i] = i * 3 + (i & 4) - (i & 7);
+      a3[i] = i * 0.05 + (i & 4) - (i & 7);
+      a4[i] = i * 3 + (i & 4) - (i & 7);
+      a5[i] = i * 3 + (i & 4) - (i & 7);
+      a6[i] = i * 3 + (i & 4) - (i & 7);
+      c1[i] = i & 1;
+      c2[i] = i & 2;
+      c3[i] = i & 3;
+      c4[i] = i & 4;
+      c5[i] = i & 5;
+      c6[i] = i & 6;
+      __asm__ volatile ("" : : : "memory");
+    }
+
+  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
+  double ref1 = init1, ref2 = init2, ref3 = init3;
+
+  int init4 = 87, init5 = 11, init6 = -123894344, init7 = -2, init8 = 854893;
+  int ref4 = init4, ref5 = init5, ref6 = init6;
+
+#pragma GCC novector
+  for (int i = 0; i < SZ; i++)
+    {
+      if (c1[i])
+        ref1 += a1[i];
+      if (c2[i])
+        ref2 -= a2[i];
+      if (c3[i])
+        ref3 *= a3[i];
+      if (c4[i])
+        ref4 &= a4[i];
+      if (c5[i])
+        ref5 |= a5[i];
+      if (c6[i])
+        ref6 ^= a6[i];
+    }
+
+  res1 = foo2 (a1, init1, c1, SZ);
+  res2 = foo3 (a2, init2, c2, SZ);
+  res3 = foo4 (a3, init3, c3, SZ);
+  res4 = foo5 (a4, init4, c4, SZ);
+  res5 = foo6 (a5, init5, c5, SZ);
+  res6 = foo7 (a6, init6, c6, SZ);
+
+  if (res1 != ref1)
+    __builtin_abort ();
+  if (res2 != ref2)
+    __builtin_abort ();
+  if (res3 != ref3)
+    __builtin_abort ();
+  if (res4 != ref4)
+    __builtin_abort ();
+  if (res5 != ref5)
+    __builtin_abort ();
+  if (res6 != ref6)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
+/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index a8c915913ae..b334173794d 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
   gimple *new_assign;
   tree rhs;
   tree rhs1 = gimple_assign_rhs1 (reduc);
+  tree lhs = gimple_assign_lhs (reduc);
   tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
   tree c;
   enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
-  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
+  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
+					       NULL, false);
   gimple_seq stmts = NULL;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1864,19 +1866,52 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
       print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
     }
 
-  /* Build cond expression using COND and constant operand
-     of reduction rhs.  */
-  c = fold_build_cond_expr (TREE_TYPE (rhs1),
-			    unshare_expr (cond),
-			    swap ? op_nochange : op1,
-			    swap ? op1 : op_nochange);
+  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
+     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
+     a vectorizable call as we can create a COND version of it directly.  */
+  internal_fn ifn;
+  ifn = get_conditional_internal_fn (reduction_op);
 
-  /* Create assignment stmt and insert it at GSI.  */
-  new_assign = gimple_build_assign (tmp, c);
-  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
-  /* Build rhs for unconditional increment/decrement/logic_operation.  */
-  rhs = gimple_build (&stmts, reduction_op,
-		      TREE_TYPE (rhs1), op0, tmp);
+  bool try_cond_op = true;
+  gimple *opstmt;
+  if (TREE_CODE (op1) == SSA_NAME
+      && (opstmt = SSA_NAME_DEF_STMT (op1))
+      && is_gimple_call (opstmt))
+    {
+      combined_fn cfn = gimple_call_combined_fn (opstmt);
+      internal_fn ifnop;
+      reduction_fn_for_scalar_code (cfn, &ifnop);
+      if (vectorized_internal_fn_supported_p (ifnop, TREE_TYPE
+					      (gimple_call_lhs (opstmt))))
+	try_cond_op = false;
+    }
+
+  if (ifn != IFN_LAST
+      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
+      && try_cond_op && !swap)
+    {
+      gcall *cond_call = gimple_build_call_internal (ifn, 4,
+						     unshare_expr (cond),
+						     op0, op1, op0);
+      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
+      gimple_call_set_lhs (cond_call, tmp);
+      rhs = tmp;
+    }
+  else
+    {
+      /* Build cond expression using COND and constant operand
+	 of reduction rhs.  */
+      c = fold_build_cond_expr (TREE_TYPE (rhs1),
+				unshare_expr (cond),
+				swap ? op_nochange : op1,
+				swap ? op1 : op_nochange);
+      /* Create assignment stmt and insert it at GSI.  */
+      new_assign = gimple_build_assign (tmp, c);
+      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
+      /* Build rhs for unconditional increment/decrement/logic_operation.  */
+      rhs = gimple_build (&stmts, reduction_op,
+			  TREE_TYPE (rhs1), op0, tmp);
+    }
 
   if (has_nop)
     {
@@ -2241,7 +2276,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
 	{
 	  /* Convert reduction stmt into vectorizable form.  */
 	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
-					       swap,has_nop, nop_reduc);
+					       swap, has_nop, nop_reduc);
 	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
 	}
       new_stmt = gimple_build_assign (res, rhs);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 23c6e8259e7..d370793cfcb 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3672,7 +3672,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 static bool
 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
 {
-  if (code == PLUS_EXPR)
+  if (code == PLUS_EXPR || code == MINUS_EXPR)
     {
       *reduc_fn = IFN_FOLD_LEFT_PLUS;
       return true;
@@ -3751,23 +3751,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
    by the introduction of additional X elements, return that X, otherwise
    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
    of the scalar elements.  If the reduction has just a single initial value
-   then INITIAL_VALUE is that value, otherwise it is null.  */
+   then INITIAL_VALUE is that value, otherwise it is null.
+   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
+   In that case no signed zero is returned.  */
 
 tree
 neutral_op_for_reduction (tree scalar_type, code_helper code,
-			  tree initial_value)
+			  tree initial_value, bool as_initial)
 {
   if (code.is_tree_code ())
     switch (tree_code (code))
       {
-      case WIDEN_SUM_EXPR:
       case DOT_PROD_EXPR:
       case SAD_EXPR:
-      case PLUS_EXPR:
       case MINUS_EXPR:
       case BIT_IOR_EXPR:
       case BIT_XOR_EXPR:
 	return build_zero_cst (scalar_type);
+      case WIDEN_SUM_EXPR:
+      case PLUS_EXPR:
+	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
+	  return build_real (scalar_type, dconstm0);
+	else
+	  return build_zero_cst (scalar_type);
 
       case MULT_EXPR:
 	return build_one_cst (scalar_type);
@@ -4106,8 +4112,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
           return NULL;
         }
 
-      nphi_def_loop_uses++;
-      phi_use_stmt = use_stmt;
+      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
+	 op1 twice (once as definition, once as else) in the same operation.
+	 Only count it as one. */
+      if (use_stmt != phi_use_stmt)
+	{
+	  nphi_def_loop_uses++;
+	  phi_use_stmt = use_stmt;
+	}
     }
 
   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
@@ -6041,7 +6053,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
     }
-  
+
   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
   scalar_type = TREE_TYPE (scalar_dest);
   scalar_results.truncate (0);
@@ -6378,7 +6390,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
 	    initial_value = reduc_info->reduc_initial_values[0];
 	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
-						 initial_value);
+						 initial_value, false);
 	}
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -6860,8 +6872,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 			       gimple_stmt_iterator *gsi,
 			       gimple **vec_stmt, slp_tree slp_node,
 			       gimple *reduc_def_stmt,
-			       tree_code code, internal_fn reduc_fn,
-			       tree ops[3], tree vectype_in,
+			       code_helper code, internal_fn reduc_fn,
+			       tree *ops, int num_ops, tree vectype_in,
 			       int reduc_index, vec_loop_masks *masks,
 			       vec_loop_lens *lens)
 {
@@ -6877,17 +6889,38 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 
   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
   gcc_assert (ncopies == 1);
-  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
+
+  bool is_cond_op = false;
+  if (!code.is_tree_code ())
+    {
+      gcc_assert (cond_fn_p (code));
+      is_cond_op = true;
+      code = conditional_internal_fn_code (internal_fn (code));
+    }
+
+  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
 
   if (slp_node)
     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
 
-  tree op0 = ops[1 - reduc_index];
+  /* The operands either come from a binary operation or an IFN_COND operation.
+     The former is a gimple assign with binary rhs and the latter is a
+     gimple call with four arguments.  */
+  gcc_assert (num_ops == 2 || num_ops == 4);
+  tree op0, opmask;
+  if (!is_cond_op)
+    op0 = ops[1 - reduc_index];
+  else
+    {
+      op0 = ops[2];
+      opmask = ops[0];
+      gcc_assert (!slp_node);
+    }
 
   int group_size = 1;
   stmt_vec_info scalar_dest_def_info;
-  auto_vec<tree> vec_oprnds0;
+  auto_vec<tree> vec_oprnds0, vec_opmask;
   if (slp_node)
     {
       auto_vec<vec<tree> > vec_defs (2);
@@ -6903,9 +6936,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
 				     op0, &vec_oprnds0);
       scalar_dest_def_info = stmt_info;
+
+      /* For an IFN_COND_OP we also need the vector mask operand.  */
+      if (is_cond_op)
+	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+					 opmask, &vec_opmask);
     }
 
-  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
+  gimple *sdef = scalar_dest_def_info->stmt;
+  tree scalar_dest = gimple_get_lhs (sdef);
   tree scalar_type = TREE_TYPE (scalar_dest);
   tree reduc_var = gimple_phi_result (reduc_def_stmt);
 
@@ -6939,13 +6978,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       tree bias = NULL_TREE;
       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
+      else if (is_cond_op)
+	mask = vec_opmask[0];
       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
 	{
 	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
 				   i, 1);
 	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 	  bias = build_int_cst (intQI_type_node, biasval);
-	  mask = build_minus_one_cst (truth_type_for (vectype_in));
+	  if (!is_cond_op)
+	    mask = build_minus_one_cst (truth_type_for (vectype_in));
 	}
 
       /* Handle MINUS by adding the negative.  */
@@ -6957,7 +6999,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	  def0 = negated;
 	}
 
-      if (mask && mask_reduc_fn == IFN_LAST)
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  && mask && mask_reduc_fn == IFN_LAST)
 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
 				    vector_identity);
 
@@ -6988,8 +7031,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
-					     reduc_var, def0);
+	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
+					     tree_code (code), reduc_var, def0);
 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
 	  /* Remove the statement, so that we can use the same code paths
 	     as for statements that we've just created.  */
@@ -7440,6 +7483,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
 	continue;
 
+      /* For an IFN_COND_OP we might hit the reduction definition operand
+	 twice (once as definition, once as else).  */
+      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
+	continue;
+
       /* There should be only one cycle def in the stmt, the one
          leading to reduc_def.  */
       if (VECTORIZABLE_CYCLE_DEF (dt))
@@ -7640,6 +7688,12 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
           when generating the code inside the loop.  */
 
   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
+
+  /* If conversion might have created a conditional operation like
+     IFN_COND_ADD already.  Use the internal code for the following checks.  */
+  if (cond_fn_p (orig_code))
+    orig_code = conditional_internal_fn_code (internal_fn (orig_code));
+
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
@@ -7678,7 +7732,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			    "reduction: not commutative/associative");
+			    "reduction: not commutative/associative\n");
 	  return false;
 	}
     }
@@ -8213,6 +8267,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
   code_helper code = canonicalize_code (op.code, op.type);
   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
+
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
@@ -8231,17 +8286,25 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   if (code == COND_EXPR)
     gcc_assert (ncopies == 1);
 
+  /* A binary COND_OP reduction must have the same definition and else
+     value. */
+  if (cond_fn_p (code))
+    {
+      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB);
+      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
+    }
+
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
-      gcc_assert (code.is_tree_code ());
+      gcc_assert (code.is_tree_code () || cond_fn_p (code));
       return vectorize_fold_left_reduction
 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
-	   lens);
+	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
+	   reduc_index, masks, lens);
     }
 
   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
@@ -8254,14 +8317,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
+  /* Get NCOPIES vector definitions for all operands except the reduction
+     definition.  */
   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
 		     single_defuse_cycle && reduc_index == 0
 		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
 		     single_defuse_cycle && reduc_index == 1
 		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
-		     op.num_ops == 3
-		     && !(single_defuse_cycle && reduc_index == 2)
+		     op.num_ops == 4
+		     || (op.num_ops == 3
+			 && !(single_defuse_cycle && reduc_index == 2))
 		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+
+  /* For single def-use cycles get one copy of the vectorized reduction
+     definition.  */
   if (single_defuse_cycle)
     {
       gcc_assert (!slp_node);
@@ -8301,7 +8370,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  if (op.num_ops == 3)
+	  if (op.num_ops >= 3)
 	    vop[2] = vec_oprnds2[i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
@@ -8314,10 +8383,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	  if (emulated_mixed_dot_prod)
 	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
 						    vec_dest, vop);
-	  else if (code.is_internal_fn ())
+
+	  else if (code.is_internal_fn () && !cond_fn_p (code))
 	    new_stmt = gimple_build_call_internal (internal_fn (code),
 						   op.num_ops,
 						   vop[0], vop[1], vop[2]);
+	  else if (cond_fn_p (code))
+	    new_stmt = gimple_build_call_internal (internal_fn (code),
+						   op.num_ops,
+						   vop[0], vop[1], vop[2],
+						   vop[1]);
 	  else
 	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
 					    vop[0], vop[1], vop[2]);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f1d0cd79961..e22067400af 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
 						  tree);
 
 /* In tree-vect-loop.cc.  */
-extern tree neutral_op_for_reduction (tree, code_helper, tree);
+extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
 extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
 bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
 /* Used in tree-vect-loop-manip.cc */
  
Robin Dapp Oct. 4, 2023, 1:15 p.m. UTC | #4
> +      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB);

I forgot to add the other IFN_CONDs here before sending.  So with

-      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB);
+      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
+                   || code == IFN_COND_MUL || code == IFN_COND_AND
+                   || code == IFN_COND_IOR || code == IFN_COND_XOR);

on top, bootstrap and testsuites on x86, aarch64 and power10 are
unchanged.

Regards
 Robin
  
Tamar Christina Oct. 4, 2023, 3:12 p.m. UTC | #5
Hi Robin,

> -----Original Message-----
> From: Robin Dapp <rdapp.gcc@gmail.com>
> Sent: Wednesday, October 4, 2023 8:54 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches <gcc-
> patches@gcc.gnu.org>; Richard Biener <rguenther@suse.de>
> Cc: rdapp.gcc@gmail.com
> Subject: Re: [PATCH] ifcvt/vect: Emit COND_ADD for conditional scalar
> reduction.
> 
> Hi Tamar,
> 
> > I can't approve but hope you don't mind the review,
> 
> Not at all, greatly appreciated.
> 
> I incorporated all your remarks apart from this:
> 
> > Isn't vec_opmask NULL for SLP? You probably need to read it from
> > vec_defs for the COND_EXPR
> 
> Above that I gcc_assert (!slp_node) for the IFN_COND case.  It doesn't seem to
> be hit during testsuite runs.  I also didn't manage to create an example that
> would trigger it.  When "conditionalizing" an SLP fold-left reduction we don't
> seem to vectorize for a different reason.
> Granted, I didn't look very closely at that reason :)

Yeah it looks like it's failing because it can't handle the PHI node reduction for
the condition.

So that's fine, I think then we should exit from vectorize_fold_left_reduction
in that case so we avoid the segfault when we start forcing things through
SLP only soon and add single lane SLP support.

So in the

  if (slp_node)
    {

Add something like:

If (is_cond_op)
    {
      if (dump_enabled_p ())
	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
			 "left fold reduction on SLP not supported.\n");
      return false;
    }

> 
> Bootstrap and testsuite are currently running with the attached v2 on x86,
> aarch64 and powerpc.
> 
> Besides, when thinking about which COND_OPs we expect I tried to loosen the
> restrictions in if-conv by allowing MAX_EXPR and MIN_EXPR.  The emitted
> code on riscv looks correct but I hit a bootstrap ICE on x86 so omitted it for
> now.
> 
> Regards
>  Robin
> 
> 
> Subject: [PATCH v2] ifcvt/vect: Emit COND_ADD for conditional scalar
> reduction.
> 
> As described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions.  This makes it difficult to combine both into a
> masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD during ifcvt and
> adjusting some vectorizer code to handle it.
> 
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS is
> true.
> 
> gcc/ChangeLog:
> 
> 	PR middle-end/111401
> 	* internal-fn.cc (cond_fn_p): New function.
> 	* internal-fn.h (cond_fn_p): Define.
> 	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
> 	if supported.
> 	(predicate_scalar_phi): Add whitespace.
> 	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
> 	(neutral_op_for_reduction): Return -0 for PLUS.
> 	(vect_is_simple_reduction): Don't count else operand in
> 	COND_ADD.
> 	(vectorize_fold_left_reduction): Add COND_ADD handling.
> 	(vectorizable_reduction): Don't count else operand in COND_ADD.
> 	(vect_transform_reduction): Add COND_ADD handling.
> 	* tree-vectorizer.h (neutral_op_for_reduction): Add default
> 	parameter.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
> 	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
> ---
>  gcc/internal-fn.cc                            |  17 +++
>  gcc/internal-fn.h                             |   1 +
>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++++
>  .../riscv/rvv/autovec/cond/pr111401.c         | 139 +++++++++++++++++
>  gcc/tree-if-conv.cc                           |  63 ++++++--
>  gcc/tree-vect-loop.cc                         | 129 ++++++++++++----
>  gcc/tree-vectorizer.h                         |   2 +-
>  7 files changed, 450 insertions(+), 42 deletions(-)  create mode 100644
> gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>  create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index
> 61d5a9e4772..9b38dc0cef4 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4245,6 +4245,23 @@ first_commutative_argument (internal_fn fn)
>      }
>  }
> 
> +/* Return true if this CODE describes a conditional (masked)
> +internal_fn.  */
> +
> +bool
> +cond_fn_p (code_helper code)
> +{
> +  if (!code.is_fn_code ())
> +    return false;
> +
> +  if (!internal_fn_p ((combined_fn) code))
> +    return false;
> +
> +  internal_fn fn = as_internal_fn ((combined_fn) code);
> +
> +  return conditional_internal_fn_code (fn) != ERROR_MARK; }
> +
> +
>  /* Return true if this CODE describes an internal_fn that returns a vector with
>     elements twice as wide as the element size of the input vectors.  */
> 

The only comment I have is whether you actually need this helper function?
It looks like all the uses of it are in cases you have, or will call conditional_internal_fn_code
directly.

e.g. in vect_transform_reduction you can replace it by 

bool cond_fn_p = cond_fn != ERROR_MARK;

and in 

  if (cond_fn_p (orig_code))
      orig_code = conditional_internal_fn_code (internal_fn(orig_code));

just 

internal_fn new_fn = conditional_internal_fn_code (internal_fn(orig_code));
if (new_fn != ERROR_MARK)
  orig_code = new_fn;

which would save the repeated testing of the condition.

Patch looks good to me with those two changes, but can't approve 😊

Thanks for working on this!,
Tamar

> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h index
> 99de13a0199..f1cc9db29c0 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -219,6 +219,7 @@ extern bool commutative_ternary_fn_p (internal_fn);
> extern int first_commutative_argument (internal_fn);  extern bool
> associative_binary_fn_p (internal_fn);  extern bool widening_fn_p
> (code_helper);
> +extern bool cond_fn_p (code_helper code);
> 
>  extern bool set_edom_supported_p (void);
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-
> zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..7b46e7d8a2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone)) reduc_plus_double (double
> +*restrict a, double init, int *cond, int n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int
> +n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone)) reduc_minus_double (double
> +*restrict a, double init, int *cond, int n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int
> +n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> +  int n = 19;
> +  double a[N];
> +  int cond1[N], cond2[N];
> +
> +  for (int i = 0; i < N; i++)
> +    {
> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> +      cond1[i] = 0;
> +      cond2[i] = i & 4 ? 1 : 0;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);  double ref1 =
> + reduc_plus_double_ref (a, -0.0, cond1, n);  double res2 =
> + reduc_minus_double (a, -0.0, cond1, n);  double ref2 =
> + reduc_minus_double_ref (a, -0.0, cond1, n);  double res3 =
> + reduc_plus_double (a, -0.0, cond1, n);  double ref3 =
> + reduc_plus_double_ref (a, -0.0, cond1, n);  double res4 =
> + reduc_minus_double (a, -0.0, cond1, n);  double ref4 =
> + reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..8f1cb0d68de
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,139 @@
> +/* { dg-do run { target { riscv_v } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param
> +riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init += a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init -= a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo4 (double *__restrict a, double init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init *= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo5 (int *__restrict a, int init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init &= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo6 (int *__restrict a, int init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init |= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo7 (int *__restrict a, int init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init ^= a[i];
> +  return init;
> +}
> +
> +#define SZ 125
> +
> +int
> +main ()
> +{
> +  double res1 = 0, res2 = 0, res3 = 0;
> +  double a1[SZ], a2[SZ], a3[SZ];
> +  int c1[SZ], c2[SZ], c3[SZ];
> +
> +  int a4[SZ], a5[SZ], a6[SZ], a7[SZ], a8[SZ];  int res4 = 0, res5 = 0,
> + res6 = 0;  int c4[SZ], c5[SZ], c6[SZ];
> +
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      a1[i] = i * 3 + (i & 4) - (i & 7);
> +      a2[i] = i * 3 + (i & 4) - (i & 7);
> +      a3[i] = i * 0.05 + (i & 4) - (i & 7);
> +      a4[i] = i * 3 + (i & 4) - (i & 7);
> +      a5[i] = i * 3 + (i & 4) - (i & 7);
> +      a6[i] = i * 3 + (i & 4) - (i & 7);
> +      c1[i] = i & 1;
> +      c2[i] = i & 2;
> +      c3[i] = i & 3;
> +      c4[i] = i & 4;
> +      c5[i] = i & 5;
> +      c6[i] = i & 6;
> +      __asm__ volatile ("" : : : "memory");
> +    }
> +
> +  double init1 = 2.7, init2 = 8.2, init3 = 0.1;  double ref1 = init1,
> + ref2 = init2, ref3 = init3;
> +
> +  int init4 = 87, init5 = 11, init6 = -123894344, init7 = -2, init8 =
> + 854893;  int ref4 = init4, ref5 = init5, ref6 = init6;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      if (c1[i])
> +        ref1 += a1[i];
> +      if (c2[i])
> +        ref2 -= a2[i];
> +      if (c3[i])
> +        ref3 *= a3[i];
> +      if (c4[i])
> +        ref4 &= a4[i];
> +      if (c5[i])
> +        ref5 |= a5[i];
> +      if (c6[i])
> +        ref6 ^= a6[i];
> +    }
> +
> +  res1 = foo2 (a1, init1, c1, SZ);
> +  res2 = foo3 (a2, init2, c2, SZ);
> +  res3 = foo4 (a3, init3, c3, SZ);
> +  res4 = foo5 (a4, init4, c4, SZ);
> +  res5 = foo6 (a5, init5, c5, SZ);
> +  res6 = foo7 (a6, init6, c6, SZ);
> +
> +  if (res1 != ref1)
> +    __builtin_abort ();
> +  if (res2 != ref2)
> +    __builtin_abort ();
> +  if (res3 != ref3)
> +    __builtin_abort ();
> +  if (res4 != ref4)
> +    __builtin_abort ();
> +  if (res5 != ref5)
> +    __builtin_abort ();
> +  if (res6 != ref6)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } }
> +*/
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index
> a8c915913ae..b334173794d 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc,
> gimple_stmt_iterator *gsi,
>    gimple *new_assign;
>    tree rhs;
>    tree rhs1 = gimple_assign_rhs1 (reduc);
> +  tree lhs = gimple_assign_lhs (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1),
> reduction_op, NULL);
> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1),
> reduction_op,
> +					       NULL, false);
>    gimple_seq stmts = NULL;
> 
>    if (dump_file && (dump_flags & TDF_DETAILS)) @@ -1864,19 +1866,52
> @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator
> *gsi,
>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>      }
> 
> -  /* Build cond expression using COND and constant operand
> -     of reduction rhs.  */
> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
> -			    unshare_expr (cond),
> -			    swap ? op_nochange : op1,
> -			    swap ? op1 : op_nochange);
> +  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
> +     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
> +     a vectorizable call as we can create a COND version of it
> + directly.  */  internal_fn ifn;  ifn = get_conditional_internal_fn
> + (reduction_op);
> 
> -  /* Create assignment stmt and insert it at GSI.  */
> -  new_assign = gimple_build_assign (tmp, c);
> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> -  rhs = gimple_build (&stmts, reduction_op,
> -		      TREE_TYPE (rhs1), op0, tmp);
> +  bool try_cond_op = true;
> +  gimple *opstmt;
> +  if (TREE_CODE (op1) == SSA_NAME
> +      && (opstmt = SSA_NAME_DEF_STMT (op1))
> +      && is_gimple_call (opstmt))
> +    {
> +      combined_fn cfn = gimple_call_combined_fn (opstmt);
> +      internal_fn ifnop;
> +      reduction_fn_for_scalar_code (cfn, &ifnop);
> +      if (vectorized_internal_fn_supported_p (ifnop, TREE_TYPE
> +					      (gimple_call_lhs (opstmt))))
> +	try_cond_op = false;
> +    }
> +
> +  if (ifn != IFN_LAST
> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> +      && try_cond_op && !swap)
> +    {
> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
> +						     unshare_expr (cond),
> +						     op0, op1, op0);
> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> +      gimple_call_set_lhs (cond_call, tmp);
> +      rhs = tmp;
> +    }
> +  else
> +    {
> +      /* Build cond expression using COND and constant operand
> +	 of reduction rhs.  */
> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
> +				unshare_expr (cond),
> +				swap ? op_nochange : op1,
> +				swap ? op1 : op_nochange);
> +      /* Create assignment stmt and insert it at GSI.  */
> +      new_assign = gimple_build_assign (tmp, c);
> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +      rhs = gimple_build (&stmts, reduction_op,
> +			  TREE_TYPE (rhs1), op0, tmp);
> +    }
> 
>    if (has_nop)
>      {
> @@ -2241,7 +2276,7 @@ predicate_scalar_phi (gphi *phi,
> gimple_stmt_iterator *gsi)
>  	{
>  	  /* Convert reduction stmt into vectorizable form.  */
>  	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -					       swap,has_nop, nop_reduc);
> +					       swap, has_nop, nop_reduc);
>  	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>  	}
>        new_stmt = gimple_build_assign (res, rhs); diff --git a/gcc/tree-vect-
> loop.cc b/gcc/tree-vect-loop.cc index 23c6e8259e7..d370793cfcb 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3672,7 +3672,7 @@ vect_analyze_loop (class loop *loop,
> vec_info_shared *shared)  static bool  fold_left_reduction_fn (code_helper
> code, internal_fn *reduc_fn)  {
> -  if (code == PLUS_EXPR)
> +  if (code == PLUS_EXPR || code == MINUS_EXPR)
>      {
>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>        return true;
> @@ -3751,23 +3751,29 @@ reduction_fn_for_scalar_code (code_helper
> code, internal_fn *reduc_fn)
>     by the introduction of additional X elements, return that X, otherwise
>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>     of the scalar elements.  If the reduction has just a single initial value
> -   then INITIAL_VALUE is that value, otherwise it is null.  */
> +   then INITIAL_VALUE is that value, otherwise it is null.
> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> +   In that case no signed zero is returned.  */
> 
>  tree
>  neutral_op_for_reduction (tree scalar_type, code_helper code,
> -			  tree initial_value)
> +			  tree initial_value, bool as_initial)
>  {
>    if (code.is_tree_code ())
>      switch (tree_code (code))
>        {
> -      case WIDEN_SUM_EXPR:
>        case DOT_PROD_EXPR:
>        case SAD_EXPR:
> -      case PLUS_EXPR:
>        case MINUS_EXPR:
>        case BIT_IOR_EXPR:
>        case BIT_XOR_EXPR:
>  	return build_zero_cst (scalar_type);
> +      case WIDEN_SUM_EXPR:
> +      case PLUS_EXPR:
> +	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> +	  return build_real (scalar_type, dconstm0);
> +	else
> +	  return build_zero_cst (scalar_type);
> 
>        case MULT_EXPR:
>  	return build_one_cst (scalar_type);
> @@ -4106,8 +4112,14 @@ vect_is_simple_reduction (loop_vec_info
> loop_info, stmt_vec_info phi_info,
>            return NULL;
>          }
> 
> -      nphi_def_loop_uses++;
> -      phi_use_stmt = use_stmt;
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	 op1 twice (once as definition, once as else) in the same operation.
> +	 Only count it as one. */
> +      if (use_stmt != phi_use_stmt)
> +	{
> +	  nphi_def_loop_uses++;
> +	  phi_use_stmt = use_stmt;
> +	}
>      }
> 
>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6041,7 +6053,7 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
>        gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
>        gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
>      }
> -
> +
>    scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
>    scalar_type = TREE_TYPE (scalar_dest);
>    scalar_results.truncate (0);
> @@ -6378,7 +6390,7 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
>  	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>  	    initial_value = reduc_info->reduc_initial_values[0];
>  	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> -						 initial_value);
> +						 initial_value, false);
>  	}
>        if (neutral_op)
>  	vector_identity = gimple_build_vector_from_val (&seq, vectype, @@ -
> 6860,8 +6872,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  			       gimple_stmt_iterator *gsi,
>  			       gimple **vec_stmt, slp_tree slp_node,
>  			       gimple *reduc_def_stmt,
> -			       tree_code code, internal_fn reduc_fn,
> -			       tree ops[3], tree vectype_in,
> +			       code_helper code, internal_fn reduc_fn,
> +			       tree *ops, int num_ops, tree vectype_in,
>  			       int reduc_index, vec_loop_masks *masks,
>  			       vec_loop_lens *lens)
>  {
> @@ -6877,17 +6889,38 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
> 
>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>    gcc_assert (ncopies == 1);
> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> +  bool is_cond_op = false;
> +  if (!code.is_tree_code ())
> +    {
> +      gcc_assert (cond_fn_p (code));
> +      is_cond_op = true;
> +      code = conditional_internal_fn_code (internal_fn (code));
> +    }
> +
> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
> 
>    if (slp_node)
>      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
>  			  TYPE_VECTOR_SUBPARTS (vectype_in)));
> 
> -  tree op0 = ops[1 - reduc_index];
> +  /* The operands either come from a binary operation or an IFN_COND
> operation.
> +     The former is a gimple assign with binary rhs and the latter is a
> +     gimple call with four arguments.  */  gcc_assert (num_ops == 2 ||
> + num_ops == 4);  tree op0, opmask;  if (!is_cond_op)
> +    op0 = ops[1 - reduc_index];
> +  else
> +    {
> +      op0 = ops[2];
> +      opmask = ops[0];
> +      gcc_assert (!slp_node);
> +    }
> 
>    int group_size = 1;
>    stmt_vec_info scalar_dest_def_info;
> -  auto_vec<tree> vec_oprnds0;
> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>    if (slp_node)
>      {
>        auto_vec<vec<tree> > vec_defs (2); @@ -6903,9 +6936,15 @@
> vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>  				     op0, &vec_oprnds0);
>        scalar_dest_def_info = stmt_info;
> +
> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
> +      if (is_cond_op)
> +	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> +					 opmask, &vec_opmask);
>      }
> 
> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> +  gimple *sdef = scalar_dest_def_info->stmt;  tree scalar_dest =
> + gimple_get_lhs (sdef);
>    tree scalar_type = TREE_TYPE (scalar_dest);
>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
> 
> @@ -6939,13 +6978,16 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
>        tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num,
> vectype_in, i);
> +      else if (is_cond_op)
> +	mask = vec_opmask[0];
>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  	{
>  	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>  				   i, 1);
>  	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS
> (loop_vinfo);
>  	  bias = build_int_cst (intQI_type_node, biasval);
> -	  mask = build_minus_one_cst (truth_type_for (vectype_in));
> +	  if (!is_cond_op)
> +	    mask = build_minus_one_cst (truth_type_for (vectype_in));
>  	}
> 
>        /* Handle MINUS by adding the negative.  */ @@ -6957,7 +6999,8 @@
> vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	  def0 = negated;
>  	}
> 
> -      if (mask && mask_reduc_fn == IFN_LAST)
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +	  && mask && mask_reduc_fn == IFN_LAST)
>  	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>  				    vector_identity);
> 
> @@ -6988,8 +7031,8 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
>  	}
>        else
>  	{
> -	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> -					     reduc_var, def0);
> +	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> +					     tree_code (code), reduc_var, def0);
>  	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>  	  /* Remove the statement, so that we can use the same code paths
>  	     as for statements that we've just created.  */ @@ -7440,6
> +7483,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>  	continue;
> 
> +      /* For an IFN_COND_OP we might hit the reduction definition operand
> +	 twice (once as definition, once as else).  */
> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> +	continue;
> +
>        /* There should be only one cycle def in the stmt, the one
>           leading to reduc_def.  */
>        if (VECTORIZABLE_CYCLE_DEF (dt))
> @@ -7640,6 +7688,12 @@ vectorizable_reduction (loop_vec_info
> loop_vinfo,
>            when generating the code inside the loop.  */
> 
>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> +  /* If conversion might have created a conditional operation like
> +     IFN_COND_ADD already.  Use the internal code for the following
> + checks.  */  if (cond_fn_p (orig_code))
> +    orig_code = conditional_internal_fn_code (internal_fn (orig_code));
> +
>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
> 
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE
> (reduc_info); @@ -7678,7 +7732,7 @@ vectorizable_reduction
> (loop_vec_info loop_vinfo,
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -			    "reduction: not commutative/associative");
> +			    "reduction: not commutative/associative\n");
>  	  return false;
>  	}
>      }
> @@ -8213,6 +8267,7 @@ vect_transform_reduction (loop_vec_info
> loop_vinfo,
> 
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
> vectype_in); @@ -8231,17 +8286,25 @@ vect_transform_reduction
> (loop_vec_info loop_vinfo,
>    if (code == COND_EXPR)
>      gcc_assert (ncopies == 1);
> 
> +  /* A binary COND_OP reduction must have the same definition and else
> +     value. */
> +  if (cond_fn_p (code))
> +    {
> +      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB);
> +      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +    }
> +
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> 
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE
> (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> -      gcc_assert (code.is_tree_code ());
> +      gcc_assert (code.is_tree_code () || cond_fn_p (code));
>        return vectorize_fold_left_reduction
>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> -	   lens);
> +	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +	   reduc_index, masks, lens);
>      }
> 
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE
> (reduc_info); @@ -8254,14 +8317,20 @@ vect_transform_reduction
> (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
> 
> +  /* Get NCOPIES vector definitions for all operands except the reduction
> +     definition.  */
>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  		     single_defuse_cycle && reduc_index == 0
>  		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
>  		     single_defuse_cycle && reduc_index == 1
>  		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -		     op.num_ops == 3
> -		     && !(single_defuse_cycle && reduc_index == 2)
> +		     op.num_ops == 4
> +		     || (op.num_ops == 3
> +			 && !(single_defuse_cycle && reduc_index == 2))
>  		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> +  /* For single def-use cycles get one copy of the vectorized reduction
> +     definition.  */
>    if (single_defuse_cycle)
>      {
>        gcc_assert (!slp_node);
> @@ -8301,7 +8370,7 @@ vect_transform_reduction (loop_vec_info
> loop_vinfo,
>  	}
>        else
>  	{
> -	  if (op.num_ops == 3)
> +	  if (op.num_ops >= 3)
>  	    vop[2] = vec_oprnds2[i];
> 
>  	  if (masked_loop_p && mask_by_cond_expr) @@ -8314,10
> +8383,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  if (emulated_mixed_dot_prod)
>  	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info,
> gsi,
>  						    vec_dest, vop);
> -	  else if (code.is_internal_fn ())
> +
> +	  else if (code.is_internal_fn () && !cond_fn_p (code))
>  	    new_stmt = gimple_build_call_internal (internal_fn (code),
>  						   op.num_ops,
>  						   vop[0], vop[1], vop[2]);
> +	  else if (cond_fn_p (code))
> +	    new_stmt = gimple_build_call_internal (internal_fn (code),
> +						   op.num_ops,
> +						   vop[0], vop[1], vop[2],
> +						   vop[1]);
>  	  else
>  	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>  					    vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index
> f1d0cd79961..e22067400af 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref
> (vec_info *,
>  						  tree);
> 
>  /* In tree-vect-loop.cc.  */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool =
> +true);
>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info
> loop_vinfo);  bool vect_rgroup_iv_might_wrap_p (loop_vec_info,
> rgroup_controls *);
>  /* Used in tree-vect-loop-manip.cc */
> --
> 2.41.0
>
  
Robin Dapp Oct. 5, 2023, 8:54 a.m. UTC | #6
Hi Tamar,

> So in the
> 
>   if (slp_node)
>     {
> 
> Add something like:
> 
> If (is_cond_op)
>     {
>       if (dump_enabled_p ())
> 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> 			 "left fold reduction on SLP not supported.\n");
>       return false;
>     }

Yes, seems reasonable, added.

> The only comment I have is whether you actually need this helper function?
> It looks like all the uses of it are in cases you have, or will call conditional_internal_fn_code
> directly.
> 
> e.g. in vect_transform_reduction you can replace it by 
> 
> bool cond_fn_p = cond_fn != ERROR_MARK;
> 
> and in 
> 
>   if (cond_fn_p (orig_code))
>       orig_code = conditional_internal_fn_code (internal_fn(orig_code));
> 
> just 
> 
> internal_fn new_fn = conditional_internal_fn_code (internal_fn(orig_code));
> if (new_fn != ERROR_MARK)
>   orig_code = new_fn;
> 
> which would save the repeated testing of the condition.

I see what you mean.  One complication is that we want to disambiguate
(among others):

 (1) code = IFN_COND_ADD, cond_fn = IFN_LAST.   (new case)
 (2) code = IFN_MAX, cond_fn = IFN_COND_MAX.
 (3) code = IFN_SOMETHING, cond_fn = IFN_LAST.

So just checking cond_fn is not enough (even if we made
get_conditional_internal_fn (IFN_COND_ADD) return IFN_COND_ADD).
We need to know if the initial code already was an IFN_COND.

It's a bit of a mess but I didn't dare untangling.  Well, actually, I
tried but made it worse ;)  The cond_fn_p check seemed least
intrusive to me.  Maybe you have another idea?

Regards
 Robin
  
Robin Dapp Oct. 5, 2023, 9:02 a.m. UTC | #7
Ah, sorry, read your remark incorrectly.  Will try again.

Regards
 Robin
  
Robin Dapp Oct. 5, 2023, 2:05 p.m. UTC | #8
Hi Tamar,

> The only comment I have is whether you actually need this helper
> function? It looks like all the uses of it are in cases you have, or
> will call conditional_internal_fn_code directly.
removed the cond_fn_p entirely in the attached v3.

Bootstrapped and regtested on x86_64, aarch64 and power10.

Regards
 Robin

Subject: [PATCH v3] ifcvt/vect: Emit COND_ADD for conditional scalar
 reduction.

As described in PR111401 we currently emit a COND and a PLUS expression
for conditional reductions.  This makes it difficult to combine both
into a masked reduction statement later.
This patch improves that by directly emitting a COND_ADD during ifcvt and
adjusting some vectorizer code to handle it.

It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
is true.

gcc/ChangeLog:

	PR middle-end/111401
	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
	if supported.
	(predicate_scalar_phi): Add whitespace.
	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
	(neutral_op_for_reduction): Return -0 for PLUS.
	(vect_is_simple_reduction): Don't count else operand in
	COND_ADD.
	(vect_create_epilog_for_reduction): Fix whitespace.
	(vectorize_fold_left_reduction): Add COND_ADD handling.
	(vectorizable_reduction): Don't count else operand in COND_ADD.
	(vect_transform_reduction): Add COND_ADD handling.
	* tree-vectorizer.h (neutral_op_for_reduction): Add default
	parameter.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
---
 .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++
 .../riscv/rvv/autovec/cond/pr111401.c         | 139 ++++++++++++++++
 gcc/tree-if-conv.cc                           |  63 ++++++--
 gcc/tree-vect-loop.cc                         | 150 ++++++++++++++----
 gcc/tree-vectorizer.h                         |   2 +-
 5 files changed, 451 insertions(+), 44 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
new file mode 100644
index 00000000000..7b46e7d8a2a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
@@ -0,0 +1,141 @@
+/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
+/* { dg-do run } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-add-options ieee } */
+/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
+
+#include "tree-vect.h"
+
+#include <math.h>
+
+#define N (VECTOR_BITS * 17)
+
+double __attribute__ ((noinline, noclone))
+reduc_plus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone))
+reduc_minus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  int n = 19;
+  double a[N];
+  int cond1[N], cond2[N];
+
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
+      cond1[i] = 0;
+      cond2[i] = i & 4 ? 1 : 0;
+      asm volatile ("" ::: "memory");
+    }
+
+  double res1 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res2 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
+  double res3 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res4 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond1, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res2 = reduc_minus_double (a, 0.0, cond1, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
+  res3 = reduc_plus_double (a, 0.0, cond1, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res4 = reduc_minus_double (a, 0.0, cond1, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, -0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res2 = reduc_minus_double (a, -0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
+  res3 = reduc_plus_double (a, -0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res4 = reduc_minus_double (a, -0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res2 = reduc_minus_double (a, 0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
+  res3 = reduc_plus_double (a, 0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res4 = reduc_minus_double (a, 0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
new file mode 100644
index 00000000000..8f1cb0d68de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
@@ -0,0 +1,139 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
+
+double
+__attribute__ ((noipa))
+foo2 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init += a[i];
+  return init;
+}
+
+double
+__attribute__ ((noipa))
+foo3 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init -= a[i];
+  return init;
+}
+
+double
+__attribute__ ((noipa))
+foo4 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init *= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo5 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init &= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo6 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init |= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo7 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init ^= a[i];
+  return init;
+}
+
+#define SZ 125
+
+int
+main ()
+{
+  double res1 = 0, res2 = 0, res3 = 0;
+  double a1[SZ], a2[SZ], a3[SZ];
+  int c1[SZ], c2[SZ], c3[SZ];
+
+  int a4[SZ], a5[SZ], a6[SZ], a7[SZ], a8[SZ];
+  int res4 = 0, res5 = 0, res6 = 0;
+  int c4[SZ], c5[SZ], c6[SZ];
+
+  for (int i = 0; i < SZ; i++)
+    {
+      a1[i] = i * 3 + (i & 4) - (i & 7);
+      a2[i] = i * 3 + (i & 4) - (i & 7);
+      a3[i] = i * 0.05 + (i & 4) - (i & 7);
+      a4[i] = i * 3 + (i & 4) - (i & 7);
+      a5[i] = i * 3 + (i & 4) - (i & 7);
+      a6[i] = i * 3 + (i & 4) - (i & 7);
+      c1[i] = i & 1;
+      c2[i] = i & 2;
+      c3[i] = i & 3;
+      c4[i] = i & 4;
+      c5[i] = i & 5;
+      c6[i] = i & 6;
+      __asm__ volatile ("" : : : "memory");
+    }
+
+  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
+  double ref1 = init1, ref2 = init2, ref3 = init3;
+
+  int init4 = 87, init5 = 11, init6 = -123894344, init7 = -2, init8 = 854893;
+  int ref4 = init4, ref5 = init5, ref6 = init6;
+
+#pragma GCC novector
+  for (int i = 0; i < SZ; i++)
+    {
+      if (c1[i])
+        ref1 += a1[i];
+      if (c2[i])
+        ref2 -= a2[i];
+      if (c3[i])
+        ref3 *= a3[i];
+      if (c4[i])
+        ref4 &= a4[i];
+      if (c5[i])
+        ref5 |= a5[i];
+      if (c6[i])
+        ref6 ^= a6[i];
+    }
+
+  res1 = foo2 (a1, init1, c1, SZ);
+  res2 = foo3 (a2, init2, c2, SZ);
+  res3 = foo4 (a3, init3, c3, SZ);
+  res4 = foo5 (a4, init4, c4, SZ);
+  res5 = foo6 (a5, init5, c5, SZ);
+  res6 = foo7 (a6, init6, c6, SZ);
+
+  if (res1 != ref1)
+    __builtin_abort ();
+  if (res2 != ref2)
+    __builtin_abort ();
+  if (res3 != ref3)
+    __builtin_abort ();
+  if (res4 != ref4)
+    __builtin_abort ();
+  if (res5 != ref5)
+    __builtin_abort ();
+  if (res6 != ref6)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
+/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index a8c915913ae..b334173794d 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
   gimple *new_assign;
   tree rhs;
   tree rhs1 = gimple_assign_rhs1 (reduc);
+  tree lhs = gimple_assign_lhs (reduc);
   tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
   tree c;
   enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
-  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
+  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
+					       NULL, false);
   gimple_seq stmts = NULL;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1864,19 +1866,52 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
       print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
     }
 
-  /* Build cond expression using COND and constant operand
-     of reduction rhs.  */
-  c = fold_build_cond_expr (TREE_TYPE (rhs1),
-			    unshare_expr (cond),
-			    swap ? op_nochange : op1,
-			    swap ? op1 : op_nochange);
+  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
+     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
+     a vectorizable call as we can create a COND version of it directly.  */
+  internal_fn ifn;
+  ifn = get_conditional_internal_fn (reduction_op);
 
-  /* Create assignment stmt and insert it at GSI.  */
-  new_assign = gimple_build_assign (tmp, c);
-  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
-  /* Build rhs for unconditional increment/decrement/logic_operation.  */
-  rhs = gimple_build (&stmts, reduction_op,
-		      TREE_TYPE (rhs1), op0, tmp);
+  bool try_cond_op = true;
+  gimple *opstmt;
+  if (TREE_CODE (op1) == SSA_NAME
+      && (opstmt = SSA_NAME_DEF_STMT (op1))
+      && is_gimple_call (opstmt))
+    {
+      combined_fn cfn = gimple_call_combined_fn (opstmt);
+      internal_fn ifnop;
+      reduction_fn_for_scalar_code (cfn, &ifnop);
+      if (vectorized_internal_fn_supported_p (ifnop, TREE_TYPE
+					      (gimple_call_lhs (opstmt))))
+	try_cond_op = false;
+    }
+
+  if (ifn != IFN_LAST
+      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
+      && try_cond_op && !swap)
+    {
+      gcall *cond_call = gimple_build_call_internal (ifn, 4,
+						     unshare_expr (cond),
+						     op0, op1, op0);
+      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
+      gimple_call_set_lhs (cond_call, tmp);
+      rhs = tmp;
+    }
+  else
+    {
+      /* Build cond expression using COND and constant operand
+	 of reduction rhs.  */
+      c = fold_build_cond_expr (TREE_TYPE (rhs1),
+				unshare_expr (cond),
+				swap ? op_nochange : op1,
+				swap ? op1 : op_nochange);
+      /* Create assignment stmt and insert it at GSI.  */
+      new_assign = gimple_build_assign (tmp, c);
+      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
+      /* Build rhs for unconditional increment/decrement/logic_operation.  */
+      rhs = gimple_build (&stmts, reduction_op,
+			  TREE_TYPE (rhs1), op0, tmp);
+    }
 
   if (has_nop)
     {
@@ -2241,7 +2276,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
 	{
 	  /* Convert reduction stmt into vectorizable form.  */
 	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
-					       swap,has_nop, nop_reduc);
+					       swap, has_nop, nop_reduc);
 	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
 	}
       new_stmt = gimple_build_assign (res, rhs);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 23c6e8259e7..3c80029eec5 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3672,7 +3672,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 static bool
 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
 {
-  if (code == PLUS_EXPR)
+  if (code == PLUS_EXPR || code == MINUS_EXPR)
     {
       *reduc_fn = IFN_FOLD_LEFT_PLUS;
       return true;
@@ -3751,23 +3751,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
    by the introduction of additional X elements, return that X, otherwise
    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
    of the scalar elements.  If the reduction has just a single initial value
-   then INITIAL_VALUE is that value, otherwise it is null.  */
+   then INITIAL_VALUE is that value, otherwise it is null.
+   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
+   In that case no signed zero is returned.  */
 
 tree
 neutral_op_for_reduction (tree scalar_type, code_helper code,
-			  tree initial_value)
+			  tree initial_value, bool as_initial)
 {
   if (code.is_tree_code ())
     switch (tree_code (code))
       {
-      case WIDEN_SUM_EXPR:
       case DOT_PROD_EXPR:
       case SAD_EXPR:
-      case PLUS_EXPR:
       case MINUS_EXPR:
       case BIT_IOR_EXPR:
       case BIT_XOR_EXPR:
 	return build_zero_cst (scalar_type);
+      case WIDEN_SUM_EXPR:
+      case PLUS_EXPR:
+	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
+	  return build_real (scalar_type, dconstm0);
+	else
+	  return build_zero_cst (scalar_type);
 
       case MULT_EXPR:
 	return build_one_cst (scalar_type);
@@ -4106,8 +4112,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
           return NULL;
         }
 
-      nphi_def_loop_uses++;
-      phi_use_stmt = use_stmt;
+      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
+	 op1 twice (once as definition, once as else) in the same operation.
+	 Only count it as one. */
+      if (use_stmt != phi_use_stmt)
+	{
+	  nphi_def_loop_uses++;
+	  phi_use_stmt = use_stmt;
+	}
     }
 
   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
@@ -6041,7 +6053,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
     }
-  
+
   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
   scalar_type = TREE_TYPE (scalar_dest);
   scalar_results.truncate (0);
@@ -6378,7 +6390,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
 	    initial_value = reduc_info->reduc_initial_values[0];
 	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
-						 initial_value);
+						 initial_value, false);
 	}
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -6860,8 +6872,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 			       gimple_stmt_iterator *gsi,
 			       gimple **vec_stmt, slp_tree slp_node,
 			       gimple *reduc_def_stmt,
-			       tree_code code, internal_fn reduc_fn,
-			       tree ops[3], tree vectype_in,
+			       code_helper code, internal_fn reduc_fn,
+			       tree *ops, int num_ops, tree vectype_in,
 			       int reduc_index, vec_loop_masks *masks,
 			       vec_loop_lens *lens)
 {
@@ -6877,17 +6889,48 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 
   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
   gcc_assert (ncopies == 1);
-  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
+
+  bool is_cond_op = false;
+  if (!code.is_tree_code ())
+    {
+      code = conditional_internal_fn_code (internal_fn (code));
+      gcc_assert (code != ERROR_MARK);
+      is_cond_op = true;
+    }
+
+  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
 
   if (slp_node)
-    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
-			  TYPE_VECTOR_SUBPARTS (vectype_in)));
+    {
+      if (is_cond_op)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "fold-left reduction on SLP not supported.\n");
+	  return false;
+	}
 
-  tree op0 = ops[1 - reduc_index];
+      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
+			    TYPE_VECTOR_SUBPARTS (vectype_in)));
+    }
+
+  /* The operands either come from a binary operation or an IFN_COND operation.
+     The former is a gimple assign with binary rhs and the latter is a
+     gimple call with four arguments.  */
+  gcc_assert (num_ops == 2 || num_ops == 4);
+  tree op0, opmask;
+  if (!is_cond_op)
+    op0 = ops[1 - reduc_index];
+  else
+    {
+      op0 = ops[2];
+      opmask = ops[0];
+      gcc_assert (!slp_node);
+    }
 
   int group_size = 1;
   stmt_vec_info scalar_dest_def_info;
-  auto_vec<tree> vec_oprnds0;
+  auto_vec<tree> vec_oprnds0, vec_opmask;
   if (slp_node)
     {
       auto_vec<vec<tree> > vec_defs (2);
@@ -6903,9 +6946,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
 				     op0, &vec_oprnds0);
       scalar_dest_def_info = stmt_info;
+
+      /* For an IFN_COND_OP we also need the vector mask operand.  */
+      if (is_cond_op)
+	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+					 opmask, &vec_opmask);
     }
 
-  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
+  gimple *sdef = scalar_dest_def_info->stmt;
+  tree scalar_dest = gimple_get_lhs (sdef);
   tree scalar_type = TREE_TYPE (scalar_dest);
   tree reduc_var = gimple_phi_result (reduc_def_stmt);
 
@@ -6939,13 +6988,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       tree bias = NULL_TREE;
       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
+      else if (is_cond_op)
+	mask = vec_opmask[0];
       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
 	{
 	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
 				   i, 1);
 	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 	  bias = build_int_cst (intQI_type_node, biasval);
-	  mask = build_minus_one_cst (truth_type_for (vectype_in));
+	  if (!is_cond_op)
+	    mask = build_minus_one_cst (truth_type_for (vectype_in));
 	}
 
       /* Handle MINUS by adding the negative.  */
@@ -6957,7 +7009,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	  def0 = negated;
 	}
 
-      if (mask && mask_reduc_fn == IFN_LAST)
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  && mask && mask_reduc_fn == IFN_LAST)
 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
 				    vector_identity);
 
@@ -6988,8 +7041,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
-					     reduc_var, def0);
+	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
+					     tree_code (code), reduc_var, def0);
 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
 	  /* Remove the statement, so that we can use the same code paths
 	     as for statements that we've just created.  */
@@ -7440,6 +7493,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
 	continue;
 
+      /* For an IFN_COND_OP we might hit the reduction definition operand
+	 twice (once as definition, once as else).  */
+      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
+	continue;
+
       /* There should be only one cycle def in the stmt, the one
          leading to reduc_def.  */
       if (VECTORIZABLE_CYCLE_DEF (dt))
@@ -7640,6 +7698,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
           when generating the code inside the loop.  */
 
   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
+
+  /* If conversion might have created a conditional operation like
+     IFN_COND_ADD already.  Use the internal code for the following checks.  */
+  if (orig_code.is_internal_fn ())
+    {
+      tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
+      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
+    }
+
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
@@ -7678,7 +7745,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			    "reduction: not commutative/associative");
+			    "reduction: not commutative/associative\n");
 	  return false;
 	}
     }
@@ -8213,6 +8280,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
   code_helper code = canonicalize_code (op.code, op.type);
   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
+
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
@@ -8231,17 +8299,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   if (code == COND_EXPR)
     gcc_assert (ncopies == 1);
 
+  /* A binary COND_OP reduction must have the same definition and else
+     value. */
+  bool cond_fn_p = code.is_internal_fn ()
+    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
+  if (cond_fn_p)
+    {
+      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
+		  || code == IFN_COND_MUL || code == IFN_COND_AND
+		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
+      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
+    }
+
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
-      gcc_assert (code.is_tree_code ());
+      gcc_assert (code.is_tree_code () || cond_fn_p);
       return vectorize_fold_left_reduction
 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
-	   lens);
+	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
+	   reduc_index, masks, lens);
     }
 
   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
@@ -8254,14 +8334,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
+  /* Get NCOPIES vector definitions for all operands except the reduction
+     definition.  */
   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
 		     single_defuse_cycle && reduc_index == 0
 		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
 		     single_defuse_cycle && reduc_index == 1
 		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
-		     op.num_ops == 3
-		     && !(single_defuse_cycle && reduc_index == 2)
+		     op.num_ops == 4
+		     || (op.num_ops == 3
+			 && !(single_defuse_cycle && reduc_index == 2))
 		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+
+  /* For single def-use cycles get one copy of the vectorized reduction
+     definition.  */
   if (single_defuse_cycle)
     {
       gcc_assert (!slp_node);
@@ -8301,7 +8387,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  if (op.num_ops == 3)
+	  if (op.num_ops >= 3)
 	    vop[2] = vec_oprnds2[i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
@@ -8314,10 +8400,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	  if (emulated_mixed_dot_prod)
 	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
 						    vec_dest, vop);
-	  else if (code.is_internal_fn ())
+
+	  else if (code.is_internal_fn () && !cond_fn_p)
 	    new_stmt = gimple_build_call_internal (internal_fn (code),
 						   op.num_ops,
 						   vop[0], vop[1], vop[2]);
+	  else if (code.is_internal_fn () && cond_fn_p)
+	    new_stmt = gimple_build_call_internal (internal_fn (code),
+						   op.num_ops,
+						   vop[0], vop[1], vop[2],
+						   vop[1]);
 	  else
 	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
 					    vop[0], vop[1], vop[2]);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f1d0cd79961..e22067400af 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
 						  tree);
 
 /* In tree-vect-loop.cc.  */
-extern tree neutral_op_for_reduction (tree, code_helper, tree);
+extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
 extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
 bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
 /* Used in tree-vect-loop-manip.cc */
  
Tamar Christina Oct. 5, 2023, 2:15 p.m. UTC | #9
Hi Robin,

> -----Original Message-----
> From: Robin Dapp <rdapp.gcc@gmail.com>
> Sent: Thursday, October 5, 2023 3:06 PM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches <gcc-
> patches@gcc.gnu.org>; Richard Biener <rguenther@suse.de>
> Cc: rdapp.gcc@gmail.com
> Subject: Re: [PATCH] ifcvt/vect: Emit COND_ADD for conditional scalar
> reduction.
> 
> Hi Tamar,
> 
> > The only comment I have is whether you actually need this helper
> > function? It looks like all the uses of it are in cases you have, or
> > will call conditional_internal_fn_code directly.
> removed the cond_fn_p entirely in the attached v3.
> 
> Bootstrapped and regtested on x86_64, aarch64 and power10.
> 

Changes look good to me thanks! I'll leave it up to Richi for final approval.

Regards,
Tamar

> Regards
>  Robin
> 
> Subject: [PATCH v3] ifcvt/vect: Emit COND_ADD for conditional scalar
> reduction.
> 
> As described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions.  This makes it difficult to combine both into a
> masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD during ifcvt and
> adjusting some vectorizer code to handle it.
> 
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS is
> true.
> 
> gcc/ChangeLog:
> 
> 	PR middle-end/111401
> 	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
> 	if supported.
> 	(predicate_scalar_phi): Add whitespace.
> 	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
> 	(neutral_op_for_reduction): Return -0 for PLUS.
> 	(vect_is_simple_reduction): Don't count else operand in
> 	COND_ADD.
> 	(vect_create_epilog_for_reduction): Fix whitespace.
> 	(vectorize_fold_left_reduction): Add COND_ADD handling.
> 	(vectorizable_reduction): Don't count else operand in COND_ADD.
> 	(vect_transform_reduction): Add COND_ADD handling.
> 	* tree-vectorizer.h (neutral_op_for_reduction): Add default
> 	parameter.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
> 	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
> ---
>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++
>  .../riscv/rvv/autovec/cond/pr111401.c         | 139 ++++++++++++++++
>  gcc/tree-if-conv.cc                           |  63 ++++++--
>  gcc/tree-vect-loop.cc                         | 150 ++++++++++++++----
>  gcc/tree-vectorizer.h                         |   2 +-
>  5 files changed, 451 insertions(+), 44 deletions(-)  create mode 100644
> gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>  create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-
> zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..7b46e7d8a2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone)) reduc_plus_double (double
> +*restrict a, double init, int *cond, int n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int
> +n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone)) reduc_minus_double (double
> +*restrict a, double init, int *cond, int n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int
> +n) {
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> +  int n = 19;
> +  double a[N];
> +  int cond1[N], cond2[N];
> +
> +  for (int i = 0; i < N; i++)
> +    {
> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> +      cond1[i] = 0;
> +      cond2[i] = i & 4 ? 1 : 0;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);  double ref1 =
> + reduc_plus_double_ref (a, -0.0, cond1, n);  double res2 =
> + reduc_minus_double (a, -0.0, cond1, n);  double ref2 =
> + reduc_minus_double_ref (a, -0.0, cond1, n);  double res3 =
> + reduc_plus_double (a, -0.0, cond1, n);  double ref3 =
> + reduc_plus_double_ref (a, -0.0, cond1, n);  double res4 =
> + reduc_minus_double (a, -0.0, cond1, n);  double ref4 =
> + reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..8f1cb0d68de
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,139 @@
> +/* { dg-do run { target { riscv_v } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param
> +riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init += a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init -= a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo4 (double *__restrict a, double init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init *= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo5 (int *__restrict a, int init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init &= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo6 (int *__restrict a, int init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init |= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo7 (int *__restrict a, int init, int *__restrict cond, int n) {
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init ^= a[i];
> +  return init;
> +}
> +
> +#define SZ 125
> +
> +int
> +main ()
> +{
> +  double res1 = 0, res2 = 0, res3 = 0;
> +  double a1[SZ], a2[SZ], a3[SZ];
> +  int c1[SZ], c2[SZ], c3[SZ];
> +
> +  int a4[SZ], a5[SZ], a6[SZ], a7[SZ], a8[SZ];  int res4 = 0, res5 = 0,
> + res6 = 0;  int c4[SZ], c5[SZ], c6[SZ];
> +
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      a1[i] = i * 3 + (i & 4) - (i & 7);
> +      a2[i] = i * 3 + (i & 4) - (i & 7);
> +      a3[i] = i * 0.05 + (i & 4) - (i & 7);
> +      a4[i] = i * 3 + (i & 4) - (i & 7);
> +      a5[i] = i * 3 + (i & 4) - (i & 7);
> +      a6[i] = i * 3 + (i & 4) - (i & 7);
> +      c1[i] = i & 1;
> +      c2[i] = i & 2;
> +      c3[i] = i & 3;
> +      c4[i] = i & 4;
> +      c5[i] = i & 5;
> +      c6[i] = i & 6;
> +      __asm__ volatile ("" : : : "memory");
> +    }
> +
> +  double init1 = 2.7, init2 = 8.2, init3 = 0.1;  double ref1 = init1,
> + ref2 = init2, ref3 = init3;
> +
> +  int init4 = 87, init5 = 11, init6 = -123894344, init7 = -2, init8 =
> + 854893;  int ref4 = init4, ref5 = init5, ref6 = init6;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      if (c1[i])
> +        ref1 += a1[i];
> +      if (c2[i])
> +        ref2 -= a2[i];
> +      if (c3[i])
> +        ref3 *= a3[i];
> +      if (c4[i])
> +        ref4 &= a4[i];
> +      if (c5[i])
> +        ref5 |= a5[i];
> +      if (c6[i])
> +        ref6 ^= a6[i];
> +    }
> +
> +  res1 = foo2 (a1, init1, c1, SZ);
> +  res2 = foo3 (a2, init2, c2, SZ);
> +  res3 = foo4 (a3, init3, c3, SZ);
> +  res4 = foo5 (a4, init4, c4, SZ);
> +  res5 = foo6 (a5, init5, c5, SZ);
> +  res6 = foo7 (a6, init6, c6, SZ);
> +
> +  if (res1 != ref1)
> +    __builtin_abort ();
> +  if (res2 != ref2)
> +    __builtin_abort ();
> +  if (res3 != ref3)
> +    __builtin_abort ();
> +  if (res4 != ref4)
> +    __builtin_abort ();
> +  if (res5 != ref5)
> +    __builtin_abort ();
> +  if (res6 != ref6)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } }
> +*/
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index
> a8c915913ae..b334173794d 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc,
> gimple_stmt_iterator *gsi,
>    gimple *new_assign;
>    tree rhs;
>    tree rhs1 = gimple_assign_rhs1 (reduc);
> +  tree lhs = gimple_assign_lhs (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1),
> reduction_op, NULL);
> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1),
> reduction_op,
> +					       NULL, false);
>    gimple_seq stmts = NULL;
> 
>    if (dump_file && (dump_flags & TDF_DETAILS)) @@ -1864,19 +1866,52
> @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator
> *gsi,
>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>      }
> 
> -  /* Build cond expression using COND and constant operand
> -     of reduction rhs.  */
> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
> -			    unshare_expr (cond),
> -			    swap ? op_nochange : op1,
> -			    swap ? op1 : op_nochange);
> +  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
> +     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
> +     a vectorizable call as we can create a COND version of it
> + directly.  */  internal_fn ifn;  ifn = get_conditional_internal_fn
> + (reduction_op);
> 
> -  /* Create assignment stmt and insert it at GSI.  */
> -  new_assign = gimple_build_assign (tmp, c);
> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> -  rhs = gimple_build (&stmts, reduction_op,
> -		      TREE_TYPE (rhs1), op0, tmp);
> +  bool try_cond_op = true;
> +  gimple *opstmt;
> +  if (TREE_CODE (op1) == SSA_NAME
> +      && (opstmt = SSA_NAME_DEF_STMT (op1))
> +      && is_gimple_call (opstmt))
> +    {
> +      combined_fn cfn = gimple_call_combined_fn (opstmt);
> +      internal_fn ifnop;
> +      reduction_fn_for_scalar_code (cfn, &ifnop);
> +      if (vectorized_internal_fn_supported_p (ifnop, TREE_TYPE
> +					      (gimple_call_lhs (opstmt))))
> +	try_cond_op = false;
> +    }
> +
> +  if (ifn != IFN_LAST
> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> +      && try_cond_op && !swap)
> +    {
> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
> +						     unshare_expr (cond),
> +						     op0, op1, op0);
> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> +      gimple_call_set_lhs (cond_call, tmp);
> +      rhs = tmp;
> +    }
> +  else
> +    {
> +      /* Build cond expression using COND and constant operand
> +	 of reduction rhs.  */
> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
> +				unshare_expr (cond),
> +				swap ? op_nochange : op1,
> +				swap ? op1 : op_nochange);
> +      /* Create assignment stmt and insert it at GSI.  */
> +      new_assign = gimple_build_assign (tmp, c);
> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +      rhs = gimple_build (&stmts, reduction_op,
> +			  TREE_TYPE (rhs1), op0, tmp);
> +    }
> 
>    if (has_nop)
>      {
> @@ -2241,7 +2276,7 @@ predicate_scalar_phi (gphi *phi,
> gimple_stmt_iterator *gsi)
>  	{
>  	  /* Convert reduction stmt into vectorizable form.  */
>  	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -					       swap,has_nop, nop_reduc);
> +					       swap, has_nop, nop_reduc);
>  	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>  	}
>        new_stmt = gimple_build_assign (res, rhs); diff --git a/gcc/tree-vect-
> loop.cc b/gcc/tree-vect-loop.cc index 23c6e8259e7..3c80029eec5 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3672,7 +3672,7 @@ vect_analyze_loop (class loop *loop,
> vec_info_shared *shared)  static bool  fold_left_reduction_fn (code_helper
> code, internal_fn *reduc_fn)  {
> -  if (code == PLUS_EXPR)
> +  if (code == PLUS_EXPR || code == MINUS_EXPR)
>      {
>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>        return true;
> @@ -3751,23 +3751,29 @@ reduction_fn_for_scalar_code (code_helper
> code, internal_fn *reduc_fn)
>     by the introduction of additional X elements, return that X, otherwise
>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>     of the scalar elements.  If the reduction has just a single initial value
> -   then INITIAL_VALUE is that value, otherwise it is null.  */
> +   then INITIAL_VALUE is that value, otherwise it is null.
> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> +   In that case no signed zero is returned.  */
> 
>  tree
>  neutral_op_for_reduction (tree scalar_type, code_helper code,
> -			  tree initial_value)
> +			  tree initial_value, bool as_initial)
>  {
>    if (code.is_tree_code ())
>      switch (tree_code (code))
>        {
> -      case WIDEN_SUM_EXPR:
>        case DOT_PROD_EXPR:
>        case SAD_EXPR:
> -      case PLUS_EXPR:
>        case MINUS_EXPR:
>        case BIT_IOR_EXPR:
>        case BIT_XOR_EXPR:
>  	return build_zero_cst (scalar_type);
> +      case WIDEN_SUM_EXPR:
> +      case PLUS_EXPR:
> +	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> +	  return build_real (scalar_type, dconstm0);
> +	else
> +	  return build_zero_cst (scalar_type);
> 
>        case MULT_EXPR:
>  	return build_one_cst (scalar_type);
> @@ -4106,8 +4112,14 @@ vect_is_simple_reduction (loop_vec_info
> loop_info, stmt_vec_info phi_info,
>            return NULL;
>          }
> 
> -      nphi_def_loop_uses++;
> -      phi_use_stmt = use_stmt;
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	 op1 twice (once as definition, once as else) in the same operation.
> +	 Only count it as one. */
> +      if (use_stmt != phi_use_stmt)
> +	{
> +	  nphi_def_loop_uses++;
> +	  phi_use_stmt = use_stmt;
> +	}
>      }
> 
>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6041,7 +6053,7 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
>        gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
>        gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
>      }
> -
> +
>    scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
>    scalar_type = TREE_TYPE (scalar_dest);
>    scalar_results.truncate (0);
> @@ -6378,7 +6390,7 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
>  	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>  	    initial_value = reduc_info->reduc_initial_values[0];
>  	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> -						 initial_value);
> +						 initial_value, false);
>  	}
>        if (neutral_op)
>  	vector_identity = gimple_build_vector_from_val (&seq, vectype, @@ -
> 6860,8 +6872,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  			       gimple_stmt_iterator *gsi,
>  			       gimple **vec_stmt, slp_tree slp_node,
>  			       gimple *reduc_def_stmt,
> -			       tree_code code, internal_fn reduc_fn,
> -			       tree ops[3], tree vectype_in,
> +			       code_helper code, internal_fn reduc_fn,
> +			       tree *ops, int num_ops, tree vectype_in,
>  			       int reduc_index, vec_loop_masks *masks,
>  			       vec_loop_lens *lens)
>  {
> @@ -6877,17 +6889,48 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
> 
>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>    gcc_assert (ncopies == 1);
> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> +  bool is_cond_op = false;
> +  if (!code.is_tree_code ())
> +    {
> +      code = conditional_internal_fn_code (internal_fn (code));
> +      gcc_assert (code != ERROR_MARK);
> +      is_cond_op = true;
> +    }
> +
> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
> 
>    if (slp_node)
> -    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> -			  TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    {
> +      if (is_cond_op)
> +	{
> +	  if (dump_enabled_p ())
> +	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +			     "fold-left reduction on SLP not supported.\n");
> +	  return false;
> +	}
> 
> -  tree op0 = ops[1 - reduc_index];
> +      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> +			    TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    }
> +
> +  /* The operands either come from a binary operation or an IFN_COND
> operation.
> +     The former is a gimple assign with binary rhs and the latter is a
> +     gimple call with four arguments.  */  gcc_assert (num_ops == 2 ||
> + num_ops == 4);  tree op0, opmask;  if (!is_cond_op)
> +    op0 = ops[1 - reduc_index];
> +  else
> +    {
> +      op0 = ops[2];
> +      opmask = ops[0];
> +      gcc_assert (!slp_node);
> +    }
> 
>    int group_size = 1;
>    stmt_vec_info scalar_dest_def_info;
> -  auto_vec<tree> vec_oprnds0;
> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>    if (slp_node)
>      {
>        auto_vec<vec<tree> > vec_defs (2); @@ -6903,9 +6946,15 @@
> vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>  				     op0, &vec_oprnds0);
>        scalar_dest_def_info = stmt_info;
> +
> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
> +      if (is_cond_op)
> +	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> +					 opmask, &vec_opmask);
>      }
> 
> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> +  gimple *sdef = scalar_dest_def_info->stmt;  tree scalar_dest =
> + gimple_get_lhs (sdef);
>    tree scalar_type = TREE_TYPE (scalar_dest);
>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
> 
> @@ -6939,13 +6988,16 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
>        tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num,
> vectype_in, i);
> +      else if (is_cond_op)
> +	mask = vec_opmask[0];
>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  	{
>  	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>  				   i, 1);
>  	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS
> (loop_vinfo);
>  	  bias = build_int_cst (intQI_type_node, biasval);
> -	  mask = build_minus_one_cst (truth_type_for (vectype_in));
> +	  if (!is_cond_op)
> +	    mask = build_minus_one_cst (truth_type_for (vectype_in));
>  	}
> 
>        /* Handle MINUS by adding the negative.  */ @@ -6957,7 +7009,8 @@
> vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	  def0 = negated;
>  	}
> 
> -      if (mask && mask_reduc_fn == IFN_LAST)
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +	  && mask && mask_reduc_fn == IFN_LAST)
>  	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>  				    vector_identity);
> 
> @@ -6988,8 +7041,8 @@ vectorize_fold_left_reduction (loop_vec_info
> loop_vinfo,
>  	}
>        else
>  	{
> -	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> -					     reduc_var, def0);
> +	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> +					     tree_code (code), reduc_var, def0);
>  	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>  	  /* Remove the statement, so that we can use the same code paths
>  	     as for statements that we've just created.  */ @@ -7440,6
> +7493,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>  	continue;
> 
> +      /* For an IFN_COND_OP we might hit the reduction definition operand
> +	 twice (once as definition, once as else).  */
> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> +	continue;
> +
>        /* There should be only one cycle def in the stmt, the one
>           leading to reduc_def.  */
>        if (VECTORIZABLE_CYCLE_DEF (dt))
> @@ -7640,6 +7698,15 @@ vectorizable_reduction (loop_vec_info
> loop_vinfo,
>            when generating the code inside the loop.  */
> 
>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> +  /* If conversion might have created a conditional operation like
> +     IFN_COND_ADD already.  Use the internal code for the following
> + checks.  */  if (orig_code.is_internal_fn ())
> +    {
> +      tree_code new_code = conditional_internal_fn_code (internal_fn
> (orig_code));
> +      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
> +    }
> +
>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
> 
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE
> (reduc_info); @@ -7678,7 +7745,7 @@ vectorizable_reduction
> (loop_vec_info loop_vinfo,
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -			    "reduction: not commutative/associative");
> +			    "reduction: not commutative/associative\n");
>  	  return false;
>  	}
>      }
> @@ -8213,6 +8280,7 @@ vect_transform_reduction (loop_vec_info
> loop_vinfo,
> 
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn,
> vectype_in); @@ -8231,17 +8299,29 @@ vect_transform_reduction
> (loop_vec_info loop_vinfo,
>    if (code == COND_EXPR)
>      gcc_assert (ncopies == 1);
> 
> +  /* A binary COND_OP reduction must have the same definition and else
> +     value. */
> +  bool cond_fn_p = code.is_internal_fn ()
> +    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
> +  if (cond_fn_p)
> +    {
> +      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
> +		  || code == IFN_COND_MUL || code == IFN_COND_AND
> +		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
> +      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +    }
> +
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> 
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE
> (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> -      gcc_assert (code.is_tree_code ());
> +      gcc_assert (code.is_tree_code () || cond_fn_p);
>        return vectorize_fold_left_reduction
>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> -	   lens);
> +	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +	   reduc_index, masks, lens);
>      }
> 
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE
> (reduc_info); @@ -8254,14 +8334,20 @@ vect_transform_reduction
> (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
> 
> +  /* Get NCOPIES vector definitions for all operands except the reduction
> +     definition.  */
>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  		     single_defuse_cycle && reduc_index == 0
>  		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
>  		     single_defuse_cycle && reduc_index == 1
>  		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -		     op.num_ops == 3
> -		     && !(single_defuse_cycle && reduc_index == 2)
> +		     op.num_ops == 4
> +		     || (op.num_ops == 3
> +			 && !(single_defuse_cycle && reduc_index == 2))
>  		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> +  /* For single def-use cycles get one copy of the vectorized reduction
> +     definition.  */
>    if (single_defuse_cycle)
>      {
>        gcc_assert (!slp_node);
> @@ -8301,7 +8387,7 @@ vect_transform_reduction (loop_vec_info
> loop_vinfo,
>  	}
>        else
>  	{
> -	  if (op.num_ops == 3)
> +	  if (op.num_ops >= 3)
>  	    vop[2] = vec_oprnds2[i];
> 
>  	  if (masked_loop_p && mask_by_cond_expr) @@ -8314,10
> +8400,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  if (emulated_mixed_dot_prod)
>  	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info,
> gsi,
>  						    vec_dest, vop);
> -	  else if (code.is_internal_fn ())
> +
> +	  else if (code.is_internal_fn () && !cond_fn_p)
>  	    new_stmt = gimple_build_call_internal (internal_fn (code),
>  						   op.num_ops,
>  						   vop[0], vop[1], vop[2]);
> +	  else if (code.is_internal_fn () && cond_fn_p)
> +	    new_stmt = gimple_build_call_internal (internal_fn (code),
> +						   op.num_ops,
> +						   vop[0], vop[1], vop[2],
> +						   vop[1]);
>  	  else
>  	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>  					    vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index
> f1d0cd79961..e22067400af 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref
> (vec_info *,
>  						  tree);
> 
>  /* In tree-vect-loop.cc.  */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool =
> +true);
>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info
> loop_vinfo);  bool vect_rgroup_iv_might_wrap_p (loop_vec_info,
> rgroup_controls *);
>  /* Used in tree-vect-loop-manip.cc */
> --
> 2.41.0
  
Richard Biener Oct. 6, 2023, 9:10 a.m. UTC | #10
On Thu, 5 Oct 2023, Robin Dapp wrote:

> Hi Tamar,
> 
> > The only comment I have is whether you actually need this helper
> > function? It looks like all the uses of it are in cases you have, or
> > will call conditional_internal_fn_code directly.
> removed the cond_fn_p entirely in the attached v3.
> 
> Bootstrapped and regtested on x86_64, aarch64 and power10.

Looks good - I only have one question, see below ...

> Regards
>  Robin
> 
> Subject: [PATCH v3] ifcvt/vect: Emit COND_ADD for conditional scalar
>  reduction.
> 
> As described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions.  This makes it difficult to combine both
> into a masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD during ifcvt and
> adjusting some vectorizer code to handle it.
> 
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
> is true.
> 
> gcc/ChangeLog:
> 
> 	PR middle-end/111401
> 	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
> 	if supported.
> 	(predicate_scalar_phi): Add whitespace.
> 	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
> 	(neutral_op_for_reduction): Return -0 for PLUS.
> 	(vect_is_simple_reduction): Don't count else operand in
> 	COND_ADD.
> 	(vect_create_epilog_for_reduction): Fix whitespace.
> 	(vectorize_fold_left_reduction): Add COND_ADD handling.
> 	(vectorizable_reduction): Don't count else operand in COND_ADD.
> 	(vect_transform_reduction): Add COND_ADD handling.
> 	* tree-vectorizer.h (neutral_op_for_reduction): Add default
> 	parameter.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
> 	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
> ---
>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++
>  .../riscv/rvv/autovec/cond/pr111401.c         | 139 ++++++++++++++++
>  gcc/tree-if-conv.cc                           |  63 ++++++--
>  gcc/tree-vect-loop.cc                         | 150 ++++++++++++++----
>  gcc/tree-vectorizer.h                         |   2 +-
>  5 files changed, 451 insertions(+), 44 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..7b46e7d8a2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_plus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_minus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> +  int n = 19;
> +  double a[N];
> +  int cond1[N], cond2[N];
> +
> +  for (int i = 0; i < N; i++)
> +    {
> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> +      cond1[i] = 0;
> +      cond2[i] = i & 4 ? 1 : 0;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res2 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +  double res3 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res4 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..8f1cb0d68de
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,139 @@
> +/* { dg-do run { target { riscv_v } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init += a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init -= a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo4 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init *= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo5 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init &= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo6 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init |= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo7 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init ^= a[i];
> +  return init;
> +}
> +
> +#define SZ 125
> +
> +int
> +main ()
> +{
> +  double res1 = 0, res2 = 0, res3 = 0;
> +  double a1[SZ], a2[SZ], a3[SZ];
> +  int c1[SZ], c2[SZ], c3[SZ];
> +
> +  int a4[SZ], a5[SZ], a6[SZ], a7[SZ], a8[SZ];
> +  int res4 = 0, res5 = 0, res6 = 0;
> +  int c4[SZ], c5[SZ], c6[SZ];
> +
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      a1[i] = i * 3 + (i & 4) - (i & 7);
> +      a2[i] = i * 3 + (i & 4) - (i & 7);
> +      a3[i] = i * 0.05 + (i & 4) - (i & 7);
> +      a4[i] = i * 3 + (i & 4) - (i & 7);
> +      a5[i] = i * 3 + (i & 4) - (i & 7);
> +      a6[i] = i * 3 + (i & 4) - (i & 7);
> +      c1[i] = i & 1;
> +      c2[i] = i & 2;
> +      c3[i] = i & 3;
> +      c4[i] = i & 4;
> +      c5[i] = i & 5;
> +      c6[i] = i & 6;
> +      __asm__ volatile ("" : : : "memory");
> +    }
> +
> +  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
> +  double ref1 = init1, ref2 = init2, ref3 = init3;
> +
> +  int init4 = 87, init5 = 11, init6 = -123894344, init7 = -2, init8 = 854893;
> +  int ref4 = init4, ref5 = init5, ref6 = init6;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      if (c1[i])
> +        ref1 += a1[i];
> +      if (c2[i])
> +        ref2 -= a2[i];
> +      if (c3[i])
> +        ref3 *= a3[i];
> +      if (c4[i])
> +        ref4 &= a4[i];
> +      if (c5[i])
> +        ref5 |= a5[i];
> +      if (c6[i])
> +        ref6 ^= a6[i];
> +    }
> +
> +  res1 = foo2 (a1, init1, c1, SZ);
> +  res2 = foo3 (a2, init2, c2, SZ);
> +  res3 = foo4 (a3, init3, c3, SZ);
> +  res4 = foo5 (a4, init4, c4, SZ);
> +  res5 = foo6 (a5, init5, c5, SZ);
> +  res6 = foo7 (a6, init6, c6, SZ);
> +
> +  if (res1 != ref1)
> +    __builtin_abort ();
> +  if (res2 != ref2)
> +    __builtin_abort ();
> +  if (res3 != ref3)
> +    __builtin_abort ();
> +  if (res4 != ref4)
> +    __builtin_abort ();
> +  if (res5 != ref5)
> +    __builtin_abort ();
> +  if (res6 != ref6)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index a8c915913ae..b334173794d 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>    gimple *new_assign;
>    tree rhs;
>    tree rhs1 = gimple_assign_rhs1 (reduc);
> +  tree lhs = gimple_assign_lhs (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
> +					       NULL, false);
>    gimple_seq stmts = NULL;
>  
>    if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1864,19 +1866,52 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>      }
>  
> -  /* Build cond expression using COND and constant operand
> -     of reduction rhs.  */
> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
> -			    unshare_expr (cond),
> -			    swap ? op_nochange : op1,
> -			    swap ? op1 : op_nochange);
> +  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
> +     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
> +     a vectorizable call as we can create a COND version of it directly.  */
> +  internal_fn ifn;
> +  ifn = get_conditional_internal_fn (reduction_op);
>  
> -  /* Create assignment stmt and insert it at GSI.  */
> -  new_assign = gimple_build_assign (tmp, c);
> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> -  rhs = gimple_build (&stmts, reduction_op,
> -		      TREE_TYPE (rhs1), op0, tmp);
> +  bool try_cond_op = true;
> +  gimple *opstmt;
> +  if (TREE_CODE (op1) == SSA_NAME
> +      && (opstmt = SSA_NAME_DEF_STMT (op1))
> +      && is_gimple_call (opstmt))
> +    {
> +      combined_fn cfn = gimple_call_combined_fn (opstmt);
> +      internal_fn ifnop;
> +      reduction_fn_for_scalar_code (cfn, &ifnop);
> +      if (vectorized_internal_fn_supported_p (ifnop, TREE_TYPE
> +					      (gimple_call_lhs (opstmt))))
> +	try_cond_op = false;
> +    }
> +
> +  if (ifn != IFN_LAST
> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> +      && try_cond_op && !swap)
> +    {
> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
> +						     unshare_expr (cond),
> +						     op0, op1, op0);
> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> +      gimple_call_set_lhs (cond_call, tmp);
> +      rhs = tmp;
> +    }
> +  else
> +    {
> +      /* Build cond expression using COND and constant operand
> +	 of reduction rhs.  */
> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
> +				unshare_expr (cond),
> +				swap ? op_nochange : op1,
> +				swap ? op1 : op_nochange);
> +      /* Create assignment stmt and insert it at GSI.  */
> +      new_assign = gimple_build_assign (tmp, c);
> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +      rhs = gimple_build (&stmts, reduction_op,
> +			  TREE_TYPE (rhs1), op0, tmp);
> +    }
>  
>    if (has_nop)
>      {
> @@ -2241,7 +2276,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
>  	{
>  	  /* Convert reduction stmt into vectorizable form.  */
>  	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -					       swap,has_nop, nop_reduc);
> +					       swap, has_nop, nop_reduc);
>  	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>  	}
>        new_stmt = gimple_build_assign (res, rhs);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 23c6e8259e7..3c80029eec5 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3672,7 +3672,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
>  static bool
>  fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
>  {
> -  if (code == PLUS_EXPR)
> +  if (code == PLUS_EXPR || code == MINUS_EXPR)

so this handles MINUS_EXPR, but ...

>      {
>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>        return true;
> @@ -3751,23 +3751,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
>     by the introduction of additional X elements, return that X, otherwise
>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>     of the scalar elements.  If the reduction has just a single initial value
> -   then INITIAL_VALUE is that value, otherwise it is null.  */
> +   then INITIAL_VALUE is that value, otherwise it is null.
> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> +   In that case no signed zero is returned.  */
>  
>  tree
>  neutral_op_for_reduction (tree scalar_type, code_helper code,
> -			  tree initial_value)
> +			  tree initial_value, bool as_initial)
>  {
>    if (code.is_tree_code ())
>      switch (tree_code (code))
>        {
> -      case WIDEN_SUM_EXPR:
>        case DOT_PROD_EXPR:
>        case SAD_EXPR:
> -      case PLUS_EXPR:
>        case MINUS_EXPR:
>        case BIT_IOR_EXPR:
>        case BIT_XOR_EXPR:
>  	return build_zero_cst (scalar_type);
> +      case WIDEN_SUM_EXPR:
> +      case PLUS_EXPR:
> +	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> +	  return build_real (scalar_type, dconstm0);
> +	else
> +	  return build_zero_cst (scalar_type);

... here we probably get PLUS_EXPR for MINUS_EXPR above but IIRC
for MINUS_EXPR the !as_initial case should return positive zero.

Can you double-check?

Otherwise OK.

Thanks,
Richard.

>        case MULT_EXPR:
>  	return build_one_cst (scalar_type);
> @@ -4106,8 +4112,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
>            return NULL;
>          }
>  
> -      nphi_def_loop_uses++;
> -      phi_use_stmt = use_stmt;
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	 op1 twice (once as definition, once as else) in the same operation.
> +	 Only count it as one. */
> +      if (use_stmt != phi_use_stmt)
> +	{
> +	  nphi_def_loop_uses++;
> +	  phi_use_stmt = use_stmt;
> +	}
>      }
>  
>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6041,7 +6053,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
>        gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
>      }
> -  
> +
>    scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
>    scalar_type = TREE_TYPE (scalar_dest);
>    scalar_results.truncate (0);
> @@ -6378,7 +6390,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>  	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>  	    initial_value = reduc_info->reduc_initial_values[0];
>  	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> -						 initial_value);
> +						 initial_value, false);
>  	}
>        if (neutral_op)
>  	vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -6860,8 +6872,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  			       gimple_stmt_iterator *gsi,
>  			       gimple **vec_stmt, slp_tree slp_node,
>  			       gimple *reduc_def_stmt,
> -			       tree_code code, internal_fn reduc_fn,
> -			       tree ops[3], tree vectype_in,
> +			       code_helper code, internal_fn reduc_fn,
> +			       tree *ops, int num_ops, tree vectype_in,
>  			       int reduc_index, vec_loop_masks *masks,
>  			       vec_loop_lens *lens)
>  {
> @@ -6877,17 +6889,48 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  
>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>    gcc_assert (ncopies == 1);
> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> +  bool is_cond_op = false;
> +  if (!code.is_tree_code ())
> +    {
> +      code = conditional_internal_fn_code (internal_fn (code));
> +      gcc_assert (code != ERROR_MARK);
> +      is_cond_op = true;
> +    }
> +
> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
>  
>    if (slp_node)
> -    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> -			  TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    {
> +      if (is_cond_op)
> +	{
> +	  if (dump_enabled_p ())
> +	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +			     "fold-left reduction on SLP not supported.\n");
> +	  return false;
> +	}
>  
> -  tree op0 = ops[1 - reduc_index];
> +      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> +			    TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    }
> +
> +  /* The operands either come from a binary operation or an IFN_COND operation.
> +     The former is a gimple assign with binary rhs and the latter is a
> +     gimple call with four arguments.  */
> +  gcc_assert (num_ops == 2 || num_ops == 4);
> +  tree op0, opmask;
> +  if (!is_cond_op)
> +    op0 = ops[1 - reduc_index];
> +  else
> +    {
> +      op0 = ops[2];
> +      opmask = ops[0];
> +      gcc_assert (!slp_node);
> +    }
>  
>    int group_size = 1;
>    stmt_vec_info scalar_dest_def_info;
> -  auto_vec<tree> vec_oprnds0;
> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>    if (slp_node)
>      {
>        auto_vec<vec<tree> > vec_defs (2);
> @@ -6903,9 +6946,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>  				     op0, &vec_oprnds0);
>        scalar_dest_def_info = stmt_info;
> +
> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
> +      if (is_cond_op)
> +	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> +					 opmask, &vec_opmask);
>      }
>  
> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> +  gimple *sdef = scalar_dest_def_info->stmt;
> +  tree scalar_dest = gimple_get_lhs (sdef);
>    tree scalar_type = TREE_TYPE (scalar_dest);
>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
>  
> @@ -6939,13 +6988,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
> +      else if (is_cond_op)
> +	mask = vec_opmask[0];
>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  	{
>  	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>  				   i, 1);
>  	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>  	  bias = build_int_cst (intQI_type_node, biasval);
> -	  mask = build_minus_one_cst (truth_type_for (vectype_in));
> +	  if (!is_cond_op)
> +	    mask = build_minus_one_cst (truth_type_for (vectype_in));
>  	}
>  
>        /* Handle MINUS by adding the negative.  */
> @@ -6957,7 +7009,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	  def0 = negated;
>  	}
>  
> -      if (mask && mask_reduc_fn == IFN_LAST)
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +	  && mask && mask_reduc_fn == IFN_LAST)
>  	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>  				    vector_identity);
>  
> @@ -6988,8 +7041,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> -					     reduc_var, def0);
> +	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> +					     tree_code (code), reduc_var, def0);
>  	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>  	  /* Remove the statement, so that we can use the same code paths
>  	     as for statements that we've just created.  */
> @@ -7440,6 +7493,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>  	continue;
>  
> +      /* For an IFN_COND_OP we might hit the reduction definition operand
> +	 twice (once as definition, once as else).  */
> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> +	continue;
> +
>        /* There should be only one cycle def in the stmt, the one
>           leading to reduc_def.  */
>        if (VECTORIZABLE_CYCLE_DEF (dt))
> @@ -7640,6 +7698,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>            when generating the code inside the loop.  */
>  
>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> +  /* If conversion might have created a conditional operation like
> +     IFN_COND_ADD already.  Use the internal code for the following checks.  */
> +  if (orig_code.is_internal_fn ())
> +    {
> +      tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
> +      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
> +    }
> +
>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> @@ -7678,7 +7745,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -			    "reduction: not commutative/associative");
> +			    "reduction: not commutative/associative\n");
>  	  return false;
>  	}
>      }
> @@ -8213,6 +8280,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> @@ -8231,17 +8299,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    if (code == COND_EXPR)
>      gcc_assert (ncopies == 1);
>  
> +  /* A binary COND_OP reduction must have the same definition and else
> +     value. */
> +  bool cond_fn_p = code.is_internal_fn ()
> +    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
> +  if (cond_fn_p)
> +    {
> +      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
> +		  || code == IFN_COND_MUL || code == IFN_COND_AND
> +		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
> +      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +    }
> +
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> -      gcc_assert (code.is_tree_code ());
> +      gcc_assert (code.is_tree_code () || cond_fn_p);
>        return vectorize_fold_left_reduction
>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> -	   lens);
> +	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +	   reduc_index, masks, lens);
>      }
>  
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> @@ -8254,14 +8334,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>  
> +  /* Get NCOPIES vector definitions for all operands except the reduction
> +     definition.  */
>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  		     single_defuse_cycle && reduc_index == 0
>  		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
>  		     single_defuse_cycle && reduc_index == 1
>  		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -		     op.num_ops == 3
> -		     && !(single_defuse_cycle && reduc_index == 2)
> +		     op.num_ops == 4
> +		     || (op.num_ops == 3
> +			 && !(single_defuse_cycle && reduc_index == 2))
>  		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> +  /* For single def-use cycles get one copy of the vectorized reduction
> +     definition.  */
>    if (single_defuse_cycle)
>      {
>        gcc_assert (!slp_node);
> @@ -8301,7 +8387,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  if (op.num_ops == 3)
> +	  if (op.num_ops >= 3)
>  	    vop[2] = vec_oprnds2[i];
>  
>  	  if (masked_loop_p && mask_by_cond_expr)
> @@ -8314,10 +8400,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  if (emulated_mixed_dot_prod)
>  	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
>  						    vec_dest, vop);
> -	  else if (code.is_internal_fn ())
> +
> +	  else if (code.is_internal_fn () && !cond_fn_p)
>  	    new_stmt = gimple_build_call_internal (internal_fn (code),
>  						   op.num_ops,
>  						   vop[0], vop[1], vop[2]);
> +	  else if (code.is_internal_fn () && cond_fn_p)
> +	    new_stmt = gimple_build_call_internal (internal_fn (code),
> +						   op.num_ops,
> +						   vop[0], vop[1], vop[2],
> +						   vop[1]);
>  	  else
>  	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>  					    vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index f1d0cd79961..e22067400af 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
>  						  tree);
>  
>  /* In tree-vect-loop.cc.  */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
>  bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
>  /* Used in tree-vect-loop-manip.cc */
>
  
Robin Dapp Oct. 6, 2023, 12:28 p.m. UTC | #11
> ... here we probably get PLUS_EXPR for MINUS_EXPR above but IIRC
> for MINUS_EXPR the !as_initial case should return positive zero.
> 
> Can you double-check?

You're referring to the canonicalization from a - CST to a + -CST so
that the neutral op would need to change with it?  Argh, good point.

From what I can tell the only difference for MINUS_EXPR is that we
negate the reduction operand and then just continue as if it were
a PLUS_EXPR (which is the right thing to do also for +-0.0?).
At least I didn't observe a canonicalization and we don't call
neutral_op_for_reduction in between.

What we do have, though, is for the fully-masked case (you added
that recently):

  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
    {
      vector_identity = build_zero_cst (vectype_out);
      if (!HONOR_SIGNED_ZEROS (vectype_out))
	;
      else
	{
	  gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
	  vector_identity = const_unop (NEGATE_EXPR, vectype_out,
					vector_identity);
	}
    }

So for

      /* Handle MINUS by adding the negative.  */
      if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
	{
	  tree negated = make_ssa_name (vectype_out);

We might need a similar assert

	  gcc_assert (HONOR_SIGNED_ZEROS (vectype_out)
                      && !HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));?

Apart from that the only call with !as_inital is in 
vect_create_epilog_for_reduction.  I just instrumented it with an
assert (false) but i386.exp doesn't trigger it at all. 

Regards
 Robin
  
Robin Dapp Oct. 6, 2023, 12:30 p.m. UTC | #12
> We might need a similar assert
> 
> 	  gcc_assert (HONOR_SIGNED_ZEROS (vectype_out)
>                       && !HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));?

erm, obviously not that exact assert but more something like

if (HONOR_SIGNED_ZEROS && !HONOR_SIGN_DEPENDENT_ROUNDING...)
  {
    if (dump)
      ...
    return false;
  }

or so.

Regards
 Robin
  
Richard Biener Oct. 6, 2023, 1:43 p.m. UTC | #13
On Fri, 6 Oct 2023, Robin Dapp wrote:

> > We might need a similar assert
> > 
> > 	  gcc_assert (HONOR_SIGNED_ZEROS (vectype_out)
> >                       && !HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));?
> 
> erm, obviously not that exact assert but more something like
> 
> if (HONOR_SIGNED_ZEROS && !HONOR_SIGN_DEPENDENT_ROUNDING...)
>   {
>     if (dump)
>       ...
>     return false;
>   }
> 
> or so.

Yeah, of course the whole point of a fold-left reduction is to
_not_ give up without -ffast-math which is why I added the above.
I obviously didn't fully verify what happens for an original
MINUS_EXPR.  I think it's required to give up for -frounding-math,
but I think I might have put the code to do that in a generic
enough place.

For x86 you need --param vect-partial-vector-usage=2 and an
AVX512 enabled arch like -march=skylake-avx512 or -march=znver4.

I think tranforming - x to + (-x) works for signed zeros.

So if you think you got everything correct the patch is OK as-is,
I just wasn't sure - maybe the neutral_element change deserves
a comment as to how MINUS_EXPR is handled.

Richard.
  
Robin Dapp Oct. 6, 2023, 8:54 p.m. UTC | #14
> So if you think you got everything correct the patch is OK as-is,
> I just wasn't sure - maybe the neutral_element change deserves
> a comment as to how MINUS_EXPR is handled.

Heh, I never think I got everything correct ;)

Added this now:

 static bool
 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
 {
+  /* We support MINUS_EXPR by negating the operand.  This also preserves an
+     initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
+     (-0.0) = -0.0.  */

What I still found is that aarch64 ICEs at the assertion you added
with -frounding-math.  Therefore I changed it to:

-         gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
+         if (HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "cannot vectorize fold-left reduction because"
+                                " signed zeros cannot be preserved.\n");
+             return false;
+           }

No code changes apart from that.  Will leave it until Monday and push then
barring any objections.

Thanks for the pointers.

Regards
 Robin
  
Richard Sandiford Oct. 9, 2023, 5:50 a.m. UTC | #15
Robin Dapp <rdapp.gcc@gmail.com> writes:
> Hi Tamar,
>
>> The only comment I have is whether you actually need this helper
>> function? It looks like all the uses of it are in cases you have, or
>> will call conditional_internal_fn_code directly.
> removed the cond_fn_p entirely in the attached v3.
>
> Bootstrapped and regtested on x86_64, aarch64 and power10.
>
> Regards
>  Robin
>
> Subject: [PATCH v3] ifcvt/vect: Emit COND_ADD for conditional scalar
>  reduction.
>
> As described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions.  This makes it difficult to combine both
> into a masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD during ifcvt and
> adjusting some vectorizer code to handle it.
>
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
> is true.
>
> gcc/ChangeLog:
>
> 	PR middle-end/111401
> 	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
> 	if supported.
> 	(predicate_scalar_phi): Add whitespace.
> 	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
> 	(neutral_op_for_reduction): Return -0 for PLUS.
> 	(vect_is_simple_reduction): Don't count else operand in
> 	COND_ADD.
> 	(vect_create_epilog_for_reduction): Fix whitespace.
> 	(vectorize_fold_left_reduction): Add COND_ADD handling.
> 	(vectorizable_reduction): Don't count else operand in COND_ADD.
> 	(vect_transform_reduction): Add COND_ADD handling.
> 	* tree-vectorizer.h (neutral_op_for_reduction): Add default
> 	parameter.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
> 	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.

The patch LGTM too FWIW, except...

> ---
>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++
>  .../riscv/rvv/autovec/cond/pr111401.c         | 139 ++++++++++++++++
>  gcc/tree-if-conv.cc                           |  63 ++++++--
>  gcc/tree-vect-loop.cc                         | 150 ++++++++++++++----
>  gcc/tree-vectorizer.h                         |   2 +-
>  5 files changed, 451 insertions(+), 44 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..7b46e7d8a2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_plus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_minus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> +  int n = 19;
> +  double a[N];
> +  int cond1[N], cond2[N];
> +
> +  for (int i = 0; i < N; i++)
> +    {
> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> +      cond1[i] = 0;
> +      cond2[i] = i & 4 ? 1 : 0;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res2 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +  double res3 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res4 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..8f1cb0d68de
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,139 @@
> +/* { dg-do run { target { riscv_v } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init += a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init -= a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo4 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init *= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo5 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init &= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo6 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init |= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo7 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init ^= a[i];
> +  return init;
> +}
> +
> +#define SZ 125
> +
> +int
> +main ()
> +{
> +  double res1 = 0, res2 = 0, res3 = 0;
> +  double a1[SZ], a2[SZ], a3[SZ];
> +  int c1[SZ], c2[SZ], c3[SZ];
> +
> +  int a4[SZ], a5[SZ], a6[SZ], a7[SZ], a8[SZ];
> +  int res4 = 0, res5 = 0, res6 = 0;
> +  int c4[SZ], c5[SZ], c6[SZ];
> +
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      a1[i] = i * 3 + (i & 4) - (i & 7);
> +      a2[i] = i * 3 + (i & 4) - (i & 7);
> +      a3[i] = i * 0.05 + (i & 4) - (i & 7);
> +      a4[i] = i * 3 + (i & 4) - (i & 7);
> +      a5[i] = i * 3 + (i & 4) - (i & 7);
> +      a6[i] = i * 3 + (i & 4) - (i & 7);
> +      c1[i] = i & 1;
> +      c2[i] = i & 2;
> +      c3[i] = i & 3;
> +      c4[i] = i & 4;
> +      c5[i] = i & 5;
> +      c6[i] = i & 6;
> +      __asm__ volatile ("" : : : "memory");
> +    }
> +
> +  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
> +  double ref1 = init1, ref2 = init2, ref3 = init3;
> +
> +  int init4 = 87, init5 = 11, init6 = -123894344, init7 = -2, init8 = 854893;
> +  int ref4 = init4, ref5 = init5, ref6 = init6;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      if (c1[i])
> +        ref1 += a1[i];
> +      if (c2[i])
> +        ref2 -= a2[i];
> +      if (c3[i])
> +        ref3 *= a3[i];
> +      if (c4[i])
> +        ref4 &= a4[i];
> +      if (c5[i])
> +        ref5 |= a5[i];
> +      if (c6[i])
> +        ref6 ^= a6[i];
> +    }
> +
> +  res1 = foo2 (a1, init1, c1, SZ);
> +  res2 = foo3 (a2, init2, c2, SZ);
> +  res3 = foo4 (a3, init3, c3, SZ);
> +  res4 = foo5 (a4, init4, c4, SZ);
> +  res5 = foo6 (a5, init5, c5, SZ);
> +  res6 = foo7 (a6, init6, c6, SZ);
> +
> +  if (res1 != ref1)
> +    __builtin_abort ();
> +  if (res2 != ref2)
> +    __builtin_abort ();
> +  if (res3 != ref3)
> +    __builtin_abort ();
> +  if (res4 != ref4)
> +    __builtin_abort ();
> +  if (res5 != ref5)
> +    __builtin_abort ();
> +  if (res6 != ref6)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index a8c915913ae..b334173794d 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>    gimple *new_assign;
>    tree rhs;
>    tree rhs1 = gimple_assign_rhs1 (reduc);
> +  tree lhs = gimple_assign_lhs (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
> +					       NULL, false);
>    gimple_seq stmts = NULL;
>  
>    if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1864,19 +1866,52 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>      }
>  
> -  /* Build cond expression using COND and constant operand
> -     of reduction rhs.  */
> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
> -			    unshare_expr (cond),
> -			    swap ? op_nochange : op1,
> -			    swap ? op1 : op_nochange);
> +  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
> +     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
> +     a vectorizable call as we can create a COND version of it directly.  */

It'd be good to expand on this comment a bit.  What kind of COND are you
anticipating?  A COND with the neutral op as the else value, so that the
PLUS_EXPR (or whatever) can remain unconditional?  If so, it would be
good to sketch briefly how that happens, and why it's better than using
the conditional PLUS_EXPR.

If that's the reason, perhaps we want a single-use check as well.
It's possible that OP1 is used elsewhere in the loop body, in a
context that would prefer a different else value.

Thanks,
Richard

> +  internal_fn ifn;
> +  ifn = get_conditional_internal_fn (reduction_op);
>  
> -  /* Create assignment stmt and insert it at GSI.  */
> -  new_assign = gimple_build_assign (tmp, c);
> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> -  rhs = gimple_build (&stmts, reduction_op,
> -		      TREE_TYPE (rhs1), op0, tmp);
> +  bool try_cond_op = true;
> +  gimple *opstmt;
> +  if (TREE_CODE (op1) == SSA_NAME
> +      && (opstmt = SSA_NAME_DEF_STMT (op1))
> +      && is_gimple_call (opstmt))
> +    {
> +      combined_fn cfn = gimple_call_combined_fn (opstmt);
> +      internal_fn ifnop;
> +      reduction_fn_for_scalar_code (cfn, &ifnop);
> +      if (vectorized_internal_fn_supported_p (ifnop, TREE_TYPE
> +					      (gimple_call_lhs (opstmt))))
> +	try_cond_op = false;
> +    }
> +
> +  if (ifn != IFN_LAST
> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> +      && try_cond_op && !swap)
> +    {
> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
> +						     unshare_expr (cond),
> +						     op0, op1, op0);
> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> +      gimple_call_set_lhs (cond_call, tmp);
> +      rhs = tmp;
> +    }
> +  else
> +    {
> +      /* Build cond expression using COND and constant operand
> +	 of reduction rhs.  */
> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
> +				unshare_expr (cond),
> +				swap ? op_nochange : op1,
> +				swap ? op1 : op_nochange);
> +      /* Create assignment stmt and insert it at GSI.  */
> +      new_assign = gimple_build_assign (tmp, c);
> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +      rhs = gimple_build (&stmts, reduction_op,
> +			  TREE_TYPE (rhs1), op0, tmp);
> +    }
>  
>    if (has_nop)
>      {
> @@ -2241,7 +2276,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
>  	{
>  	  /* Convert reduction stmt into vectorizable form.  */
>  	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -					       swap,has_nop, nop_reduc);
> +					       swap, has_nop, nop_reduc);
>  	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>  	}
>        new_stmt = gimple_build_assign (res, rhs);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 23c6e8259e7..3c80029eec5 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3672,7 +3672,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
>  static bool
>  fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
>  {
> -  if (code == PLUS_EXPR)
> +  if (code == PLUS_EXPR || code == MINUS_EXPR)
>      {
>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>        return true;
> @@ -3751,23 +3751,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
>     by the introduction of additional X elements, return that X, otherwise
>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>     of the scalar elements.  If the reduction has just a single initial value
> -   then INITIAL_VALUE is that value, otherwise it is null.  */
> +   then INITIAL_VALUE is that value, otherwise it is null.
> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> +   In that case no signed zero is returned.  */
>  
>  tree
>  neutral_op_for_reduction (tree scalar_type, code_helper code,
> -			  tree initial_value)
> +			  tree initial_value, bool as_initial)
>  {
>    if (code.is_tree_code ())
>      switch (tree_code (code))
>        {
> -      case WIDEN_SUM_EXPR:
>        case DOT_PROD_EXPR:
>        case SAD_EXPR:
> -      case PLUS_EXPR:
>        case MINUS_EXPR:
>        case BIT_IOR_EXPR:
>        case BIT_XOR_EXPR:
>  	return build_zero_cst (scalar_type);
> +      case WIDEN_SUM_EXPR:
> +      case PLUS_EXPR:
> +	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> +	  return build_real (scalar_type, dconstm0);
> +	else
> +	  return build_zero_cst (scalar_type);
>  
>        case MULT_EXPR:
>  	return build_one_cst (scalar_type);
> @@ -4106,8 +4112,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
>            return NULL;
>          }
>  
> -      nphi_def_loop_uses++;
> -      phi_use_stmt = use_stmt;
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	 op1 twice (once as definition, once as else) in the same operation.
> +	 Only count it as one. */
> +      if (use_stmt != phi_use_stmt)
> +	{
> +	  nphi_def_loop_uses++;
> +	  phi_use_stmt = use_stmt;
> +	}
>      }
>  
>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6041,7 +6053,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
>        gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
>      }
> -  
> +
>    scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
>    scalar_type = TREE_TYPE (scalar_dest);
>    scalar_results.truncate (0);
> @@ -6378,7 +6390,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>  	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>  	    initial_value = reduc_info->reduc_initial_values[0];
>  	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> -						 initial_value);
> +						 initial_value, false);
>  	}
>        if (neutral_op)
>  	vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -6860,8 +6872,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  			       gimple_stmt_iterator *gsi,
>  			       gimple **vec_stmt, slp_tree slp_node,
>  			       gimple *reduc_def_stmt,
> -			       tree_code code, internal_fn reduc_fn,
> -			       tree ops[3], tree vectype_in,
> +			       code_helper code, internal_fn reduc_fn,
> +			       tree *ops, int num_ops, tree vectype_in,
>  			       int reduc_index, vec_loop_masks *masks,
>  			       vec_loop_lens *lens)
>  {
> @@ -6877,17 +6889,48 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  
>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>    gcc_assert (ncopies == 1);
> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> +  bool is_cond_op = false;
> +  if (!code.is_tree_code ())
> +    {
> +      code = conditional_internal_fn_code (internal_fn (code));
> +      gcc_assert (code != ERROR_MARK);
> +      is_cond_op = true;
> +    }
> +
> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
>  
>    if (slp_node)
> -    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> -			  TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    {
> +      if (is_cond_op)
> +	{
> +	  if (dump_enabled_p ())
> +	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +			     "fold-left reduction on SLP not supported.\n");
> +	  return false;
> +	}
>  
> -  tree op0 = ops[1 - reduc_index];
> +      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> +			    TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    }
> +
> +  /* The operands either come from a binary operation or an IFN_COND operation.
> +     The former is a gimple assign with binary rhs and the latter is a
> +     gimple call with four arguments.  */
> +  gcc_assert (num_ops == 2 || num_ops == 4);
> +  tree op0, opmask;
> +  if (!is_cond_op)
> +    op0 = ops[1 - reduc_index];
> +  else
> +    {
> +      op0 = ops[2];
> +      opmask = ops[0];
> +      gcc_assert (!slp_node);
> +    }
>  
>    int group_size = 1;
>    stmt_vec_info scalar_dest_def_info;
> -  auto_vec<tree> vec_oprnds0;
> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>    if (slp_node)
>      {
>        auto_vec<vec<tree> > vec_defs (2);
> @@ -6903,9 +6946,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>  				     op0, &vec_oprnds0);
>        scalar_dest_def_info = stmt_info;
> +
> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
> +      if (is_cond_op)
> +	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> +					 opmask, &vec_opmask);
>      }
>  
> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> +  gimple *sdef = scalar_dest_def_info->stmt;
> +  tree scalar_dest = gimple_get_lhs (sdef);
>    tree scalar_type = TREE_TYPE (scalar_dest);
>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
>  
> @@ -6939,13 +6988,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
> +      else if (is_cond_op)
> +	mask = vec_opmask[0];
>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  	{
>  	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>  				   i, 1);
>  	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>  	  bias = build_int_cst (intQI_type_node, biasval);
> -	  mask = build_minus_one_cst (truth_type_for (vectype_in));
> +	  if (!is_cond_op)
> +	    mask = build_minus_one_cst (truth_type_for (vectype_in));
>  	}
>  
>        /* Handle MINUS by adding the negative.  */
> @@ -6957,7 +7009,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	  def0 = negated;
>  	}
>  
> -      if (mask && mask_reduc_fn == IFN_LAST)
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +	  && mask && mask_reduc_fn == IFN_LAST)
>  	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>  				    vector_identity);
>  
> @@ -6988,8 +7041,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> -					     reduc_var, def0);
> +	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> +					     tree_code (code), reduc_var, def0);
>  	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>  	  /* Remove the statement, so that we can use the same code paths
>  	     as for statements that we've just created.  */
> @@ -7440,6 +7493,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>  	continue;
>  
> +      /* For an IFN_COND_OP we might hit the reduction definition operand
> +	 twice (once as definition, once as else).  */
> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> +	continue;
> +
>        /* There should be only one cycle def in the stmt, the one
>           leading to reduc_def.  */
>        if (VECTORIZABLE_CYCLE_DEF (dt))
> @@ -7640,6 +7698,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>            when generating the code inside the loop.  */
>  
>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> +  /* If conversion might have created a conditional operation like
> +     IFN_COND_ADD already.  Use the internal code for the following checks.  */
> +  if (orig_code.is_internal_fn ())
> +    {
> +      tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
> +      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
> +    }
> +
>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> @@ -7678,7 +7745,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -			    "reduction: not commutative/associative");
> +			    "reduction: not commutative/associative\n");
>  	  return false;
>  	}
>      }
> @@ -8213,6 +8280,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> @@ -8231,17 +8299,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    if (code == COND_EXPR)
>      gcc_assert (ncopies == 1);
>  
> +  /* A binary COND_OP reduction must have the same definition and else
> +     value. */
> +  bool cond_fn_p = code.is_internal_fn ()
> +    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
> +  if (cond_fn_p)
> +    {
> +      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
> +		  || code == IFN_COND_MUL || code == IFN_COND_AND
> +		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
> +      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +    }
> +
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> -      gcc_assert (code.is_tree_code ());
> +      gcc_assert (code.is_tree_code () || cond_fn_p);
>        return vectorize_fold_left_reduction
>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> -	   lens);
> +	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +	   reduc_index, masks, lens);
>      }
>  
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> @@ -8254,14 +8334,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>  
> +  /* Get NCOPIES vector definitions for all operands except the reduction
> +     definition.  */
>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  		     single_defuse_cycle && reduc_index == 0
>  		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
>  		     single_defuse_cycle && reduc_index == 1
>  		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -		     op.num_ops == 3
> -		     && !(single_defuse_cycle && reduc_index == 2)
> +		     op.num_ops == 4
> +		     || (op.num_ops == 3
> +			 && !(single_defuse_cycle && reduc_index == 2))
>  		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> +  /* For single def-use cycles get one copy of the vectorized reduction
> +     definition.  */
>    if (single_defuse_cycle)
>      {
>        gcc_assert (!slp_node);
> @@ -8301,7 +8387,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  if (op.num_ops == 3)
> +	  if (op.num_ops >= 3)
>  	    vop[2] = vec_oprnds2[i];
>  
>  	  if (masked_loop_p && mask_by_cond_expr)
> @@ -8314,10 +8400,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  if (emulated_mixed_dot_prod)
>  	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
>  						    vec_dest, vop);
> -	  else if (code.is_internal_fn ())
> +
> +	  else if (code.is_internal_fn () && !cond_fn_p)
>  	    new_stmt = gimple_build_call_internal (internal_fn (code),
>  						   op.num_ops,
>  						   vop[0], vop[1], vop[2]);
> +	  else if (code.is_internal_fn () && cond_fn_p)
> +	    new_stmt = gimple_build_call_internal (internal_fn (code),
> +						   op.num_ops,
> +						   vop[0], vop[1], vop[2],
> +						   vop[1]);
>  	  else
>  	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>  					    vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index f1d0cd79961..e22067400af 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
>  						  tree);
>  
>  /* In tree-vect-loop.cc.  */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
>  bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
>  /* Used in tree-vect-loop-manip.cc */
  
Richard Biener Oct. 9, 2023, 8:25 a.m. UTC | #16
On Fri, 6 Oct 2023, Robin Dapp wrote:

> > So if you think you got everything correct the patch is OK as-is,
> > I just wasn't sure - maybe the neutral_element change deserves
> > a comment as to how MINUS_EXPR is handled.
> 
> Heh, I never think I got everything correct ;)
> 
> Added this now:
> 
>  static bool
>  fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
>  {
> +  /* We support MINUS_EXPR by negating the operand.  This also preserves an
> +     initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
> +     (-0.0) = -0.0.  */
> 
> What I still found is that aarch64 ICEs at the assertion you added
> with -frounding-math.  Therefore I changed it to:
> 
> -         gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
> +         if (HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out))
> +           {
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                "cannot vectorize fold-left reduction because"
> +                                " signed zeros cannot be preserved.\n");
> +             return false;
> +           }
> 
> No code changes apart from that.  Will leave it until Monday and push then
> barring any objections.

Hmm, the function is called at transform time so this shouldn't help
avoiding the ICE.  I expected we refuse to vectorize _any_ reduction
when sign dependent rounding is in effect?  OTOH maybe sign-dependent
rounding is OK but only when we use a unconditional fold-left
(so a loop mask from fully masking is OK but not an original COND_ADD?).

Still the check should be done in vectorizable_reduction, not only
during transform (there the assert is proper, if we can distinguish
the loop mask vs. the COND_ADD here, otherwise just remove it).

Richard.


> Thanks for the pointers.
> 
> Regards
>  Robin
> 
>
  
Robin Dapp Oct. 9, 2023, 12:02 p.m. UTC | #17
> It'd be good to expand on this comment a bit.  What kind of COND are you
> anticipating?  A COND with the neutral op as the else value, so that the
> PLUS_EXPR (or whatever) can remain unconditional?  If so, it would be
> good to sketch briefly how that happens, and why it's better than using
> the conditional PLUS_EXPR.
> 
> If that's the reason, perhaps we want a single-use check as well.
> It's possible that OP1 is used elsewhere in the loop body, in a
> context that would prefer a different else value.

Would something like the following on top work?

-  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
-     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
+  /* If possible create a COND_OP instead of a COND_EXPR and an OP_EXPR.
+     The COND_OP will have a neutral_op else value.
+
+     This allows re-using the mask directly in a masked reduction instead
+     of creating a vector merge (or similar) and then an unmasked reduction.
+
+     Don't do this if the reduction def operand itself is
      a vectorizable call as we can create a COND version of it directly.  */

   if (ifn != IFN_LAST
       && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
-      && try_cond_op && !swap)
+      && use_cond_op && !swap && has_single_use (op1))

Regards
 Robin
  
Robin Dapp Oct. 9, 2023, 12:54 p.m. UTC | #18
> Hmm, the function is called at transform time so this shouldn't help
> avoiding the ICE.  I expected we refuse to vectorize _any_ reduction
> when sign dependent rounding is in effect?  OTOH maybe sign-dependent
> rounding is OK but only when we use a unconditional fold-left
> (so a loop mask from fully masking is OK but not an original COND_ADD?).

So we currently only disable the use of partial vectors

      else if (reduction_type == FOLD_LEFT_REDUCTION
	       && reduc_fn == IFN_LAST
	       && FLOAT_TYPE_P (vectype_in)
	       && HONOR_SIGNED_ZEROS (vectype_in)
	       && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
	{
	  if (dump_enabled_p ())
	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
			     "can't operate on partial vectors because"
			     " signed zeros cannot be preserved.\n");
	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;

which is inside a LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P block.

For the fully masked case we continue (and then fail the assertion
on aarch64 at transform time).

I didn't get why that case is ok, though?  We still merge the initial
definition with the identity/neutral op (i.e. possibly -0.0) based on
the loop mask.  Is that different to partial masking?

Regards
 Robin
  
Richard Biener Oct. 9, 2023, 1:05 p.m. UTC | #19
On Mon, 9 Oct 2023, Robin Dapp wrote:

> > Hmm, the function is called at transform time so this shouldn't help
> > avoiding the ICE.  I expected we refuse to vectorize _any_ reduction
> > when sign dependent rounding is in effect?  OTOH maybe sign-dependent
> > rounding is OK but only when we use a unconditional fold-left
> > (so a loop mask from fully masking is OK but not an original COND_ADD?).
> 
> So we currently only disable the use of partial vectors
> 
>       else if (reduction_type == FOLD_LEFT_REDUCTION
> 	       && reduc_fn == IFN_LAST

aarch64 probably chokes because reduc_fn is not IFN_LAST.

> 	       && FLOAT_TYPE_P (vectype_in)
> 	       && HONOR_SIGNED_ZEROS (vectype_in)

so with your change we'd support signed zeros correctly.

> 	       && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
> 	{
> 	  if (dump_enabled_p ())
> 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> 			     "can't operate on partial vectors because"
> 			     " signed zeros cannot be preserved.\n");
> 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> 
> which is inside a LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P block.
> 
> For the fully masked case we continue (and then fail the assertion
> on aarch64 at transform time).
> 
> I didn't get why that case is ok, though?  We still merge the initial
> definition with the identity/neutral op (i.e. possibly -0.0) based on
> the loop mask.  Is that different to partial masking?

I think the main point with my earlier change is that without
native support for a fold-left reduction (like on x86) we get

 ops = mask ? ops : neutral;
 acc += ops[0];
 acc += ops[1];
 ...

so we wouldn't use a COND_ADD but add neutral elements for masked
elements.  That's OK for signed zeros after your change (great)
but not OK for sign dependent rounding (because we can't decide on
the sign of the neutral zero then).

For the case of using an internal function, thus direct target support,
it should be OK to have sign-dependent rounding if we can use
the masked-fold-left reduction op.  As we do

      /* On the first iteration the input is simply the scalar phi
         result, and for subsequent iterations it is the output of
         the preceding operation.  */
      if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
        {
          if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
            new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, 
reduc_var,
                                                   def0, mask, len, bias);
          else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
            new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, 
reduc_var,
                                                   def0, mask);
          else
            new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
                                                   def0);

the last case should be able to assert that 
!HONOR_SIGN_DEPENDENT_ROUNDING (also the reduc_fn == IFN_LAST case).

The quoted condition above should change to drop the HONOR_SIGNED_ZEROS
condition and the reduc_fn == IFN_LAST should change, maybe to
internal_fn_mask_index (reduc_fn) == -1?

Richard.
  
Richard Sandiford Oct. 9, 2023, 2:57 p.m. UTC | #20
Robin Dapp <rdapp.gcc@gmail.com> writes:
>> It'd be good to expand on this comment a bit.  What kind of COND are you
>> anticipating?  A COND with the neutral op as the else value, so that the
>> PLUS_EXPR (or whatever) can remain unconditional?  If so, it would be
>> good to sketch briefly how that happens, and why it's better than using
>> the conditional PLUS_EXPR.
>> 
>> If that's the reason, perhaps we want a single-use check as well.
>> It's possible that OP1 is used elsewhere in the loop body, in a
>> context that would prefer a different else value.
>
> Would something like the following on top work?
>
> -  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
> -     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
> +  /* If possible create a COND_OP instead of a COND_EXPR and an OP_EXPR.
> +     The COND_OP will have a neutral_op else value.
> +
> +     This allows re-using the mask directly in a masked reduction instead
> +     of creating a vector merge (or similar) and then an unmasked reduction.
> +
> +     Don't do this if the reduction def operand itself is
>       a vectorizable call as we can create a COND version of it directly.  */

It wasn't very clear, sorry, but it was the last sentence I was asking
for clarification on, not the other bits.  Why do we want to avoid
generating a COND_ADD when the operand is a vectorisable call?

Thanks,
Richard

>
>    if (ifn != IFN_LAST
>        && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> -      && try_cond_op && !swap)
> +      && use_cond_op && !swap && has_single_use (op1))
>
> Regards
>  Robin
  
Robin Dapp Oct. 11, 2023, 7:15 p.m. UTC | #21
> It wasn't very clear, sorry, but it was the last sentence I was asking
> for clarification on, not the other bits.  Why do we want to avoid
> generating a COND_ADD when the operand is a vectorisable call?

Ah, I see, apologies.  Upon thinking about it a bit more (thanks)
I figured this hunk is not necessary.  I added it early in the process
in order to keep the current behavior for situations like the following:

 before:
 _1 = .FMA (...)
 _2 = COND (cond, .FMA, 0.0)
 _3 = COND_ADD (true, result, _2, result)

 This we would simplify to:
 _2 = COND_FMA (cond, ...)
 _3 = COND_ADD (true, result, _2, result)

 with the patch we have:
 _1 = .FMA (...)
 _2 = .COND_ADD (cond, arg1, _1, arg1)

Due to differences in expansion we'd end up with a masked
vfmacc ("a += a + b * c") before and now emit an unmasked
vfmadd ("a += a * b + c") and a masked result add.  This shouldn't
be worse from a vector spec point of view, so I just changed the
test expectation for now.

The attached v4 also includes Richi's suggestion for the HONOR...
stuff.

Bootstrap and regtest unchanged on aarch64, x86 and power10.

Regards
 Robin


From 1752507ce22c22b50b96f889dc0a9c2fc8e50859 Mon Sep 17 00:00:00 2001
From: Robin Dapp <rdapp@ventanamicro.com>
Date: Wed, 13 Sep 2023 22:19:35 +0200
Subject: [PATCH v4] ifcvt/vect: Emit COND_ADD for conditional scalar
 reduction.

As described in PR111401 we currently emit a COND and a PLUS expression
for conditional reductions.  This makes it difficult to combine both
into a masked reduction statement later.
This patch improves that by directly emitting a COND_ADD during ifcvt and
adjusting some vectorizer code to handle it.

It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
is true.

gcc/ChangeLog:

	PR middle-end/111401
	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
	if supported.
	(predicate_scalar_phi): Add whitespace.
	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
	(neutral_op_for_reduction): Return -0 for PLUS.
	(vect_is_simple_reduction): Don't count else operand in
	COND_ADD.
	(vect_create_epilog_for_reduction): Fix whitespace.
	(vectorize_fold_left_reduction): Add COND_ADD handling.
	(vectorizable_reduction): Don't count else operand in COND_ADD.
	(vect_transform_reduction): Add COND_ADD handling.
	* tree-vectorizer.h (neutral_op_for_reduction): Add default
	parameter.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c: Adjust.
	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c: Ditto.
---
 .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++
 .../riscv/rvv/autovec/cond/pr111401.c         | 139 ++++++++++++++++
 .../riscv/rvv/autovec/reduc/reduc_call-2.c    |   4 +-
 .../riscv/rvv/autovec/reduc/reduc_call-4.c    |   4 +-
 gcc/tree-if-conv.cc                           |  49 ++++--
 gcc/tree-vect-loop.cc                         | 156 ++++++++++++++----
 gcc/tree-vectorizer.h                         |   2 +-
 7 files changed, 446 insertions(+), 49 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
new file mode 100644
index 00000000000..7b46e7d8a2a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
@@ -0,0 +1,141 @@
+/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
+/* { dg-do run } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-add-options ieee } */
+/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
+
+#include "tree-vect.h"
+
+#include <math.h>
+
+#define N (VECTOR_BITS * 17)
+
+double __attribute__ ((noinline, noclone))
+reduc_plus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone))
+reduc_minus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  int n = 19;
+  double a[N];
+  int cond1[N], cond2[N];
+
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
+      cond1[i] = 0;
+      cond2[i] = i & 4 ? 1 : 0;
+      asm volatile ("" ::: "memory");
+    }
+
+  double res1 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res2 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
+  double res3 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res4 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond1, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res2 = reduc_minus_double (a, 0.0, cond1, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
+  res3 = reduc_plus_double (a, 0.0, cond1, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res4 = reduc_minus_double (a, 0.0, cond1, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, -0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res2 = reduc_minus_double (a, -0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
+  res3 = reduc_plus_double (a, -0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res4 = reduc_minus_double (a, -0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res2 = reduc_minus_double (a, 0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
+  res3 = reduc_plus_double (a, 0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res4 = reduc_minus_double (a, 0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
new file mode 100644
index 00000000000..83dbd61b3f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
@@ -0,0 +1,139 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
+
+double
+__attribute__ ((noipa))
+foo2 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init += a[i];
+  return init;
+}
+
+double
+__attribute__ ((noipa))
+foo3 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init -= a[i];
+  return init;
+}
+
+double
+__attribute__ ((noipa))
+foo4 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init *= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo5 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init &= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo6 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init |= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo7 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init ^= a[i];
+  return init;
+}
+
+#define SZ 125
+
+int
+main ()
+{
+  double res1 = 0, res2 = 0, res3 = 0;
+  double a1[SZ], a2[SZ], a3[SZ];
+  int c1[SZ], c2[SZ], c3[SZ];
+
+  int a4[SZ], a5[SZ], a6[SZ];
+  int res4 = 0, res5 = 0, res6 = 0;
+  int c4[SZ], c5[SZ], c6[SZ];
+
+  for (int i = 0; i < SZ; i++)
+    {
+      a1[i] = i * 3 + (i & 4) - (i & 7);
+      a2[i] = i * 3 + (i & 4) - (i & 7);
+      a3[i] = i * 0.05 + (i & 4) - (i & 7);
+      a4[i] = i * 3 + (i & 4) - (i & 7);
+      a5[i] = i * 3 + (i & 4) - (i & 7);
+      a6[i] = i * 3 + (i & 4) - (i & 7);
+      c1[i] = i & 1;
+      c2[i] = i & 2;
+      c3[i] = i & 3;
+      c4[i] = i & 4;
+      c5[i] = i & 5;
+      c6[i] = i & 6;
+      __asm__ volatile ("" : : : "memory");
+    }
+
+  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
+  double ref1 = init1, ref2 = init2, ref3 = init3;
+
+  int init4 = 87, init5 = 11, init6 = -123894344;
+  int ref4 = init4, ref5 = init5, ref6 = init6;
+
+#pragma GCC novector
+  for (int i = 0; i < SZ; i++)
+    {
+      if (c1[i])
+        ref1 += a1[i];
+      if (c2[i])
+        ref2 -= a2[i];
+      if (c3[i])
+        ref3 *= a3[i];
+      if (c4[i])
+        ref4 &= a4[i];
+      if (c5[i])
+        ref5 |= a5[i];
+      if (c6[i])
+        ref6 ^= a6[i];
+    }
+
+  res1 = foo2 (a1, init1, c1, SZ);
+  res2 = foo3 (a2, init2, c2, SZ);
+  res3 = foo4 (a3, init3, c3, SZ);
+  res4 = foo5 (a4, init4, c4, SZ);
+  res5 = foo6 (a5, init5, c5, SZ);
+  res6 = foo7 (a6, init6, c6, SZ);
+
+  if (res1 != ref1)
+    __builtin_abort ();
+  if (res2 != ref2)
+    __builtin_abort ();
+  if (res3 != ref3)
+    __builtin_abort ();
+  if (res4 != ref4)
+    __builtin_abort ();
+  if (res5 != ref5)
+    __builtin_abort ();
+  if (res6 != ref6)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
+/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
index cc07a047cd5..7be22d60bf2 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
@@ -3,4 +3,6 @@
 
 #include "reduc_call-1.c"
 
-/* { dg-final { scan-assembler-times {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
+/* { dg-final { scan-assembler-times {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
+/* { dg-final { scan-assembler-not {vmerge} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
index 6d00c404d2a..83beabeff97 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
@@ -3,4 +3,6 @@
 
 #include "reduc_call-1.c"
 
-/* { dg-final { scan-assembler {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
+/* { dg-final { scan-assembler {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} } } */
+/* { dg-final { scan-assembler {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
+/* { dg-final { scan-assembler-not {vmerge} } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index a8c915913ae..462b5aab716 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
   gimple *new_assign;
   tree rhs;
   tree rhs1 = gimple_assign_rhs1 (reduc);
+  tree lhs = gimple_assign_lhs (reduc);
   tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
   tree c;
   enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
-  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
+  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
+					       NULL, false);
   gimple_seq stmts = NULL;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1864,19 +1866,36 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
       print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
     }
 
-  /* Build cond expression using COND and constant operand
-     of reduction rhs.  */
-  c = fold_build_cond_expr (TREE_TYPE (rhs1),
-			    unshare_expr (cond),
-			    swap ? op_nochange : op1,
-			    swap ? op1 : op_nochange);
-
-  /* Create assignment stmt and insert it at GSI.  */
-  new_assign = gimple_build_assign (tmp, c);
-  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
-  /* Build rhs for unconditional increment/decrement/logic_operation.  */
-  rhs = gimple_build (&stmts, reduction_op,
-		      TREE_TYPE (rhs1), op0, tmp);
+  /* If possible create a COND_OP instead of a COND_EXPR and an OP_EXPR.
+     The COND_OP will have a neutral_op else value.  */
+  internal_fn ifn;
+  ifn = get_conditional_internal_fn (reduction_op);
+  if (ifn != IFN_LAST
+      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
+      && !swap)
+    {
+      gcall *cond_call = gimple_build_call_internal (ifn, 4,
+						     unshare_expr (cond),
+						     op0, op1, op0);
+      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
+      gimple_call_set_lhs (cond_call, tmp);
+      rhs = tmp;
+    }
+  else
+    {
+      /* Build cond expression using COND and constant operand
+	 of reduction rhs.  */
+      c = fold_build_cond_expr (TREE_TYPE (rhs1),
+				unshare_expr (cond),
+				swap ? op_nochange : op1,
+				swap ? op1 : op_nochange);
+      /* Create assignment stmt and insert it at GSI.  */
+      new_assign = gimple_build_assign (tmp, c);
+      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
+      /* Build rhs for unconditional increment/decrement/logic_operation.  */
+      rhs = gimple_build (&stmts, reduction_op,
+			  TREE_TYPE (rhs1), op0, tmp);
+    }
 
   if (has_nop)
     {
@@ -2241,7 +2260,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
 	{
 	  /* Convert reduction stmt into vectorizable form.  */
 	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
-					       swap,has_nop, nop_reduc);
+					       swap, has_nop, nop_reduc);
 	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
 	}
       new_stmt = gimple_build_assign (res, rhs);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 23c6e8259e7..ee425461aa3 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3672,7 +3672,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 static bool
 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
 {
-  if (code == PLUS_EXPR)
+  /* We support MINUS_EXPR by negating the operand.  This also preserves an
+     initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
+     (-0.0) = -0.0.  */
+  if (code == PLUS_EXPR || code == MINUS_EXPR)
     {
       *reduc_fn = IFN_FOLD_LEFT_PLUS;
       return true;
@@ -3751,23 +3754,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
    by the introduction of additional X elements, return that X, otherwise
    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
    of the scalar elements.  If the reduction has just a single initial value
-   then INITIAL_VALUE is that value, otherwise it is null.  */
+   then INITIAL_VALUE is that value, otherwise it is null.
+   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
+   In that case no signed zero is returned.  */
 
 tree
 neutral_op_for_reduction (tree scalar_type, code_helper code,
-			  tree initial_value)
+			  tree initial_value, bool as_initial)
 {
   if (code.is_tree_code ())
     switch (tree_code (code))
       {
-      case WIDEN_SUM_EXPR:
       case DOT_PROD_EXPR:
       case SAD_EXPR:
-      case PLUS_EXPR:
       case MINUS_EXPR:
       case BIT_IOR_EXPR:
       case BIT_XOR_EXPR:
 	return build_zero_cst (scalar_type);
+      case WIDEN_SUM_EXPR:
+      case PLUS_EXPR:
+	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
+	  return build_real (scalar_type, dconstm0);
+	else
+	  return build_zero_cst (scalar_type);
 
       case MULT_EXPR:
 	return build_one_cst (scalar_type);
@@ -4106,8 +4115,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
           return NULL;
         }
 
-      nphi_def_loop_uses++;
-      phi_use_stmt = use_stmt;
+      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
+	 op1 twice (once as definition, once as else) in the same operation.
+	 Only count it as one. */
+      if (use_stmt != phi_use_stmt)
+	{
+	  nphi_def_loop_uses++;
+	  phi_use_stmt = use_stmt;
+	}
     }
 
   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
@@ -6041,7 +6056,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
     }
-  
+
   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
   scalar_type = TREE_TYPE (scalar_dest);
   scalar_results.truncate (0);
@@ -6378,7 +6393,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
 	    initial_value = reduc_info->reduc_initial_values[0];
 	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
-						 initial_value);
+						 initial_value, false);
 	}
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -6860,8 +6875,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 			       gimple_stmt_iterator *gsi,
 			       gimple **vec_stmt, slp_tree slp_node,
 			       gimple *reduc_def_stmt,
-			       tree_code code, internal_fn reduc_fn,
-			       tree ops[3], tree vectype_in,
+			       code_helper code, internal_fn reduc_fn,
+			       tree *ops, int num_ops, tree vectype_in,
 			       int reduc_index, vec_loop_masks *masks,
 			       vec_loop_lens *lens)
 {
@@ -6877,17 +6892,48 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 
   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
   gcc_assert (ncopies == 1);
-  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
+
+  bool is_cond_op = false;
+  if (!code.is_tree_code ())
+    {
+      code = conditional_internal_fn_code (internal_fn (code));
+      gcc_assert (code != ERROR_MARK);
+      is_cond_op = true;
+    }
+
+  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
 
   if (slp_node)
-    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
-			  TYPE_VECTOR_SUBPARTS (vectype_in)));
+    {
+      if (is_cond_op)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "fold-left reduction on SLP not supported.\n");
+	  return false;
+	}
 
-  tree op0 = ops[1 - reduc_index];
+      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
+			    TYPE_VECTOR_SUBPARTS (vectype_in)));
+    }
+
+  /* The operands either come from a binary operation or an IFN_COND operation.
+     The former is a gimple assign with binary rhs and the latter is a
+     gimple call with four arguments.  */
+  gcc_assert (num_ops == 2 || num_ops == 4);
+  tree op0, opmask;
+  if (!is_cond_op)
+    op0 = ops[1 - reduc_index];
+  else
+    {
+      op0 = ops[2];
+      opmask = ops[0];
+      gcc_assert (!slp_node);
+    }
 
   int group_size = 1;
   stmt_vec_info scalar_dest_def_info;
-  auto_vec<tree> vec_oprnds0;
+  auto_vec<tree> vec_oprnds0, vec_opmask;
   if (slp_node)
     {
       auto_vec<vec<tree> > vec_defs (2);
@@ -6903,9 +6949,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
 				     op0, &vec_oprnds0);
       scalar_dest_def_info = stmt_info;
+
+      /* For an IFN_COND_OP we also need the vector mask operand.  */
+      if (is_cond_op)
+	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+					 opmask, &vec_opmask);
     }
 
-  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
+  gimple *sdef = scalar_dest_def_info->stmt;
+  tree scalar_dest = gimple_get_lhs (sdef);
   tree scalar_type = TREE_TYPE (scalar_dest);
   tree reduc_var = gimple_phi_result (reduc_def_stmt);
 
@@ -6939,13 +6991,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       tree bias = NULL_TREE;
       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
+      else if (is_cond_op)
+	mask = vec_opmask[0];
       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
 	{
 	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
 				   i, 1);
 	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 	  bias = build_int_cst (intQI_type_node, biasval);
-	  mask = build_minus_one_cst (truth_type_for (vectype_in));
+	  if (!is_cond_op)
+	    mask = build_minus_one_cst (truth_type_for (vectype_in));
 	}
 
       /* Handle MINUS by adding the negative.  */
@@ -6957,7 +7012,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	  def0 = negated;
 	}
 
-      if (mask && mask_reduc_fn == IFN_LAST)
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  && mask && mask_reduc_fn == IFN_LAST)
 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
 				    vector_identity);
 
@@ -6988,8 +7044,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
-					     reduc_var, def0);
+	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
+					     tree_code (code), reduc_var, def0);
 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
 	  /* Remove the statement, so that we can use the same code paths
 	     as for statements that we've just created.  */
@@ -7440,6 +7496,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
 	continue;
 
+      /* For an IFN_COND_OP we might hit the reduction definition operand
+	 twice (once as definition, once as else).  */
+      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
+	continue;
+
       /* There should be only one cycle def in the stmt, the one
          leading to reduc_def.  */
       if (VECTORIZABLE_CYCLE_DEF (dt))
@@ -7640,6 +7701,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
           when generating the code inside the loop.  */
 
   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
+
+  /* If conversion might have created a conditional operation like
+     IFN_COND_ADD already.  Use the internal code for the following checks.  */
+  if (orig_code.is_internal_fn ())
+    {
+      tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
+      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
+    }
+
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
@@ -7678,7 +7748,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			    "reduction: not commutative/associative");
+			    "reduction: not commutative/associative\n");
 	  return false;
 	}
     }
@@ -8062,9 +8132,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	}
       else if (reduction_type == FOLD_LEFT_REDUCTION
-	       && reduc_fn == IFN_LAST
+	       && internal_fn_mask_index (reduc_fn) == -1
 	       && FLOAT_TYPE_P (vectype_in)
-	       && HONOR_SIGNED_ZEROS (vectype_in)
 	       && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
 	{
 	  if (dump_enabled_p ())
@@ -8213,6 +8282,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
   code_helper code = canonicalize_code (op.code, op.type);
   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
+
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
@@ -8231,17 +8301,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   if (code == COND_EXPR)
     gcc_assert (ncopies == 1);
 
+  /* A binary COND_OP reduction must have the same definition and else
+     value. */
+  bool cond_fn_p = code.is_internal_fn ()
+    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
+  if (cond_fn_p)
+    {
+      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
+		  || code == IFN_COND_MUL || code == IFN_COND_AND
+		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
+      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
+    }
+
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
-      gcc_assert (code.is_tree_code ());
+      gcc_assert (code.is_tree_code () || cond_fn_p);
       return vectorize_fold_left_reduction
 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
-	   lens);
+	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
+	   reduc_index, masks, lens);
     }
 
   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
@@ -8254,14 +8336,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
+  /* Get NCOPIES vector definitions for all operands except the reduction
+     definition.  */
   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
 		     single_defuse_cycle && reduc_index == 0
 		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
 		     single_defuse_cycle && reduc_index == 1
 		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
-		     op.num_ops == 3
-		     && !(single_defuse_cycle && reduc_index == 2)
+		     op.num_ops == 4
+		     || (op.num_ops == 3
+			 && !(single_defuse_cycle && reduc_index == 2))
 		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+
+  /* For single def-use cycles get one copy of the vectorized reduction
+     definition.  */
   if (single_defuse_cycle)
     {
       gcc_assert (!slp_node);
@@ -8301,7 +8389,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  if (op.num_ops == 3)
+	  if (op.num_ops >= 3)
 	    vop[2] = vec_oprnds2[i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
@@ -8314,10 +8402,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	  if (emulated_mixed_dot_prod)
 	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
 						    vec_dest, vop);
-	  else if (code.is_internal_fn ())
+
+	  else if (code.is_internal_fn () && !cond_fn_p)
 	    new_stmt = gimple_build_call_internal (internal_fn (code),
 						   op.num_ops,
 						   vop[0], vop[1], vop[2]);
+	  else if (code.is_internal_fn () && cond_fn_p)
+	    new_stmt = gimple_build_call_internal (internal_fn (code),
+						   op.num_ops,
+						   vop[0], vop[1], vop[2],
+						   vop[1]);
 	  else
 	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
 					    vop[0], vop[1], vop[2]);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f1d0cd79961..e22067400af 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
 						  tree);
 
 /* In tree-vect-loop.cc.  */
-extern tree neutral_op_for_reduction (tree, code_helper, tree);
+extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
 extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
 bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
 /* Used in tree-vect-loop-manip.cc */
  
Richard Sandiford Oct. 12, 2023, 10:47 a.m. UTC | #22
Robin Dapp <rdapp.gcc@gmail.com> writes:
>> It wasn't very clear, sorry, but it was the last sentence I was asking
>> for clarification on, not the other bits.  Why do we want to avoid
>> generating a COND_ADD when the operand is a vectorisable call?
>
> Ah, I see, apologies.  Upon thinking about it a bit more (thanks)
> I figured this hunk is not necessary.  I added it early in the process
> in order to keep the current behavior for situations like the following:
>
>  before:
>  _1 = .FMA (...)
>  _2 = COND (cond, .FMA, 0.0)
>  _3 = COND_ADD (true, result, _2, result)
>
>  This we would simplify to:
>  _2 = COND_FMA (cond, ...)
>  _3 = COND_ADD (true, result, _2, result)
>
>  with the patch we have:
>  _1 = .FMA (...)
>  _2 = .COND_ADD (cond, arg1, _1, arg1)
>
> Due to differences in expansion we'd end up with a masked
> vfmacc ("a += a + b * c") before and now emit an unmasked
> vfmadd ("a += a * b + c") and a masked result add.  This shouldn't
> be worse from a vector spec point of view, so I just changed the
> test expectation for now.

Thanks, sounds good.

> The attached v4 also includes Richi's suggestion for the HONOR...
> stuff.
>
> Bootstrap and regtest unchanged on aarch64, x86 and power10.

I'm reluctant to comment on the signed zeros/MINUS_EXPR parts,
but FWIW, the rest looks good to me.

Thanks,
Richard

>
> Regards
>  Robin
>
>
> From 1752507ce22c22b50b96f889dc0a9c2fc8e50859 Mon Sep 17 00:00:00 2001
> From: Robin Dapp <rdapp@ventanamicro.com>
> Date: Wed, 13 Sep 2023 22:19:35 +0200
> Subject: [PATCH v4] ifcvt/vect: Emit COND_ADD for conditional scalar
>  reduction.
>
> As described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions.  This makes it difficult to combine both
> into a masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD during ifcvt and
> adjusting some vectorizer code to handle it.
>
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
> is true.
>
> gcc/ChangeLog:
>
> 	PR middle-end/111401
> 	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
> 	if supported.
> 	(predicate_scalar_phi): Add whitespace.
> 	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
> 	(neutral_op_for_reduction): Return -0 for PLUS.
> 	(vect_is_simple_reduction): Don't count else operand in
> 	COND_ADD.
> 	(vect_create_epilog_for_reduction): Fix whitespace.
> 	(vectorize_fold_left_reduction): Add COND_ADD handling.
> 	(vectorizable_reduction): Don't count else operand in COND_ADD.
> 	(vect_transform_reduction): Add COND_ADD handling.
> 	* tree-vectorizer.h (neutral_op_for_reduction): Add default
> 	parameter.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
> 	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
> 	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c: Adjust.
> 	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c: Ditto.
> ---
>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++
>  .../riscv/rvv/autovec/cond/pr111401.c         | 139 ++++++++++++++++
>  .../riscv/rvv/autovec/reduc/reduc_call-2.c    |   4 +-
>  .../riscv/rvv/autovec/reduc/reduc_call-4.c    |   4 +-
>  gcc/tree-if-conv.cc                           |  49 ++++--
>  gcc/tree-vect-loop.cc                         | 156 ++++++++++++++----
>  gcc/tree-vectorizer.h                         |   2 +-
>  7 files changed, 446 insertions(+), 49 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..7b46e7d8a2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_plus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_minus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> +  int n = 19;
> +  double a[N];
> +  int cond1[N], cond2[N];
> +
> +  for (int i = 0; i < N; i++)
> +    {
> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> +      cond1[i] = 0;
> +      cond2[i] = i & 4 ? 1 : 0;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res2 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +  double res3 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res4 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..83dbd61b3f3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,139 @@
> +/* { dg-do run { target { riscv_v } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init += a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init -= a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo4 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init *= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo5 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init &= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo6 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init |= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo7 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init ^= a[i];
> +  return init;
> +}
> +
> +#define SZ 125
> +
> +int
> +main ()
> +{
> +  double res1 = 0, res2 = 0, res3 = 0;
> +  double a1[SZ], a2[SZ], a3[SZ];
> +  int c1[SZ], c2[SZ], c3[SZ];
> +
> +  int a4[SZ], a5[SZ], a6[SZ];
> +  int res4 = 0, res5 = 0, res6 = 0;
> +  int c4[SZ], c5[SZ], c6[SZ];
> +
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      a1[i] = i * 3 + (i & 4) - (i & 7);
> +      a2[i] = i * 3 + (i & 4) - (i & 7);
> +      a3[i] = i * 0.05 + (i & 4) - (i & 7);
> +      a4[i] = i * 3 + (i & 4) - (i & 7);
> +      a5[i] = i * 3 + (i & 4) - (i & 7);
> +      a6[i] = i * 3 + (i & 4) - (i & 7);
> +      c1[i] = i & 1;
> +      c2[i] = i & 2;
> +      c3[i] = i & 3;
> +      c4[i] = i & 4;
> +      c5[i] = i & 5;
> +      c6[i] = i & 6;
> +      __asm__ volatile ("" : : : "memory");
> +    }
> +
> +  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
> +  double ref1 = init1, ref2 = init2, ref3 = init3;
> +
> +  int init4 = 87, init5 = 11, init6 = -123894344;
> +  int ref4 = init4, ref5 = init5, ref6 = init6;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      if (c1[i])
> +        ref1 += a1[i];
> +      if (c2[i])
> +        ref2 -= a2[i];
> +      if (c3[i])
> +        ref3 *= a3[i];
> +      if (c4[i])
> +        ref4 &= a4[i];
> +      if (c5[i])
> +        ref5 |= a5[i];
> +      if (c6[i])
> +        ref6 ^= a6[i];
> +    }
> +
> +  res1 = foo2 (a1, init1, c1, SZ);
> +  res2 = foo3 (a2, init2, c2, SZ);
> +  res3 = foo4 (a3, init3, c3, SZ);
> +  res4 = foo5 (a4, init4, c4, SZ);
> +  res5 = foo6 (a5, init5, c5, SZ);
> +  res6 = foo7 (a6, init6, c6, SZ);
> +
> +  if (res1 != ref1)
> +    __builtin_abort ();
> +  if (res2 != ref2)
> +    __builtin_abort ();
> +  if (res3 != ref3)
> +    __builtin_abort ();
> +  if (res4 != ref4)
> +    __builtin_abort ();
> +  if (res5 != ref5)
> +    __builtin_abort ();
> +  if (res6 != ref6)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> index cc07a047cd5..7be22d60bf2 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> @@ -3,4 +3,6 @@
>  
>  #include "reduc_call-1.c"
>  
> -/* { dg-final { scan-assembler-times {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
> +/* { dg-final { scan-assembler-times {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> index 6d00c404d2a..83beabeff97 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> @@ -3,4 +3,6 @@
>  
>  #include "reduc_call-1.c"
>  
> -/* { dg-final { scan-assembler {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
> +/* { dg-final { scan-assembler {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} } } */
> +/* { dg-final { scan-assembler {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index a8c915913ae..462b5aab716 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>    gimple *new_assign;
>    tree rhs;
>    tree rhs1 = gimple_assign_rhs1 (reduc);
> +  tree lhs = gimple_assign_lhs (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
> +					       NULL, false);
>    gimple_seq stmts = NULL;
>  
>    if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1864,19 +1866,36 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>      }
>  
> -  /* Build cond expression using COND and constant operand
> -     of reduction rhs.  */
> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
> -			    unshare_expr (cond),
> -			    swap ? op_nochange : op1,
> -			    swap ? op1 : op_nochange);
> -
> -  /* Create assignment stmt and insert it at GSI.  */
> -  new_assign = gimple_build_assign (tmp, c);
> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> -  rhs = gimple_build (&stmts, reduction_op,
> -		      TREE_TYPE (rhs1), op0, tmp);
> +  /* If possible create a COND_OP instead of a COND_EXPR and an OP_EXPR.
> +     The COND_OP will have a neutral_op else value.  */
> +  internal_fn ifn;
> +  ifn = get_conditional_internal_fn (reduction_op);
> +  if (ifn != IFN_LAST
> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> +      && !swap)
> +    {
> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
> +						     unshare_expr (cond),
> +						     op0, op1, op0);
> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> +      gimple_call_set_lhs (cond_call, tmp);
> +      rhs = tmp;
> +    }
> +  else
> +    {
> +      /* Build cond expression using COND and constant operand
> +	 of reduction rhs.  */
> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
> +				unshare_expr (cond),
> +				swap ? op_nochange : op1,
> +				swap ? op1 : op_nochange);
> +      /* Create assignment stmt and insert it at GSI.  */
> +      new_assign = gimple_build_assign (tmp, c);
> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +      rhs = gimple_build (&stmts, reduction_op,
> +			  TREE_TYPE (rhs1), op0, tmp);
> +    }
>  
>    if (has_nop)
>      {
> @@ -2241,7 +2260,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
>  	{
>  	  /* Convert reduction stmt into vectorizable form.  */
>  	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -					       swap,has_nop, nop_reduc);
> +					       swap, has_nop, nop_reduc);
>  	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>  	}
>        new_stmt = gimple_build_assign (res, rhs);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 23c6e8259e7..ee425461aa3 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3672,7 +3672,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
>  static bool
>  fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
>  {
> -  if (code == PLUS_EXPR)
> +  /* We support MINUS_EXPR by negating the operand.  This also preserves an
> +     initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
> +     (-0.0) = -0.0.  */
> +  if (code == PLUS_EXPR || code == MINUS_EXPR)
>      {
>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>        return true;
> @@ -3751,23 +3754,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
>     by the introduction of additional X elements, return that X, otherwise
>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>     of the scalar elements.  If the reduction has just a single initial value
> -   then INITIAL_VALUE is that value, otherwise it is null.  */
> +   then INITIAL_VALUE is that value, otherwise it is null.
> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> +   In that case no signed zero is returned.  */
>  
>  tree
>  neutral_op_for_reduction (tree scalar_type, code_helper code,
> -			  tree initial_value)
> +			  tree initial_value, bool as_initial)
>  {
>    if (code.is_tree_code ())
>      switch (tree_code (code))
>        {
> -      case WIDEN_SUM_EXPR:
>        case DOT_PROD_EXPR:
>        case SAD_EXPR:
> -      case PLUS_EXPR:
>        case MINUS_EXPR:
>        case BIT_IOR_EXPR:
>        case BIT_XOR_EXPR:
>  	return build_zero_cst (scalar_type);
> +      case WIDEN_SUM_EXPR:
> +      case PLUS_EXPR:
> +	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> +	  return build_real (scalar_type, dconstm0);
> +	else
> +	  return build_zero_cst (scalar_type);
>  
>        case MULT_EXPR:
>  	return build_one_cst (scalar_type);
> @@ -4106,8 +4115,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
>            return NULL;
>          }
>  
> -      nphi_def_loop_uses++;
> -      phi_use_stmt = use_stmt;
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	 op1 twice (once as definition, once as else) in the same operation.
> +	 Only count it as one. */
> +      if (use_stmt != phi_use_stmt)
> +	{
> +	  nphi_def_loop_uses++;
> +	  phi_use_stmt = use_stmt;
> +	}
>      }
>  
>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6041,7 +6056,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
>        gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
>      }
> -  
> +
>    scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
>    scalar_type = TREE_TYPE (scalar_dest);
>    scalar_results.truncate (0);
> @@ -6378,7 +6393,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>  	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>  	    initial_value = reduc_info->reduc_initial_values[0];
>  	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> -						 initial_value);
> +						 initial_value, false);
>  	}
>        if (neutral_op)
>  	vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -6860,8 +6875,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  			       gimple_stmt_iterator *gsi,
>  			       gimple **vec_stmt, slp_tree slp_node,
>  			       gimple *reduc_def_stmt,
> -			       tree_code code, internal_fn reduc_fn,
> -			       tree ops[3], tree vectype_in,
> +			       code_helper code, internal_fn reduc_fn,
> +			       tree *ops, int num_ops, tree vectype_in,
>  			       int reduc_index, vec_loop_masks *masks,
>  			       vec_loop_lens *lens)
>  {
> @@ -6877,17 +6892,48 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  
>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>    gcc_assert (ncopies == 1);
> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> +  bool is_cond_op = false;
> +  if (!code.is_tree_code ())
> +    {
> +      code = conditional_internal_fn_code (internal_fn (code));
> +      gcc_assert (code != ERROR_MARK);
> +      is_cond_op = true;
> +    }
> +
> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
>  
>    if (slp_node)
> -    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> -			  TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    {
> +      if (is_cond_op)
> +	{
> +	  if (dump_enabled_p ())
> +	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +			     "fold-left reduction on SLP not supported.\n");
> +	  return false;
> +	}
>  
> -  tree op0 = ops[1 - reduc_index];
> +      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> +			    TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    }
> +
> +  /* The operands either come from a binary operation or an IFN_COND operation.
> +     The former is a gimple assign with binary rhs and the latter is a
> +     gimple call with four arguments.  */
> +  gcc_assert (num_ops == 2 || num_ops == 4);
> +  tree op0, opmask;
> +  if (!is_cond_op)
> +    op0 = ops[1 - reduc_index];
> +  else
> +    {
> +      op0 = ops[2];
> +      opmask = ops[0];
> +      gcc_assert (!slp_node);
> +    }
>  
>    int group_size = 1;
>    stmt_vec_info scalar_dest_def_info;
> -  auto_vec<tree> vec_oprnds0;
> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>    if (slp_node)
>      {
>        auto_vec<vec<tree> > vec_defs (2);
> @@ -6903,9 +6949,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>  				     op0, &vec_oprnds0);
>        scalar_dest_def_info = stmt_info;
> +
> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
> +      if (is_cond_op)
> +	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> +					 opmask, &vec_opmask);
>      }
>  
> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> +  gimple *sdef = scalar_dest_def_info->stmt;
> +  tree scalar_dest = gimple_get_lhs (sdef);
>    tree scalar_type = TREE_TYPE (scalar_dest);
>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
>  
> @@ -6939,13 +6991,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
> +      else if (is_cond_op)
> +	mask = vec_opmask[0];
>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  	{
>  	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>  				   i, 1);
>  	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>  	  bias = build_int_cst (intQI_type_node, biasval);
> -	  mask = build_minus_one_cst (truth_type_for (vectype_in));
> +	  if (!is_cond_op)
> +	    mask = build_minus_one_cst (truth_type_for (vectype_in));
>  	}
>  
>        /* Handle MINUS by adding the negative.  */
> @@ -6957,7 +7012,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	  def0 = negated;
>  	}
>  
> -      if (mask && mask_reduc_fn == IFN_LAST)
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +	  && mask && mask_reduc_fn == IFN_LAST)
>  	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>  				    vector_identity);
>  
> @@ -6988,8 +7044,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> -					     reduc_var, def0);
> +	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> +					     tree_code (code), reduc_var, def0);
>  	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>  	  /* Remove the statement, so that we can use the same code paths
>  	     as for statements that we've just created.  */
> @@ -7440,6 +7496,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>  	continue;
>  
> +      /* For an IFN_COND_OP we might hit the reduction definition operand
> +	 twice (once as definition, once as else).  */
> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> +	continue;
> +
>        /* There should be only one cycle def in the stmt, the one
>           leading to reduc_def.  */
>        if (VECTORIZABLE_CYCLE_DEF (dt))
> @@ -7640,6 +7701,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>            when generating the code inside the loop.  */
>  
>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> +  /* If conversion might have created a conditional operation like
> +     IFN_COND_ADD already.  Use the internal code for the following checks.  */
> +  if (orig_code.is_internal_fn ())
> +    {
> +      tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
> +      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
> +    }
> +
>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> @@ -7678,7 +7748,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -			    "reduction: not commutative/associative");
> +			    "reduction: not commutative/associative\n");
>  	  return false;
>  	}
>      }
> @@ -8062,9 +8132,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  	}
>        else if (reduction_type == FOLD_LEFT_REDUCTION
> -	       && reduc_fn == IFN_LAST
> +	       && internal_fn_mask_index (reduc_fn) == -1
>  	       && FLOAT_TYPE_P (vectype_in)
> -	       && HONOR_SIGNED_ZEROS (vectype_in)
>  	       && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
>  	{
>  	  if (dump_enabled_p ())
> @@ -8213,6 +8282,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> @@ -8231,17 +8301,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    if (code == COND_EXPR)
>      gcc_assert (ncopies == 1);
>  
> +  /* A binary COND_OP reduction must have the same definition and else
> +     value. */
> +  bool cond_fn_p = code.is_internal_fn ()
> +    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
> +  if (cond_fn_p)
> +    {
> +      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
> +		  || code == IFN_COND_MUL || code == IFN_COND_AND
> +		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
> +      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +    }
> +
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> -      gcc_assert (code.is_tree_code ());
> +      gcc_assert (code.is_tree_code () || cond_fn_p);
>        return vectorize_fold_left_reduction
>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> -	   lens);
> +	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +	   reduc_index, masks, lens);
>      }
>  
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> @@ -8254,14 +8336,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>  
> +  /* Get NCOPIES vector definitions for all operands except the reduction
> +     definition.  */
>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  		     single_defuse_cycle && reduc_index == 0
>  		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
>  		     single_defuse_cycle && reduc_index == 1
>  		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -		     op.num_ops == 3
> -		     && !(single_defuse_cycle && reduc_index == 2)
> +		     op.num_ops == 4
> +		     || (op.num_ops == 3
> +			 && !(single_defuse_cycle && reduc_index == 2))
>  		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> +  /* For single def-use cycles get one copy of the vectorized reduction
> +     definition.  */
>    if (single_defuse_cycle)
>      {
>        gcc_assert (!slp_node);
> @@ -8301,7 +8389,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  if (op.num_ops == 3)
> +	  if (op.num_ops >= 3)
>  	    vop[2] = vec_oprnds2[i];
>  
>  	  if (masked_loop_p && mask_by_cond_expr)
> @@ -8314,10 +8402,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  if (emulated_mixed_dot_prod)
>  	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
>  						    vec_dest, vop);
> -	  else if (code.is_internal_fn ())
> +
> +	  else if (code.is_internal_fn () && !cond_fn_p)
>  	    new_stmt = gimple_build_call_internal (internal_fn (code),
>  						   op.num_ops,
>  						   vop[0], vop[1], vop[2]);
> +	  else if (code.is_internal_fn () && cond_fn_p)
> +	    new_stmt = gimple_build_call_internal (internal_fn (code),
> +						   op.num_ops,
> +						   vop[0], vop[1], vop[2],
> +						   vop[1]);
>  	  else
>  	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>  					    vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index f1d0cd79961..e22067400af 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
>  						  tree);
>  
>  /* In tree-vect-loop.cc.  */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
>  bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
>  /* Used in tree-vect-loop-manip.cc */
  
Richard Biener Oct. 12, 2023, 11:11 a.m. UTC | #23
On Wed, 11 Oct 2023, Robin Dapp wrote:

> > It wasn't very clear, sorry, but it was the last sentence I was asking
> > for clarification on, not the other bits.  Why do we want to avoid
> > generating a COND_ADD when the operand is a vectorisable call?
> 
> Ah, I see, apologies.  Upon thinking about it a bit more (thanks)
> I figured this hunk is not necessary.  I added it early in the process
> in order to keep the current behavior for situations like the following:
> 
>  before:
>  _1 = .FMA (...)
>  _2 = COND (cond, .FMA, 0.0)
>  _3 = COND_ADD (true, result, _2, result)
> 
>  This we would simplify to:
>  _2 = COND_FMA (cond, ...)
>  _3 = COND_ADD (true, result, _2, result)
> 
>  with the patch we have:
>  _1 = .FMA (...)
>  _2 = .COND_ADD (cond, arg1, _1, arg1)
> 
> Due to differences in expansion we'd end up with a masked
> vfmacc ("a += a + b * c") before and now emit an unmasked
> vfmadd ("a += a * b + c") and a masked result add.  This shouldn't
> be worse from a vector spec point of view, so I just changed the
> test expectation for now.
> 
> The attached v4 also includes Richi's suggestion for the HONOR...
> stuff.
> 
> Bootstrap and regtest unchanged on aarch64, x86 and power10.

OK

Thanks,
Richard.

> Regards
>  Robin
> 
> 
> From 1752507ce22c22b50b96f889dc0a9c2fc8e50859 Mon Sep 17 00:00:00 2001
> From: Robin Dapp <rdapp@ventanamicro.com>
> Date: Wed, 13 Sep 2023 22:19:35 +0200
> Subject: [PATCH v4] ifcvt/vect: Emit COND_ADD for conditional scalar
>  reduction.
> 
> As described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions.  This makes it difficult to combine both
> into a masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD during ifcvt and
> adjusting some vectorizer code to handle it.
> 
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
> is true.
> 
> gcc/ChangeLog:
> 
> 	PR middle-end/111401
> 	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
> 	if supported.
> 	(predicate_scalar_phi): Add whitespace.
> 	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
> 	(neutral_op_for_reduction): Return -0 for PLUS.
> 	(vect_is_simple_reduction): Don't count else operand in
> 	COND_ADD.
> 	(vect_create_epilog_for_reduction): Fix whitespace.
> 	(vectorize_fold_left_reduction): Add COND_ADD handling.
> 	(vectorizable_reduction): Don't count else operand in COND_ADD.
> 	(vect_transform_reduction): Add COND_ADD handling.
> 	* tree-vectorizer.h (neutral_op_for_reduction): Add default
> 	parameter.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
> 	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
> 	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c: Adjust.
> 	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c: Ditto.
> ---
>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++
>  .../riscv/rvv/autovec/cond/pr111401.c         | 139 ++++++++++++++++
>  .../riscv/rvv/autovec/reduc/reduc_call-2.c    |   4 +-
>  .../riscv/rvv/autovec/reduc/reduc_call-4.c    |   4 +-
>  gcc/tree-if-conv.cc                           |  49 ++++--
>  gcc/tree-vect-loop.cc                         | 156 ++++++++++++++----
>  gcc/tree-vectorizer.h                         |   2 +-
>  7 files changed, 446 insertions(+), 49 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..7b46e7d8a2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_plus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_minus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> +  int n = 19;
> +  double a[N];
> +  int cond1[N], cond2[N];
> +
> +  for (int i = 0; i < N; i++)
> +    {
> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> +      cond1[i] = 0;
> +      cond2[i] = i & 4 ? 1 : 0;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res2 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +  double res3 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res4 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..83dbd61b3f3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,139 @@
> +/* { dg-do run { target { riscv_v } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init += a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init -= a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo4 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init *= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo5 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init &= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo6 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init |= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo7 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init ^= a[i];
> +  return init;
> +}
> +
> +#define SZ 125
> +
> +int
> +main ()
> +{
> +  double res1 = 0, res2 = 0, res3 = 0;
> +  double a1[SZ], a2[SZ], a3[SZ];
> +  int c1[SZ], c2[SZ], c3[SZ];
> +
> +  int a4[SZ], a5[SZ], a6[SZ];
> +  int res4 = 0, res5 = 0, res6 = 0;
> +  int c4[SZ], c5[SZ], c6[SZ];
> +
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      a1[i] = i * 3 + (i & 4) - (i & 7);
> +      a2[i] = i * 3 + (i & 4) - (i & 7);
> +      a3[i] = i * 0.05 + (i & 4) - (i & 7);
> +      a4[i] = i * 3 + (i & 4) - (i & 7);
> +      a5[i] = i * 3 + (i & 4) - (i & 7);
> +      a6[i] = i * 3 + (i & 4) - (i & 7);
> +      c1[i] = i & 1;
> +      c2[i] = i & 2;
> +      c3[i] = i & 3;
> +      c4[i] = i & 4;
> +      c5[i] = i & 5;
> +      c6[i] = i & 6;
> +      __asm__ volatile ("" : : : "memory");
> +    }
> +
> +  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
> +  double ref1 = init1, ref2 = init2, ref3 = init3;
> +
> +  int init4 = 87, init5 = 11, init6 = -123894344;
> +  int ref4 = init4, ref5 = init5, ref6 = init6;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      if (c1[i])
> +        ref1 += a1[i];
> +      if (c2[i])
> +        ref2 -= a2[i];
> +      if (c3[i])
> +        ref3 *= a3[i];
> +      if (c4[i])
> +        ref4 &= a4[i];
> +      if (c5[i])
> +        ref5 |= a5[i];
> +      if (c6[i])
> +        ref6 ^= a6[i];
> +    }
> +
> +  res1 = foo2 (a1, init1, c1, SZ);
> +  res2 = foo3 (a2, init2, c2, SZ);
> +  res3 = foo4 (a3, init3, c3, SZ);
> +  res4 = foo5 (a4, init4, c4, SZ);
> +  res5 = foo6 (a5, init5, c5, SZ);
> +  res6 = foo7 (a6, init6, c6, SZ);
> +
> +  if (res1 != ref1)
> +    __builtin_abort ();
> +  if (res2 != ref2)
> +    __builtin_abort ();
> +  if (res3 != ref3)
> +    __builtin_abort ();
> +  if (res4 != ref4)
> +    __builtin_abort ();
> +  if (res5 != ref5)
> +    __builtin_abort ();
> +  if (res6 != ref6)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> index cc07a047cd5..7be22d60bf2 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> @@ -3,4 +3,6 @@
>  
>  #include "reduc_call-1.c"
>  
> -/* { dg-final { scan-assembler-times {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
> +/* { dg-final { scan-assembler-times {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> index 6d00c404d2a..83beabeff97 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> @@ -3,4 +3,6 @@
>  
>  #include "reduc_call-1.c"
>  
> -/* { dg-final { scan-assembler {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
> +/* { dg-final { scan-assembler {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} } } */
> +/* { dg-final { scan-assembler {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index a8c915913ae..462b5aab716 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>    gimple *new_assign;
>    tree rhs;
>    tree rhs1 = gimple_assign_rhs1 (reduc);
> +  tree lhs = gimple_assign_lhs (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
> +					       NULL, false);
>    gimple_seq stmts = NULL;
>  
>    if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1864,19 +1866,36 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>      }
>  
> -  /* Build cond expression using COND and constant operand
> -     of reduction rhs.  */
> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
> -			    unshare_expr (cond),
> -			    swap ? op_nochange : op1,
> -			    swap ? op1 : op_nochange);
> -
> -  /* Create assignment stmt and insert it at GSI.  */
> -  new_assign = gimple_build_assign (tmp, c);
> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> -  rhs = gimple_build (&stmts, reduction_op,
> -		      TREE_TYPE (rhs1), op0, tmp);
> +  /* If possible create a COND_OP instead of a COND_EXPR and an OP_EXPR.
> +     The COND_OP will have a neutral_op else value.  */
> +  internal_fn ifn;
> +  ifn = get_conditional_internal_fn (reduction_op);
> +  if (ifn != IFN_LAST
> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> +      && !swap)
> +    {
> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
> +						     unshare_expr (cond),
> +						     op0, op1, op0);
> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> +      gimple_call_set_lhs (cond_call, tmp);
> +      rhs = tmp;
> +    }
> +  else
> +    {
> +      /* Build cond expression using COND and constant operand
> +	 of reduction rhs.  */
> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
> +				unshare_expr (cond),
> +				swap ? op_nochange : op1,
> +				swap ? op1 : op_nochange);
> +      /* Create assignment stmt and insert it at GSI.  */
> +      new_assign = gimple_build_assign (tmp, c);
> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +      rhs = gimple_build (&stmts, reduction_op,
> +			  TREE_TYPE (rhs1), op0, tmp);
> +    }
>  
>    if (has_nop)
>      {
> @@ -2241,7 +2260,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
>  	{
>  	  /* Convert reduction stmt into vectorizable form.  */
>  	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -					       swap,has_nop, nop_reduc);
> +					       swap, has_nop, nop_reduc);
>  	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>  	}
>        new_stmt = gimple_build_assign (res, rhs);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 23c6e8259e7..ee425461aa3 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3672,7 +3672,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
>  static bool
>  fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
>  {
> -  if (code == PLUS_EXPR)
> +  /* We support MINUS_EXPR by negating the operand.  This also preserves an
> +     initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
> +     (-0.0) = -0.0.  */
> +  if (code == PLUS_EXPR || code == MINUS_EXPR)
>      {
>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>        return true;
> @@ -3751,23 +3754,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
>     by the introduction of additional X elements, return that X, otherwise
>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>     of the scalar elements.  If the reduction has just a single initial value
> -   then INITIAL_VALUE is that value, otherwise it is null.  */
> +   then INITIAL_VALUE is that value, otherwise it is null.
> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> +   In that case no signed zero is returned.  */
>  
>  tree
>  neutral_op_for_reduction (tree scalar_type, code_helper code,
> -			  tree initial_value)
> +			  tree initial_value, bool as_initial)
>  {
>    if (code.is_tree_code ())
>      switch (tree_code (code))
>        {
> -      case WIDEN_SUM_EXPR:
>        case DOT_PROD_EXPR:
>        case SAD_EXPR:
> -      case PLUS_EXPR:
>        case MINUS_EXPR:
>        case BIT_IOR_EXPR:
>        case BIT_XOR_EXPR:
>  	return build_zero_cst (scalar_type);
> +      case WIDEN_SUM_EXPR:
> +      case PLUS_EXPR:
> +	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> +	  return build_real (scalar_type, dconstm0);
> +	else
> +	  return build_zero_cst (scalar_type);
>  
>        case MULT_EXPR:
>  	return build_one_cst (scalar_type);
> @@ -4106,8 +4115,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
>            return NULL;
>          }
>  
> -      nphi_def_loop_uses++;
> -      phi_use_stmt = use_stmt;
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	 op1 twice (once as definition, once as else) in the same operation.
> +	 Only count it as one. */
> +      if (use_stmt != phi_use_stmt)
> +	{
> +	  nphi_def_loop_uses++;
> +	  phi_use_stmt = use_stmt;
> +	}
>      }
>  
>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6041,7 +6056,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
>        gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
>      }
> -  
> +
>    scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
>    scalar_type = TREE_TYPE (scalar_dest);
>    scalar_results.truncate (0);
> @@ -6378,7 +6393,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>  	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>  	    initial_value = reduc_info->reduc_initial_values[0];
>  	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> -						 initial_value);
> +						 initial_value, false);
>  	}
>        if (neutral_op)
>  	vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -6860,8 +6875,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  			       gimple_stmt_iterator *gsi,
>  			       gimple **vec_stmt, slp_tree slp_node,
>  			       gimple *reduc_def_stmt,
> -			       tree_code code, internal_fn reduc_fn,
> -			       tree ops[3], tree vectype_in,
> +			       code_helper code, internal_fn reduc_fn,
> +			       tree *ops, int num_ops, tree vectype_in,
>  			       int reduc_index, vec_loop_masks *masks,
>  			       vec_loop_lens *lens)
>  {
> @@ -6877,17 +6892,48 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  
>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>    gcc_assert (ncopies == 1);
> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> +  bool is_cond_op = false;
> +  if (!code.is_tree_code ())
> +    {
> +      code = conditional_internal_fn_code (internal_fn (code));
> +      gcc_assert (code != ERROR_MARK);
> +      is_cond_op = true;
> +    }
> +
> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
>  
>    if (slp_node)
> -    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> -			  TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    {
> +      if (is_cond_op)
> +	{
> +	  if (dump_enabled_p ())
> +	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +			     "fold-left reduction on SLP not supported.\n");
> +	  return false;
> +	}
>  
> -  tree op0 = ops[1 - reduc_index];
> +      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> +			    TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    }
> +
> +  /* The operands either come from a binary operation or an IFN_COND operation.
> +     The former is a gimple assign with binary rhs and the latter is a
> +     gimple call with four arguments.  */
> +  gcc_assert (num_ops == 2 || num_ops == 4);
> +  tree op0, opmask;
> +  if (!is_cond_op)
> +    op0 = ops[1 - reduc_index];
> +  else
> +    {
> +      op0 = ops[2];
> +      opmask = ops[0];
> +      gcc_assert (!slp_node);
> +    }
>  
>    int group_size = 1;
>    stmt_vec_info scalar_dest_def_info;
> -  auto_vec<tree> vec_oprnds0;
> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>    if (slp_node)
>      {
>        auto_vec<vec<tree> > vec_defs (2);
> @@ -6903,9 +6949,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>  				     op0, &vec_oprnds0);
>        scalar_dest_def_info = stmt_info;
> +
> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
> +      if (is_cond_op)
> +	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> +					 opmask, &vec_opmask);
>      }
>  
> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> +  gimple *sdef = scalar_dest_def_info->stmt;
> +  tree scalar_dest = gimple_get_lhs (sdef);
>    tree scalar_type = TREE_TYPE (scalar_dest);
>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
>  
> @@ -6939,13 +6991,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
> +      else if (is_cond_op)
> +	mask = vec_opmask[0];
>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  	{
>  	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>  				   i, 1);
>  	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>  	  bias = build_int_cst (intQI_type_node, biasval);
> -	  mask = build_minus_one_cst (truth_type_for (vectype_in));
> +	  if (!is_cond_op)
> +	    mask = build_minus_one_cst (truth_type_for (vectype_in));
>  	}
>  
>        /* Handle MINUS by adding the negative.  */
> @@ -6957,7 +7012,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	  def0 = negated;
>  	}
>  
> -      if (mask && mask_reduc_fn == IFN_LAST)
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +	  && mask && mask_reduc_fn == IFN_LAST)
>  	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>  				    vector_identity);
>  
> @@ -6988,8 +7044,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> -					     reduc_var, def0);
> +	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> +					     tree_code (code), reduc_var, def0);
>  	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>  	  /* Remove the statement, so that we can use the same code paths
>  	     as for statements that we've just created.  */
> @@ -7440,6 +7496,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>  	continue;
>  
> +      /* For an IFN_COND_OP we might hit the reduction definition operand
> +	 twice (once as definition, once as else).  */
> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> +	continue;
> +
>        /* There should be only one cycle def in the stmt, the one
>           leading to reduc_def.  */
>        if (VECTORIZABLE_CYCLE_DEF (dt))
> @@ -7640,6 +7701,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>            when generating the code inside the loop.  */
>  
>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> +  /* If conversion might have created a conditional operation like
> +     IFN_COND_ADD already.  Use the internal code for the following checks.  */
> +  if (orig_code.is_internal_fn ())
> +    {
> +      tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
> +      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
> +    }
> +
>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> @@ -7678,7 +7748,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -			    "reduction: not commutative/associative");
> +			    "reduction: not commutative/associative\n");
>  	  return false;
>  	}
>      }
> @@ -8062,9 +8132,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  	}
>        else if (reduction_type == FOLD_LEFT_REDUCTION
> -	       && reduc_fn == IFN_LAST
> +	       && internal_fn_mask_index (reduc_fn) == -1
>  	       && FLOAT_TYPE_P (vectype_in)
> -	       && HONOR_SIGNED_ZEROS (vectype_in)
>  	       && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
>  	{
>  	  if (dump_enabled_p ())
> @@ -8213,6 +8282,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> @@ -8231,17 +8301,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    if (code == COND_EXPR)
>      gcc_assert (ncopies == 1);
>  
> +  /* A binary COND_OP reduction must have the same definition and else
> +     value. */
> +  bool cond_fn_p = code.is_internal_fn ()
> +    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
> +  if (cond_fn_p)
> +    {
> +      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
> +		  || code == IFN_COND_MUL || code == IFN_COND_AND
> +		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
> +      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +    }
> +
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> -      gcc_assert (code.is_tree_code ());
> +      gcc_assert (code.is_tree_code () || cond_fn_p);
>        return vectorize_fold_left_reduction
>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> -	   lens);
> +	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +	   reduc_index, masks, lens);
>      }
>  
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> @@ -8254,14 +8336,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>  
> +  /* Get NCOPIES vector definitions for all operands except the reduction
> +     definition.  */
>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  		     single_defuse_cycle && reduc_index == 0
>  		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
>  		     single_defuse_cycle && reduc_index == 1
>  		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -		     op.num_ops == 3
> -		     && !(single_defuse_cycle && reduc_index == 2)
> +		     op.num_ops == 4
> +		     || (op.num_ops == 3
> +			 && !(single_defuse_cycle && reduc_index == 2))
>  		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> +  /* For single def-use cycles get one copy of the vectorized reduction
> +     definition.  */
>    if (single_defuse_cycle)
>      {
>        gcc_assert (!slp_node);
> @@ -8301,7 +8389,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  if (op.num_ops == 3)
> +	  if (op.num_ops >= 3)
>  	    vop[2] = vec_oprnds2[i];
>  
>  	  if (masked_loop_p && mask_by_cond_expr)
> @@ -8314,10 +8402,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  if (emulated_mixed_dot_prod)
>  	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
>  						    vec_dest, vop);
> -	  else if (code.is_internal_fn ())
> +
> +	  else if (code.is_internal_fn () && !cond_fn_p)
>  	    new_stmt = gimple_build_call_internal (internal_fn (code),
>  						   op.num_ops,
>  						   vop[0], vop[1], vop[2]);
> +	  else if (code.is_internal_fn () && cond_fn_p)
> +	    new_stmt = gimple_build_call_internal (internal_fn (code),
> +						   op.num_ops,
> +						   vop[0], vop[1], vop[2],
> +						   vop[1]);
>  	  else
>  	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>  					    vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index f1d0cd79961..e22067400af 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
>  						  tree);
>  
>  /* In tree-vect-loop.cc.  */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
>  bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
>  /* Used in tree-vect-loop-manip.cc */
>
  
Robin Dapp Oct. 19, 2023, 8:07 p.m. UTC | #24
Ugh, I didn't push yet because with a rebased trunk I am
seeing different behavior for some riscv testcases.

A reduction is not recognized because there is yet another
"double use" occurrence in check_reduction_path.  I guess it's
reasonable to loosen the restriction for conditional operations
here as well.

The only change to v4 therefore is:

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index ebab1953b9c..64654a55e4c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -4085,7 +4094,15 @@ pop:
                || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
          FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
            cnt++;
-      if (cnt != 1)
+
+      bool cond_fn_p = op.code.is_internal_fn ()
+       && (conditional_internal_fn_code (internal_fn (*code))
+           != ERROR_MARK);
+
+      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
+        op1 twice (once as definition, once as else) in the same operation.
+        Allow this.  */
+      if ((!cond_fn_p && cnt != 1) || (opi == 1 && cond_fn_p && cnt != 2))

Bootstrapped and regtested again on x86, aarch64 and power10.
Testsuite on riscv unchanged.

Regards
 Robin

Subject: [PATCH v5] ifcvt/vect: Emit COND_OP for conditional scalar reduction.

As described in PR111401 we currently emit a COND and a PLUS expression
for conditional reductions.  This makes it difficult to combine both
into a masked reduction statement later.
This patch improves that by directly emitting a COND_ADD/COND_OP during
ifcvt and adjusting some vectorizer code to handle it.

It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
is true.

gcc/ChangeLog:

	PR middle-end/111401
	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_OP
	if supported.
	(predicate_scalar_phi): Add whitespace.
	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_OP.
	(neutral_op_for_reduction): Return -0 for PLUS.
	(check_reduction_path): Don't count else operand in COND_OP.
	(vect_is_simple_reduction): Ditto.
	(vect_create_epilog_for_reduction): Fix whitespace.
	(vectorize_fold_left_reduction): Add COND_OP handling.
	(vectorizable_reduction): Don't count else operand in COND_OP.
	(vect_transform_reduction): Add COND_OP handling.
	* tree-vectorizer.h (neutral_op_for_reduction): Add default
	parameter.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c: Adjust.
	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c: Ditto.
---
 .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 +++++++++++++++
 .../riscv/rvv/autovec/cond/pr111401.c         | 139 +++++++++++++++
 .../riscv/rvv/autovec/reduc/reduc_call-2.c    |   4 +-
 .../riscv/rvv/autovec/reduc/reduc_call-4.c    |   4 +-
 gcc/tree-if-conv.cc                           |  49 +++--
 gcc/tree-vect-loop.cc                         | 168 ++++++++++++++----
 gcc/tree-vectorizer.h                         |   2 +-
 7 files changed, 456 insertions(+), 51 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
new file mode 100644
index 00000000000..7b46e7d8a2a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
@@ -0,0 +1,141 @@
+/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
+/* { dg-do run } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-add-options ieee } */
+/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
+
+#include "tree-vect.h"
+
+#include <math.h>
+
+#define N (VECTOR_BITS * 17)
+
+double __attribute__ ((noinline, noclone))
+reduc_plus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone))
+reduc_minus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  int n = 19;
+  double a[N];
+  int cond1[N], cond2[N];
+
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
+      cond1[i] = 0;
+      cond2[i] = i & 4 ? 1 : 0;
+      asm volatile ("" ::: "memory");
+    }
+
+  double res1 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res2 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
+  double res3 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res4 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond1, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res2 = reduc_minus_double (a, 0.0, cond1, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
+  res3 = reduc_plus_double (a, 0.0, cond1, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res4 = reduc_minus_double (a, 0.0, cond1, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, -0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res2 = reduc_minus_double (a, -0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
+  res3 = reduc_plus_double (a, -0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res4 = reduc_minus_double (a, -0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res2 = reduc_minus_double (a, 0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
+  res3 = reduc_plus_double (a, 0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res4 = reduc_minus_double (a, 0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
new file mode 100644
index 00000000000..83dbd61b3f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
@@ -0,0 +1,139 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
+
+double
+__attribute__ ((noipa))
+foo2 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init += a[i];
+  return init;
+}
+
+double
+__attribute__ ((noipa))
+foo3 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init -= a[i];
+  return init;
+}
+
+double
+__attribute__ ((noipa))
+foo4 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init *= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo5 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init &= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo6 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init |= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo7 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init ^= a[i];
+  return init;
+}
+
+#define SZ 125
+
+int
+main ()
+{
+  double res1 = 0, res2 = 0, res3 = 0;
+  double a1[SZ], a2[SZ], a3[SZ];
+  int c1[SZ], c2[SZ], c3[SZ];
+
+  int a4[SZ], a5[SZ], a6[SZ];
+  int res4 = 0, res5 = 0, res6 = 0;
+  int c4[SZ], c5[SZ], c6[SZ];
+
+  for (int i = 0; i < SZ; i++)
+    {
+      a1[i] = i * 3 + (i & 4) - (i & 7);
+      a2[i] = i * 3 + (i & 4) - (i & 7);
+      a3[i] = i * 0.05 + (i & 4) - (i & 7);
+      a4[i] = i * 3 + (i & 4) - (i & 7);
+      a5[i] = i * 3 + (i & 4) - (i & 7);
+      a6[i] = i * 3 + (i & 4) - (i & 7);
+      c1[i] = i & 1;
+      c2[i] = i & 2;
+      c3[i] = i & 3;
+      c4[i] = i & 4;
+      c5[i] = i & 5;
+      c6[i] = i & 6;
+      __asm__ volatile ("" : : : "memory");
+    }
+
+  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
+  double ref1 = init1, ref2 = init2, ref3 = init3;
+
+  int init4 = 87, init5 = 11, init6 = -123894344;
+  int ref4 = init4, ref5 = init5, ref6 = init6;
+
+#pragma GCC novector
+  for (int i = 0; i < SZ; i++)
+    {
+      if (c1[i])
+        ref1 += a1[i];
+      if (c2[i])
+        ref2 -= a2[i];
+      if (c3[i])
+        ref3 *= a3[i];
+      if (c4[i])
+        ref4 &= a4[i];
+      if (c5[i])
+        ref5 |= a5[i];
+      if (c6[i])
+        ref6 ^= a6[i];
+    }
+
+  res1 = foo2 (a1, init1, c1, SZ);
+  res2 = foo3 (a2, init2, c2, SZ);
+  res3 = foo4 (a3, init3, c3, SZ);
+  res4 = foo5 (a4, init4, c4, SZ);
+  res5 = foo6 (a5, init5, c5, SZ);
+  res6 = foo7 (a6, init6, c6, SZ);
+
+  if (res1 != ref1)
+    __builtin_abort ();
+  if (res2 != ref2)
+    __builtin_abort ();
+  if (res3 != ref3)
+    __builtin_abort ();
+  if (res4 != ref4)
+    __builtin_abort ();
+  if (res5 != ref5)
+    __builtin_abort ();
+  if (res6 != ref6)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
+/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
index cc07a047cd5..7be22d60bf2 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
@@ -3,4 +3,6 @@
 
 #include "reduc_call-1.c"
 
-/* { dg-final { scan-assembler-times {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
+/* { dg-final { scan-assembler-times {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
+/* { dg-final { scan-assembler-not {vmerge} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
index 6d00c404d2a..83beabeff97 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
@@ -3,4 +3,6 @@
 
 #include "reduc_call-1.c"
 
-/* { dg-final { scan-assembler {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
+/* { dg-final { scan-assembler {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} } } */
+/* { dg-final { scan-assembler {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
+/* { dg-final { scan-assembler-not {vmerge} } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index c381d14b801..9571351805c 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -1856,10 +1856,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
   gimple *new_assign;
   tree rhs;
   tree rhs1 = gimple_assign_rhs1 (reduc);
+  tree lhs = gimple_assign_lhs (reduc);
   tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
   tree c;
   enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
-  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
+  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
+					       NULL, false);
   gimple_seq stmts = NULL;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1868,19 +1870,36 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
       print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
     }
 
-  /* Build cond expression using COND and constant operand
-     of reduction rhs.  */
-  c = fold_build_cond_expr (TREE_TYPE (rhs1),
-			    unshare_expr (cond),
-			    swap ? op_nochange : op1,
-			    swap ? op1 : op_nochange);
-
-  /* Create assignment stmt and insert it at GSI.  */
-  new_assign = gimple_build_assign (tmp, c);
-  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
-  /* Build rhs for unconditional increment/decrement/logic_operation.  */
-  rhs = gimple_build (&stmts, reduction_op,
-		      TREE_TYPE (rhs1), op0, tmp);
+  /* If possible create a COND_OP instead of a COND_EXPR and an OP_EXPR.
+     The COND_OP will have a neutral_op else value.  */
+  internal_fn ifn;
+  ifn = get_conditional_internal_fn (reduction_op);
+  if (ifn != IFN_LAST
+      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
+      && !swap)
+    {
+      gcall *cond_call = gimple_build_call_internal (ifn, 4,
+						     unshare_expr (cond),
+						     op0, op1, op0);
+      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
+      gimple_call_set_lhs (cond_call, tmp);
+      rhs = tmp;
+    }
+  else
+    {
+      /* Build cond expression using COND and constant operand
+	 of reduction rhs.  */
+      c = fold_build_cond_expr (TREE_TYPE (rhs1),
+				unshare_expr (cond),
+				swap ? op_nochange : op1,
+				swap ? op1 : op_nochange);
+      /* Create assignment stmt and insert it at GSI.  */
+      new_assign = gimple_build_assign (tmp, c);
+      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
+      /* Build rhs for unconditional increment/decrement/logic_operation.  */
+      rhs = gimple_build (&stmts, reduction_op,
+			  TREE_TYPE (rhs1), op0, tmp);
+    }
 
   if (has_nop)
     {
@@ -2292,7 +2311,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
 	{
 	  /* Convert reduction stmt into vectorizable form.  */
 	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
-					       swap,has_nop, nop_reduc);
+					       swap, has_nop, nop_reduc);
 	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
 	}
       new_stmt = gimple_build_assign (res, rhs);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index ebab1953b9c..1c455701c73 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3762,7 +3762,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 static bool
 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
 {
-  if (code == PLUS_EXPR)
+  /* We support MINUS_EXPR by negating the operand.  This also preserves an
+     initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
+     (-0.0) = -0.0.  */
+  if (code == PLUS_EXPR || code == MINUS_EXPR)
     {
       *reduc_fn = IFN_FOLD_LEFT_PLUS;
       return true;
@@ -3841,23 +3844,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
    by the introduction of additional X elements, return that X, otherwise
    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
    of the scalar elements.  If the reduction has just a single initial value
-   then INITIAL_VALUE is that value, otherwise it is null.  */
+   then INITIAL_VALUE is that value, otherwise it is null.
+   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
+   In that case no signed zero is returned.  */
 
 tree
 neutral_op_for_reduction (tree scalar_type, code_helper code,
-			  tree initial_value)
+			  tree initial_value, bool as_initial)
 {
   if (code.is_tree_code ())
     switch (tree_code (code))
       {
-      case WIDEN_SUM_EXPR:
       case DOT_PROD_EXPR:
       case SAD_EXPR:
-      case PLUS_EXPR:
       case MINUS_EXPR:
       case BIT_IOR_EXPR:
       case BIT_XOR_EXPR:
 	return build_zero_cst (scalar_type);
+      case WIDEN_SUM_EXPR:
+      case PLUS_EXPR:
+	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
+	  return build_real (scalar_type, dconstm0);
+	else
+	  return build_zero_cst (scalar_type);
 
       case MULT_EXPR:
 	return build_one_cst (scalar_type);
@@ -4085,7 +4094,15 @@ pop:
 		|| flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
 	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 	    cnt++;
-      if (cnt != 1)
+
+      bool cond_fn_p = op.code.is_internal_fn ()
+	&& (conditional_internal_fn_code (internal_fn (*code))
+	    != ERROR_MARK);
+
+      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
+	 op1 twice (once as definition, once as else) in the same operation.
+	 Allow this.  */
+      if ((!cond_fn_p && cnt != 1) || (opi == 1 && cond_fn_p && cnt != 2))
 	{
 	  fail = true;
 	  break;
@@ -4187,8 +4204,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
           return NULL;
         }
 
-      nphi_def_loop_uses++;
-      phi_use_stmt = use_stmt;
+      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
+	 op1 twice (once as definition, once as else) in the same operation.
+	 Only count it as one. */
+      if (use_stmt != phi_use_stmt)
+	{
+	  nphi_def_loop_uses++;
+	  phi_use_stmt = use_stmt;
+	}
     }
 
   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
@@ -6122,7 +6145,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
     }
-  
+
   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
   scalar_type = TREE_TYPE (scalar_dest);
   scalar_results.truncate (0);
@@ -6459,7 +6482,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
 	    initial_value = reduc_info->reduc_initial_values[0];
 	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
-						 initial_value);
+						 initial_value, false);
 	}
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -6941,8 +6964,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 			       gimple_stmt_iterator *gsi,
 			       gimple **vec_stmt, slp_tree slp_node,
 			       gimple *reduc_def_stmt,
-			       tree_code code, internal_fn reduc_fn,
-			       tree ops[3], tree vectype_in,
+			       code_helper code, internal_fn reduc_fn,
+			       tree *ops, int num_ops, tree vectype_in,
 			       int reduc_index, vec_loop_masks *masks,
 			       vec_loop_lens *lens)
 {
@@ -6958,17 +6981,48 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 
   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
   gcc_assert (ncopies == 1);
-  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
+
+  bool is_cond_op = false;
+  if (!code.is_tree_code ())
+    {
+      code = conditional_internal_fn_code (internal_fn (code));
+      gcc_assert (code != ERROR_MARK);
+      is_cond_op = true;
+    }
+
+  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
 
   if (slp_node)
-    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
-			  TYPE_VECTOR_SUBPARTS (vectype_in)));
+    {
+      if (is_cond_op)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "fold-left reduction on SLP not supported.\n");
+	  return false;
+	}
 
-  tree op0 = ops[1 - reduc_index];
+      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
+			    TYPE_VECTOR_SUBPARTS (vectype_in)));
+    }
+
+  /* The operands either come from a binary operation or an IFN_COND operation.
+     The former is a gimple assign with binary rhs and the latter is a
+     gimple call with four arguments.  */
+  gcc_assert (num_ops == 2 || num_ops == 4);
+  tree op0, opmask;
+  if (!is_cond_op)
+    op0 = ops[1 - reduc_index];
+  else
+    {
+      op0 = ops[2];
+      opmask = ops[0];
+      gcc_assert (!slp_node);
+    }
 
   int group_size = 1;
   stmt_vec_info scalar_dest_def_info;
-  auto_vec<tree> vec_oprnds0;
+  auto_vec<tree> vec_oprnds0, vec_opmask;
   if (slp_node)
     {
       auto_vec<vec<tree> > vec_defs (2);
@@ -6984,9 +7038,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
 				     op0, &vec_oprnds0);
       scalar_dest_def_info = stmt_info;
+
+      /* For an IFN_COND_OP we also need the vector mask operand.  */
+      if (is_cond_op)
+	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+					 opmask, &vec_opmask);
     }
 
-  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
+  gimple *sdef = scalar_dest_def_info->stmt;
+  tree scalar_dest = gimple_get_lhs (sdef);
   tree scalar_type = TREE_TYPE (scalar_dest);
   tree reduc_var = gimple_phi_result (reduc_def_stmt);
 
@@ -7020,13 +7080,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       tree bias = NULL_TREE;
       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
+      else if (is_cond_op)
+	mask = vec_opmask[0];
       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
 	{
 	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
 				   i, 1);
 	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 	  bias = build_int_cst (intQI_type_node, biasval);
-	  mask = build_minus_one_cst (truth_type_for (vectype_in));
+	  if (!is_cond_op)
+	    mask = build_minus_one_cst (truth_type_for (vectype_in));
 	}
 
       /* Handle MINUS by adding the negative.  */
@@ -7038,7 +7101,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	  def0 = negated;
 	}
 
-      if (mask && mask_reduc_fn == IFN_LAST)
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  && mask && mask_reduc_fn == IFN_LAST)
 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
 				    vector_identity);
 
@@ -7069,8 +7133,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
-					     reduc_var, def0);
+	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
+					     tree_code (code), reduc_var, def0);
 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
 	  /* Remove the statement, so that we can use the same code paths
 	     as for statements that we've just created.  */
@@ -7521,8 +7585,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
 	continue;
 
+      /* For an IFN_COND_OP we might hit the reduction definition operand
+	 twice (once as definition, once as else).  */
+      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
+	continue;
+
       /* There should be only one cycle def in the stmt, the one
-         leading to reduc_def.  */
+	 leading to reduc_def.  */
       if (VECTORIZABLE_CYCLE_DEF (dt))
 	return false;
 
@@ -7721,6 +7790,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
           when generating the code inside the loop.  */
 
   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
+
+  /* If conversion might have created a conditional operation like
+     IFN_COND_ADD already.  Use the internal code for the following checks.  */
+  if (orig_code.is_internal_fn ())
+    {
+      tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
+      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
+    }
+
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
@@ -7759,7 +7837,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			    "reduction: not commutative/associative");
+			    "reduction: not commutative/associative\n");
 	  return false;
 	}
     }
@@ -8143,9 +8221,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	}
       else if (reduction_type == FOLD_LEFT_REDUCTION
-	       && reduc_fn == IFN_LAST
+	       && internal_fn_mask_index (reduc_fn) == -1
 	       && FLOAT_TYPE_P (vectype_in)
-	       && HONOR_SIGNED_ZEROS (vectype_in)
 	       && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
 	{
 	  if (dump_enabled_p ())
@@ -8294,6 +8371,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
   code_helper code = canonicalize_code (op.code, op.type);
   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
+
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
@@ -8312,17 +8390,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   if (code == COND_EXPR)
     gcc_assert (ncopies == 1);
 
+  /* A binary COND_OP reduction must have the same definition and else
+     value. */
+  bool cond_fn_p = code.is_internal_fn ()
+    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
+  if (cond_fn_p)
+    {
+      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
+		  || code == IFN_COND_MUL || code == IFN_COND_AND
+		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
+      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
+    }
+
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
-      gcc_assert (code.is_tree_code ());
+      gcc_assert (code.is_tree_code () || cond_fn_p);
       return vectorize_fold_left_reduction
 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
-	   lens);
+	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
+	   reduc_index, masks, lens);
     }
 
   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
@@ -8335,14 +8425,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
+  /* Get NCOPIES vector definitions for all operands except the reduction
+     definition.  */
   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
 		     single_defuse_cycle && reduc_index == 0
 		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
 		     single_defuse_cycle && reduc_index == 1
 		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
-		     op.num_ops == 3
-		     && !(single_defuse_cycle && reduc_index == 2)
+		     op.num_ops == 4
+		     || (op.num_ops == 3
+			 && !(single_defuse_cycle && reduc_index == 2))
 		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+
+  /* For single def-use cycles get one copy of the vectorized reduction
+     definition.  */
   if (single_defuse_cycle)
     {
       gcc_assert (!slp_node);
@@ -8382,7 +8478,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  if (op.num_ops == 3)
+	  if (op.num_ops >= 3)
 	    vop[2] = vec_oprnds2[i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
@@ -8395,10 +8491,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	  if (emulated_mixed_dot_prod)
 	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
 						    vec_dest, vop);
-	  else if (code.is_internal_fn ())
+
+	  else if (code.is_internal_fn () && !cond_fn_p)
 	    new_stmt = gimple_build_call_internal (internal_fn (code),
 						   op.num_ops,
 						   vop[0], vop[1], vop[2]);
+	  else if (code.is_internal_fn () && cond_fn_p)
+	    new_stmt = gimple_build_call_internal (internal_fn (code),
+						   op.num_ops,
+						   vop[0], vop[1], vop[2],
+						   vop[1]);
 	  else
 	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
 					    vop[0], vop[1], vop[2]);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index a4043e4a656..254d172231d 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2350,7 +2350,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
 						  tree);
 
 /* In tree-vect-loop.cc.  */
-extern tree neutral_op_for_reduction (tree, code_helper, tree);
+extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
 extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
 bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
 /* Used in tree-vect-loop-manip.cc */
  
Richard Biener Oct. 23, 2023, 10:53 a.m. UTC | #25
On Thu, 19 Oct 2023, Robin Dapp wrote:

> Ugh, I didn't push yet because with a rebased trunk I am
> seeing different behavior for some riscv testcases.
> 
> A reduction is not recognized because there is yet another
> "double use" occurrence in check_reduction_path.  I guess it's
> reasonable to loosen the restriction for conditional operations
> here as well.
> 
> The only change to v4 therefore is:
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index ebab1953b9c..64654a55e4c 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -4085,7 +4094,15 @@ pop:
>                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
>           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
>             cnt++;
> -      if (cnt != 1)
> +
> +      bool cond_fn_p = op.code.is_internal_fn ()
> +       && (conditional_internal_fn_code (internal_fn (*code))
> +           != ERROR_MARK);
> +
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +        op1 twice (once as definition, once as else) in the same operation.
> +        Allow this.  */
> +      if ((!cond_fn_p && cnt != 1) || (opi == 1 && cond_fn_p && cnt != 2))
> 
> Bootstrapped and regtested again on x86, aarch64 and power10.
> Testsuite on riscv unchanged.

Hmm, why opi == 1 only?  I think

# _1 = PHI <.., _4>
 _3 = .COND_ADD (_1, _2, _1);
 _4 = .COND_ADD (_3, _5, _3);

would be fine as well.  I think we want to simply ignore the 'else' value
of conditional internal functions.  I suppose we have unary, binary
and ternary conditional functions - I miss a internal_fn_else_index,
but I suppose it's always the last one?

I think a single use on .COND functions is also OK, even when on the
'else' value only?  But maybe that's not too important here.

Maybe

      gimple *op_use_stmt;
      unsigned cnt = 0;
      FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
        if (.. op_use_stmt is conditional internal function ..)
          {
            for (unsigned j = 0; j < gimple_call_num_args (call) - 1; ++j)
              if (gimple_call_arg (call, j) == op.ops[opi])
                cnt++;
          }
        else if (!is_gimple_debug (op_use_stmt)
            && (*code != ERROR_MARK
                || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
          FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
            cnt++;

?

> Regards
>  Robin
> 
> Subject: [PATCH v5] ifcvt/vect: Emit COND_OP for conditional scalar reduction.
> 
> As described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions.  This makes it difficult to combine both
> into a masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD/COND_OP during
> ifcvt and adjusting some vectorizer code to handle it.
> 
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
> is true.
> 
> gcc/ChangeLog:
> 
> 	PR middle-end/111401
> 	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_OP
> 	if supported.
> 	(predicate_scalar_phi): Add whitespace.
> 	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_OP.
> 	(neutral_op_for_reduction): Return -0 for PLUS.
> 	(check_reduction_path): Don't count else operand in COND_OP.
> 	(vect_is_simple_reduction): Ditto.
> 	(vect_create_epilog_for_reduction): Fix whitespace.
> 	(vectorize_fold_left_reduction): Add COND_OP handling.
> 	(vectorizable_reduction): Don't count else operand in COND_OP.
> 	(vect_transform_reduction): Add COND_OP handling.
> 	* tree-vectorizer.h (neutral_op_for_reduction): Add default
> 	parameter.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
> 	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
> 	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c: Adjust.
> 	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c: Ditto.
> ---
>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 +++++++++++++++
>  .../riscv/rvv/autovec/cond/pr111401.c         | 139 +++++++++++++++
>  .../riscv/rvv/autovec/reduc/reduc_call-2.c    |   4 +-
>  .../riscv/rvv/autovec/reduc/reduc_call-4.c    |   4 +-
>  gcc/tree-if-conv.cc                           |  49 +++--
>  gcc/tree-vect-loop.cc                         | 168 ++++++++++++++----
>  gcc/tree-vectorizer.h                         |   2 +-
>  7 files changed, 456 insertions(+), 51 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..7b46e7d8a2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_plus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_minus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> +  int n = 19;
> +  double a[N];
> +  int cond1[N], cond2[N];
> +
> +  for (int i = 0; i < N; i++)
> +    {
> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> +      cond1[i] = 0;
> +      cond2[i] = i & 4 ? 1 : 0;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res2 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +  double res3 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res4 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..83dbd61b3f3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,139 @@
> +/* { dg-do run { target { riscv_v } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init += a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init -= a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo4 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init *= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo5 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init &= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo6 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init |= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo7 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init ^= a[i];
> +  return init;
> +}
> +
> +#define SZ 125
> +
> +int
> +main ()
> +{
> +  double res1 = 0, res2 = 0, res3 = 0;
> +  double a1[SZ], a2[SZ], a3[SZ];
> +  int c1[SZ], c2[SZ], c3[SZ];
> +
> +  int a4[SZ], a5[SZ], a6[SZ];
> +  int res4 = 0, res5 = 0, res6 = 0;
> +  int c4[SZ], c5[SZ], c6[SZ];
> +
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      a1[i] = i * 3 + (i & 4) - (i & 7);
> +      a2[i] = i * 3 + (i & 4) - (i & 7);
> +      a3[i] = i * 0.05 + (i & 4) - (i & 7);
> +      a4[i] = i * 3 + (i & 4) - (i & 7);
> +      a5[i] = i * 3 + (i & 4) - (i & 7);
> +      a6[i] = i * 3 + (i & 4) - (i & 7);
> +      c1[i] = i & 1;
> +      c2[i] = i & 2;
> +      c3[i] = i & 3;
> +      c4[i] = i & 4;
> +      c5[i] = i & 5;
> +      c6[i] = i & 6;
> +      __asm__ volatile ("" : : : "memory");
> +    }
> +
> +  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
> +  double ref1 = init1, ref2 = init2, ref3 = init3;
> +
> +  int init4 = 87, init5 = 11, init6 = -123894344;
> +  int ref4 = init4, ref5 = init5, ref6 = init6;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      if (c1[i])
> +        ref1 += a1[i];
> +      if (c2[i])
> +        ref2 -= a2[i];
> +      if (c3[i])
> +        ref3 *= a3[i];
> +      if (c4[i])
> +        ref4 &= a4[i];
> +      if (c5[i])
> +        ref5 |= a5[i];
> +      if (c6[i])
> +        ref6 ^= a6[i];
> +    }
> +
> +  res1 = foo2 (a1, init1, c1, SZ);
> +  res2 = foo3 (a2, init2, c2, SZ);
> +  res3 = foo4 (a3, init3, c3, SZ);
> +  res4 = foo5 (a4, init4, c4, SZ);
> +  res5 = foo6 (a5, init5, c5, SZ);
> +  res6 = foo7 (a6, init6, c6, SZ);
> +
> +  if (res1 != ref1)
> +    __builtin_abort ();
> +  if (res2 != ref2)
> +    __builtin_abort ();
> +  if (res3 != ref3)
> +    __builtin_abort ();
> +  if (res4 != ref4)
> +    __builtin_abort ();
> +  if (res5 != ref5)
> +    __builtin_abort ();
> +  if (res6 != ref6)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> index cc07a047cd5..7be22d60bf2 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> @@ -3,4 +3,6 @@
>  
>  #include "reduc_call-1.c"
>  
> -/* { dg-final { scan-assembler-times {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
> +/* { dg-final { scan-assembler-times {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> index 6d00c404d2a..83beabeff97 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> @@ -3,4 +3,6 @@
>  
>  #include "reduc_call-1.c"
>  
> -/* { dg-final { scan-assembler {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
> +/* { dg-final { scan-assembler {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} } } */
> +/* { dg-final { scan-assembler {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index c381d14b801..9571351805c 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1856,10 +1856,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>    gimple *new_assign;
>    tree rhs;
>    tree rhs1 = gimple_assign_rhs1 (reduc);
> +  tree lhs = gimple_assign_lhs (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
> +					       NULL, false);
>    gimple_seq stmts = NULL;
>  
>    if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1868,19 +1870,36 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>      }
>  
> -  /* Build cond expression using COND and constant operand
> -     of reduction rhs.  */
> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
> -			    unshare_expr (cond),
> -			    swap ? op_nochange : op1,
> -			    swap ? op1 : op_nochange);
> -
> -  /* Create assignment stmt and insert it at GSI.  */
> -  new_assign = gimple_build_assign (tmp, c);
> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> -  rhs = gimple_build (&stmts, reduction_op,
> -		      TREE_TYPE (rhs1), op0, tmp);
> +  /* If possible create a COND_OP instead of a COND_EXPR and an OP_EXPR.
> +     The COND_OP will have a neutral_op else value.  */
> +  internal_fn ifn;
> +  ifn = get_conditional_internal_fn (reduction_op);
> +  if (ifn != IFN_LAST
> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> +      && !swap)
> +    {
> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
> +						     unshare_expr (cond),
> +						     op0, op1, op0);
> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> +      gimple_call_set_lhs (cond_call, tmp);
> +      rhs = tmp;
> +    }
> +  else
> +    {
> +      /* Build cond expression using COND and constant operand
> +	 of reduction rhs.  */
> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
> +				unshare_expr (cond),
> +				swap ? op_nochange : op1,
> +				swap ? op1 : op_nochange);
> +      /* Create assignment stmt and insert it at GSI.  */
> +      new_assign = gimple_build_assign (tmp, c);
> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +      rhs = gimple_build (&stmts, reduction_op,
> +			  TREE_TYPE (rhs1), op0, tmp);
> +    }
>  
>    if (has_nop)
>      {
> @@ -2292,7 +2311,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
>  	{
>  	  /* Convert reduction stmt into vectorizable form.  */
>  	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -					       swap,has_nop, nop_reduc);
> +					       swap, has_nop, nop_reduc);
>  	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>  	}
>        new_stmt = gimple_build_assign (res, rhs);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index ebab1953b9c..1c455701c73 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3762,7 +3762,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
>  static bool
>  fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
>  {
> -  if (code == PLUS_EXPR)
> +  /* We support MINUS_EXPR by negating the operand.  This also preserves an
> +     initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
> +     (-0.0) = -0.0.  */
> +  if (code == PLUS_EXPR || code == MINUS_EXPR)
>      {
>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>        return true;
> @@ -3841,23 +3844,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
>     by the introduction of additional X elements, return that X, otherwise
>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>     of the scalar elements.  If the reduction has just a single initial value
> -   then INITIAL_VALUE is that value, otherwise it is null.  */
> +   then INITIAL_VALUE is that value, otherwise it is null.
> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> +   In that case no signed zero is returned.  */
>  
>  tree
>  neutral_op_for_reduction (tree scalar_type, code_helper code,
> -			  tree initial_value)
> +			  tree initial_value, bool as_initial)
>  {
>    if (code.is_tree_code ())
>      switch (tree_code (code))
>        {
> -      case WIDEN_SUM_EXPR:
>        case DOT_PROD_EXPR:
>        case SAD_EXPR:
> -      case PLUS_EXPR:
>        case MINUS_EXPR:
>        case BIT_IOR_EXPR:
>        case BIT_XOR_EXPR:
>  	return build_zero_cst (scalar_type);
> +      case WIDEN_SUM_EXPR:
> +      case PLUS_EXPR:
> +	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> +	  return build_real (scalar_type, dconstm0);
> +	else
> +	  return build_zero_cst (scalar_type);
>  
>        case MULT_EXPR:
>  	return build_one_cst (scalar_type);
> @@ -4085,7 +4094,15 @@ pop:
>  		|| flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
>  	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
>  	    cnt++;
> -      if (cnt != 1)
> +
> +      bool cond_fn_p = op.code.is_internal_fn ()
> +	&& (conditional_internal_fn_code (internal_fn (*code))
> +	    != ERROR_MARK);
> +
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	 op1 twice (once as definition, once as else) in the same operation.
> +	 Allow this.  */
> +      if ((!cond_fn_p && cnt != 1) || (opi == 1 && cond_fn_p && cnt != 2))
>  	{
>  	  fail = true;
>  	  break;
> @@ -4187,8 +4204,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
>            return NULL;
>          }
>  
> -      nphi_def_loop_uses++;
> -      phi_use_stmt = use_stmt;
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	 op1 twice (once as definition, once as else) in the same operation.
> +	 Only count it as one. */
> +      if (use_stmt != phi_use_stmt)
> +	{
> +	  nphi_def_loop_uses++;
> +	  phi_use_stmt = use_stmt;
> +	}
>      }
>  
>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6122,7 +6145,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
>        gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
>      }
> -  
> +
>    scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
>    scalar_type = TREE_TYPE (scalar_dest);
>    scalar_results.truncate (0);
> @@ -6459,7 +6482,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>  	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>  	    initial_value = reduc_info->reduc_initial_values[0];
>  	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> -						 initial_value);
> +						 initial_value, false);
>  	}
>        if (neutral_op)
>  	vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -6941,8 +6964,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  			       gimple_stmt_iterator *gsi,
>  			       gimple **vec_stmt, slp_tree slp_node,
>  			       gimple *reduc_def_stmt,
> -			       tree_code code, internal_fn reduc_fn,
> -			       tree ops[3], tree vectype_in,
> +			       code_helper code, internal_fn reduc_fn,
> +			       tree *ops, int num_ops, tree vectype_in,
>  			       int reduc_index, vec_loop_masks *masks,
>  			       vec_loop_lens *lens)
>  {
> @@ -6958,17 +6981,48 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  
>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>    gcc_assert (ncopies == 1);
> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> +  bool is_cond_op = false;
> +  if (!code.is_tree_code ())
> +    {
> +      code = conditional_internal_fn_code (internal_fn (code));
> +      gcc_assert (code != ERROR_MARK);
> +      is_cond_op = true;
> +    }
> +
> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
>  
>    if (slp_node)
> -    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> -			  TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    {
> +      if (is_cond_op)
> +	{
> +	  if (dump_enabled_p ())
> +	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +			     "fold-left reduction on SLP not supported.\n");
> +	  return false;
> +	}
>  
> -  tree op0 = ops[1 - reduc_index];
> +      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> +			    TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    }
> +
> +  /* The operands either come from a binary operation or an IFN_COND operation.
> +     The former is a gimple assign with binary rhs and the latter is a
> +     gimple call with four arguments.  */
> +  gcc_assert (num_ops == 2 || num_ops == 4);
> +  tree op0, opmask;
> +  if (!is_cond_op)
> +    op0 = ops[1 - reduc_index];
> +  else
> +    {
> +      op0 = ops[2];
> +      opmask = ops[0];
> +      gcc_assert (!slp_node);
> +    }
>  
>    int group_size = 1;
>    stmt_vec_info scalar_dest_def_info;
> -  auto_vec<tree> vec_oprnds0;
> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>    if (slp_node)
>      {
>        auto_vec<vec<tree> > vec_defs (2);
> @@ -6984,9 +7038,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>  				     op0, &vec_oprnds0);
>        scalar_dest_def_info = stmt_info;
> +
> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
> +      if (is_cond_op)
> +	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> +					 opmask, &vec_opmask);
>      }
>  
> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> +  gimple *sdef = scalar_dest_def_info->stmt;
> +  tree scalar_dest = gimple_get_lhs (sdef);
>    tree scalar_type = TREE_TYPE (scalar_dest);
>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
>  
> @@ -7020,13 +7080,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
> +      else if (is_cond_op)
> +	mask = vec_opmask[0];
>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  	{
>  	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>  				   i, 1);
>  	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>  	  bias = build_int_cst (intQI_type_node, biasval);
> -	  mask = build_minus_one_cst (truth_type_for (vectype_in));
> +	  if (!is_cond_op)
> +	    mask = build_minus_one_cst (truth_type_for (vectype_in));
>  	}
>  
>        /* Handle MINUS by adding the negative.  */
> @@ -7038,7 +7101,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	  def0 = negated;
>  	}
>  
> -      if (mask && mask_reduc_fn == IFN_LAST)
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +	  && mask && mask_reduc_fn == IFN_LAST)
>  	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>  				    vector_identity);
>  
> @@ -7069,8 +7133,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> -					     reduc_var, def0);
> +	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> +					     tree_code (code), reduc_var, def0);
>  	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>  	  /* Remove the statement, so that we can use the same code paths
>  	     as for statements that we've just created.  */
> @@ -7521,8 +7585,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>  	continue;
>  
> +      /* For an IFN_COND_OP we might hit the reduction definition operand
> +	 twice (once as definition, once as else).  */
> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> +	continue;
> +
>        /* There should be only one cycle def in the stmt, the one
> -         leading to reduc_def.  */
> +	 leading to reduc_def.  */
>        if (VECTORIZABLE_CYCLE_DEF (dt))
>  	return false;
>  
> @@ -7721,6 +7790,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>            when generating the code inside the loop.  */
>  
>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> +  /* If conversion might have created a conditional operation like
> +     IFN_COND_ADD already.  Use the internal code for the following checks.  */
> +  if (orig_code.is_internal_fn ())
> +    {
> +      tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
> +      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
> +    }
> +
>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> @@ -7759,7 +7837,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -			    "reduction: not commutative/associative");
> +			    "reduction: not commutative/associative\n");
>  	  return false;
>  	}
>      }
> @@ -8143,9 +8221,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  	}
>        else if (reduction_type == FOLD_LEFT_REDUCTION
> -	       && reduc_fn == IFN_LAST
> +	       && internal_fn_mask_index (reduc_fn) == -1
>  	       && FLOAT_TYPE_P (vectype_in)
> -	       && HONOR_SIGNED_ZEROS (vectype_in)
>  	       && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
>  	{
>  	  if (dump_enabled_p ())
> @@ -8294,6 +8371,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> @@ -8312,17 +8390,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    if (code == COND_EXPR)
>      gcc_assert (ncopies == 1);
>  
> +  /* A binary COND_OP reduction must have the same definition and else
> +     value. */
> +  bool cond_fn_p = code.is_internal_fn ()
> +    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
> +  if (cond_fn_p)
> +    {
> +      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
> +		  || code == IFN_COND_MUL || code == IFN_COND_AND
> +		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
> +      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +    }
> +
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> -      gcc_assert (code.is_tree_code ());
> +      gcc_assert (code.is_tree_code () || cond_fn_p);
>        return vectorize_fold_left_reduction
>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> -	   lens);
> +	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +	   reduc_index, masks, lens);
>      }
>  
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> @@ -8335,14 +8425,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>  
> +  /* Get NCOPIES vector definitions for all operands except the reduction
> +     definition.  */
>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  		     single_defuse_cycle && reduc_index == 0
>  		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
>  		     single_defuse_cycle && reduc_index == 1
>  		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -		     op.num_ops == 3
> -		     && !(single_defuse_cycle && reduc_index == 2)
> +		     op.num_ops == 4
> +		     || (op.num_ops == 3
> +			 && !(single_defuse_cycle && reduc_index == 2))
>  		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> +  /* For single def-use cycles get one copy of the vectorized reduction
> +     definition.  */
>    if (single_defuse_cycle)
>      {
>        gcc_assert (!slp_node);
> @@ -8382,7 +8478,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  if (op.num_ops == 3)
> +	  if (op.num_ops >= 3)
>  	    vop[2] = vec_oprnds2[i];
>  
>  	  if (masked_loop_p && mask_by_cond_expr)
> @@ -8395,10 +8491,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  if (emulated_mixed_dot_prod)
>  	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
>  						    vec_dest, vop);
> -	  else if (code.is_internal_fn ())
> +
> +	  else if (code.is_internal_fn () && !cond_fn_p)
>  	    new_stmt = gimple_build_call_internal (internal_fn (code),
>  						   op.num_ops,
>  						   vop[0], vop[1], vop[2]);
> +	  else if (code.is_internal_fn () && cond_fn_p)
> +	    new_stmt = gimple_build_call_internal (internal_fn (code),
> +						   op.num_ops,
> +						   vop[0], vop[1], vop[2],
> +						   vop[1]);
>  	  else
>  	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>  					    vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index a4043e4a656..254d172231d 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2350,7 +2350,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
>  						  tree);
>  
>  /* In tree-vect-loop.cc.  */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
>  bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
>  /* Used in tree-vect-loop-manip.cc */
>
  
Richard Sandiford Oct. 24, 2023, 11:11 a.m. UTC | #26
Richard Biener <rguenther@suse.de> writes:
> On Thu, 19 Oct 2023, Robin Dapp wrote:
>
>> Ugh, I didn't push yet because with a rebased trunk I am
>> seeing different behavior for some riscv testcases.
>> 
>> A reduction is not recognized because there is yet another
>> "double use" occurrence in check_reduction_path.  I guess it's
>> reasonable to loosen the restriction for conditional operations
>> here as well.
>> 
>> The only change to v4 therefore is:
>> 
>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>> index ebab1953b9c..64654a55e4c 100644
>> --- a/gcc/tree-vect-loop.cc
>> +++ b/gcc/tree-vect-loop.cc
>> @@ -4085,7 +4094,15 @@ pop:
>>                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
>>           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
>>             cnt++;
>> -      if (cnt != 1)
>> +
>> +      bool cond_fn_p = op.code.is_internal_fn ()
>> +       && (conditional_internal_fn_code (internal_fn (*code))
>> +           != ERROR_MARK);
>> +
>> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
>> +        op1 twice (once as definition, once as else) in the same operation.
>> +        Allow this.  */
>> +      if ((!cond_fn_p && cnt != 1) || (opi == 1 && cond_fn_p && cnt != 2))
>> 
>> Bootstrapped and regtested again on x86, aarch64 and power10.
>> Testsuite on riscv unchanged.
>
> Hmm, why opi == 1 only?  I think
>
> # _1 = PHI <.., _4>
>  _3 = .COND_ADD (_1, _2, _1);
>  _4 = .COND_ADD (_3, _5, _3);
>
> would be fine as well.  I think we want to simply ignore the 'else' value
> of conditional internal functions.  I suppose we have unary, binary
> and ternary conditional functions - I miss a internal_fn_else_index,
> but I suppose it's always the last one?

Yeah, it was always the last one before the introduction of .COND_LEN.
I agree internal_fn_else_index would be useful now.

Thanks,
Richard

>
> I think a single use on .COND functions is also OK, even when on the
> 'else' value only?  But maybe that's not too important here.
>
> Maybe
>
>       gimple *op_use_stmt;
>       unsigned cnt = 0;
>       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
>         if (.. op_use_stmt is conditional internal function ..)
>           {
>             for (unsigned j = 0; j < gimple_call_num_args (call) - 1; ++j)
>               if (gimple_call_arg (call, j) == op.ops[opi])
>                 cnt++;
>           }
>         else if (!is_gimple_debug (op_use_stmt)
>             && (*code != ERROR_MARK
>                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
>           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
>             cnt++;
>
> ?
>
>> Regards
>>  Robin
>> 
>> Subject: [PATCH v5] ifcvt/vect: Emit COND_OP for conditional scalar reduction.
>> 
>> As described in PR111401 we currently emit a COND and a PLUS expression
>> for conditional reductions.  This makes it difficult to combine both
>> into a masked reduction statement later.
>> This patch improves that by directly emitting a COND_ADD/COND_OP during
>> ifcvt and adjusting some vectorizer code to handle it.
>> 
>> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
>> is true.
>> 
>> gcc/ChangeLog:
>> 
>> 	PR middle-end/111401
>> 	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_OP
>> 	if supported.
>> 	(predicate_scalar_phi): Add whitespace.
>> 	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_OP.
>> 	(neutral_op_for_reduction): Return -0 for PLUS.
>> 	(check_reduction_path): Don't count else operand in COND_OP.
>> 	(vect_is_simple_reduction): Ditto.
>> 	(vect_create_epilog_for_reduction): Fix whitespace.
>> 	(vectorize_fold_left_reduction): Add COND_OP handling.
>> 	(vectorizable_reduction): Don't count else operand in COND_OP.
>> 	(vect_transform_reduction): Add COND_OP handling.
>> 	* tree-vectorizer.h (neutral_op_for_reduction): Add default
>> 	parameter.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>> 	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
>> 	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
>> 	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c: Adjust.
>> 	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c: Ditto.
>> ---
>>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 +++++++++++++++
>>  .../riscv/rvv/autovec/cond/pr111401.c         | 139 +++++++++++++++
>>  .../riscv/rvv/autovec/reduc/reduc_call-2.c    |   4 +-
>>  .../riscv/rvv/autovec/reduc/reduc_call-4.c    |   4 +-
>>  gcc/tree-if-conv.cc                           |  49 +++--
>>  gcc/tree-vect-loop.cc                         | 168 ++++++++++++++----
>>  gcc/tree-vectorizer.h                         |   2 +-
>>  7 files changed, 456 insertions(+), 51 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
>> 
>> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>> new file mode 100644
>> index 00000000000..7b46e7d8a2a
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>> @@ -0,0 +1,141 @@
>> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
>> +/* { dg-do run } */
>> +/* { dg-require-effective-target vect_double } */
>> +/* { dg-add-options ieee } */
>> +/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
>> +
>> +#include "tree-vect.h"
>> +
>> +#include <math.h>
>> +
>> +#define N (VECTOR_BITS * 17)
>> +
>> +double __attribute__ ((noinline, noclone))
>> +reduc_plus_double (double *restrict a, double init, int *cond, int n)
>> +{
>> +  double res = init;
>> +  for (int i = 0; i < n; i++)
>> +    if (cond[i])
>> +      res += a[i];
>> +  return res;
>> +}
>> +
>> +double __attribute__ ((noinline, noclone, optimize ("0")))
>> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
>> +{
>> +  double res = init;
>> +  for (int i = 0; i < n; i++)
>> +    if (cond[i])
>> +      res += a[i];
>> +  return res;
>> +}
>> +
>> +double __attribute__ ((noinline, noclone))
>> +reduc_minus_double (double *restrict a, double init, int *cond, int n)
>> +{
>> +  double res = init;
>> +  for (int i = 0; i < n; i++)
>> +    if (cond[i])
>> +      res -= a[i];
>> +  return res;
>> +}
>> +
>> +double __attribute__ ((noinline, noclone, optimize ("0")))
>> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
>> +{
>> +  double res = init;
>> +  for (int i = 0; i < n; i++)
>> +    if (cond[i])
>> +      res -= a[i];
>> +  return res;
>> +}
>> +
>> +int __attribute__ ((optimize (1)))
>> +main ()
>> +{
>> +  int n = 19;
>> +  double a[N];
>> +  int cond1[N], cond2[N];
>> +
>> +  for (int i = 0; i < N; i++)
>> +    {
>> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
>> +      cond1[i] = 0;
>> +      cond2[i] = i & 4 ? 1 : 0;
>> +      asm volatile ("" ::: "memory");
>> +    }
>> +
>> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);
>> +  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
>> +  double res2 = reduc_minus_double (a, -0.0, cond1, n);
>> +  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
>> +  double res3 = reduc_plus_double (a, -0.0, cond1, n);
>> +  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
>> +  double res4 = reduc_minus_double (a, -0.0, cond1, n);
>> +  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
>> +
>> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
>> +    __builtin_abort ();
>> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
>> +    __builtin_abort ();
>> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
>> +    __builtin_abort ();
>> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
>> +    __builtin_abort ();
>> +
>> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
>> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
>> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
>> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
>> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
>> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
>> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
>> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
>> +
>> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
>> +    __builtin_abort ();
>> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
>> +    __builtin_abort ();
>> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
>> +    __builtin_abort ();
>> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
>> +    __builtin_abort ();
>> +
>> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
>> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
>> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
>> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
>> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
>> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
>> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
>> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
>> +
>> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
>> +    __builtin_abort ();
>> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
>> +    __builtin_abort ();
>> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
>> +    __builtin_abort ();
>> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
>> +    __builtin_abort ();
>> +
>> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
>> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
>> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
>> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
>> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
>> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
>> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
>> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
>> +
>> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
>> +    __builtin_abort ();
>> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
>> +    __builtin_abort ();
>> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
>> +    __builtin_abort ();
>> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
>> +    __builtin_abort ();
>> +
>> +  return 0;
>> +}
>> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
>> new file mode 100644
>> index 00000000000..83dbd61b3f3
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
>> @@ -0,0 +1,139 @@
>> +/* { dg-do run { target { riscv_v } } } */
>> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
>> +
>> +double
>> +__attribute__ ((noipa))
>> +foo2 (double *__restrict a, double init, int *__restrict cond, int n)
>> +{
>> +  for (int i = 0; i < n; i++)
>> +    if (cond[i])
>> +      init += a[i];
>> +  return init;
>> +}
>> +
>> +double
>> +__attribute__ ((noipa))
>> +foo3 (double *__restrict a, double init, int *__restrict cond, int n)
>> +{
>> +  for (int i = 0; i < n; i++)
>> +    if (cond[i])
>> +      init -= a[i];
>> +  return init;
>> +}
>> +
>> +double
>> +__attribute__ ((noipa))
>> +foo4 (double *__restrict a, double init, int *__restrict cond, int n)
>> +{
>> +  for (int i = 0; i < n; i++)
>> +    if (cond[i])
>> +      init *= a[i];
>> +  return init;
>> +}
>> +
>> +int
>> +__attribute__ ((noipa))
>> +foo5 (int *__restrict a, int init, int *__restrict cond, int n)
>> +{
>> +  for (int i = 0; i < n; i++)
>> +    if (cond[i])
>> +      init &= a[i];
>> +  return init;
>> +}
>> +
>> +int
>> +__attribute__ ((noipa))
>> +foo6 (int *__restrict a, int init, int *__restrict cond, int n)
>> +{
>> +  for (int i = 0; i < n; i++)
>> +    if (cond[i])
>> +      init |= a[i];
>> +  return init;
>> +}
>> +
>> +int
>> +__attribute__ ((noipa))
>> +foo7 (int *__restrict a, int init, int *__restrict cond, int n)
>> +{
>> +  for (int i = 0; i < n; i++)
>> +    if (cond[i])
>> +      init ^= a[i];
>> +  return init;
>> +}
>> +
>> +#define SZ 125
>> +
>> +int
>> +main ()
>> +{
>> +  double res1 = 0, res2 = 0, res3 = 0;
>> +  double a1[SZ], a2[SZ], a3[SZ];
>> +  int c1[SZ], c2[SZ], c3[SZ];
>> +
>> +  int a4[SZ], a5[SZ], a6[SZ];
>> +  int res4 = 0, res5 = 0, res6 = 0;
>> +  int c4[SZ], c5[SZ], c6[SZ];
>> +
>> +  for (int i = 0; i < SZ; i++)
>> +    {
>> +      a1[i] = i * 3 + (i & 4) - (i & 7);
>> +      a2[i] = i * 3 + (i & 4) - (i & 7);
>> +      a3[i] = i * 0.05 + (i & 4) - (i & 7);
>> +      a4[i] = i * 3 + (i & 4) - (i & 7);
>> +      a5[i] = i * 3 + (i & 4) - (i & 7);
>> +      a6[i] = i * 3 + (i & 4) - (i & 7);
>> +      c1[i] = i & 1;
>> +      c2[i] = i & 2;
>> +      c3[i] = i & 3;
>> +      c4[i] = i & 4;
>> +      c5[i] = i & 5;
>> +      c6[i] = i & 6;
>> +      __asm__ volatile ("" : : : "memory");
>> +    }
>> +
>> +  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
>> +  double ref1 = init1, ref2 = init2, ref3 = init3;
>> +
>> +  int init4 = 87, init5 = 11, init6 = -123894344;
>> +  int ref4 = init4, ref5 = init5, ref6 = init6;
>> +
>> +#pragma GCC novector
>> +  for (int i = 0; i < SZ; i++)
>> +    {
>> +      if (c1[i])
>> +        ref1 += a1[i];
>> +      if (c2[i])
>> +        ref2 -= a2[i];
>> +      if (c3[i])
>> +        ref3 *= a3[i];
>> +      if (c4[i])
>> +        ref4 &= a4[i];
>> +      if (c5[i])
>> +        ref5 |= a5[i];
>> +      if (c6[i])
>> +        ref6 ^= a6[i];
>> +    }
>> +
>> +  res1 = foo2 (a1, init1, c1, SZ);
>> +  res2 = foo3 (a2, init2, c2, SZ);
>> +  res3 = foo4 (a3, init3, c3, SZ);
>> +  res4 = foo5 (a4, init4, c4, SZ);
>> +  res5 = foo6 (a5, init5, c5, SZ);
>> +  res6 = foo7 (a6, init6, c6, SZ);
>> +
>> +  if (res1 != ref1)
>> +    __builtin_abort ();
>> +  if (res2 != ref2)
>> +    __builtin_abort ();
>> +  if (res3 != ref3)
>> +    __builtin_abort ();
>> +  if (res4 != ref4)
>> +    __builtin_abort ();
>> +  if (res5 != ref5)
>> +    __builtin_abort ();
>> +  if (res6 != ref6)
>> +    __builtin_abort ();
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
>> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
>> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
>> index cc07a047cd5..7be22d60bf2 100644
>> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
>> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
>> @@ -3,4 +3,6 @@
>>  
>>  #include "reduc_call-1.c"
>>  
>> -/* { dg-final { scan-assembler-times {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
>> +/* { dg-final { scan-assembler-times {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} 1 } } */
>> +/* { dg-final { scan-assembler-times {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
>> +/* { dg-final { scan-assembler-not {vmerge} } } */
>> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
>> index 6d00c404d2a..83beabeff97 100644
>> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
>> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
>> @@ -3,4 +3,6 @@
>>  
>>  #include "reduc_call-1.c"
>>  
>> -/* { dg-final { scan-assembler {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
>> +/* { dg-final { scan-assembler {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} } } */
>> +/* { dg-final { scan-assembler {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
>> +/* { dg-final { scan-assembler-not {vmerge} } } */
>> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
>> index c381d14b801..9571351805c 100644
>> --- a/gcc/tree-if-conv.cc
>> +++ b/gcc/tree-if-conv.cc
>> @@ -1856,10 +1856,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>>    gimple *new_assign;
>>    tree rhs;
>>    tree rhs1 = gimple_assign_rhs1 (reduc);
>> +  tree lhs = gimple_assign_lhs (reduc);
>>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>>    tree c;
>>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
>> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
>> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
>> +					       NULL, false);
>>    gimple_seq stmts = NULL;
>>  
>>    if (dump_file && (dump_flags & TDF_DETAILS))
>> @@ -1868,19 +1870,36 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>>      }
>>  
>> -  /* Build cond expression using COND and constant operand
>> -     of reduction rhs.  */
>> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
>> -			    unshare_expr (cond),
>> -			    swap ? op_nochange : op1,
>> -			    swap ? op1 : op_nochange);
>> -
>> -  /* Create assignment stmt and insert it at GSI.  */
>> -  new_assign = gimple_build_assign (tmp, c);
>> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
>> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
>> -  rhs = gimple_build (&stmts, reduction_op,
>> -		      TREE_TYPE (rhs1), op0, tmp);
>> +  /* If possible create a COND_OP instead of a COND_EXPR and an OP_EXPR.
>> +     The COND_OP will have a neutral_op else value.  */
>> +  internal_fn ifn;
>> +  ifn = get_conditional_internal_fn (reduction_op);
>> +  if (ifn != IFN_LAST
>> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
>> +      && !swap)
>> +    {
>> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
>> +						     unshare_expr (cond),
>> +						     op0, op1, op0);
>> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
>> +      gimple_call_set_lhs (cond_call, tmp);
>> +      rhs = tmp;
>> +    }
>> +  else
>> +    {
>> +      /* Build cond expression using COND and constant operand
>> +	 of reduction rhs.  */
>> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
>> +				unshare_expr (cond),
>> +				swap ? op_nochange : op1,
>> +				swap ? op1 : op_nochange);
>> +      /* Create assignment stmt and insert it at GSI.  */
>> +      new_assign = gimple_build_assign (tmp, c);
>> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
>> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
>> +      rhs = gimple_build (&stmts, reduction_op,
>> +			  TREE_TYPE (rhs1), op0, tmp);
>> +    }
>>  
>>    if (has_nop)
>>      {
>> @@ -2292,7 +2311,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
>>  	{
>>  	  /* Convert reduction stmt into vectorizable form.  */
>>  	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
>> -					       swap,has_nop, nop_reduc);
>> +					       swap, has_nop, nop_reduc);
>>  	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>>  	}
>>        new_stmt = gimple_build_assign (res, rhs);
>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>> index ebab1953b9c..1c455701c73 100644
>> --- a/gcc/tree-vect-loop.cc
>> +++ b/gcc/tree-vect-loop.cc
>> @@ -3762,7 +3762,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
>>  static bool
>>  fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
>>  {
>> -  if (code == PLUS_EXPR)
>> +  /* We support MINUS_EXPR by negating the operand.  This also preserves an
>> +     initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
>> +     (-0.0) = -0.0.  */
>> +  if (code == PLUS_EXPR || code == MINUS_EXPR)
>>      {
>>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>>        return true;
>> @@ -3841,23 +3844,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
>>     by the introduction of additional X elements, return that X, otherwise
>>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>>     of the scalar elements.  If the reduction has just a single initial value
>> -   then INITIAL_VALUE is that value, otherwise it is null.  */
>> +   then INITIAL_VALUE is that value, otherwise it is null.
>> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
>> +   In that case no signed zero is returned.  */
>>  
>>  tree
>>  neutral_op_for_reduction (tree scalar_type, code_helper code,
>> -			  tree initial_value)
>> +			  tree initial_value, bool as_initial)
>>  {
>>    if (code.is_tree_code ())
>>      switch (tree_code (code))
>>        {
>> -      case WIDEN_SUM_EXPR:
>>        case DOT_PROD_EXPR:
>>        case SAD_EXPR:
>> -      case PLUS_EXPR:
>>        case MINUS_EXPR:
>>        case BIT_IOR_EXPR:
>>        case BIT_XOR_EXPR:
>>  	return build_zero_cst (scalar_type);
>> +      case WIDEN_SUM_EXPR:
>> +      case PLUS_EXPR:
>> +	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
>> +	  return build_real (scalar_type, dconstm0);
>> +	else
>> +	  return build_zero_cst (scalar_type);
>>  
>>        case MULT_EXPR:
>>  	return build_one_cst (scalar_type);
>> @@ -4085,7 +4094,15 @@ pop:
>>  		|| flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
>>  	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
>>  	    cnt++;
>> -      if (cnt != 1)
>> +
>> +      bool cond_fn_p = op.code.is_internal_fn ()
>> +	&& (conditional_internal_fn_code (internal_fn (*code))
>> +	    != ERROR_MARK);
>> +
>> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
>> +	 op1 twice (once as definition, once as else) in the same operation.
>> +	 Allow this.  */
>> +      if ((!cond_fn_p && cnt != 1) || (opi == 1 && cond_fn_p && cnt != 2))
>>  	{
>>  	  fail = true;
>>  	  break;
>> @@ -4187,8 +4204,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
>>            return NULL;
>>          }
>>  
>> -      nphi_def_loop_uses++;
>> -      phi_use_stmt = use_stmt;
>> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
>> +	 op1 twice (once as definition, once as else) in the same operation.
>> +	 Only count it as one. */
>> +      if (use_stmt != phi_use_stmt)
>> +	{
>> +	  nphi_def_loop_uses++;
>> +	  phi_use_stmt = use_stmt;
>> +	}
>>      }
>>  
>>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
>> @@ -6122,7 +6145,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>>        gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
>>        gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
>>      }
>> -  
>> +
>>    scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
>>    scalar_type = TREE_TYPE (scalar_dest);
>>    scalar_results.truncate (0);
>> @@ -6459,7 +6482,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>>  	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>>  	    initial_value = reduc_info->reduc_initial_values[0];
>>  	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
>> -						 initial_value);
>> +						 initial_value, false);
>>  	}
>>        if (neutral_op)
>>  	vector_identity = gimple_build_vector_from_val (&seq, vectype,
>> @@ -6941,8 +6964,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>>  			       gimple_stmt_iterator *gsi,
>>  			       gimple **vec_stmt, slp_tree slp_node,
>>  			       gimple *reduc_def_stmt,
>> -			       tree_code code, internal_fn reduc_fn,
>> -			       tree ops[3], tree vectype_in,
>> +			       code_helper code, internal_fn reduc_fn,
>> +			       tree *ops, int num_ops, tree vectype_in,
>>  			       int reduc_index, vec_loop_masks *masks,
>>  			       vec_loop_lens *lens)
>>  {
>> @@ -6958,17 +6981,48 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>>  
>>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>>    gcc_assert (ncopies == 1);
>> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
>> +
>> +  bool is_cond_op = false;
>> +  if (!code.is_tree_code ())
>> +    {
>> +      code = conditional_internal_fn_code (internal_fn (code));
>> +      gcc_assert (code != ERROR_MARK);
>> +      is_cond_op = true;
>> +    }
>> +
>> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
>>  
>>    if (slp_node)
>> -    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
>> -			  TYPE_VECTOR_SUBPARTS (vectype_in)));
>> +    {
>> +      if (is_cond_op)
>> +	{
>> +	  if (dump_enabled_p ())
>> +	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +			     "fold-left reduction on SLP not supported.\n");
>> +	  return false;
>> +	}
>>  
>> -  tree op0 = ops[1 - reduc_index];
>> +      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
>> +			    TYPE_VECTOR_SUBPARTS (vectype_in)));
>> +    }
>> +
>> +  /* The operands either come from a binary operation or an IFN_COND operation.
>> +     The former is a gimple assign with binary rhs and the latter is a
>> +     gimple call with four arguments.  */
>> +  gcc_assert (num_ops == 2 || num_ops == 4);
>> +  tree op0, opmask;
>> +  if (!is_cond_op)
>> +    op0 = ops[1 - reduc_index];
>> +  else
>> +    {
>> +      op0 = ops[2];
>> +      opmask = ops[0];
>> +      gcc_assert (!slp_node);
>> +    }
>>  
>>    int group_size = 1;
>>    stmt_vec_info scalar_dest_def_info;
>> -  auto_vec<tree> vec_oprnds0;
>> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>>    if (slp_node)
>>      {
>>        auto_vec<vec<tree> > vec_defs (2);
>> @@ -6984,9 +7038,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>>  				     op0, &vec_oprnds0);
>>        scalar_dest_def_info = stmt_info;
>> +
>> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
>> +      if (is_cond_op)
>> +	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>> +					 opmask, &vec_opmask);
>>      }
>>  
>> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
>> +  gimple *sdef = scalar_dest_def_info->stmt;
>> +  tree scalar_dest = gimple_get_lhs (sdef);
>>    tree scalar_type = TREE_TYPE (scalar_dest);
>>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
>>  
>> @@ -7020,13 +7080,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>>        tree bias = NULL_TREE;
>>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
>> +      else if (is_cond_op)
>> +	mask = vec_opmask[0];
>>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>>  	{
>>  	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>>  				   i, 1);
>>  	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>>  	  bias = build_int_cst (intQI_type_node, biasval);
>> -	  mask = build_minus_one_cst (truth_type_for (vectype_in));
>> +	  if (!is_cond_op)
>> +	    mask = build_minus_one_cst (truth_type_for (vectype_in));
>>  	}
>>  
>>        /* Handle MINUS by adding the negative.  */
>> @@ -7038,7 +7101,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>>  	  def0 = negated;
>>  	}
>>  
>> -      if (mask && mask_reduc_fn == IFN_LAST)
>> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
>> +	  && mask && mask_reduc_fn == IFN_LAST)
>>  	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>>  				    vector_identity);
>>  
>> @@ -7069,8 +7133,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>>  	}
>>        else
>>  	{
>> -	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
>> -					     reduc_var, def0);
>> +	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
>> +					     tree_code (code), reduc_var, def0);
>>  	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>>  	  /* Remove the statement, so that we can use the same code paths
>>  	     as for statements that we've just created.  */
>> @@ -7521,8 +7585,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>>  	continue;
>>  
>> +      /* For an IFN_COND_OP we might hit the reduction definition operand
>> +	 twice (once as definition, once as else).  */
>> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
>> +	continue;
>> +
>>        /* There should be only one cycle def in the stmt, the one
>> -         leading to reduc_def.  */
>> +	 leading to reduc_def.  */
>>        if (VECTORIZABLE_CYCLE_DEF (dt))
>>  	return false;
>>  
>> @@ -7721,6 +7790,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>            when generating the code inside the loop.  */
>>  
>>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
>> +
>> +  /* If conversion might have created a conditional operation like
>> +     IFN_COND_ADD already.  Use the internal code for the following checks.  */
>> +  if (orig_code.is_internal_fn ())
>> +    {
>> +      tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
>> +      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
>> +    }
>> +
>>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>>  
>>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>> @@ -7759,7 +7837,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>  	{
>>  	  if (dump_enabled_p ())
>>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> -			    "reduction: not commutative/associative");
>> +			    "reduction: not commutative/associative\n");
>>  	  return false;
>>  	}
>>      }
>> @@ -8143,9 +8221,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>>  	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>>  	}
>>        else if (reduction_type == FOLD_LEFT_REDUCTION
>> -	       && reduc_fn == IFN_LAST
>> +	       && internal_fn_mask_index (reduc_fn) == -1
>>  	       && FLOAT_TYPE_P (vectype_in)
>> -	       && HONOR_SIGNED_ZEROS (vectype_in)
>>  	       && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
>>  	{
>>  	  if (dump_enabled_p ())
>> @@ -8294,6 +8371,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>  
>>    code_helper code = canonicalize_code (op.code, op.type);
>>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
>> +
>>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
>> @@ -8312,17 +8390,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>    if (code == COND_EXPR)
>>      gcc_assert (ncopies == 1);
>>  
>> +  /* A binary COND_OP reduction must have the same definition and else
>> +     value. */
>> +  bool cond_fn_p = code.is_internal_fn ()
>> +    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
>> +  if (cond_fn_p)
>> +    {
>> +      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
>> +		  || code == IFN_COND_MUL || code == IFN_COND_AND
>> +		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
>> +      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
>> +    }
>> +
>>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>>  
>>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>>    if (reduction_type == FOLD_LEFT_REDUCTION)
>>      {
>>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
>> -      gcc_assert (code.is_tree_code ());
>> +      gcc_assert (code.is_tree_code () || cond_fn_p);
>>        return vectorize_fold_left_reduction
>>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
>> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
>> -	   lens);
>> +	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
>> +	   reduc_index, masks, lens);
>>      }
>>  
>>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
>> @@ -8335,14 +8425,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>>  
>> +  /* Get NCOPIES vector definitions for all operands except the reduction
>> +     definition.  */
>>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>>  		     single_defuse_cycle && reduc_index == 0
>>  		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
>>  		     single_defuse_cycle && reduc_index == 1
>>  		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
>> -		     op.num_ops == 3
>> -		     && !(single_defuse_cycle && reduc_index == 2)
>> +		     op.num_ops == 4
>> +		     || (op.num_ops == 3
>> +			 && !(single_defuse_cycle && reduc_index == 2))
>>  		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
>> +
>> +  /* For single def-use cycles get one copy of the vectorized reduction
>> +     definition.  */
>>    if (single_defuse_cycle)
>>      {
>>        gcc_assert (!slp_node);
>> @@ -8382,7 +8478,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>  	}
>>        else
>>  	{
>> -	  if (op.num_ops == 3)
>> +	  if (op.num_ops >= 3)
>>  	    vop[2] = vec_oprnds2[i];
>>  
>>  	  if (masked_loop_p && mask_by_cond_expr)
>> @@ -8395,10 +8491,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>>  	  if (emulated_mixed_dot_prod)
>>  	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
>>  						    vec_dest, vop);
>> -	  else if (code.is_internal_fn ())
>> +
>> +	  else if (code.is_internal_fn () && !cond_fn_p)
>>  	    new_stmt = gimple_build_call_internal (internal_fn (code),
>>  						   op.num_ops,
>>  						   vop[0], vop[1], vop[2]);
>> +	  else if (code.is_internal_fn () && cond_fn_p)
>> +	    new_stmt = gimple_build_call_internal (internal_fn (code),
>> +						   op.num_ops,
>> +						   vop[0], vop[1], vop[2],
>> +						   vop[1]);
>>  	  else
>>  	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>>  					    vop[0], vop[1], vop[2]);
>> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
>> index a4043e4a656..254d172231d 100644
>> --- a/gcc/tree-vectorizer.h
>> +++ b/gcc/tree-vectorizer.h
>> @@ -2350,7 +2350,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
>>  						  tree);
>>  
>>  /* In tree-vect-loop.cc.  */
>> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
>> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
>>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
>>  bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
>>  /* Used in tree-vect-loop-manip.cc */
>>
  
Robin Dapp Oct. 24, 2023, 7:56 p.m. UTC | #27
Changed as suggested.  The difference to v5 is thus:

+	  if (cond_fn_p)
+	    {
+	      gcall *call = dyn_cast<gcall *> (use_stmt);
+	      unsigned else_pos
+		= internal_fn_else_index (internal_fn (op.code));
+
+	      for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
+		{
+		  if (j == else_pos)
+		    continue;
+		  if (gimple_call_arg (call, j) == op.ops[opi])
+		    cnt++;
+		}
+	    }
+	  else if (!is_gimple_debug (op_use_stmt)

as well as internal_fn_else_index.

Testsuite on riscv is unchanged, bootstrap and testsuite on power10 done,
aarch64 and x86 still running.

Regards
 Robin

From e11ac2b5889558c58ce711d8119ebcd78173ac6c Mon Sep 17 00:00:00 2001
From: Robin Dapp <rdapp@ventanamicro.com>
Date: Wed, 13 Sep 2023 22:19:35 +0200
Subject: [PATCH v6] ifcvt/vect: Emit COND_OP for conditional scalar reduction.

As described in PR111401 we currently emit a COND and a PLUS expression
for conditional reductions.  This makes it difficult to combine both
into a masked reduction statement later.
This patch improves that by directly emitting a COND_ADD/COND_OP during
ifcvt and adjusting some vectorizer code to handle it.

It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
is true.

gcc/ChangeLog:

	PR middle-end/111401
	* internal-fn.cc (internal_fn_else_index): New function.
	* internal-fn.h (internal_fn_else_index): Define.
	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_OP
	if supported.
	(predicate_scalar_phi): Add whitespace.
	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_OP.
	(neutral_op_for_reduction): Return -0 for PLUS.
	(check_reduction_path): Don't count else operand in COND_OP.
	(vect_is_simple_reduction): Ditto.
	(vect_create_epilog_for_reduction): Fix whitespace.
	(vectorize_fold_left_reduction): Add COND_OP handling.
	(vectorizable_reduction): Don't count else operand in COND_OP.
	(vect_transform_reduction): Add COND_OP handling.
	* tree-vectorizer.h (neutral_op_for_reduction): Add default
	parameter.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c: Adjust.
	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c: Ditto.
---
 gcc/internal-fn.cc                            |  58 ++++++
 gcc/internal-fn.h                             |   1 +
 .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 +++++++++++++
 .../riscv/rvv/autovec/cond/pr111401.c         | 139 +++++++++++++
 .../riscv/rvv/autovec/reduc/reduc_call-2.c    |   4 +-
 .../riscv/rvv/autovec/reduc/reduc_call-4.c    |   4 +-
 gcc/tree-if-conv.cc                           |  49 +++--
 gcc/tree-vect-loop.cc                         | 193 ++++++++++++++----
 gcc/tree-vectorizer.h                         |   2 +-
 9 files changed, 536 insertions(+), 55 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 61d5a9e4772..018175261b9 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4697,6 +4697,64 @@ internal_fn_len_index (internal_fn fn)
     }
 }
 
+int
+internal_fn_else_index (internal_fn fn)
+{
+  switch (fn)
+    {
+    case IFN_COND_NEG:
+    case IFN_COND_NOT:
+    case IFN_COND_LEN_NEG:
+    case IFN_COND_LEN_NOT:
+      return 2;
+
+    case IFN_COND_ADD:
+    case IFN_COND_SUB:
+    case IFN_COND_MUL:
+    case IFN_COND_DIV:
+    case IFN_COND_MOD:
+    case IFN_COND_MIN:
+    case IFN_COND_MAX:
+    case IFN_COND_FMIN:
+    case IFN_COND_FMAX:
+    case IFN_COND_AND:
+    case IFN_COND_IOR:
+    case IFN_COND_XOR:
+    case IFN_COND_SHL:
+    case IFN_COND_SHR:
+    case IFN_COND_LEN_ADD:
+    case IFN_COND_LEN_SUB:
+    case IFN_COND_LEN_MUL:
+    case IFN_COND_LEN_DIV:
+    case IFN_COND_LEN_MOD:
+    case IFN_COND_LEN_MIN:
+    case IFN_COND_LEN_MAX:
+    case IFN_COND_LEN_FMIN:
+    case IFN_COND_LEN_FMAX:
+    case IFN_COND_LEN_AND:
+    case IFN_COND_LEN_IOR:
+    case IFN_COND_LEN_XOR:
+    case IFN_COND_LEN_SHL:
+    case IFN_COND_LEN_SHR:
+      return 3;
+
+    case IFN_COND_FMA:
+    case IFN_COND_FMS:
+    case IFN_COND_FNMA:
+    case IFN_COND_FNMS:
+    case IFN_COND_LEN_FMA:
+    case IFN_COND_LEN_FMS:
+    case IFN_COND_LEN_FNMA:
+    case IFN_COND_LEN_FNMS:
+      return 4;
+
+    default:
+      return -1;
+    }
+
+  return -1;
+}
+
 /* If FN takes a vector mask argument, return the index of that argument,
    otherwise return -1.  */
 
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 99de13a0199..7d72f4db2d0 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -237,6 +237,7 @@ extern bool internal_store_fn_p (internal_fn);
 extern bool internal_gather_scatter_fn_p (internal_fn);
 extern int internal_fn_mask_index (internal_fn);
 extern int internal_fn_len_index (internal_fn);
+extern int internal_fn_else_index (internal_fn);
 extern int internal_fn_stored_value_index (internal_fn);
 extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
 						    tree, tree, int);
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
new file mode 100644
index 00000000000..7b46e7d8a2a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
@@ -0,0 +1,141 @@
+/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
+/* { dg-do run } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-add-options ieee } */
+/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
+
+#include "tree-vect.h"
+
+#include <math.h>
+
+#define N (VECTOR_BITS * 17)
+
+double __attribute__ ((noinline, noclone))
+reduc_plus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone))
+reduc_minus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  int n = 19;
+  double a[N];
+  int cond1[N], cond2[N];
+
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
+      cond1[i] = 0;
+      cond2[i] = i & 4 ? 1 : 0;
+      asm volatile ("" ::: "memory");
+    }
+
+  double res1 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res2 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
+  double res3 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res4 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond1, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res2 = reduc_minus_double (a, 0.0, cond1, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
+  res3 = reduc_plus_double (a, 0.0, cond1, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res4 = reduc_minus_double (a, 0.0, cond1, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, -0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res2 = reduc_minus_double (a, -0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
+  res3 = reduc_plus_double (a, -0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res4 = reduc_minus_double (a, -0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res2 = reduc_minus_double (a, 0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
+  res3 = reduc_plus_double (a, 0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res4 = reduc_minus_double (a, 0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
new file mode 100644
index 00000000000..83dbd61b3f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
@@ -0,0 +1,139 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
+
+double
+__attribute__ ((noipa))
+foo2 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init += a[i];
+  return init;
+}
+
+double
+__attribute__ ((noipa))
+foo3 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init -= a[i];
+  return init;
+}
+
+double
+__attribute__ ((noipa))
+foo4 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init *= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo5 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init &= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo6 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init |= a[i];
+  return init;
+}
+
+int
+__attribute__ ((noipa))
+foo7 (int *__restrict a, int init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init ^= a[i];
+  return init;
+}
+
+#define SZ 125
+
+int
+main ()
+{
+  double res1 = 0, res2 = 0, res3 = 0;
+  double a1[SZ], a2[SZ], a3[SZ];
+  int c1[SZ], c2[SZ], c3[SZ];
+
+  int a4[SZ], a5[SZ], a6[SZ];
+  int res4 = 0, res5 = 0, res6 = 0;
+  int c4[SZ], c5[SZ], c6[SZ];
+
+  for (int i = 0; i < SZ; i++)
+    {
+      a1[i] = i * 3 + (i & 4) - (i & 7);
+      a2[i] = i * 3 + (i & 4) - (i & 7);
+      a3[i] = i * 0.05 + (i & 4) - (i & 7);
+      a4[i] = i * 3 + (i & 4) - (i & 7);
+      a5[i] = i * 3 + (i & 4) - (i & 7);
+      a6[i] = i * 3 + (i & 4) - (i & 7);
+      c1[i] = i & 1;
+      c2[i] = i & 2;
+      c3[i] = i & 3;
+      c4[i] = i & 4;
+      c5[i] = i & 5;
+      c6[i] = i & 6;
+      __asm__ volatile ("" : : : "memory");
+    }
+
+  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
+  double ref1 = init1, ref2 = init2, ref3 = init3;
+
+  int init4 = 87, init5 = 11, init6 = -123894344;
+  int ref4 = init4, ref5 = init5, ref6 = init6;
+
+#pragma GCC novector
+  for (int i = 0; i < SZ; i++)
+    {
+      if (c1[i])
+        ref1 += a1[i];
+      if (c2[i])
+        ref2 -= a2[i];
+      if (c3[i])
+        ref3 *= a3[i];
+      if (c4[i])
+        ref4 &= a4[i];
+      if (c5[i])
+        ref5 |= a5[i];
+      if (c6[i])
+        ref6 ^= a6[i];
+    }
+
+  res1 = foo2 (a1, init1, c1, SZ);
+  res2 = foo3 (a2, init2, c2, SZ);
+  res3 = foo4 (a3, init3, c3, SZ);
+  res4 = foo5 (a4, init4, c4, SZ);
+  res5 = foo6 (a5, init5, c5, SZ);
+  res6 = foo7 (a6, init6, c6, SZ);
+
+  if (res1 != ref1)
+    __builtin_abort ();
+  if (res2 != ref2)
+    __builtin_abort ();
+  if (res3 != ref3)
+    __builtin_abort ();
+  if (res4 != ref4)
+    __builtin_abort ();
+  if (res5 != ref5)
+    __builtin_abort ();
+  if (res6 != ref6)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
+/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
index cc07a047cd5..7be22d60bf2 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
@@ -3,4 +3,6 @@
 
 #include "reduc_call-1.c"
 
-/* { dg-final { scan-assembler-times {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
+/* { dg-final { scan-assembler-times {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
+/* { dg-final { scan-assembler-not {vmerge} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
index 6d00c404d2a..83beabeff97 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
@@ -3,4 +3,6 @@
 
 #include "reduc_call-1.c"
 
-/* { dg-final { scan-assembler {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
+/* { dg-final { scan-assembler {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} } } */
+/* { dg-final { scan-assembler {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
+/* { dg-final { scan-assembler-not {vmerge} } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 262765139ff..4fea1000911 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -1856,10 +1856,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
   gimple *new_assign;
   tree rhs;
   tree rhs1 = gimple_assign_rhs1 (reduc);
+  tree lhs = gimple_assign_lhs (reduc);
   tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
   tree c;
   enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
-  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
+  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
+					       NULL, false);
   gimple_seq stmts = NULL;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1868,19 +1870,36 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
       print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
     }
 
-  /* Build cond expression using COND and constant operand
-     of reduction rhs.  */
-  c = fold_build_cond_expr (TREE_TYPE (rhs1),
-			    unshare_expr (cond),
-			    swap ? op_nochange : op1,
-			    swap ? op1 : op_nochange);
-
-  /* Create assignment stmt and insert it at GSI.  */
-  new_assign = gimple_build_assign (tmp, c);
-  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
-  /* Build rhs for unconditional increment/decrement/logic_operation.  */
-  rhs = gimple_build (&stmts, reduction_op,
-		      TREE_TYPE (rhs1), op0, tmp);
+  /* If possible create a COND_OP instead of a COND_EXPR and an OP_EXPR.
+     The COND_OP will have a neutral_op else value.  */
+  internal_fn ifn;
+  ifn = get_conditional_internal_fn (reduction_op);
+  if (ifn != IFN_LAST
+      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
+      && !swap)
+    {
+      gcall *cond_call = gimple_build_call_internal (ifn, 4,
+						     unshare_expr (cond),
+						     op0, op1, op0);
+      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
+      gimple_call_set_lhs (cond_call, tmp);
+      rhs = tmp;
+    }
+  else
+    {
+      /* Build cond expression using COND and constant operand
+	 of reduction rhs.  */
+      c = fold_build_cond_expr (TREE_TYPE (rhs1),
+				unshare_expr (cond),
+				swap ? op_nochange : op1,
+				swap ? op1 : op_nochange);
+      /* Create assignment stmt and insert it at GSI.  */
+      new_assign = gimple_build_assign (tmp, c);
+      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
+      /* Build rhs for unconditional increment/decrement/logic_operation.  */
+      rhs = gimple_build (&stmts, reduction_op,
+			  TREE_TYPE (rhs1), op0, tmp);
+    }
 
   if (has_nop)
     {
@@ -2292,7 +2311,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
 	{
 	  /* Convert reduction stmt into vectorizable form.  */
 	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
-					       swap,has_nop, nop_reduc);
+					       swap, has_nop, nop_reduc);
 	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
 	}
       new_stmt = gimple_build_assign (res, rhs);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 40f167d2795..3b28c826b3b 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3762,7 +3762,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 static bool
 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
 {
-  if (code == PLUS_EXPR)
+  /* We support MINUS_EXPR by negating the operand.  This also preserves an
+     initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
+     (-0.0) = -0.0.  */
+  if (code == PLUS_EXPR || code == MINUS_EXPR)
     {
       *reduc_fn = IFN_FOLD_LEFT_PLUS;
       return true;
@@ -3841,23 +3844,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
    by the introduction of additional X elements, return that X, otherwise
    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
    of the scalar elements.  If the reduction has just a single initial value
-   then INITIAL_VALUE is that value, otherwise it is null.  */
+   then INITIAL_VALUE is that value, otherwise it is null.
+   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
+   In that case no signed zero is returned.  */
 
 tree
 neutral_op_for_reduction (tree scalar_type, code_helper code,
-			  tree initial_value)
+			  tree initial_value, bool as_initial)
 {
   if (code.is_tree_code ())
     switch (tree_code (code))
       {
-      case WIDEN_SUM_EXPR:
       case DOT_PROD_EXPR:
       case SAD_EXPR:
-      case PLUS_EXPR:
       case MINUS_EXPR:
       case BIT_IOR_EXPR:
       case BIT_XOR_EXPR:
 	return build_zero_cst (scalar_type);
+      case WIDEN_SUM_EXPR:
+      case PLUS_EXPR:
+	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
+	  return build_real (scalar_type, dconstm0);
+	else
+	  return build_zero_cst (scalar_type);
 
       case MULT_EXPR:
 	return build_one_cst (scalar_type);
@@ -4079,12 +4088,37 @@ pop:
       use_operand_p use_p;
       gimple *op_use_stmt;
       unsigned cnt = 0;
+      bool cond_fn_p = op.code.is_internal_fn ()
+	&& (conditional_internal_fn_code (internal_fn (op.code))
+	    != ERROR_MARK);
+
       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
-	if (!is_gimple_debug (op_use_stmt)
-	    && (*code != ERROR_MARK
-		|| flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
-	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
-	    cnt++;
+	{
+	/* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
+	   op1 twice (once as definition, once as else) in the same operation.
+	   Allow this.  */
+	  if (cond_fn_p)
+	    {
+	      gcall *call = dyn_cast<gcall *> (use_stmt);
+	      unsigned else_pos
+		= internal_fn_else_index (internal_fn (op.code));
+
+	      for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
+		{
+		  if (j == else_pos)
+		    continue;
+		  if (gimple_call_arg (call, j) == op.ops[opi])
+		    cnt++;
+		}
+	    }
+	  else if (!is_gimple_debug (op_use_stmt)
+		   && (*code != ERROR_MARK
+		       || flow_bb_inside_loop_p (loop,
+						 gimple_bb (op_use_stmt))))
+	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+	      cnt++;
+	}
+
       if (cnt != 1)
 	{
 	  fail = true;
@@ -4187,8 +4221,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
           return NULL;
         }
 
-      nphi_def_loop_uses++;
-      phi_use_stmt = use_stmt;
+      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
+	 op1 twice (once as definition, once as else) in the same operation.
+	 Only count it as one. */
+      if (use_stmt != phi_use_stmt)
+	{
+	  nphi_def_loop_uses++;
+	  phi_use_stmt = use_stmt;
+	}
     }
 
   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
@@ -6122,7 +6162,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
     }
-  
+
   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
   scalar_type = TREE_TYPE (scalar_dest);
   scalar_results.truncate (0);
@@ -6459,7 +6499,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
 	    initial_value = reduc_info->reduc_initial_values[0];
 	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
-						 initial_value);
+						 initial_value, false);
 	}
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -6941,8 +6981,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 			       gimple_stmt_iterator *gsi,
 			       gimple **vec_stmt, slp_tree slp_node,
 			       gimple *reduc_def_stmt,
-			       tree_code code, internal_fn reduc_fn,
-			       tree ops[3], tree vectype_in,
+			       code_helper code, internal_fn reduc_fn,
+			       tree *ops, int num_ops, tree vectype_in,
 			       int reduc_index, vec_loop_masks *masks,
 			       vec_loop_lens *lens)
 {
@@ -6958,17 +6998,48 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 
   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
   gcc_assert (ncopies == 1);
-  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
+
+  bool is_cond_op = false;
+  if (!code.is_tree_code ())
+    {
+      code = conditional_internal_fn_code (internal_fn (code));
+      gcc_assert (code != ERROR_MARK);
+      is_cond_op = true;
+    }
+
+  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
 
   if (slp_node)
-    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
-			  TYPE_VECTOR_SUBPARTS (vectype_in)));
+    {
+      if (is_cond_op)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "fold-left reduction on SLP not supported.\n");
+	  return false;
+	}
+
+      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
+			    TYPE_VECTOR_SUBPARTS (vectype_in)));
+    }
 
-  tree op0 = ops[1 - reduc_index];
+  /* The operands either come from a binary operation or an IFN_COND operation.
+     The former is a gimple assign with binary rhs and the latter is a
+     gimple call with four arguments.  */
+  gcc_assert (num_ops == 2 || num_ops == 4);
+  tree op0, opmask;
+  if (!is_cond_op)
+    op0 = ops[1 - reduc_index];
+  else
+    {
+      op0 = ops[2];
+      opmask = ops[0];
+      gcc_assert (!slp_node);
+    }
 
   int group_size = 1;
   stmt_vec_info scalar_dest_def_info;
-  auto_vec<tree> vec_oprnds0;
+  auto_vec<tree> vec_oprnds0, vec_opmask;
   if (slp_node)
     {
       auto_vec<vec<tree> > vec_defs (2);
@@ -6984,9 +7055,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
 				     op0, &vec_oprnds0);
       scalar_dest_def_info = stmt_info;
+
+      /* For an IFN_COND_OP we also need the vector mask operand.  */
+      if (is_cond_op)
+	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+					 opmask, &vec_opmask);
     }
 
-  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
+  gimple *sdef = scalar_dest_def_info->stmt;
+  tree scalar_dest = gimple_get_lhs (sdef);
   tree scalar_type = TREE_TYPE (scalar_dest);
   tree reduc_var = gimple_phi_result (reduc_def_stmt);
 
@@ -7020,13 +7097,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       tree bias = NULL_TREE;
       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
+      else if (is_cond_op)
+	mask = vec_opmask[0];
       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
 	{
 	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
 				   i, 1);
 	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 	  bias = build_int_cst (intQI_type_node, biasval);
-	  mask = build_minus_one_cst (truth_type_for (vectype_in));
+	  if (!is_cond_op)
+	    mask = build_minus_one_cst (truth_type_for (vectype_in));
 	}
 
       /* Handle MINUS by adding the negative.  */
@@ -7038,7 +7118,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	  def0 = negated;
 	}
 
-      if (mask && mask_reduc_fn == IFN_LAST)
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  && mask && mask_reduc_fn == IFN_LAST)
 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
 				    vector_identity);
 
@@ -7069,8 +7150,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
-					     reduc_var, def0);
+	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
+					     tree_code (code), reduc_var, def0);
 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
 	  /* Remove the statement, so that we can use the same code paths
 	     as for statements that we've just created.  */
@@ -7521,8 +7602,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
 	continue;
 
+      /* For an IFN_COND_OP we might hit the reduction definition operand
+	 twice (once as definition, once as else).  */
+      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
+	continue;
+
       /* There should be only one cycle def in the stmt, the one
-         leading to reduc_def.  */
+	 leading to reduc_def.  */
       if (VECTORIZABLE_CYCLE_DEF (dt))
 	return false;
 
@@ -7721,6 +7807,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
           when generating the code inside the loop.  */
 
   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
+
+  /* If conversion might have created a conditional operation like
+     IFN_COND_ADD already.  Use the internal code for the following checks.  */
+  if (orig_code.is_internal_fn ())
+    {
+      tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
+      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
+    }
+
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
@@ -7759,7 +7854,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			    "reduction: not commutative/associative");
+			    "reduction: not commutative/associative\n");
 	  return false;
 	}
     }
@@ -8143,9 +8238,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 	}
       else if (reduction_type == FOLD_LEFT_REDUCTION
-	       && reduc_fn == IFN_LAST
+	       && internal_fn_mask_index (reduc_fn) == -1
 	       && FLOAT_TYPE_P (vectype_in)
-	       && HONOR_SIGNED_ZEROS (vectype_in)
 	       && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
 	{
 	  if (dump_enabled_p ())
@@ -8294,6 +8388,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 
   code_helper code = canonicalize_code (op.code, op.type);
   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
+
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
@@ -8312,17 +8407,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   if (code == COND_EXPR)
     gcc_assert (ncopies == 1);
 
+  /* A binary COND_OP reduction must have the same definition and else
+     value. */
+  bool cond_fn_p = code.is_internal_fn ()
+    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
+  if (cond_fn_p)
+    {
+      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
+		  || code == IFN_COND_MUL || code == IFN_COND_AND
+		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
+      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
+    }
+
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
-      gcc_assert (code.is_tree_code ());
+      gcc_assert (code.is_tree_code () || cond_fn_p);
       return vectorize_fold_left_reduction
 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
-	   lens);
+	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
+	   reduc_index, masks, lens);
     }
 
   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
@@ -8335,14 +8442,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
+  /* Get NCOPIES vector definitions for all operands except the reduction
+     definition.  */
   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
 		     single_defuse_cycle && reduc_index == 0
 		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
 		     single_defuse_cycle && reduc_index == 1
 		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
-		     op.num_ops == 3
-		     && !(single_defuse_cycle && reduc_index == 2)
+		     op.num_ops == 4
+		     || (op.num_ops == 3
+			 && !(single_defuse_cycle && reduc_index == 2))
 		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+
+  /* For single def-use cycles get one copy of the vectorized reduction
+     definition.  */
   if (single_defuse_cycle)
     {
       gcc_assert (!slp_node);
@@ -8382,7 +8495,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  if (op.num_ops == 3)
+	  if (op.num_ops >= 3)
 	    vop[2] = vec_oprnds2[i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
@@ -8395,10 +8508,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
 	  if (emulated_mixed_dot_prod)
 	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
 						    vec_dest, vop);
-	  else if (code.is_internal_fn ())
+
+	  else if (code.is_internal_fn () && !cond_fn_p)
 	    new_stmt = gimple_build_call_internal (internal_fn (code),
 						   op.num_ops,
 						   vop[0], vop[1], vop[2]);
+	  else if (code.is_internal_fn () && cond_fn_p)
+	    new_stmt = gimple_build_call_internal (internal_fn (code),
+						   op.num_ops,
+						   vop[0], vop[1], vop[2],
+						   vop[1]);
 	  else
 	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
 					    vop[0], vop[1], vop[2]);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index a4043e4a656..254d172231d 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2350,7 +2350,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
 						  tree);
 
 /* In tree-vect-loop.cc.  */
-extern tree neutral_op_for_reduction (tree, code_helper, tree);
+extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
 extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
 bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
 /* Used in tree-vect-loop-manip.cc */
  
Richard Sandiford Oct. 31, 2023, 9:04 p.m. UTC | #28
Robin Dapp <rdapp.gcc@gmail.com> writes:
> Changed as suggested.  The difference to v5 is thus:
>
> +	  if (cond_fn_p)
> +	    {
> +	      gcall *call = dyn_cast<gcall *> (use_stmt);
> +	      unsigned else_pos
> +		= internal_fn_else_index (internal_fn (op.code));
> +
> +	      for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
> +		{
> +		  if (j == else_pos)
> +		    continue;
> +		  if (gimple_call_arg (call, j) == op.ops[opi])
> +		    cnt++;
> +		}
> +	    }
> +	  else if (!is_gimple_debug (op_use_stmt)
>
> as well as internal_fn_else_index.
>
> Testsuite on riscv is unchanged, bootstrap and testsuite on power10 done,
> aarch64 and x86 still running.
>
> Regards
>  Robin
>
> From e11ac2b5889558c58ce711d8119ebcd78173ac6c Mon Sep 17 00:00:00 2001
> From: Robin Dapp <rdapp@ventanamicro.com>
> Date: Wed, 13 Sep 2023 22:19:35 +0200
> Subject: [PATCH v6] ifcvt/vect: Emit COND_OP for conditional scalar reduction.
>
> As described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions.  This makes it difficult to combine both
> into a masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD/COND_OP during
> ifcvt and adjusting some vectorizer code to handle it.
>
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
> is true.
>
> gcc/ChangeLog:
>
> 	PR middle-end/111401
> 	* internal-fn.cc (internal_fn_else_index): New function.
> 	* internal-fn.h (internal_fn_else_index): Define.
> 	* tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_OP
> 	if supported.
> 	(predicate_scalar_phi): Add whitespace.
> 	* tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_OP.
> 	(neutral_op_for_reduction): Return -0 for PLUS.
> 	(check_reduction_path): Don't count else operand in COND_OP.
> 	(vect_is_simple_reduction): Ditto.
> 	(vect_create_epilog_for_reduction): Fix whitespace.
> 	(vectorize_fold_left_reduction): Add COND_OP handling.
> 	(vectorizable_reduction): Don't count else operand in COND_OP.
> 	(vect_transform_reduction): Add COND_OP handling.
> 	* tree-vectorizer.h (neutral_op_for_reduction): Add default
> 	parameter.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
> 	* gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
> 	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c: Adjust.
> 	* gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c: Ditto.
>
> ---
>  gcc/internal-fn.cc                            |  58 ++++++
>  gcc/internal-fn.h                             |   1 +
>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 +++++++++++++
>  .../riscv/rvv/autovec/cond/pr111401.c         | 139 +++++++++++++
>  .../riscv/rvv/autovec/reduc/reduc_call-2.c    |   4 +-
>  .../riscv/rvv/autovec/reduc/reduc_call-4.c    |   4 +-
>  gcc/tree-if-conv.cc                           |  49 +++--
>  gcc/tree-vect-loop.cc                         | 193 ++++++++++++++----
>  gcc/tree-vectorizer.h                         |   2 +-
>  9 files changed, 536 insertions(+), 55 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 61d5a9e4772..018175261b9 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4697,6 +4697,64 @@ internal_fn_len_index (internal_fn fn)
>      }
>  }
>  
> +int
> +internal_fn_else_index (internal_fn fn)

The function needs a comment, maybe:

/* If FN is an IFN_COND_* or IFN_COND_LEN_* function, return the index of the
   argument that is used when the condition is false.  Return -1 otherwise.  */

OK for the internal-fn* and tree-if-conv.cc bits (which were the
parts I commented on earlier).  I'll look at cleaning up the
definition of conditional internal functions separately, so that
the list of functions isn't necessary.

Thanks,
Richard

> +{
> +  switch (fn)
> +    {
> +    case IFN_COND_NEG:
> +    case IFN_COND_NOT:
> +    case IFN_COND_LEN_NEG:
> +    case IFN_COND_LEN_NOT:
> +      return 2;
> +
> +    case IFN_COND_ADD:
> +    case IFN_COND_SUB:
> +    case IFN_COND_MUL:
> +    case IFN_COND_DIV:
> +    case IFN_COND_MOD:
> +    case IFN_COND_MIN:
> +    case IFN_COND_MAX:
> +    case IFN_COND_FMIN:
> +    case IFN_COND_FMAX:
> +    case IFN_COND_AND:
> +    case IFN_COND_IOR:
> +    case IFN_COND_XOR:
> +    case IFN_COND_SHL:
> +    case IFN_COND_SHR:
> +    case IFN_COND_LEN_ADD:
> +    case IFN_COND_LEN_SUB:
> +    case IFN_COND_LEN_MUL:
> +    case IFN_COND_LEN_DIV:
> +    case IFN_COND_LEN_MOD:
> +    case IFN_COND_LEN_MIN:
> +    case IFN_COND_LEN_MAX:
> +    case IFN_COND_LEN_FMIN:
> +    case IFN_COND_LEN_FMAX:
> +    case IFN_COND_LEN_AND:
> +    case IFN_COND_LEN_IOR:
> +    case IFN_COND_LEN_XOR:
> +    case IFN_COND_LEN_SHL:
> +    case IFN_COND_LEN_SHR:
> +      return 3;
> +
> +    case IFN_COND_FMA:
> +    case IFN_COND_FMS:
> +    case IFN_COND_FNMA:
> +    case IFN_COND_FNMS:
> +    case IFN_COND_LEN_FMA:
> +    case IFN_COND_LEN_FMS:
> +    case IFN_COND_LEN_FNMA:
> +    case IFN_COND_LEN_FNMS:
> +      return 4;
> +
> +    default:
> +      return -1;
> +    }
> +
> +  return -1;
> +}
> +
>  /* If FN takes a vector mask argument, return the index of that argument,
>     otherwise return -1.  */
>  
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 99de13a0199..7d72f4db2d0 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -237,6 +237,7 @@ extern bool internal_store_fn_p (internal_fn);
>  extern bool internal_gather_scatter_fn_p (internal_fn);
>  extern int internal_fn_mask_index (internal_fn);
>  extern int internal_fn_len_index (internal_fn);
> +extern int internal_fn_else_index (internal_fn);
>  extern int internal_fn_stored_value_index (internal_fn);
>  extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
>  						    tree, tree, int);
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..7b46e7d8a2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=gnu99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_plus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_minus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> +  int n = 19;
> +  double a[N];
> +  int cond1[N], cond2[N];
> +
> +  for (int i = 0; i < N; i++)
> +    {
> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> +      cond1[i] = 0;
> +      cond2[i] = i & 4 ? 1 : 0;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res2 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +  double res3 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res4 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..83dbd61b3f3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,139 @@
> +/* { dg-do run { target { riscv_v } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init += a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init -= a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo4 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init *= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo5 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init &= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo6 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init |= a[i];
> +  return init;
> +}
> +
> +int
> +__attribute__ ((noipa))
> +foo7 (int *__restrict a, int init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init ^= a[i];
> +  return init;
> +}
> +
> +#define SZ 125
> +
> +int
> +main ()
> +{
> +  double res1 = 0, res2 = 0, res3 = 0;
> +  double a1[SZ], a2[SZ], a3[SZ];
> +  int c1[SZ], c2[SZ], c3[SZ];
> +
> +  int a4[SZ], a5[SZ], a6[SZ];
> +  int res4 = 0, res5 = 0, res6 = 0;
> +  int c4[SZ], c5[SZ], c6[SZ];
> +
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      a1[i] = i * 3 + (i & 4) - (i & 7);
> +      a2[i] = i * 3 + (i & 4) - (i & 7);
> +      a3[i] = i * 0.05 + (i & 4) - (i & 7);
> +      a4[i] = i * 3 + (i & 4) - (i & 7);
> +      a5[i] = i * 3 + (i & 4) - (i & 7);
> +      a6[i] = i * 3 + (i & 4) - (i & 7);
> +      c1[i] = i & 1;
> +      c2[i] = i & 2;
> +      c3[i] = i & 3;
> +      c4[i] = i & 4;
> +      c5[i] = i & 5;
> +      c6[i] = i & 6;
> +      __asm__ volatile ("" : : : "memory");
> +    }
> +
> +  double init1 = 2.7, init2 = 8.2, init3 = 0.1;
> +  double ref1 = init1, ref2 = init2, ref3 = init3;
> +
> +  int init4 = 87, init5 = 11, init6 = -123894344;
> +  int ref4 = init4, ref5 = init5, ref6 = init6;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      if (c1[i])
> +        ref1 += a1[i];
> +      if (c2[i])
> +        ref2 -= a2[i];
> +      if (c3[i])
> +        ref3 *= a3[i];
> +      if (c4[i])
> +        ref4 &= a4[i];
> +      if (c5[i])
> +        ref5 |= a5[i];
> +      if (c6[i])
> +        ref6 ^= a6[i];
> +    }
> +
> +  res1 = foo2 (a1, init1, c1, SZ);
> +  res2 = foo3 (a2, init2, c2, SZ);
> +  res3 = foo4 (a3, init3, c3, SZ);
> +  res4 = foo5 (a4, init4, c4, SZ);
> +  res5 = foo6 (a5, init5, c5, SZ);
> +  res6 = foo7 (a6, init6, c6, SZ);
> +
> +  if (res1 != ref1)
> +    __builtin_abort ();
> +  if (res2 != ref2)
> +    __builtin_abort ();
> +  if (res3 != ref3)
> +    __builtin_abort ();
> +  if (res4 != ref4)
> +    __builtin_abort ();
> +  if (res5 != ref5)
> +    __builtin_abort ();
> +  if (res6 != ref6)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 5 "vect" } } */
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> index cc07a047cd5..7be22d60bf2 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-2.c
> @@ -3,4 +3,6 @@
>  
>  #include "reduc_call-1.c"
>  
> -/* { dg-final { scan-assembler-times {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
> +/* { dg-final { scan-assembler-times {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} 1 } } */
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> index 6d00c404d2a..83beabeff97 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/reduc/reduc_call-4.c
> @@ -3,4 +3,6 @@
>  
>  #include "reduc_call-1.c"
>  
> -/* { dg-final { scan-assembler {vfmacc\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
> +/* { dg-final { scan-assembler {vfmadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+} } } */
> +/* { dg-final { scan-assembler {vfadd\.vv\s+v[0-9]+,v[0-9]+,v[0-9]+,v0.t} } } */
> +/* { dg-final { scan-assembler-not {vmerge} } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 262765139ff..4fea1000911 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1856,10 +1856,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>    gimple *new_assign;
>    tree rhs;
>    tree rhs1 = gimple_assign_rhs1 (reduc);
> +  tree lhs = gimple_assign_lhs (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
> +					       NULL, false);
>    gimple_seq stmts = NULL;
>  
>    if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1868,19 +1870,36 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>      }
>  
> -  /* Build cond expression using COND and constant operand
> -     of reduction rhs.  */
> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
> -			    unshare_expr (cond),
> -			    swap ? op_nochange : op1,
> -			    swap ? op1 : op_nochange);
> -
> -  /* Create assignment stmt and insert it at GSI.  */
> -  new_assign = gimple_build_assign (tmp, c);
> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> -  rhs = gimple_build (&stmts, reduction_op,
> -		      TREE_TYPE (rhs1), op0, tmp);
> +  /* If possible create a COND_OP instead of a COND_EXPR and an OP_EXPR.
> +     The COND_OP will have a neutral_op else value.  */
> +  internal_fn ifn;
> +  ifn = get_conditional_internal_fn (reduction_op);
> +  if (ifn != IFN_LAST
> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> +      && !swap)
> +    {
> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
> +						     unshare_expr (cond),
> +						     op0, op1, op0);
> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> +      gimple_call_set_lhs (cond_call, tmp);
> +      rhs = tmp;
> +    }
> +  else
> +    {
> +      /* Build cond expression using COND and constant operand
> +	 of reduction rhs.  */
> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
> +				unshare_expr (cond),
> +				swap ? op_nochange : op1,
> +				swap ? op1 : op_nochange);
> +      /* Create assignment stmt and insert it at GSI.  */
> +      new_assign = gimple_build_assign (tmp, c);
> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +      rhs = gimple_build (&stmts, reduction_op,
> +			  TREE_TYPE (rhs1), op0, tmp);
> +    }
>  
>    if (has_nop)
>      {
> @@ -2292,7 +2311,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
>  	{
>  	  /* Convert reduction stmt into vectorizable form.  */
>  	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -					       swap,has_nop, nop_reduc);
> +					       swap, has_nop, nop_reduc);
>  	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>  	}
>        new_stmt = gimple_build_assign (res, rhs);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 40f167d2795..3b28c826b3b 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3762,7 +3762,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
>  static bool
>  fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
>  {
> -  if (code == PLUS_EXPR)
> +  /* We support MINUS_EXPR by negating the operand.  This also preserves an
> +     initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
> +     (-0.0) = -0.0.  */
> +  if (code == PLUS_EXPR || code == MINUS_EXPR)
>      {
>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>        return true;
> @@ -3841,23 +3844,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
>     by the introduction of additional X elements, return that X, otherwise
>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>     of the scalar elements.  If the reduction has just a single initial value
> -   then INITIAL_VALUE is that value, otherwise it is null.  */
> +   then INITIAL_VALUE is that value, otherwise it is null.
> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> +   In that case no signed zero is returned.  */
>  
>  tree
>  neutral_op_for_reduction (tree scalar_type, code_helper code,
> -			  tree initial_value)
> +			  tree initial_value, bool as_initial)
>  {
>    if (code.is_tree_code ())
>      switch (tree_code (code))
>        {
> -      case WIDEN_SUM_EXPR:
>        case DOT_PROD_EXPR:
>        case SAD_EXPR:
> -      case PLUS_EXPR:
>        case MINUS_EXPR:
>        case BIT_IOR_EXPR:
>        case BIT_XOR_EXPR:
>  	return build_zero_cst (scalar_type);
> +      case WIDEN_SUM_EXPR:
> +      case PLUS_EXPR:
> +	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> +	  return build_real (scalar_type, dconstm0);
> +	else
> +	  return build_zero_cst (scalar_type);
>  
>        case MULT_EXPR:
>  	return build_one_cst (scalar_type);
> @@ -4079,12 +4088,37 @@ pop:
>        use_operand_p use_p;
>        gimple *op_use_stmt;
>        unsigned cnt = 0;
> +      bool cond_fn_p = op.code.is_internal_fn ()
> +	&& (conditional_internal_fn_code (internal_fn (op.code))
> +	    != ERROR_MARK);
> +
>        FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
> -	if (!is_gimple_debug (op_use_stmt)
> -	    && (*code != ERROR_MARK
> -		|| flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
> -	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> -	    cnt++;
> +	{
> +	/* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	   op1 twice (once as definition, once as else) in the same operation.
> +	   Allow this.  */
> +	  if (cond_fn_p)
> +	    {
> +	      gcall *call = dyn_cast<gcall *> (use_stmt);
> +	      unsigned else_pos
> +		= internal_fn_else_index (internal_fn (op.code));
> +
> +	      for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
> +		{
> +		  if (j == else_pos)
> +		    continue;
> +		  if (gimple_call_arg (call, j) == op.ops[opi])
> +		    cnt++;
> +		}
> +	    }
> +	  else if (!is_gimple_debug (op_use_stmt)
> +		   && (*code != ERROR_MARK
> +		       || flow_bb_inside_loop_p (loop,
> +						 gimple_bb (op_use_stmt))))
> +	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> +	      cnt++;
> +	}
> +
>        if (cnt != 1)
>  	{
>  	  fail = true;
> @@ -4187,8 +4221,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
>            return NULL;
>          }
>  
> -      nphi_def_loop_uses++;
> -      phi_use_stmt = use_stmt;
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +	 op1 twice (once as definition, once as else) in the same operation.
> +	 Only count it as one. */
> +      if (use_stmt != phi_use_stmt)
> +	{
> +	  nphi_def_loop_uses++;
> +	  phi_use_stmt = use_stmt;
> +	}
>      }
>  
>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6122,7 +6162,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
>        gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
>      }
> -  
> +
>    scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
>    scalar_type = TREE_TYPE (scalar_dest);
>    scalar_results.truncate (0);
> @@ -6459,7 +6499,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>  	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>  	    initial_value = reduc_info->reduc_initial_values[0];
>  	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> -						 initial_value);
> +						 initial_value, false);
>  	}
>        if (neutral_op)
>  	vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -6941,8 +6981,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  			       gimple_stmt_iterator *gsi,
>  			       gimple **vec_stmt, slp_tree slp_node,
>  			       gimple *reduc_def_stmt,
> -			       tree_code code, internal_fn reduc_fn,
> -			       tree ops[3], tree vectype_in,
> +			       code_helper code, internal_fn reduc_fn,
> +			       tree *ops, int num_ops, tree vectype_in,
>  			       int reduc_index, vec_loop_masks *masks,
>  			       vec_loop_lens *lens)
>  {
> @@ -6958,17 +6998,48 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  
>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>    gcc_assert (ncopies == 1);
> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> +  bool is_cond_op = false;
> +  if (!code.is_tree_code ())
> +    {
> +      code = conditional_internal_fn_code (internal_fn (code));
> +      gcc_assert (code != ERROR_MARK);
> +      is_cond_op = true;
> +    }
> +
> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
>  
>    if (slp_node)
> -    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> -			  TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    {
> +      if (is_cond_op)
> +	{
> +	  if (dump_enabled_p ())
> +	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +			     "fold-left reduction on SLP not supported.\n");
> +	  return false;
> +	}
> +
> +      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
> +			    TYPE_VECTOR_SUBPARTS (vectype_in)));
> +    }
>  
> -  tree op0 = ops[1 - reduc_index];
> +  /* The operands either come from a binary operation or an IFN_COND operation.
> +     The former is a gimple assign with binary rhs and the latter is a
> +     gimple call with four arguments.  */
> +  gcc_assert (num_ops == 2 || num_ops == 4);
> +  tree op0, opmask;
> +  if (!is_cond_op)
> +    op0 = ops[1 - reduc_index];
> +  else
> +    {
> +      op0 = ops[2];
> +      opmask = ops[0];
> +      gcc_assert (!slp_node);
> +    }
>  
>    int group_size = 1;
>    stmt_vec_info scalar_dest_def_info;
> -  auto_vec<tree> vec_oprnds0;
> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>    if (slp_node)
>      {
>        auto_vec<vec<tree> > vec_defs (2);
> @@ -6984,9 +7055,15 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>  				     op0, &vec_oprnds0);
>        scalar_dest_def_info = stmt_info;
> +
> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
> +      if (is_cond_op)
> +	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> +					 opmask, &vec_opmask);
>      }
>  
> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> +  gimple *sdef = scalar_dest_def_info->stmt;
> +  tree scalar_dest = gimple_get_lhs (sdef);
>    tree scalar_type = TREE_TYPE (scalar_dest);
>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
>  
> @@ -7020,13 +7097,16 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>  	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
> +      else if (is_cond_op)
> +	mask = vec_opmask[0];
>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>  	{
>  	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>  				   i, 1);
>  	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>  	  bias = build_int_cst (intQI_type_node, biasval);
> -	  mask = build_minus_one_cst (truth_type_for (vectype_in));
> +	  if (!is_cond_op)
> +	    mask = build_minus_one_cst (truth_type_for (vectype_in));
>  	}
>  
>        /* Handle MINUS by adding the negative.  */
> @@ -7038,7 +7118,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	  def0 = negated;
>  	}
>  
> -      if (mask && mask_reduc_fn == IFN_LAST)
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +	  && mask && mask_reduc_fn == IFN_LAST)
>  	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>  				    vector_identity);
>  
> @@ -7069,8 +7150,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> -					     reduc_var, def0);
> +	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> +					     tree_code (code), reduc_var, def0);
>  	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>  	  /* Remove the statement, so that we can use the same code paths
>  	     as for statements that we've just created.  */
> @@ -7521,8 +7602,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>  	continue;
>  
> +      /* For an IFN_COND_OP we might hit the reduction definition operand
> +	 twice (once as definition, once as else).  */
> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> +	continue;
> +
>        /* There should be only one cycle def in the stmt, the one
> -         leading to reduc_def.  */
> +	 leading to reduc_def.  */
>        if (VECTORIZABLE_CYCLE_DEF (dt))
>  	return false;
>  
> @@ -7721,6 +7807,15 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>            when generating the code inside the loop.  */
>  
>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> +  /* If conversion might have created a conditional operation like
> +     IFN_COND_ADD already.  Use the internal code for the following checks.  */
> +  if (orig_code.is_internal_fn ())
> +    {
> +      tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
> +      orig_code = new_code != ERROR_MARK ? new_code : orig_code;
> +    }
> +
>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> @@ -7759,7 +7854,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	{
>  	  if (dump_enabled_p ())
>  	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -			    "reduction: not commutative/associative");
> +			    "reduction: not commutative/associative\n");
>  	  return false;
>  	}
>      }
> @@ -8143,9 +8238,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>  	  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  	}
>        else if (reduction_type == FOLD_LEFT_REDUCTION
> -	       && reduc_fn == IFN_LAST
> +	       && internal_fn_mask_index (reduc_fn) == -1
>  	       && FLOAT_TYPE_P (vectype_in)
> -	       && HONOR_SIGNED_ZEROS (vectype_in)
>  	       && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
>  	{
>  	  if (dump_enabled_p ())
> @@ -8294,6 +8388,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> @@ -8312,17 +8407,29 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    if (code == COND_EXPR)
>      gcc_assert (ncopies == 1);
>  
> +  /* A binary COND_OP reduction must have the same definition and else
> +     value. */
> +  bool cond_fn_p = code.is_internal_fn ()
> +    && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
> +  if (cond_fn_p)
> +    {
> +      gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
> +		  || code == IFN_COND_MUL || code == IFN_COND_AND
> +		  || code == IFN_COND_IOR || code == IFN_COND_XOR);
> +      gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +    }
> +
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>  
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> -      gcc_assert (code.is_tree_code ());
> +      gcc_assert (code.is_tree_code () || cond_fn_p);
>        return vectorize_fold_left_reduction
>  	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> -	   lens);
> +	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +	   reduc_index, masks, lens);
>      }
>  
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> @@ -8335,14 +8442,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>  
> +  /* Get NCOPIES vector definitions for all operands except the reduction
> +     definition.  */
>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>  		     single_defuse_cycle && reduc_index == 0
>  		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
>  		     single_defuse_cycle && reduc_index == 1
>  		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -		     op.num_ops == 3
> -		     && !(single_defuse_cycle && reduc_index == 2)
> +		     op.num_ops == 4
> +		     || (op.num_ops == 3
> +			 && !(single_defuse_cycle && reduc_index == 2))
>  		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> +  /* For single def-use cycles get one copy of the vectorized reduction
> +     definition.  */
>    if (single_defuse_cycle)
>      {
>        gcc_assert (!slp_node);
> @@ -8382,7 +8495,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	}
>        else
>  	{
> -	  if (op.num_ops == 3)
> +	  if (op.num_ops >= 3)
>  	    vop[2] = vec_oprnds2[i];
>  
>  	  if (masked_loop_p && mask_by_cond_expr)
> @@ -8395,10 +8508,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>  	  if (emulated_mixed_dot_prod)
>  	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
>  						    vec_dest, vop);
> -	  else if (code.is_internal_fn ())
> +
> +	  else if (code.is_internal_fn () && !cond_fn_p)
>  	    new_stmt = gimple_build_call_internal (internal_fn (code),
>  						   op.num_ops,
>  						   vop[0], vop[1], vop[2]);
> +	  else if (code.is_internal_fn () && cond_fn_p)
> +	    new_stmt = gimple_build_call_internal (internal_fn (code),
> +						   op.num_ops,
> +						   vop[0], vop[1], vop[2],
> +						   vop[1]);
>  	  else
>  	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>  					    vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index a4043e4a656..254d172231d 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2350,7 +2350,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
>  						  tree);
>  
>  /* In tree-vect-loop.cc.  */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
>  bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
>  /* Used in tree-vect-loop-manip.cc */
  
Robin Dapp Oct. 31, 2023, 9:19 p.m. UTC | #29
>> +int
>> +internal_fn_else_index (internal_fn fn)
> 
> The function needs a comment, maybe:
> 
> /* If FN is an IFN_COND_* or IFN_COND_LEN_* function, return the index of the
>    argument that is used when the condition is false.  Return -1 otherwise.  */
> 
> OK for the internal-fn* and tree-if-conv.cc bits (which were the
> parts I commented on earlier).  I'll look at cleaning up the
> definition of conditional internal functions separately, so that
> the list of functions isn't necessary.

Thank you, added the comment (shouldn't have forgotten it in the
first place...).  So there's the vectorizer part left that is not
yet OK'd.  

Regards
 Robin
  
Richard Biener Nov. 2, 2023, 7:48 a.m. UTC | #30
On Tue, 31 Oct 2023, Robin Dapp wrote:

> >> +int
> >> +internal_fn_else_index (internal_fn fn)
> > 
> > The function needs a comment, maybe:
> > 
> > /* If FN is an IFN_COND_* or IFN_COND_LEN_* function, return the index of the
> >    argument that is used when the condition is false.  Return -1 otherwise.  */
> > 
> > OK for the internal-fn* and tree-if-conv.cc bits (which were the
> > parts I commented on earlier).  I'll look at cleaning up the
> > definition of conditional internal functions separately, so that
> > the list of functions isn't necessary.
> 
> Thank you, added the comment (shouldn't have forgotten it in the
> first place...).  So there's the vectorizer part left that is not
> yet OK'd.  

The vectorizer part is OK.

Richard.
  
Andrew Pinski Nov. 2, 2023, 11:26 p.m. UTC | #31
On Wed, Sep 20, 2023 at 6:52 AM Robin Dapp <rdapp.gcc@gmail.com> wrote:
>
> Hi,
>
> as described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions.  This makes it difficult to combine both
> into a masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD during ifcvt and
> adjusting some vectorizer code to handle it.
>
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
> is true.
>
> Related question/change: We only allow PLUS_EXPR in fold_left_reduction_fn
> but have code to handle MINUS_EXPR in vectorize_fold_left_reduction.  I
> suppose that's intentional but it "just works" on riscv and the testsuite
> doesn't change when allowing MINUS_EXPR so I went ahead and did that.
>
> Bootstrapped and regtested on x86 and aarch64.

This caused gcc.target/i386/avx512f-reduce-op-1.c testcase to start to
fail when testing on a x86_64 that has avx512f (In my case I am using
`Intel(R) Xeon(R) D-2166NT CPU @ 2.00GHz`).  I reverted the commit to
double check it too.

The difference in optimized I see is:
  if (_40 != 3.5e+1) // working
vs
  if (_40 != 6.4e+1) // not working

It is test_epi32_ps which is failing with TEST_PS macro and the plus
operand that uses TESTOP:
    TESTOP (add, +, float, ps, 0.0f);                                   \

I have not reduced the testcase any further though.

Thanks,
Andrew Pinski


>
> Regards
>  Robin
>
> gcc/ChangeLog:
>
>         PR middle-end/111401
>         * internal-fn.cc (cond_fn_p): New function.
>         * internal-fn.h (cond_fn_p): Define.
>         * tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
>         if supported.
>         (predicate_scalar_phi): Add whitespace.
>         * tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
>         (neutral_op_for_reduction): Return -0 for PLUS.
>         (vect_is_simple_reduction): Don't count else operand in
>         COND_ADD.
>         (vectorize_fold_left_reduction): Add COND_ADD handling.
>         (vectorizable_reduction): Don't count else operand in COND_ADD.
>         (vect_transform_reduction): Add COND_ADD handling.
>         * tree-vectorizer.h (neutral_op_for_reduction): Add default
>         parameter.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
>         * gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
> ---
>  gcc/internal-fn.cc                            |  38 +++++
>  gcc/internal-fn.h                             |   1 +
>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++++++++++++++++++
>  .../riscv/rvv/autovec/cond/pr111401.c         |  61 ++++++++
>  gcc/tree-if-conv.cc                           |  63 ++++++--
>  gcc/tree-vect-loop.cc                         | 130 ++++++++++++----
>  gcc/tree-vectorizer.h                         |   2 +-
>  7 files changed, 394 insertions(+), 42 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 0fd34359247..77939890f5a 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4241,6 +4241,44 @@ first_commutative_argument (internal_fn fn)
>      }
>  }
>
> +/* Return true if this CODE describes a conditional (masked) internal_fn.  */
> +
> +bool
> +cond_fn_p (code_helper code)
> +{
> +  if (!code.is_fn_code ())
> +    return false;
> +
> +  if (!internal_fn_p ((combined_fn) code))
> +    return false;
> +
> +  internal_fn fn = as_internal_fn ((combined_fn) code);
> +  switch (fn)
> +    {
> +    #undef DEF_INTERNAL_COND_FN
> +    #define DEF_INTERNAL_COND_FN(NAME, F, O, T)                          \
> +    case IFN_COND_##NAME:                                        \
> +    case IFN_COND_LEN_##NAME:                                    \
> +      return true;
> +    #include "internal-fn.def"
> +    #undef DEF_INTERNAL_COND_FN
> +
> +    #undef DEF_INTERNAL_SIGNED_COND_FN
> +    #define DEF_INTERNAL_SIGNED_COND_FN(NAME, F, S, SO, UO, T)   \
> +    case IFN_COND_##NAME:                                        \
> +    case IFN_COND_LEN_##NAME:                                    \
> +      return true;
> +    #include "internal-fn.def"
> +    #undef DEF_INTERNAL_SIGNED_COND_FN
> +
> +    default:
> +      return false;
> +    }
> +
> +  return false;
> +}
> +
> +
>  /* Return true if this CODE describes an internal_fn that returns a vector with
>     elements twice as wide as the element size of the input vectors.  */
>
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 99de13a0199..f1cc9db29c0 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -219,6 +219,7 @@ extern bool commutative_ternary_fn_p (internal_fn);
>  extern int first_commutative_argument (internal_fn);
>  extern bool associative_binary_fn_p (internal_fn);
>  extern bool widening_fn_p (code_helper);
> +extern bool cond_fn_p (code_helper code);
>
>  extern bool set_edom_supported_p (void);
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 00000000000..57c600838ee
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { dg-do run } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-add-options ieee } */
> +/* { dg-additional-options "-std=c99 -fno-fast-math" } */
> +
> +#include "tree-vect.h"
> +
> +#include <math.h>
> +
> +#define N (VECTOR_BITS * 17)
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_plus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res += a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone))
> +reduc_minus_double (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +double __attribute__ ((noinline, noclone, optimize ("0")))
> +reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
> +{
> +  double res = init;
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      res -= a[i];
> +  return res;
> +}
> +
> +int __attribute__ ((optimize (1)))
> +main ()
> +{
> +  int n = 19;
> +  double a[N];
> +  int cond1[N], cond2[N];
> +
> +  for (int i = 0; i < N; i++)
> +    {
> +      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
> +      cond1[i] = 0;
> +      cond2[i] = i & 4 ? 1 : 0;
> +      asm volatile ("" ::: "memory");
> +    }
> +
> +  double res1 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res2 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +  double res3 = reduc_plus_double (a, -0.0, cond1, n);
> +  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
> +  double res4 = reduc_minus_double (a, -0.0, cond1, n);
> +  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res2 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +  res3 = reduc_plus_double (a, 0.0, cond1, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
> +  res4 = reduc_minus_double (a, 0.0, cond1, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res2 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +  res3 = reduc_plus_double (a, -0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
> +  res4 = reduc_minus_double (a, -0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  res1 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res2 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +  res3 = reduc_plus_double (a, 0.0, cond2, n);
> +  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
> +  res4 = reduc_minus_double (a, 0.0, cond2, n);
> +  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
> +
> +  if (res1 != ref1 || signbit (res1) != signbit (ref1))
> +    __builtin_abort ();
> +  if (res2 != ref2 || signbit (res2) != signbit (ref2))
> +    __builtin_abort ();
> +  if (res3 != ref3 || signbit (res3) != signbit (ref3))
> +    __builtin_abort ();
> +  if (res4 != ref4 || signbit (res4) != signbit (ref4))
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> new file mode 100644
> index 00000000000..1d559ce5391
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> @@ -0,0 +1,61 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
> +
> +double
> +__attribute__ ((noipa))
> +foo2 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init += a[i];
> +  return init;
> +}
> +
> +double
> +__attribute__ ((noipa))
> +foo3 (double *__restrict a, double init, int *__restrict cond, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    if (cond[i])
> +      init -= a[i];
> +  return init;
> +}
> +
> +#define SZ 125
> +
> +__attribute__ ((optimize ("1")))
> +int
> +main ()
> +{
> +  double res1 = 0, res2 = 0;
> +  double a1[SZ], a2[SZ];
> +  int c1[SZ], c2[SZ];
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      a1[i] = i * 3 + (i & 4) - (i & 7);
> +      a2[i] = i * 3 + (i & 4) - (i & 7);
> +      c1[i] = i & 1;
> +      c2[i] = i & 1;
> +    }
> +
> +  double init1 = 2.7, init2 = 8.2;
> +  double ref1 = init1, ref2 = init2;
> +  for (int i = 0; i < SZ; i++)
> +    {
> +      if (c1[i])
> +        ref1 += a1[i];
> +      if (c2[i])
> +        ref2 -= a2[i];
> +    }
> +
> +  res1 = foo2 (a1, init1, c1, SZ);
> +  res2 = foo3 (a2, init2, c2, SZ);
> +
> +  if (res1 != ref1)
> +    __builtin_abort ();
> +  if (res2 != ref2)
> +    __builtin_abort ();
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 799f071965e..425976b0861 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1852,10 +1852,12 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>    gimple *new_assign;
>    tree rhs;
>    tree rhs1 = gimple_assign_rhs1 (reduc);
> +  tree lhs = gimple_assign_lhs (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
>    enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> -  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
> +  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
> +                                              NULL, false);
>    gimple_seq stmts = NULL;
>
>    if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1864,19 +1866,52 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>        print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
>      }
>
> -  /* Build cond expression using COND and constant operand
> -     of reduction rhs.  */
> -  c = fold_build_cond_expr (TREE_TYPE (rhs1),
> -                           unshare_expr (cond),
> -                           swap ? op_nochange : op1,
> -                           swap ? op1 : op_nochange);
> +  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
> +     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
> +     a vectorizable call as we can create a COND version of it directly.  */
> +  internal_fn ifn;
> +  ifn = get_conditional_internal_fn (reduction_op);
>
> -  /* Create assignment stmt and insert it at GSI.  */
> -  new_assign = gimple_build_assign (tmp, c);
> -  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> -  rhs = gimple_build (&stmts, reduction_op,
> -                     TREE_TYPE (rhs1), op0, tmp);
> +  bool try_cond_op = true;
> +  gimple *opstmt;
> +  if (TREE_CODE (op1) == SSA_NAME
> +      && (opstmt = SSA_NAME_DEF_STMT (op1))
> +      && is_gimple_call (opstmt))
> +    {
> +      combined_fn cfn = gimple_call_combined_fn (opstmt);
> +      internal_fn ifnop;
> +      reduction_fn_for_scalar_code (cfn, &ifnop);
> +      if (vectorized_internal_fn_supported_p (ifnop, TREE_TYPE
> +                                             (gimple_call_lhs (opstmt))))
> +       try_cond_op = false;
> +    }
> +
> +  if (ifn != IFN_LAST
> +      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
> +      && try_cond_op && !swap)
> +    {
> +      gcall *cond_call = gimple_build_call_internal (ifn, 4,
> +                                                    unshare_expr (cond),
> +                                                    op0, op1, op0);
> +      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
> +      gimple_call_set_lhs (cond_call, tmp);
> +      rhs = tmp;
> +    }
> +  else
> +    {
> +      /* Build cond expression using COND and constant operand
> +        of reduction rhs.  */
> +      c = fold_build_cond_expr (TREE_TYPE (rhs1),
> +                               unshare_expr (cond),
> +                               swap ? op_nochange : op1,
> +                               swap ? op1 : op_nochange);
> +      /* Create assignment stmt and insert it at GSI.  */
> +      new_assign = gimple_build_assign (tmp, c);
> +      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> +      /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +      rhs = gimple_build (&stmts, reduction_op,
> +                         TREE_TYPE (rhs1), op0, tmp);
> +    }
>
>    if (has_nop)
>      {
> @@ -2241,7 +2276,7 @@ predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
>         {
>           /* Convert reduction stmt into vectorizable form.  */
>           rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
> -                                              swap,has_nop, nop_reduc);
> +                                              swap, has_nop, nop_reduc);
>           redundant_ssa_names.safe_push (std::make_pair (res, rhs));
>         }
>        new_stmt = gimple_build_assign (res, rhs);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 23c6e8259e7..94d3cead1e6 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -3672,7 +3672,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
>  static bool
>  fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
>  {
> -  if (code == PLUS_EXPR)
> +  if (code == PLUS_EXPR || code == MINUS_EXPR)
>      {
>        *reduc_fn = IFN_FOLD_LEFT_PLUS;
>        return true;
> @@ -3751,23 +3751,29 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
>     by the introduction of additional X elements, return that X, otherwise
>     return null.  CODE is the code of the reduction and SCALAR_TYPE is type
>     of the scalar elements.  If the reduction has just a single initial value
> -   then INITIAL_VALUE is that value, otherwise it is null.  */
> +   then INITIAL_VALUE is that value, otherwise it is null.
> +   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
> +   In that case no signed zero is returned.  */
>
>  tree
>  neutral_op_for_reduction (tree scalar_type, code_helper code,
> -                         tree initial_value)
> +                         tree initial_value, bool as_initial)
>  {
>    if (code.is_tree_code ())
>      switch (tree_code (code))
>        {
> -      case WIDEN_SUM_EXPR:
>        case DOT_PROD_EXPR:
>        case SAD_EXPR:
> -      case PLUS_EXPR:
>        case MINUS_EXPR:
>        case BIT_IOR_EXPR:
>        case BIT_XOR_EXPR:
>         return build_zero_cst (scalar_type);
> +      case WIDEN_SUM_EXPR:
> +      case PLUS_EXPR:
> +       if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
> +         return build_real (scalar_type, dconstm0);
> +       else
> +         return build_zero_cst (scalar_type);
>
>        case MULT_EXPR:
>         return build_one_cst (scalar_type);
> @@ -4106,8 +4112,14 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
>            return NULL;
>          }
>
> -      nphi_def_loop_uses++;
> -      phi_use_stmt = use_stmt;
> +      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
> +        op1 twice (once as definition, once as else) in the same operation.
> +        Only count it as one. */
> +      if (use_stmt != phi_use_stmt)
> +       {
> +         nphi_def_loop_uses++;
> +         phi_use_stmt = use_stmt;
> +       }
>      }
>
>    tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> @@ -6378,7 +6390,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
>             initial_value = reduc_info->reduc_initial_values[0];
>           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
> -                                                initial_value);
> +                                                initial_value, false);
>         }
>        if (neutral_op)
>         vector_identity = gimple_build_vector_from_val (&seq, vectype,
> @@ -6860,8 +6872,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>                                gimple_stmt_iterator *gsi,
>                                gimple **vec_stmt, slp_tree slp_node,
>                                gimple *reduc_def_stmt,
> -                              tree_code code, internal_fn reduc_fn,
> -                              tree ops[3], tree vectype_in,
> +                              code_helper code, internal_fn reduc_fn,
> +                              tree *ops, int num_ops, tree vectype_in,
>                                int reduc_index, vec_loop_masks *masks,
>                                vec_loop_lens *lens)
>  {
> @@ -6877,17 +6889,40 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>
>    gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
>    gcc_assert (ncopies == 1);
> -  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
> +
> +  bool is_cond_op = false;
> +  if (code.is_tree_code ())
> +    code = tree_code (code);
> +  else
> +    {
> +      gcc_assert (cond_fn_p (code));
> +      is_cond_op = true;
> +      code = conditional_internal_fn_code (internal_fn (code));
> +    }
> +
> +  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
>
>    if (slp_node)
>      gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
>                           TYPE_VECTOR_SUBPARTS (vectype_in)));
>
> -  tree op0 = ops[1 - reduc_index];
> +  /* The operands either come from a binary operation or an IFN_COND operation.
> +     The former is a gimple assign with binary rhs and the latter is a
> +     gimple call with four arguments.  */
> +  gcc_assert (num_ops == 2 || num_ops == 4);
> +  tree op0, opmask;
> +  if (!is_cond_op)
> +    op0 = ops[1 - reduc_index];
> +  else
> +    {
> +      op0 = ops[2];
> +      opmask = ops[0];
> +      gcc_assert (!slp_node);
> +    }
>
>    int group_size = 1;
>    stmt_vec_info scalar_dest_def_info;
> -  auto_vec<tree> vec_oprnds0;
> +  auto_vec<tree> vec_oprnds0, vec_opmask;
>    if (slp_node)
>      {
>        auto_vec<vec<tree> > vec_defs (2);
> @@ -6903,9 +6938,17 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
>                                      op0, &vec_oprnds0);
>        scalar_dest_def_info = stmt_info;
> +
> +      /* For an IFN_COND_OP we also need the vector mask operand.  */
> +      if (is_cond_op)
> +         vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
> +                                        opmask, &vec_opmask);
>      }
>
> -  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
> +  gimple *sdef = scalar_dest_def_info->stmt;
> +  tree scalar_dest = is_gimple_call (sdef)
> +                      ? gimple_call_lhs (sdef)
> +                      : gimple_assign_lhs (scalar_dest_def_info->stmt);
>    tree scalar_type = TREE_TYPE (scalar_dest);
>    tree reduc_var = gimple_phi_result (reduc_def_stmt);
>
> @@ -6939,17 +6982,20 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>        tree bias = NULL_TREE;
>        if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
> +      else if (is_cond_op)
> +       mask = vec_opmask[0];
>        if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>         {
>           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
>                                    i, 1);
>           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>           bias = build_int_cst (intQI_type_node, biasval);
> -         mask = build_minus_one_cst (truth_type_for (vectype_in));
> +         if (!is_cond_op)
> +           mask = build_minus_one_cst (truth_type_for (vectype_in));
>         }
>
>        /* Handle MINUS by adding the negative.  */
> -      if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
> +      if (reduc_fn != IFN_LAST && tree_code (code) == MINUS_EXPR)
>         {
>           tree negated = make_ssa_name (vectype_out);
>           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
> @@ -6957,7 +7003,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>           def0 = negated;
>         }
>
> -      if (mask && mask_reduc_fn == IFN_LAST)
> +      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> +         && mask && mask_reduc_fn == IFN_LAST)
>         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
>                                     vector_identity);
>
> @@ -6988,8 +7035,8 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
>         }
>        else
>         {
> -         reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
> -                                            reduc_var, def0);
> +         reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
> +                                            tree_code (code), reduc_var, def0);
>           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
>           /* Remove the statement, so that we can use the same code paths
>              as for statements that we've just created.  */
> @@ -7440,6 +7487,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>        if (i == STMT_VINFO_REDUC_IDX (stmt_info))
>         continue;
>
> +      /* For an IFN_COND_OP we might hit the reduction definition operand
> +        twice (once as definition, once as else).  */
> +      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
> +       continue;
> +
>        /* There should be only one cycle def in the stmt, the one
>           leading to reduc_def.  */
>        if (VECTORIZABLE_CYCLE_DEF (dt))
> @@ -7640,6 +7692,13 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>            when generating the code inside the loop.  */
>
>    code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
> +
> +  /* If conversion might have created a conditional operation like
> +     IFN_COND_ADD already.  Use the internal code for the following checks.  */
> +  if (cond_fn_p (orig_code))
> +      orig_code = conditional_internal_fn_code
> +       (as_internal_fn(combined_fn (orig_code)));
> +
>    STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
>
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
> @@ -7678,7 +7737,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>         {
>           if (dump_enabled_p ())
>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                           "reduction: not commutative/associative");
> +                           "reduction: not commutative/associative\n");
>           return false;
>         }
>      }
> @@ -8213,6 +8272,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>
>    code_helper code = canonicalize_code (op.code, op.type);
>    internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
> +
>    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
>    vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
>    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
> @@ -8231,17 +8291,21 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    if (code == COND_EXPR)
>      gcc_assert (ncopies == 1);
>
> +  /* A COND_OP reduction must have the same definition and else value. */
> +  if (cond_fn_p (code))
> +    gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
> +
>    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
>
>    vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
>    if (reduction_type == FOLD_LEFT_REDUCTION)
>      {
>        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
> -      gcc_assert (code.is_tree_code ());
> +      gcc_assert (code.is_tree_code () || cond_fn_p (code));
>        return vectorize_fold_left_reduction
>           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
> -          tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
> -          lens);
> +          code, reduc_fn, op.ops, op.num_ops, vectype_in,
> +          reduc_index, masks, lens);
>      }
>
>    bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
> @@ -8254,14 +8318,20 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>    tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
>    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>
> +  /* Get NCOPIES vector definitions for all operands except the reduction
> +     definition.  */
>    vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
>                      single_defuse_cycle && reduc_index == 0
>                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
>                      single_defuse_cycle && reduc_index == 1
>                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
> -                    op.num_ops == 3
> -                    && !(single_defuse_cycle && reduc_index == 2)
> +                    op.num_ops == 4
> +                    || (op.num_ops == 3
> +                        && !(single_defuse_cycle && reduc_index == 2))
>                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
> +
> +  /* For single def-use cycles get one copy of the vectorized reduction
> +     definition.  */
>    if (single_defuse_cycle)
>      {
>        gcc_assert (!slp_node);
> @@ -8301,7 +8371,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>         }
>        else
>         {
> -         if (op.num_ops == 3)
> +         if (op.num_ops >= 3)
>             vop[2] = vec_oprnds2[i];
>
>           if (masked_loop_p && mask_by_cond_expr)
> @@ -8314,10 +8384,16 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
>           if (emulated_mixed_dot_prod)
>             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
>                                                     vec_dest, vop);
> -         else if (code.is_internal_fn ())
> +
> +         else if (code.is_internal_fn () && !cond_fn_p (code))
>             new_stmt = gimple_build_call_internal (internal_fn (code),
>                                                    op.num_ops,
>                                                    vop[0], vop[1], vop[2]);
> +         else if (cond_fn_p (code))
> +           new_stmt = gimple_build_call_internal (internal_fn (code),
> +                                                  op.num_ops,
> +                                                  vop[0], vop[1], vop[2],
> +                                                  vop[1]);
>           else
>             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
>                                             vop[0], vop[1], vop[2]);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index f1d0cd79961..e22067400af 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2319,7 +2319,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
>                                                   tree);
>
>  /* In tree-vect-loop.cc.  */
> -extern tree neutral_op_for_reduction (tree, code_helper, tree);
> +extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
>  extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
>  bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
>  /* Used in tree-vect-loop-manip.cc */
> --
> 2.41.0
  

Patch

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 0fd34359247..77939890f5a 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4241,6 +4241,44 @@  first_commutative_argument (internal_fn fn)
     }
 }
 
+/* Return true if this CODE describes a conditional (masked) internal_fn.  */
+
+bool
+cond_fn_p (code_helper code)
+{
+  if (!code.is_fn_code ())
+    return false;
+
+  if (!internal_fn_p ((combined_fn) code))
+    return false;
+
+  internal_fn fn = as_internal_fn ((combined_fn) code);
+  switch (fn)
+    {
+    #undef DEF_INTERNAL_COND_FN
+    #define DEF_INTERNAL_COND_FN(NAME, F, O, T)			  \
+    case IFN_COND_##NAME:					  \
+    case IFN_COND_LEN_##NAME:					  \
+      return true;
+    #include "internal-fn.def"
+    #undef DEF_INTERNAL_COND_FN
+
+    #undef DEF_INTERNAL_SIGNED_COND_FN
+    #define DEF_INTERNAL_SIGNED_COND_FN(NAME, F, S, SO, UO, T)	  \
+    case IFN_COND_##NAME:					  \
+    case IFN_COND_LEN_##NAME:					  \
+      return true;
+    #include "internal-fn.def"
+    #undef DEF_INTERNAL_SIGNED_COND_FN
+
+    default:
+      return false;
+    }
+
+  return false;
+}
+
+
 /* Return true if this CODE describes an internal_fn that returns a vector with
    elements twice as wide as the element size of the input vectors.  */
 
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 99de13a0199..f1cc9db29c0 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -219,6 +219,7 @@  extern bool commutative_ternary_fn_p (internal_fn);
 extern int first_commutative_argument (internal_fn);
 extern bool associative_binary_fn_p (internal_fn);
 extern bool widening_fn_p (code_helper);
+extern bool cond_fn_p (code_helper code);
 
 extern bool set_edom_supported_p (void);
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
new file mode 100644
index 00000000000..57c600838ee
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
@@ -0,0 +1,141 @@ 
+/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
+/* { dg-do run } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-add-options ieee } */
+/* { dg-additional-options "-std=c99 -fno-fast-math" } */
+
+#include "tree-vect.h"
+
+#include <math.h>
+
+#define N (VECTOR_BITS * 17)
+
+double __attribute__ ((noinline, noclone))
+reduc_plus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_plus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res += a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone))
+reduc_minus_double (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+double __attribute__ ((noinline, noclone, optimize ("0")))
+reduc_minus_double_ref (double *restrict a, double init, int *cond, int n)
+{
+  double res = init;
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      res -= a[i];
+  return res;
+}
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  int n = 19;
+  double a[N];
+  int cond1[N], cond2[N];
+
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
+      cond1[i] = 0;
+      cond2[i] = i & 4 ? 1 : 0;
+      asm volatile ("" ::: "memory");
+    }
+
+  double res1 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref1 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res2 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref2 = reduc_minus_double_ref (a, -0.0, cond1, n);
+  double res3 = reduc_plus_double (a, -0.0, cond1, n);
+  double ref3 = reduc_plus_double_ref (a, -0.0, cond1, n);
+  double res4 = reduc_minus_double (a, -0.0, cond1, n);
+  double ref4 = reduc_minus_double_ref (a, -0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond1, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res2 = reduc_minus_double (a, 0.0, cond1, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond1, n);
+  res3 = reduc_plus_double (a, 0.0, cond1, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond1, n);
+  res4 = reduc_minus_double (a, 0.0, cond1, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond1, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, -0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res2 = reduc_minus_double (a, -0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, -0.0, cond2, n);
+  res3 = reduc_plus_double (a, -0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, -0.0, cond2, n);
+  res4 = reduc_minus_double (a, -0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, -0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  res1 = reduc_plus_double (a, 0.0, cond2, n);
+  ref1 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res2 = reduc_minus_double (a, 0.0, cond2, n);
+  ref2 = reduc_minus_double_ref (a, 0.0, cond2, n);
+  res3 = reduc_plus_double (a, 0.0, cond2, n);
+  ref3 = reduc_plus_double_ref (a, 0.0, cond2, n);
+  res4 = reduc_minus_double (a, 0.0, cond2, n);
+  ref4 = reduc_minus_double_ref (a, 0.0, cond2, n);
+
+  if (res1 != ref1 || signbit (res1) != signbit (ref1))
+    __builtin_abort ();
+  if (res2 != ref2 || signbit (res2) != signbit (ref2))
+    __builtin_abort ();
+  if (res3 != ref3 || signbit (res3) != signbit (ref3))
+    __builtin_abort ();
+  if (res4 != ref4 || signbit (res4) != signbit (ref4))
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
new file mode 100644
index 00000000000..1d559ce5391
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
@@ -0,0 +1,61 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=scalable -fdump-tree-vect-details" } */
+
+double
+__attribute__ ((noipa))
+foo2 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init += a[i];
+  return init;
+}
+
+double
+__attribute__ ((noipa))
+foo3 (double *__restrict a, double init, int *__restrict cond, int n)
+{
+  for (int i = 0; i < n; i++)
+    if (cond[i])
+      init -= a[i];
+  return init;
+}
+
+#define SZ 125
+
+__attribute__ ((optimize ("1")))
+int
+main ()
+{
+  double res1 = 0, res2 = 0;
+  double a1[SZ], a2[SZ];
+  int c1[SZ], c2[SZ];
+  for (int i = 0; i < SZ; i++)
+    {
+      a1[i] = i * 3 + (i & 4) - (i & 7);
+      a2[i] = i * 3 + (i & 4) - (i & 7);
+      c1[i] = i & 1;
+      c2[i] = i & 1;
+    }
+
+  double init1 = 2.7, init2 = 8.2;
+  double ref1 = init1, ref2 = init2;
+  for (int i = 0; i < SZ; i++)
+    {
+      if (c1[i])
+        ref1 += a1[i];
+      if (c2[i])
+        ref2 -= a2[i];
+    }
+
+  res1 = foo2 (a1, init1, c1, SZ);
+  res2 = foo3 (a2, init2, c2, SZ);
+
+  if (res1 != ref1)
+    __builtin_abort ();
+  if (res2 != ref2)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-not "VCOND_MASK" "vect" } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 799f071965e..425976b0861 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -1852,10 +1852,12 @@  convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
   gimple *new_assign;
   tree rhs;
   tree rhs1 = gimple_assign_rhs1 (reduc);
+  tree lhs = gimple_assign_lhs (reduc);
   tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
   tree c;
   enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
-  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
+  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op,
+					       NULL, false);
   gimple_seq stmts = NULL;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1864,19 +1866,52 @@  convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
       print_gimple_stmt (dump_file, reduc, 0, TDF_SLIM);
     }
 
-  /* Build cond expression using COND and constant operand
-     of reduction rhs.  */
-  c = fold_build_cond_expr (TREE_TYPE (rhs1),
-			    unshare_expr (cond),
-			    swap ? op_nochange : op1,
-			    swap ? op1 : op_nochange);
+  /* If possible try to create an IFN_COND_ADD instead of a COND_EXPR and
+     a PLUS_EXPR.  Don't do this if the reduction def operand itself is
+     a vectorizable call as we can create a COND version of it directly.  */
+  internal_fn ifn;
+  ifn = get_conditional_internal_fn (reduction_op);
 
-  /* Create assignment stmt and insert it at GSI.  */
-  new_assign = gimple_build_assign (tmp, c);
-  gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
-  /* Build rhs for unconditional increment/decrement/logic_operation.  */
-  rhs = gimple_build (&stmts, reduction_op,
-		      TREE_TYPE (rhs1), op0, tmp);
+  bool try_cond_op = true;
+  gimple *opstmt;
+  if (TREE_CODE (op1) == SSA_NAME
+      && (opstmt = SSA_NAME_DEF_STMT (op1))
+      && is_gimple_call (opstmt))
+    {
+      combined_fn cfn = gimple_call_combined_fn (opstmt);
+      internal_fn ifnop;
+      reduction_fn_for_scalar_code (cfn, &ifnop);
+      if (vectorized_internal_fn_supported_p (ifnop, TREE_TYPE
+					      (gimple_call_lhs (opstmt))))
+	try_cond_op = false;
+    }
+
+  if (ifn != IFN_LAST
+      && vectorized_internal_fn_supported_p (ifn, TREE_TYPE (lhs))
+      && try_cond_op && !swap)
+    {
+      gcall *cond_call = gimple_build_call_internal (ifn, 4,
+						     unshare_expr (cond),
+						     op0, op1, op0);
+      gsi_insert_before (gsi, cond_call, GSI_SAME_STMT);
+      gimple_call_set_lhs (cond_call, tmp);
+      rhs = tmp;
+    }
+  else
+    {
+      /* Build cond expression using COND and constant operand
+	 of reduction rhs.  */
+      c = fold_build_cond_expr (TREE_TYPE (rhs1),
+				unshare_expr (cond),
+				swap ? op_nochange : op1,
+				swap ? op1 : op_nochange);
+      /* Create assignment stmt and insert it at GSI.  */
+      new_assign = gimple_build_assign (tmp, c);
+      gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
+      /* Build rhs for unconditional increment/decrement/logic_operation.  */
+      rhs = gimple_build (&stmts, reduction_op,
+			  TREE_TYPE (rhs1), op0, tmp);
+    }
 
   if (has_nop)
     {
@@ -2241,7 +2276,7 @@  predicate_scalar_phi (gphi *phi, gimple_stmt_iterator *gsi)
 	{
 	  /* Convert reduction stmt into vectorizable form.  */
 	  rhs = convert_scalar_cond_reduction (reduc, gsi, cond, op0, op1,
-					       swap,has_nop, nop_reduc);
+					       swap, has_nop, nop_reduc);
 	  redundant_ssa_names.safe_push (std::make_pair (res, rhs));
 	}
       new_stmt = gimple_build_assign (res, rhs);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 23c6e8259e7..94d3cead1e6 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3672,7 +3672,7 @@  vect_analyze_loop (class loop *loop, vec_info_shared *shared)
 static bool
 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
 {
-  if (code == PLUS_EXPR)
+  if (code == PLUS_EXPR || code == MINUS_EXPR)
     {
       *reduc_fn = IFN_FOLD_LEFT_PLUS;
       return true;
@@ -3751,23 +3751,29 @@  reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
    by the introduction of additional X elements, return that X, otherwise
    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
    of the scalar elements.  If the reduction has just a single initial value
-   then INITIAL_VALUE is that value, otherwise it is null.  */
+   then INITIAL_VALUE is that value, otherwise it is null.
+   If AS_INITIAL is TRUE the value is supposed to be used as initial value.
+   In that case no signed zero is returned.  */
 
 tree
 neutral_op_for_reduction (tree scalar_type, code_helper code,
-			  tree initial_value)
+			  tree initial_value, bool as_initial)
 {
   if (code.is_tree_code ())
     switch (tree_code (code))
       {
-      case WIDEN_SUM_EXPR:
       case DOT_PROD_EXPR:
       case SAD_EXPR:
-      case PLUS_EXPR:
       case MINUS_EXPR:
       case BIT_IOR_EXPR:
       case BIT_XOR_EXPR:
 	return build_zero_cst (scalar_type);
+      case WIDEN_SUM_EXPR:
+      case PLUS_EXPR:
+	if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
+	  return build_real (scalar_type, dconstm0);
+	else
+	  return build_zero_cst (scalar_type);
 
       case MULT_EXPR:
 	return build_one_cst (scalar_type);
@@ -4106,8 +4112,14 @@  vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
           return NULL;
         }
 
-      nphi_def_loop_uses++;
-      phi_use_stmt = use_stmt;
+      /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
+	 op1 twice (once as definition, once as else) in the same operation.
+	 Only count it as one. */
+      if (use_stmt != phi_use_stmt)
+	{
+	  nphi_def_loop_uses++;
+	  phi_use_stmt = use_stmt;
+	}
     }
 
   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
@@ -6378,7 +6390,7 @@  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
 	    initial_value = reduc_info->reduc_initial_values[0];
 	  neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
-						 initial_value);
+						 initial_value, false);
 	}
       if (neutral_op)
 	vector_identity = gimple_build_vector_from_val (&seq, vectype,
@@ -6860,8 +6872,8 @@  vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 			       gimple_stmt_iterator *gsi,
 			       gimple **vec_stmt, slp_tree slp_node,
 			       gimple *reduc_def_stmt,
-			       tree_code code, internal_fn reduc_fn,
-			       tree ops[3], tree vectype_in,
+			       code_helper code, internal_fn reduc_fn,
+			       tree *ops, int num_ops, tree vectype_in,
 			       int reduc_index, vec_loop_masks *masks,
 			       vec_loop_lens *lens)
 {
@@ -6877,17 +6889,40 @@  vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 
   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
   gcc_assert (ncopies == 1);
-  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
+
+  bool is_cond_op = false;
+  if (code.is_tree_code ())
+    code = tree_code (code);
+  else
+    {
+      gcc_assert (cond_fn_p (code));
+      is_cond_op = true;
+      code = conditional_internal_fn_code (internal_fn (code));
+    }
+
+  gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
 
   if (slp_node)
     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
 			  TYPE_VECTOR_SUBPARTS (vectype_in)));
 
-  tree op0 = ops[1 - reduc_index];
+  /* The operands either come from a binary operation or an IFN_COND operation.
+     The former is a gimple assign with binary rhs and the latter is a
+     gimple call with four arguments.  */
+  gcc_assert (num_ops == 2 || num_ops == 4);
+  tree op0, opmask;
+  if (!is_cond_op)
+    op0 = ops[1 - reduc_index];
+  else
+    {
+      op0 = ops[2];
+      opmask = ops[0];
+      gcc_assert (!slp_node);
+    }
 
   int group_size = 1;
   stmt_vec_info scalar_dest_def_info;
-  auto_vec<tree> vec_oprnds0;
+  auto_vec<tree> vec_oprnds0, vec_opmask;
   if (slp_node)
     {
       auto_vec<vec<tree> > vec_defs (2);
@@ -6903,9 +6938,17 @@  vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
 				     op0, &vec_oprnds0);
       scalar_dest_def_info = stmt_info;
+
+      /* For an IFN_COND_OP we also need the vector mask operand.  */
+      if (is_cond_op)
+	  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+					 opmask, &vec_opmask);
     }
 
-  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
+  gimple *sdef = scalar_dest_def_info->stmt;
+  tree scalar_dest = is_gimple_call (sdef)
+		       ? gimple_call_lhs (sdef)
+		       : gimple_assign_lhs (scalar_dest_def_info->stmt);
   tree scalar_type = TREE_TYPE (scalar_dest);
   tree reduc_var = gimple_phi_result (reduc_def_stmt);
 
@@ -6939,17 +6982,20 @@  vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
       tree bias = NULL_TREE;
       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 	mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
+      else if (is_cond_op)
+	mask = vec_opmask[0];
       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
 	{
 	  len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
 				   i, 1);
 	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
 	  bias = build_int_cst (intQI_type_node, biasval);
-	  mask = build_minus_one_cst (truth_type_for (vectype_in));
+	  if (!is_cond_op)
+	    mask = build_minus_one_cst (truth_type_for (vectype_in));
 	}
 
       /* Handle MINUS by adding the negative.  */
-      if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
+      if (reduc_fn != IFN_LAST && tree_code (code) == MINUS_EXPR)
 	{
 	  tree negated = make_ssa_name (vectype_out);
 	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
@@ -6957,7 +7003,8 @@  vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	  def0 = negated;
 	}
 
-      if (mask && mask_reduc_fn == IFN_LAST)
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+	  && mask && mask_reduc_fn == IFN_LAST)
 	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
 				    vector_identity);
 
@@ -6988,8 +7035,8 @@  vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
-					     reduc_var, def0);
+	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
+					     tree_code (code), reduc_var, def0);
 	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
 	  /* Remove the statement, so that we can use the same code paths
 	     as for statements that we've just created.  */
@@ -7440,6 +7487,11 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
 	continue;
 
+      /* For an IFN_COND_OP we might hit the reduction definition operand
+	 twice (once as definition, once as else).  */
+      if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
+	continue;
+
       /* There should be only one cycle def in the stmt, the one
          leading to reduc_def.  */
       if (VECTORIZABLE_CYCLE_DEF (dt))
@@ -7640,6 +7692,13 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
           when generating the code inside the loop.  */
 
   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
+
+  /* If conversion might have created a conditional operation like
+     IFN_COND_ADD already.  Use the internal code for the following checks.  */
+  if (cond_fn_p (orig_code))
+      orig_code = conditional_internal_fn_code
+	(as_internal_fn(combined_fn (orig_code)));
+
   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
@@ -7678,7 +7737,7 @@  vectorizable_reduction (loop_vec_info loop_vinfo,
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			    "reduction: not commutative/associative");
+			    "reduction: not commutative/associative\n");
 	  return false;
 	}
     }
@@ -8213,6 +8272,7 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
 
   code_helper code = canonicalize_code (op.code, op.type);
   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
+
   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
@@ -8231,17 +8291,21 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
   if (code == COND_EXPR)
     gcc_assert (ncopies == 1);
 
+  /* A COND_OP reduction must have the same definition and else value. */
+  if (cond_fn_p (code))
+    gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
+
   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
 
   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
   if (reduction_type == FOLD_LEFT_REDUCTION)
     {
       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
-      gcc_assert (code.is_tree_code ());
+      gcc_assert (code.is_tree_code () || cond_fn_p (code));
       return vectorize_fold_left_reduction
 	  (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
-	   tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
-	   lens);
+	   code, reduc_fn, op.ops, op.num_ops, vectype_in,
+	   reduc_index, masks, lens);
     }
 
   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
@@ -8254,14 +8318,20 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
+  /* Get NCOPIES vector definitions for all operands except the reduction
+     definition.  */
   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
 		     single_defuse_cycle && reduc_index == 0
 		     ? NULL_TREE : op.ops[0], &vec_oprnds0,
 		     single_defuse_cycle && reduc_index == 1
 		     ? NULL_TREE : op.ops[1], &vec_oprnds1,
-		     op.num_ops == 3
-		     && !(single_defuse_cycle && reduc_index == 2)
+		     op.num_ops == 4
+		     || (op.num_ops == 3
+			 && !(single_defuse_cycle && reduc_index == 2))
 		     ? op.ops[2] : NULL_TREE, &vec_oprnds2);
+
+  /* For single def-use cycles get one copy of the vectorized reduction
+     definition.  */
   if (single_defuse_cycle)
     {
       gcc_assert (!slp_node);
@@ -8301,7 +8371,7 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
 	}
       else
 	{
-	  if (op.num_ops == 3)
+	  if (op.num_ops >= 3)
 	    vop[2] = vec_oprnds2[i];
 
 	  if (masked_loop_p && mask_by_cond_expr)
@@ -8314,10 +8384,16 @@  vect_transform_reduction (loop_vec_info loop_vinfo,
 	  if (emulated_mixed_dot_prod)
 	    new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
 						    vec_dest, vop);
-	  else if (code.is_internal_fn ())
+
+	  else if (code.is_internal_fn () && !cond_fn_p (code))
 	    new_stmt = gimple_build_call_internal (internal_fn (code),
 						   op.num_ops,
 						   vop[0], vop[1], vop[2]);
+	  else if (cond_fn_p (code))
+	    new_stmt = gimple_build_call_internal (internal_fn (code),
+						   op.num_ops,
+						   vop[0], vop[1], vop[2],
+						   vop[1]);
 	  else
 	    new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
 					    vop[0], vop[1], vop[2]);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f1d0cd79961..e22067400af 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2319,7 +2319,7 @@  extern tree vect_create_addr_base_for_vector_ref (vec_info *,
 						  tree);
 
 /* In tree-vect-loop.cc.  */
-extern tree neutral_op_for_reduction (tree, code_helper, tree);
+extern tree neutral_op_for_reduction (tree, code_helper, tree, bool = true);
 extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
 bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
 /* Used in tree-vect-loop-manip.cc */