Handle in-order reductions when SLP vectorizing non-loops

Message ID 20230809132301.0F2F138582B0@sourceware.org
State Unresolved
Headers
Series Handle in-order reductions when SLP vectorizing non-loops |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Richard Biener Aug. 9, 2023, 1:22 p.m. UTC
  The following teaches the non-loop reduction vectorization code to
handle non-associatable reductions.  Using the existing FOLD_LEFT_PLUS
internal functions might be possible but I'd have to convince myself
that +0.0 + x[0] is a safe extra operation in ever rounding mode
(I also have no way to test the resulting code).

The reduction code is now extra lame in lacking any way to do
associatable reductions without a direct optab support but always
supporting in-order reductions by open-coding them.

I'll also note the pre-existing issue of every associatable
operation now triggering at least a two-lane SLP reduction discovery
attempt.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

Richard.

	* tree-vectorizer.h (vect_expand_fold_left): Export.
	* tree-vect-loop.cc (vect_expand_fold_left): Likewise.
	Support NULL lhs.
	* tree-vect-slp.cc (vect_slp_linearize_chain): Add a flag
	to indicate whether we can associate, if not only follow
	the left associative chain.
	(vect_build_slp_tree_2): Adjust.
	(vect_slp_check_for_constructors): Use needs_fold_left_reduction_p
	instead of open-coding, properly linearize the chain according
	to that and avoid sorting.
	(vectorizable_bb_reduc_epilogue): For fold-left reductions
	require a single vector definition and at most one remaining op.
	Adjust costing.
	(vectorize_slp_instance_root_stmt): Vectorize fold-left
	reductions with vect_expand_fold_left.

	* gcc.dg/vect/bb-slp-75.c: New testcase.
	* gcc.dg/vect/bb-slp-46.c: Adjust.
	* gcc.dg/vect/vect-reduc-in-order-1.c: Disable SLP vectorization.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-46.c         |   2 +-
 gcc/testsuite/gcc.dg/vect/bb-slp-75.c         |  20 ++++
 .../gcc.dg/vect/vect-reduc-in-order-1.c       |   2 +-
 gcc/tree-vect-loop.cc                         |  21 ++--
 gcc/tree-vect-slp.cc                          | 111 ++++++++++++------
 gcc/tree-vectorizer.h                         |   2 +
 6 files changed, 111 insertions(+), 47 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-75.c
  

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-46.c b/gcc/testsuite/gcc.dg/vect/bb-slp-46.c
index 98b29062a19..4eceea44efc 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-46.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-46.c
@@ -15,7 +15,7 @@  int foo ()
   a[1] = tem1;
   a[2] = tem2;
   a[3] = tem3;
-  return temx + temy;
+  return temx / temy;
 }
 
 /* We should extract the live lane from the vectorized add rather than
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-75.c b/gcc/testsuite/gcc.dg/vect/bb-slp-75.c
new file mode 100644
index 00000000000..d7f91089c87
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-75.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+
+double
+not_vectorizable (double *x)
+{
+  double r = x[0];
+  return r + x[2] + x[1];
+}
+
+double
+vectorizable (double *x)
+{
+  double r = x[0];
+  return r + x[1] + x[2];
+}
+
+/* We can vectorize the in-order reduction in vectorizable but not the one
+   in not_vectorizable since we cannot handle the gap after x[2].  */
+/* { dg-final { scan-tree-dump-times "optimized: basic block part vectorized" 1 "slp2" { target vect_hw_misalign } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-1.c
index 4c17f2c1978..c1c853cb93b 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-1.c
@@ -1,7 +1,7 @@ 
 /* { dg-xfail-run-if "" { { i?86-*-* x86_64-*-* } && ia32 } } */
 /* { dg-require-effective-target vect_double } */
 /* { dg-add-options ieee } */
-/* { dg-additional-options "-fno-fast-math" } */
+/* { dg-additional-options "-fno-fast-math -fno-tree-slp-vectorize" } */
 
 #include "tree-vect.h"
 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bf8d677b584..1cd6bb43194 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6769,11 +6769,11 @@  merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
 }
 
 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
-   order, starting with LHS.  Insert the extraction statements before GSI and
-   associate the new scalar SSA names with variable SCALAR_DEST.
+   order, starting with LHS if not NULL.  Insert the extraction statements
+   before GSI and associate the new scalar SSA names with variable SCALAR_DEST.
    Return the SSA name for the result.  */
 
-static tree
+tree
 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
 		       tree_code code, tree lhs, tree vector_rhs)
 {
@@ -6796,11 +6796,16 @@  vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
       gimple_assign_set_lhs (stmt, rhs);
       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
 
-      stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
-      tree new_name = make_ssa_name (scalar_dest, stmt);
-      gimple_assign_set_lhs (stmt, new_name);
-      gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
-      lhs = new_name;
+      if (lhs)
+	{
+	  stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
+	  tree new_name = make_ssa_name (scalar_dest, stmt);
+	  gimple_assign_set_lhs (stmt, new_name);
+	  gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+	  lhs = new_name;
+	}
+      else
+	lhs = rhs;
     }
   return lhs;
 }
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index f02921564c9..69e6ef2baa7 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1480,9 +1480,10 @@  dt_sort_cmp (const void *op1_, const void *op2_, void *)
   return (int)op1->code - (int)op2->code;
 }
 
-/* Linearize the associatable expression chain at START with the
-   associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
-   filling CHAIN with the result and using WORKLIST as intermediate storage.
+/* Linearize the associatable (if ASSOCIATABLE is true) expression chain at
+   START with the associatable operation CODE (where PLUS_EXPR also allows
+   MINUS_EXPR), filling CHAIN with the result and using WORKLIST as
+   intermediate storage.
    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
    stmts, starting with START.  */
@@ -1493,7 +1494,7 @@  vect_slp_linearize_chain (vec_info *vinfo,
 			  vec<chain_op_t> &chain,
 			  enum tree_code code, gimple *start,
 			  gimple *&code_stmt, gimple *&alt_code_stmt,
-			  vec<gimple *> *chain_stmts)
+			  vec<gimple *> *chain_stmts, bool associatable)
 {
   /* For each lane linearize the addition/subtraction (or other
      uniform associatable operation) expression tree.  */
@@ -1526,6 +1527,7 @@  vect_slp_linearize_chain (vec_info *vinfo,
 	  gimple *use_stmt;
 	  use_operand_p use_p;
 	  if (dt == vect_internal_def
+	      && (opnum == 1 || associatable)
 	      && single_imm_use (op, &use_p, &use_stmt)
 	      && is_gimple_assign (def_stmt_info->stmt)
 	      && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
@@ -1924,7 +1926,7 @@  vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 	  gimple *op_stmt = NULL, *other_op_stmt = NULL;
 	  vect_slp_linearize_chain (vinfo, worklist, chain, code,
 				    stmts[lane]->stmt, op_stmt, other_op_stmt,
-				    NULL);
+				    NULL, true);
 	  if (!op_stmt_info && op_stmt)
 	    op_stmt_info = vinfo->lookup_stmt (op_stmt);
 	  if (!other_op_stmt_info && other_op_stmt)
@@ -6399,17 +6401,31 @@  vectorizable_bb_reduc_epilogue (slp_instance instance,
     reduc_code = PLUS_EXPR;
   internal_fn reduc_fn;
   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
+  bool fl;
   if (!vectype
-      || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
-      || reduc_fn == IFN_LAST
-      || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
-				     TREE_TYPE (vectype)))
-    return false;
+				     TREE_TYPE (vectype))
+      || (!(fl = needs_fold_left_reduction_p (TREE_TYPE (vectype), reduc_code))
+	  && (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
+	      || reduc_fn == IFN_LAST
+	      || !direct_internal_fn_supported_p (reduc_fn,
+						  vectype, OPTIMIZE_FOR_BOTH)))
+      || (fl
+	  && (maybe_ne (TYPE_VECTOR_SUBPARTS (vectype),
+			SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)))
+	      || SLP_INSTANCE_REMAIN_STMTS (instance).length () > 1)))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not vectorized: basic block reduction epilogue "
+			 "operation unsupported.\n");
+      return false;
+    }
 
   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
      cost log2 vector operations plus shuffles and one extraction.  */
-  unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
+  unsigned steps = (fl ? vect_nunits_for_cost (vectype)
+		    : floor_log2 (vect_nunits_for_cost (vectype)));
   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
 		    vectype, 0, vect_body);
   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
@@ -7220,13 +7236,6 @@  vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
 	}
       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
 	       && (associative_tree_code (code) || code == MINUS_EXPR)
-	       /* ???  The flag_associative_math and TYPE_OVERFLOW_WRAPS
-		  checks pessimize a two-element reduction.  PR54400.
-		  ???  In-order reduction could be handled if we only
-		  traverse one operand chain in vect_slp_linearize_chain.  */
-	       && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
-		   || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
-		       && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
 	       /* Ops with constants at the tail can be stripped here.  */
 	       && TREE_CODE (rhs) == SSA_NAME
 	       && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
@@ -7247,17 +7256,27 @@  vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
 	  gimple *code_stmt = NULL, *alt_code_stmt = NULL;
 	  if (code == MINUS_EXPR)
 	    code = PLUS_EXPR;
+	  /* ???  For TYPE_OVERFLOW_UNDEFINED we could resort to
+	     unsigned arithmetic.  */
+	  bool associatable
+	    = !needs_fold_left_reduction_p (TREE_TYPE (rhs), code);
 	  internal_fn reduc_fn;
-	  if (!reduction_fn_for_scalar_code (code, &reduc_fn)
-	      || reduc_fn == IFN_LAST)
+	  if (associatable
+	      && (!reduction_fn_for_scalar_code (code, &reduc_fn)
+		  || reduc_fn == IFN_LAST))
 	    continue;
 	  vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
-				    /* ??? */
-				    code_stmt, alt_code_stmt, &chain_stmts);
+				    code_stmt, alt_code_stmt, &chain_stmts,
+				    associatable);
 	  if (chain.length () > 1)
 	    {
-	      /* Sort the chain according to def_type and operation.  */
-	      chain.sort (dt_sort_cmp, bb_vinfo);
+	      /* Sort the chain according to def_type and operation if
+		 allowed.  Otherwise reverse so an in-order reduction
+		 of the final vector(s) is in the correct order.  */
+	      if (associatable)
+		chain.sort (dt_sort_cmp, bb_vinfo);
+	      else
+		chain.reverse ();
 	      /* ???  Now we'd want to strip externals and constants
 		 but record those to be handled in the epilogue.  */
 	      /* ???  For now do not allow mixing ops or externs/constants.  */
@@ -9137,22 +9156,40 @@  vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
       if (reduc_code == MINUS_EXPR)
 	reduc_code = PLUS_EXPR;
       gimple_seq epilogue = NULL;
-      /* We may end up with more than one vector result, reduce them
-	 to one vector.  */
       tree vec_def = vec_defs[0];
-      for (unsigned i = 1; i < vec_defs.length (); ++i)
-	vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
-				vec_def, vec_defs[i]);
-      vec_defs.release ();
-      /* ???  Support other schemes than direct internal fn.  */
-      internal_fn reduc_fn;
-      if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
-	  || reduc_fn == IFN_LAST)
-	gcc_unreachable ();
-      tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
-				      TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
+      bool fl_reduction
+	= needs_fold_left_reduction_p (TREE_TYPE (TREE_TYPE (vec_def)),
+				       reduc_code);
+      /* ???  Support other schemes than direct internal fn and open-coded
+	 fold-left reduction.  */
+      tree scalar_type = TREE_TYPE (TREE_TYPE (vec_def));
+      tree scalar_def;
+      if (fl_reduction)
+	{
+	  gcc_assert (vec_defs.length () == 1);
+	  gimple_seq fl_stmts = NULL;
+	  gimple_stmt_iterator gsi = gsi_last (fl_stmts);
+	  scalar_def = vect_expand_fold_left (&gsi, scalar_type, reduc_code,
+					      NULL_TREE, vec_def);
+	  gimple_seq_add_seq (&epilogue, fl_stmts);
+	}
+      else
+	{
+	  /* We may end up with more than one vector result, reduce them
+	     to one vector.  */
+	  for (unsigned i = 1; i < vec_defs.length (); ++i)
+	    vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
+				    vec_def, vec_defs[i]);
+	  vec_defs.release ();
+	  internal_fn reduc_fn;
+	  reduction_fn_for_scalar_code (reduc_code, &reduc_fn);
+	  scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
+				     scalar_type, vec_def);
+	}
       if (!SLP_INSTANCE_REMAIN_STMTS (instance).is_empty ())
 	{
+	  if (fl_reduction)
+	    gcc_assert (SLP_INSTANCE_REMAIN_STMTS (instance).length () == 1);
 	  tree rem_def = NULL_TREE;
 	  for (auto rem : SLP_INSTANCE_REMAIN_STMTS (instance))
 	    if (!rem_def)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 5987a327332..e93f3892eb0 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2347,6 +2347,8 @@  extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
 extern gimple_seq vect_gen_len (tree, tree, tree, tree);
 extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
 extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
+tree vect_expand_fold_left (gimple_stmt_iterator *, tree,
+			    tree_code, tree, tree);
 
 /* Drive for loop transformation stage.  */
 extern class loop *vect_transform_loop (loop_vec_info, gimple *);