[8/21] middle-end: update vectorizable_live_reduction with support for multiple exits and different exits

Message ID ZUiYLW1r3kBr9Vvh@arm.com
State Unresolved
Headers
Series None |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Tamar Christina Nov. 6, 2023, 7:39 a.m. UTC
  Hi All,

This adds support to vectorizable_live_reduction to handle multiple exits by
doing a search for which exit the live value should be materialized in.

Additinally which value in the index we're after depends on whether the exit
it's materialized in is an early exit or whether the loop's main exit is
different from the loop's natural one (i.e. the one with the same src block as
the latch).

In those two cases we want the first rather than the last value as we're going
to restart the iteration in the scalar loop.  For VLA this means we need to
reverse both the mask and vector since there's only a way to get the last
active element and not the first.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
	* tree-vectorizer.h (perm_mask_for_reverse): Expose.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index c123398aad207082384a2079c5234033c3d825ea..55d6aee3d29151e6b528f6fdde15c693e5bdd847 100644




--
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index c123398aad207082384a2079c5234033c3d825ea..55d6aee3d29151e6b528f6fdde15c693e5bdd847 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10503,12 +10503,56 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	   lhs' = new_tree;  */
 
       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
+      /* A value can only be live in one exit.  So figure out which one.  */
+      edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
+      /* Check if we have a loop where the chosen exit is not the main exit,
+	 in these cases for an early break we restart the iteration the vector code
+	 did.  For the live values we want the value at the start of the iteration
+	 rather than at the end.  */
+      bool inverted_ctrl_p = false;
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+	    if (!is_gimple_debug (use_stmt)
+		&& !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+	      {
+		basic_block use_bb = gimple_bb (use_stmt);
+		for (auto edge : get_loop_exit_edges (loop))
+		  {
+		    /* Alternative exits can have an intermediate BB in
+		       between to update the IV.  In those cases we need to
+		       look one block further.  */
+		    if (use_bb == edge->dest
+			|| (single_succ_p (edge->dest)
+			    && use_bb == single_succ (edge->dest)))
+		      {
+			exit_e = edge;
+			goto found;
+		      }
+		  }
+	      }
+found:
+	  /* If the edge isn't a single pred then split the edge so we have a
+	     location to place the live operations.  Perhaps we should always
+	     split during IV updating.  But this way the CFG is cleaner to
+	     follow.  */
+	  inverted_ctrl_p = !vect_is_loop_exit_latch_pred (exit_e, loop);
+	  if (!single_pred_p (exit_e->dest))
+	    exit_e = single_pred_edge (split_edge (exit_e));
+
+	  /* For early exit where the exit is not in the BB that leads to the
+	     latch then we're restarting the iteration in the scalar loop. So
+	     get the first live value.  */
+	  if (inverted_ctrl_p)
+	    bitstart = build_zero_cst (TREE_TYPE (bitstart));
+	}
+
+      basic_block exit_bb = exit_e->dest;
       gcc_assert (single_pred_p (exit_bb));
 
       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
-      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
+      SET_PHI_ARG_DEF (phi, exit_e->dest_idx, vec_lhs);
 
       gimple_seq stmts = NULL;
       tree new_tree;
@@ -10539,6 +10583,12 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
 					  len, bias_minus_one);
 
+	  /* This needs to implement extraction of the first index, but not sure
+	     how the LEN stuff works.  At the moment we shouldn't get here since
+	     there's no LEN support for early breaks.  But guard this so there's
+	     no incorrect codegen.  */
+	  gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
+
 	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
 	  tree scalar_res
 	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
@@ -10563,8 +10613,37 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 					  &LOOP_VINFO_MASKS (loop_vinfo),
 					  1, vectype, 0);
 	  gimple_seq_add_seq (&stmts, tem);
-	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-					  mask, vec_lhs_phi);
+	  tree scalar_res;
+
+	  /* For an inverted control flow with early breaks we want EXTRACT_FIRST
+	     instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
+	  if (inverted_ctrl_p && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	    {
+	      auto gsi_stmt = gsi_last (stmts);
+
+	       /* First create the permuted mask.  */
+	      tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
+	      tree perm_dest = copy_ssa_name (mask);
+	      gimple *perm_stmt
+		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
+					   mask, perm_mask);
+	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
+					   &gsi_stmt);
+	      mask = perm_dest;
+
+	       /* Then permute the vector contents.  */
+	      tree perm_elem = perm_mask_for_reverse (vectype);
+	      perm_dest = copy_ssa_name (vec_lhs_phi);
+	      perm_stmt
+		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
+					   vec_lhs_phi, perm_elem);
+	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
+					   &gsi_stmt);
+	      vec_lhs_phi = perm_dest;
+	    }
+
+	  scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+				     mask, vec_lhs_phi);
 
 	  /* Convert the extracted vector element to the scalar type.  */
 	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 42cebb92789247434a91cb8e74c0557e75d1ea2c..36aeca60a22cfaea8d3b43348000d75de1d525c7 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1765,7 +1765,7 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
 /* If the target supports a permute mask that reverses the elements in
    a vector of type VECTYPE, return that mask, otherwise return null.  */
 
-static tree
+tree
 perm_mask_for_reverse (tree vectype)
 {
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index d8b532c4b8ca92a856368a686598859fab9d40e9..a570cf113adb8e11e5383d4ba7600bddaddbd8c4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2245,6 +2245,7 @@ extern bool vect_is_simple_use (vec_info *, stmt_vec_info, slp_tree,
 				enum vect_def_type *,
 				tree *, stmt_vec_info * = NULL);
 extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
+extern tree perm_mask_for_reverse (tree);
 extern bool supportable_widening_operation (vec_info*, code_helper,
 					    stmt_vec_info, tree, tree,
 					    code_helper*, code_helper*,
  

Comments

Tamar Christina Nov. 15, 2023, 12:05 a.m. UTC | #1
Patch updated to trunk.

This adds support to vectorizable_live_reduction to handle multiple exits by
doing a search for which exit the live value should be materialized in.

Additinally which value in the index we're after depends on whether the exit
it's materialized in is an early exit or whether the loop's main exit is
different from the loop's natural one (i.e. the one with the same src block as
the latch).

In those two cases we want the first rather than the last value as we're going
to restart the iteration in the scalar loop.  For VLA this means we need to
reverse both the mask and vector since there's only a way to get the last
active element and not the first.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
	* tree-vectorizer.h (perm_mask_for_reverse): Expose.

--- inline copy of patch ---

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 4cf7f65dc164db27a498b31fe7ce0d9af3f3e299..2476e59ef488fd0a3b296ced7b0d4d3e76a3634f 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10627,12 +10627,60 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	   lhs' = new_tree;  */
 
       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
+      /* A value can only be live in one exit.  So figure out which one.  */
+      edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
+      /* Check if we have a loop where the chosen exit is not the main exit,
+	 in these cases for an early break we restart the iteration the vector code
+	 did.  For the live values we want the value at the start of the iteration
+	 rather than at the end.  */
+      bool restart_loop = false;
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+	    if (!is_gimple_debug (use_stmt)
+		&& !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+	      {
+		basic_block use_bb = gimple_bb (use_stmt);
+		for (auto edge : get_loop_exit_edges (loop))
+		  {
+		    /* Alternative exits can have an intermediate BB in
+		       between to update the IV.  In those cases we need to
+		       look one block further.  */
+		    if (use_bb == edge->dest
+			|| (single_succ_p (edge->dest)
+			    && use_bb == single_succ (edge->dest)))
+		      {
+			exit_e = edge;
+			goto found;
+		      }
+		  }
+	      }
+found:
+	  /* If the edge isn't a single pred then split the edge so we have a
+	     location to place the live operations.  Perhaps we should always
+	     split during IV updating.  But this way the CFG is cleaner to
+	     follow.  */
+	  restart_loop = !vect_is_loop_exit_latch_pred (exit_e, loop);
+	  if (!single_pred_p (exit_e->dest))
+	    exit_e = single_pred_edge (split_edge (exit_e));
+
+	  /* For early exit where the exit is not in the BB that leads to the
+	     latch then we're restarting the iteration in the scalar loop. So
+	     get the first live value.  */
+	  if (restart_loop)
+	    {
+	      vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+	      vec_lhs = gimple_get_lhs (vec_stmt);
+	      bitstart = build_zero_cst (TREE_TYPE (bitstart));
+	    }
+	}
+
+      basic_block exit_bb = exit_e->dest;
       gcc_assert (single_pred_p (exit_bb));
 
       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
-      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
+      SET_PHI_ARG_DEF (phi, exit_e->dest_idx, vec_lhs);
 
       gimple_seq stmts = NULL;
       tree new_tree;
@@ -10663,6 +10711,12 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
 					  len, bias_minus_one);
 
+	  /* This needs to implement extraction of the first index, but not sure
+	     how the LEN stuff works.  At the moment we shouldn't get here since
+	     there's no LEN support for early breaks.  But guard this so there's
+	     no incorrect codegen.  */
+	  gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
+
 	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
 	  tree scalar_res
 	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
@@ -10687,8 +10741,37 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 					  &LOOP_VINFO_MASKS (loop_vinfo),
 					  1, vectype, 0);
 	  gimple_seq_add_seq (&stmts, tem);
-	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-					  mask, vec_lhs_phi);
+	  tree scalar_res;
+
+	  /* For an inverted control flow with early breaks we want EXTRACT_FIRST
+	     instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
+	  if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	    {
+	      auto gsi_stmt = gsi_last (stmts);
+
+	       /* First create the permuted mask.  */
+	      tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
+	      tree perm_dest = copy_ssa_name (mask);
+	      gimple *perm_stmt
+		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
+					   mask, perm_mask);
+	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
+					   &gsi_stmt);
+	      mask = perm_dest;
+
+	       /* Then permute the vector contents.  */
+	      tree perm_elem = perm_mask_for_reverse (vectype);
+	      perm_dest = copy_ssa_name (vec_lhs_phi);
+	      perm_stmt
+		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
+					   vec_lhs_phi, perm_elem);
+	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
+					   &gsi_stmt);
+	      vec_lhs_phi = perm_dest;
+	    }
+
+	  scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+				     mask, vec_lhs_phi);
 
 	  /* Convert the extracted vector element to the scalar type.  */
 	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
@@ -10708,26 +10791,36 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       if (stmts)
 	gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
-      /* Remove existing phis that copy from lhs and create copies
-	 from new_tree.  */
-      gimple_stmt_iterator gsi;
-      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
+      /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
+      bool single_use = true;
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
 	{
-	  gimple *phi = gsi_stmt (gsi);
-	  if ((gimple_phi_arg_def (phi, 0) == lhs))
+	  if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+	    continue;
+
+	  gcc_assert (single_use);
+	  if (is_a <gphi *> (use_stmt)
+	      && gimple_phi_arg_def (as_a <gphi *> (use_stmt), 0) == lhs)
 	    {
+	      /* Remove existing phis that copy from lhs and create copies
+		 from new_tree.  */
+	      gphi *phi = as_a <gphi *> (use_stmt);
+	      auto gsi = gsi_for_phi (phi);
 	      remove_phi_node (&gsi, false);
 	      tree lhs_phi = gimple_phi_result (phi);
 	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
 	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
 	    }
 	  else
-	    gsi_next (&gsi);
+	    {
+	      /* Or just update the use in place if not a phi.  */
+	      use_operand_p use_p;
+	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+		SET_USE (use_p, new_tree);
+	      update_stmt (use_stmt);
+	    }
+	  single_use = false;
 	}
-
-      /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
-      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
-	gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
     }
   else
     {
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 3a22bf02f5ab16ded0af61cd1d719a98b8982144..7c3d6d196e122d67f750dfef6d615aabc6c28281 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1774,7 +1774,7 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
 /* If the target supports a permute mask that reverses the elements in
    a vector of type VECTYPE, return that mask, otherwise return null.  */
 
-static tree
+tree
 perm_mask_for_reverse (tree vectype)
 {
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b9a71a0b5f5407417e8366b0df132df20c7f60aa..f261fc74b8795b4516b17155441d25baaf8c22ae 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2246,6 +2246,7 @@ extern bool vect_is_simple_use (vec_info *, stmt_vec_info, slp_tree,
 				enum vect_def_type *,
 				tree *, stmt_vec_info * = NULL);
 extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
+extern tree perm_mask_for_reverse (tree);
 extern bool supportable_widening_operation (vec_info*, code_helper,
 					    stmt_vec_info, tree, tree,
 					    code_helper*, code_helper*,
  
Richard Biener Nov. 15, 2023, 1:41 p.m. UTC | #2
On Wed, 15 Nov 2023, Tamar Christina wrote:

> Patch updated to trunk.
> 
> This adds support to vectorizable_live_reduction to handle multiple exits by

vectorizable_live_operation, but I do wonder how you handle reductions?

> doing a search for which exit the live value should be materialized in.
> 
> Additinally which value in the index we're after depends on whether the exit
> it's materialized in is an early exit or whether the loop's main exit is
> different from the loop's natural one (i.e. the one with the same src block as
> the latch).
> 
> In those two cases we want the first rather than the last value as we're going
> to restart the iteration in the scalar loop.  For VLA this means we need to
> reverse both the mask and vector since there's only a way to get the last
> active element and not the first.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
> 	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
> 	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 4cf7f65dc164db27a498b31fe7ce0d9af3f3e299..2476e59ef488fd0a3b296ced7b0d4d3e76a3634f 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10627,12 +10627,60 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  	   lhs' = new_tree;  */
>  
>        class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> -      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> +      /* A value can only be live in one exit.  So figure out which one.  */

Well, a value can be live across multiple exits!

> +      edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> +      /* Check if we have a loop where the chosen exit is not the main exit,
> +	 in these cases for an early break we restart the iteration the vector code
> +	 did.  For the live values we want the value at the start of the iteration
> +	 rather than at the end.  */
> +      bool restart_loop = false;
> +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +	{
> +	  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> +	    if (!is_gimple_debug (use_stmt)
> +		&& !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> +	      {

In fact when you get here you know the use is in a LC PHI.  Use
FOR_EACH_IMM_USE_FAST and you can get at the edge
via phi_arg_index_from_use and gimple_phi_arg_edge.

As said you have to process all exits the value is live on, not only
the first.

> +		basic_block use_bb = gimple_bb (use_stmt);
> +		for (auto edge : get_loop_exit_edges (loop))
> +		  {
> +		    /* Alternative exits can have an intermediate BB in
> +		       between to update the IV.  In those cases we need to
> +		       look one block further.  */
> +		    if (use_bb == edge->dest
> +			|| (single_succ_p (edge->dest)
> +			    && use_bb == single_succ (edge->dest)))
> +		      {
> +			exit_e = edge;
> +			goto found;
> +		      }
> +		  }
> +	      }
> +found:
> +	  /* If the edge isn't a single pred then split the edge so we have a
> +	     location to place the live operations.  Perhaps we should always
> +	     split during IV updating.  But this way the CFG is cleaner to
> +	     follow.  */
> +	  restart_loop = !vect_is_loop_exit_latch_pred (exit_e, loop);
> +	  if (!single_pred_p (exit_e->dest))
> +	    exit_e = single_pred_edge (split_edge (exit_e));
> +
> +	  /* For early exit where the exit is not in the BB that leads to the
> +	     latch then we're restarting the iteration in the scalar loop. So
> +	     get the first live value.  */
> +	  if (restart_loop)
> +	    {
> +	      vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> +	      vec_lhs = gimple_get_lhs (vec_stmt);
> +	      bitstart = build_zero_cst (TREE_TYPE (bitstart));

No, this doesn't work for SLP.  Note this also gets you the "first" live
value _after_ the vector iteration.  Btw, I fail to see why you need
to handle STMT_VINFO_LIVE at all for the early exits - this is
scalar values live _after_ all iterations of the loop, thus it's
provided by the scalar epilog that always runs when we exit the vector
loop early.

The story is different for reductions though (unless we fail to support
early breaks for those at the moment).

Richard.


> +	    }
> +	}
> +
> +      basic_block exit_bb = exit_e->dest;
>        gcc_assert (single_pred_p (exit_bb));
>  
>        tree vec_lhs_phi = copy_ssa_name (vec_lhs);
>        gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
> -      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
> +      SET_PHI_ARG_DEF (phi, exit_e->dest_idx, vec_lhs);
>  
>        gimple_seq stmts = NULL;
>        tree new_tree;
> @@ -10663,6 +10711,12 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
>  					  len, bias_minus_one);
>  
> +	  /* This needs to implement extraction of the first index, but not sure
> +	     how the LEN stuff works.  At the moment we shouldn't get here since
> +	     there's no LEN support for early breaks.  But guard this so there's
> +	     no incorrect codegen.  */
> +	  gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
> +
>  	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
>  	  tree scalar_res
>  	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> @@ -10687,8 +10741,37 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  					  &LOOP_VINFO_MASKS (loop_vinfo),
>  					  1, vectype, 0);
>  	  gimple_seq_add_seq (&stmts, tem);
> -	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> -					  mask, vec_lhs_phi);
> +	  tree scalar_res;
> +
> +	  /* For an inverted control flow with early breaks we want EXTRACT_FIRST
> +	     instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
> +	  if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +	    {
> +	      auto gsi_stmt = gsi_last (stmts);
> +
> +	       /* First create the permuted mask.  */
> +	      tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> +	      tree perm_dest = copy_ssa_name (mask);
> +	      gimple *perm_stmt
> +		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> +					   mask, perm_mask);
> +	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
> +					   &gsi_stmt);
> +	      mask = perm_dest;
> +
> +	       /* Then permute the vector contents.  */
> +	      tree perm_elem = perm_mask_for_reverse (vectype);
> +	      perm_dest = copy_ssa_name (vec_lhs_phi);
> +	      perm_stmt
> +		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
> +					   vec_lhs_phi, perm_elem);
> +	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
> +					   &gsi_stmt);
> +	      vec_lhs_phi = perm_dest;
> +	    }
> +
> +	  scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> +				     mask, vec_lhs_phi);
>  
>  	  /* Convert the extracted vector element to the scalar type.  */
>  	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> @@ -10708,26 +10791,36 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        if (stmts)
>  	gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
>  
> -      /* Remove existing phis that copy from lhs and create copies
> -	 from new_tree.  */
> -      gimple_stmt_iterator gsi;
> -      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
> +      /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
> +      bool single_use = true;
> +      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
>  	{
> -	  gimple *phi = gsi_stmt (gsi);
> -	  if ((gimple_phi_arg_def (phi, 0) == lhs))
> +	  if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> +	    continue;
> +
> +	  gcc_assert (single_use);
> +	  if (is_a <gphi *> (use_stmt)
> +	      && gimple_phi_arg_def (as_a <gphi *> (use_stmt), 0) == lhs)
>  	    {
> +	      /* Remove existing phis that copy from lhs and create copies
> +		 from new_tree.  */
> +	      gphi *phi = as_a <gphi *> (use_stmt);
> +	      auto gsi = gsi_for_phi (phi);
>  	      remove_phi_node (&gsi, false);
>  	      tree lhs_phi = gimple_phi_result (phi);
>  	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
>  	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
>  	    }
>  	  else
> -	    gsi_next (&gsi);
> +	    {
> +	      /* Or just update the use in place if not a phi.  */
> +	      use_operand_p use_p;
> +	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> +		SET_USE (use_p, new_tree);
> +	      update_stmt (use_stmt);
> +	    }
> +	  single_use = false;
>  	}
> -
> -      /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
> -      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> -	gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
>      }
>    else
>      {
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 3a22bf02f5ab16ded0af61cd1d719a98b8982144..7c3d6d196e122d67f750dfef6d615aabc6c28281 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1774,7 +1774,7 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
>  /* If the target supports a permute mask that reverses the elements in
>     a vector of type VECTYPE, return that mask, otherwise return null.  */
>  
> -static tree
> +tree
>  perm_mask_for_reverse (tree vectype)
>  {
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index b9a71a0b5f5407417e8366b0df132df20c7f60aa..f261fc74b8795b4516b17155441d25baaf8c22ae 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2246,6 +2246,7 @@ extern bool vect_is_simple_use (vec_info *, stmt_vec_info, slp_tree,
>  				enum vect_def_type *,
>  				tree *, stmt_vec_info * = NULL);
>  extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
> +extern tree perm_mask_for_reverse (tree);
>  extern bool supportable_widening_operation (vec_info*, code_helper,
>  					    stmt_vec_info, tree, tree,
>  					    code_helper*, code_helper*,
>
  
Tamar Christina Nov. 15, 2023, 2:26 p.m. UTC | #3
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Wednesday, November 15, 2023 1:42 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH 8/21]middle-end: update vectorizable_live_reduction
> with support for multiple exits and different exits
> 
> On Wed, 15 Nov 2023, Tamar Christina wrote:
> 
> > Patch updated to trunk.
> >
> > This adds support to vectorizable_live_reduction to handle multiple
> > exits by
> 
> vectorizable_live_operation, but I do wonder how you handle reductions?

In the testcases I have reductions all seem to work fine, since reductions are
Placed in the merge block between the two loops and always have the
"value so far from full loop iterations".  These will just be used as seed for the
Scalar loop for any partial iterations.

> 
> > doing a search for which exit the live value should be materialized in.
> >
> > Additinally which value in the index we're after depends on whether
> > the exit it's materialized in is an early exit or whether the loop's
> > main exit is different from the loop's natural one (i.e. the one with
> > the same src block as the latch).
> >
> > In those two cases we want the first rather than the last value as
> > we're going to restart the iteration in the scalar loop.  For VLA this
> > means we need to reverse both the mask and vector since there's only a
> > way to get the last active element and not the first.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
> > 	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
> > 	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index
> >
> 4cf7f65dc164db27a498b31fe7ce0d9af3f3e299..2476e59ef488fd0a3b296c
> ed7b0d
> > 4d3e76a3634f 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -10627,12 +10627,60 @@ vectorizable_live_operation (vec_info
> *vinfo, stmt_vec_info stmt_info,
> >  	   lhs' = new_tree;  */
> >
> >        class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > -      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > +      /* A value can only be live in one exit.  So figure out which
> > + one.  */
> 
> Well, a value can be live across multiple exits!

The same value can only be live across multiple early exits no?  In which
case they'll all still be I the same block as all the early exits end In the same
merge block.

So this code is essentially just figuring out if you're an early or normal exit.
Perhaps the comment is inclear..

> 
> > +      edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > +      /* Check if we have a loop where the chosen exit is not the main exit,
> > +	 in these cases for an early break we restart the iteration the vector
> code
> > +	 did.  For the live values we want the value at the start of the iteration
> > +	 rather than at the end.  */
> > +      bool restart_loop = false;
> > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > +	{
> > +	  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> > +	    if (!is_gimple_debug (use_stmt)
> > +		&& !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> > +	      {
> 
> In fact when you get here you know the use is in a LC PHI.  Use
> FOR_EACH_IMM_USE_FAST and you can get at the edge via
> phi_arg_index_from_use and gimple_phi_arg_edge.
> 
> As said you have to process all exits the value is live on, not only the first.
> 
> > +		basic_block use_bb = gimple_bb (use_stmt);
> > +		for (auto edge : get_loop_exit_edges (loop))
> > +		  {
> > +		    /* Alternative exits can have an intermediate BB in
> > +		       between to update the IV.  In those cases we need to
> > +		       look one block further.  */
> > +		    if (use_bb == edge->dest
> > +			|| (single_succ_p (edge->dest)
> > +			    && use_bb == single_succ (edge->dest)))
> > +		      {
> > +			exit_e = edge;
> > +			goto found;
> > +		      }
> > +		  }
> > +	      }
> > +found:
> > +	  /* If the edge isn't a single pred then split the edge so we have a
> > +	     location to place the live operations.  Perhaps we should always
> > +	     split during IV updating.  But this way the CFG is cleaner to
> > +	     follow.  */
> > +	  restart_loop = !vect_is_loop_exit_latch_pred (exit_e, loop);
> > +	  if (!single_pred_p (exit_e->dest))
> > +	    exit_e = single_pred_edge (split_edge (exit_e));
> > +
> > +	  /* For early exit where the exit is not in the BB that leads to the
> > +	     latch then we're restarting the iteration in the scalar loop. So
> > +	     get the first live value.  */
> > +	  if (restart_loop)
> > +	    {
> > +	      vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> > +	      vec_lhs = gimple_get_lhs (vec_stmt);
> > +	      bitstart = build_zero_cst (TREE_TYPE (bitstart));
> 
> No, this doesn't work for SLP.  Note this also gets you the "first" live value
> _after_ the vector iteration.

Yes we're after the first value for a full vector iteration.  In the initial iteration that
value the seed vector is always started from the initial value of the PHI node no?

> Btw, I fail to see why you need to handle
> STMT_VINFO_LIVE at all for the early exits - this is scalar values live _after_ all
> iterations of the loop, thus it's provided by the scalar epilog that always runs
> when we exit the vector loop early.

In the patch of last year I basically exited here with return true, and did not bother
vectorizing them at all and instead adjusted them during the vect_update_ivs_after_vectorizer
just like we normally would..  But you didn't seem to like that approach.

If we take that approach again then the only thing needing to be changed here is
to ignore the live operations inside an early exit block.

The reason they appear is that if you have something like

If (foo)
  return I;

when we redirect the edge `i` ends up in the block between the two loops, and I Is also
the loop counter.

Would you prefer I use last year's approach instead? i.e. just ignore them and recalculate
Any loop IVs as normal?

Thanks,
Tamar

> 
> The story is different for reductions though (unless we fail to support early
> breaks for those at the moment).
> 
> Richard.
> 
> 
> > +	    }
> > +	}
> > +
> > +      basic_block exit_bb = exit_e->dest;
> >        gcc_assert (single_pred_p (exit_bb));
> >
> >        tree vec_lhs_phi = copy_ssa_name (vec_lhs);
> >        gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
> > -      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx,
> vec_lhs);
> > +      SET_PHI_ARG_DEF (phi, exit_e->dest_idx, vec_lhs);
> >
> >        gimple_seq stmts = NULL;
> >        tree new_tree;
> > @@ -10663,6 +10711,12 @@ vectorizable_live_operation (vec_info *vinfo,
> stmt_vec_info stmt_info,
> >  	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> >  					  len, bias_minus_one);
> >
> > +	  /* This needs to implement extraction of the first index, but not sure
> > +	     how the LEN stuff works.  At the moment we shouldn't get here
> since
> > +	     there's no LEN support for early breaks.  But guard this so there's
> > +	     no incorrect codegen.  */
> > +	  gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
> > +
> >  	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> >  	  tree scalar_res
> >  	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> @@
> > -10687,8 +10741,37 @@ vectorizable_live_operation (vec_info *vinfo,
> stmt_vec_info stmt_info,
> >  					  &LOOP_VINFO_MASKS (loop_vinfo),
> >  					  1, vectype, 0);
> >  	  gimple_seq_add_seq (&stmts, tem);
> > -	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
> scalar_type,
> > -					  mask, vec_lhs_phi);
> > +	  tree scalar_res;
> > +
> > +	  /* For an inverted control flow with early breaks we want
> EXTRACT_FIRST
> > +	     instead of EXTRACT_LAST.  Emulate by reversing the vector and
> mask. */
> > +	  if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > +	    {
> > +	      auto gsi_stmt = gsi_last (stmts);
> > +
> > +	       /* First create the permuted mask.  */
> > +	      tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> > +	      tree perm_dest = copy_ssa_name (mask);
> > +	      gimple *perm_stmt
> > +		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> > +					   mask, perm_mask);
> > +	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
> > +					   &gsi_stmt);
> > +	      mask = perm_dest;
> > +
> > +	       /* Then permute the vector contents.  */
> > +	      tree perm_elem = perm_mask_for_reverse (vectype);
> > +	      perm_dest = copy_ssa_name (vec_lhs_phi);
> > +	      perm_stmt
> > +		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
> vec_lhs_phi,
> > +					   vec_lhs_phi, perm_elem);
> > +	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
> > +					   &gsi_stmt);
> > +	      vec_lhs_phi = perm_dest;
> > +	    }
> > +
> > +	  scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > +				     mask, vec_lhs_phi);
> >
> >  	  /* Convert the extracted vector element to the scalar type.  */
> >  	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res); @@
> > -10708,26 +10791,36 @@ vectorizable_live_operation (vec_info *vinfo,
> stmt_vec_info stmt_info,
> >        if (stmts)
> >  	gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
> >
> > -      /* Remove existing phis that copy from lhs and create copies
> > -	 from new_tree.  */
> > -      gimple_stmt_iterator gsi;
> > -      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
> > +      /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
> > +      bool single_use = true;
> > +      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> >  	{
> > -	  gimple *phi = gsi_stmt (gsi);
> > -	  if ((gimple_phi_arg_def (phi, 0) == lhs))
> > +	  if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> > +	    continue;
> > +
> > +	  gcc_assert (single_use);
> > +	  if (is_a <gphi *> (use_stmt)
> > +	      && gimple_phi_arg_def (as_a <gphi *> (use_stmt), 0) == lhs)
> >  	    {
> > +	      /* Remove existing phis that copy from lhs and create copies
> > +		 from new_tree.  */
> > +	      gphi *phi = as_a <gphi *> (use_stmt);
> > +	      auto gsi = gsi_for_phi (phi);
> >  	      remove_phi_node (&gsi, false);
> >  	      tree lhs_phi = gimple_phi_result (phi);
> >  	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> >  	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> >  	    }
> >  	  else
> > -	    gsi_next (&gsi);
> > +	    {
> > +	      /* Or just update the use in place if not a phi.  */
> > +	      use_operand_p use_p;
> > +	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> > +		SET_USE (use_p, new_tree);
> > +	      update_stmt (use_stmt);
> > +	    }
> > +	  single_use = false;
> >  	}
> > -
> > -      /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
> > -      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> > -	gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
> >      }
> >    else
> >      {
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> >
> 3a22bf02f5ab16ded0af61cd1d719a98b8982144..7c3d6d196e122d67f750
> dfef6d61
> > 5aabc6c28281 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -1774,7 +1774,7 @@ compare_step_with_zero (vec_info *vinfo,
> > stmt_vec_info stmt_info)
> >  /* If the target supports a permute mask that reverses the elements in
> >     a vector of type VECTYPE, return that mask, otherwise return null.
> > */
> >
> > -static tree
> > +tree
> >  perm_mask_for_reverse (tree vectype)
> >  {
> >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); diff --git
> > a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index
> >
> b9a71a0b5f5407417e8366b0df132df20c7f60aa..f261fc74b8795b4516b17
> 155441d
> > 25baaf8c22ae 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -2246,6 +2246,7 @@ extern bool vect_is_simple_use (vec_info *,
> stmt_vec_info, slp_tree,
> >  				enum vect_def_type *,
> >  				tree *, stmt_vec_info * = NULL);
> >  extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
> > +extern tree perm_mask_for_reverse (tree);
> >  extern bool supportable_widening_operation (vec_info*, code_helper,
> >  					    stmt_vec_info, tree, tree,
> >  					    code_helper*, code_helper*,
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
  
Richard Biener Nov. 16, 2023, 11:16 a.m. UTC | #4
On Wed, 15 Nov 2023, Tamar Christina wrote:

> 
> 
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Wednesday, November 15, 2023 1:42 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: RE: [PATCH 8/21]middle-end: update vectorizable_live_reduction
> > with support for multiple exits and different exits
> > 
> > On Wed, 15 Nov 2023, Tamar Christina wrote:
> > 
> > > Patch updated to trunk.
> > >
> > > This adds support to vectorizable_live_reduction to handle multiple
> > > exits by
> > 
> > vectorizable_live_operation, but I do wonder how you handle reductions?
> 
> In the testcases I have reductions all seem to work fine, since reductions are
> Placed in the merge block between the two loops and always have the
> "value so far from full loop iterations".

Is that so?  A simple testcase shows

  <bb 3> [local count: 1063004408]:
  # sum_9 = PHI <sum_6(5), 0.0(2)>
  # i_11 = PHI <i_7(5), 0(2)>
  # ivtmp_8 = PHI <ivtmp_4(5), 1024(2)>
  # vect_sum_9.4_3 = PHI <vect_sum_6.8_14(5), { 0.0, 0.0 }(2)>
  # vectp_x.5_2 = PHI <vectp_x.5_12(5), &x(2)>
  # ivtmp_17 = PHI <ivtmp_18(5), 0(2)>
  vect__1.7_13 = MEM <vector(2) double> [(double *)vectp_x.5_2];
  _1 = x[i_11];
  vect_sum_6.8_14 = vect__1.7_13 + vect_sum_9.4_3;
  sum_6 = _1 + sum_9;
  i_7 = i_11 + 1;
  ivtmp_4 = ivtmp_8 - 1;
  vectp_x.5_12 = vectp_x.5_2 + 16;
  ivtmp_18 = ivtmp_17 + 1;
  if (ivtmp_18 < 511)
    goto <bb 5>; [98.99%]
  else
    goto <bb 4>; [1.01%]

  <bb 5> [local count: 1052266995]:
  goto <bb 3>; [100.00%]

  <bb 4> [local count: 10737416]:
  # sum_10 = PHI <sum_6(3)>
  # vect_sum_6.8_15 = PHI <vect_sum_6.8_14(3)>
  _16 = .REDUC_PLUS (vect_sum_6.8_15);

so we're using the updated value vect_sum_6.8_14, not the
start value vect_sum_9.4_3 to compute the result.  For an
early exit I would have expected we need to do .REDUC_PLUS
on vect_sum_9.4_3 instead?

What I see when doing an experiment on your branch is that
you keep the scalar reduction variable live in the vectorized
loop, resulting in wrong code as that will no longer compute
the sum of all elements but just the first N (the IV increment
will also not be adjusted).

double x[1024];
int a[1024];
double __attribute__((noipa)) foo  ()
{
  double sum = 0.0;
  for (int i = 0 ; i < 1023; ++i)
    {
      sum += x[i];
      if (a[i])
        break;
    }
  return sum;
}

int main()
{
  for (int i = 0; i < 1024; ++i)
    x[i] = i;
  a[19] = 1;
  if (foo () != 190.)
    __builtin_abort ();
  return 0;
}

is miscompiled on x86_64 with -msse4.1 -Ofast because of that. 

>  These will just be used as seed for the
> Scalar loop for any partial iterations.


> > 
> > > doing a search for which exit the live value should be materialized in.
> > >
> > > Additinally which value in the index we're after depends on whether
> > > the exit it's materialized in is an early exit or whether the loop's
> > > main exit is different from the loop's natural one (i.e. the one with
> > > the same src block as the latch).
> > >
> > > In those two cases we want the first rather than the last value as
> > > we're going to restart the iteration in the scalar loop.  For VLA this
> > > means we need to reverse both the mask and vector since there's only a
> > > way to get the last active element and not the first.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
> > > 	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
> > > 	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
> > >
> > > --- inline copy of patch ---
> > >
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index
> > >
> > 4cf7f65dc164db27a498b31fe7ce0d9af3f3e299..2476e59ef488fd0a3b296c
> > ed7b0d
> > > 4d3e76a3634f 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -10627,12 +10627,60 @@ vectorizable_live_operation (vec_info
> > *vinfo, stmt_vec_info stmt_info,
> > >  	   lhs' = new_tree;  */
> > >
> > >        class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > > -      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > > +      /* A value can only be live in one exit.  So figure out which
> > > + one.  */
> > 
> > Well, a value can be live across multiple exits!
> 
> The same value can only be live across multiple early exits no?  In which
> case they'll all still be I the same block as all the early exits end In the same
> merge block.

A value can be live on the normal exit as well though we wouldn't
need its value there.  I think besides reductions we advance
all inductions in vect_update_ivs_after_vectorizer, we don't
have code to "continue" an induction in the epilogue.

> So this code is essentially just figuring out if you're an early or normal exit.
> Perhaps the comment is inclear..
> 
> > 
> > > +      edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > > +      /* Check if we have a loop where the chosen exit is not the main exit,
> > > +	 in these cases for an early break we restart the iteration the vector
> > code
> > > +	 did.  For the live values we want the value at the start of the iteration
> > > +	 rather than at the end.  */
> > > +      bool restart_loop = false;
> > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > +	{
> > > +	  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> > > +	    if (!is_gimple_debug (use_stmt)
> > > +		&& !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> > > +	      {
> > 
> > In fact when you get here you know the use is in a LC PHI.  Use
> > FOR_EACH_IMM_USE_FAST and you can get at the edge via
> > phi_arg_index_from_use and gimple_phi_arg_edge.
> > 
> > As said you have to process all exits the value is live on, not only the first.
> > 
> > > +		basic_block use_bb = gimple_bb (use_stmt);
> > > +		for (auto edge : get_loop_exit_edges (loop))
> > > +		  {
> > > +		    /* Alternative exits can have an intermediate BB in
> > > +		       between to update the IV.  In those cases we need to
> > > +		       look one block further.  */
> > > +		    if (use_bb == edge->dest
> > > +			|| (single_succ_p (edge->dest)
> > > +			    && use_bb == single_succ (edge->dest)))
> > > +		      {
> > > +			exit_e = edge;
> > > +			goto found;
> > > +		      }
> > > +		  }
> > > +	      }
> > > +found:
> > > +	  /* If the edge isn't a single pred then split the edge so we have a
> > > +	     location to place the live operations.  Perhaps we should always
> > > +	     split during IV updating.  But this way the CFG is cleaner to
> > > +	     follow.  */
> > > +	  restart_loop = !vect_is_loop_exit_latch_pred (exit_e, loop);
> > > +	  if (!single_pred_p (exit_e->dest))
> > > +	    exit_e = single_pred_edge (split_edge (exit_e));
> > > +
> > > +	  /* For early exit where the exit is not in the BB that leads to the
> > > +	     latch then we're restarting the iteration in the scalar loop. So
> > > +	     get the first live value.  */
> > > +	  if (restart_loop)
> > > +	    {
> > > +	      vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> > > +	      vec_lhs = gimple_get_lhs (vec_stmt);
> > > +	      bitstart = build_zero_cst (TREE_TYPE (bitstart));
> > 
> > No, this doesn't work for SLP.  Note this also gets you the "first" live value
> > _after_ the vector iteration.
> 
> Yes we're after the first value for a full vector iteration.  In the initial iteration that
> value the seed vector is always started from the initial value of the PHI node no?
> 
> > Btw, I fail to see why you need to handle
> > STMT_VINFO_LIVE at all for the early exits - this is scalar values live _after_ all
> > iterations of the loop, thus it's provided by the scalar epilog that always runs
> > when we exit the vector loop early.
> 
> In the patch of last year I basically exited here with return true, and did not bother
> vectorizing them at all and instead adjusted them during the vect_update_ivs_after_vectorizer
> just like we normally would..  But you didn't seem to like that approach.
> 
> If we take that approach again then the only thing needing to be changed here is
> to ignore the live operations inside an early exit block.

I think that's the appropriate approach for any exit that will always
lead to an epilogue loop (we're only creating dead code).  I wonder
what will happen if you just leave it alone, just handling the
main IV edge as before?

> The reason they appear is that if you have something like
> 
> If (foo)
>   return I;
> 
> when we redirect the edge `i` ends up in the block between the two loops, and I Is also
> the loop counter.
> 
> Would you prefer I use last year's approach instead? i.e. just ignore them and recalculate
> Any loop IVs as normal?

Yes.

Richard.

> 
> Thanks,
> Tamar
> 
> > 
> > The story is different for reductions though (unless we fail to support early
> > breaks for those at the moment).
> > 
> > Richard.
> > 
> > 
> > > +	    }
> > > +	}
> > > +
> > > +      basic_block exit_bb = exit_e->dest;
> > >        gcc_assert (single_pred_p (exit_bb));
> > >
> > >        tree vec_lhs_phi = copy_ssa_name (vec_lhs);
> > >        gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
> > > -      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx,
> > vec_lhs);
> > > +      SET_PHI_ARG_DEF (phi, exit_e->dest_idx, vec_lhs);
> > >
> > >        gimple_seq stmts = NULL;
> > >        tree new_tree;
> > > @@ -10663,6 +10711,12 @@ vectorizable_live_operation (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> > >  	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> > >  					  len, bias_minus_one);
> > >
> > > +	  /* This needs to implement extraction of the first index, but not sure
> > > +	     how the LEN stuff works.  At the moment we shouldn't get here
> > since
> > > +	     there's no LEN support for early breaks.  But guard this so there's
> > > +	     no incorrect codegen.  */
> > > +	  gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
> > > +
> > >  	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> > >  	  tree scalar_res
> > >  	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> > @@
> > > -10687,8 +10741,37 @@ vectorizable_live_operation (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> > >  					  &LOOP_VINFO_MASKS (loop_vinfo),
> > >  					  1, vectype, 0);
> > >  	  gimple_seq_add_seq (&stmts, tem);
> > > -	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
> > scalar_type,
> > > -					  mask, vec_lhs_phi);
> > > +	  tree scalar_res;
> > > +
> > > +	  /* For an inverted control flow with early breaks we want
> > EXTRACT_FIRST
> > > +	     instead of EXTRACT_LAST.  Emulate by reversing the vector and
> > mask. */
> > > +	  if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > > +	    {
> > > +	      auto gsi_stmt = gsi_last (stmts);
> > > +
> > > +	       /* First create the permuted mask.  */
> > > +	      tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> > > +	      tree perm_dest = copy_ssa_name (mask);
> > > +	      gimple *perm_stmt
> > > +		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> > > +					   mask, perm_mask);
> > > +	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
> > > +					   &gsi_stmt);
> > > +	      mask = perm_dest;
> > > +
> > > +	       /* Then permute the vector contents.  */
> > > +	      tree perm_elem = perm_mask_for_reverse (vectype);
> > > +	      perm_dest = copy_ssa_name (vec_lhs_phi);
> > > +	      perm_stmt
> > > +		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
> > vec_lhs_phi,
> > > +					   vec_lhs_phi, perm_elem);
> > > +	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
> > > +					   &gsi_stmt);
> > > +	      vec_lhs_phi = perm_dest;
> > > +	    }
> > > +
> > > +	  scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > > +				     mask, vec_lhs_phi);
> > >
> > >  	  /* Convert the extracted vector element to the scalar type.  */
> > >  	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res); @@
> > > -10708,26 +10791,36 @@ vectorizable_live_operation (vec_info *vinfo,
> > stmt_vec_info stmt_info,
> > >        if (stmts)
> > >  	gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
> > >
> > > -      /* Remove existing phis that copy from lhs and create copies
> > > -	 from new_tree.  */
> > > -      gimple_stmt_iterator gsi;
> > > -      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
> > > +      /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
> > > +      bool single_use = true;
> > > +      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> > >  	{
> > > -	  gimple *phi = gsi_stmt (gsi);
> > > -	  if ((gimple_phi_arg_def (phi, 0) == lhs))
> > > +	  if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> > > +	    continue;
> > > +
> > > +	  gcc_assert (single_use);
> > > +	  if (is_a <gphi *> (use_stmt)
> > > +	      && gimple_phi_arg_def (as_a <gphi *> (use_stmt), 0) == lhs)
> > >  	    {
> > > +	      /* Remove existing phis that copy from lhs and create copies
> > > +		 from new_tree.  */
> > > +	      gphi *phi = as_a <gphi *> (use_stmt);
> > > +	      auto gsi = gsi_for_phi (phi);
> > >  	      remove_phi_node (&gsi, false);
> > >  	      tree lhs_phi = gimple_phi_result (phi);
> > >  	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> > >  	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> > >  	    }
> > >  	  else
> > > -	    gsi_next (&gsi);
> > > +	    {
> > > +	      /* Or just update the use in place if not a phi.  */
> > > +	      use_operand_p use_p;
> > > +	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> > > +		SET_USE (use_p, new_tree);
> > > +	      update_stmt (use_stmt);
> > > +	    }
> > > +	  single_use = false;
> > >  	}
> > > -
> > > -      /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
> > > -      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> > > -	gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
> > >      }
> > >    else
> > >      {
> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> > >
> > 3a22bf02f5ab16ded0af61cd1d719a98b8982144..7c3d6d196e122d67f750
> > dfef6d61
> > > 5aabc6c28281 100644
> > > --- a/gcc/tree-vect-stmts.cc
> > > +++ b/gcc/tree-vect-stmts.cc
> > > @@ -1774,7 +1774,7 @@ compare_step_with_zero (vec_info *vinfo,
> > > stmt_vec_info stmt_info)
> > >  /* If the target supports a permute mask that reverses the elements in
> > >     a vector of type VECTYPE, return that mask, otherwise return null.
> > > */
> > >
> > > -static tree
> > > +tree
> > >  perm_mask_for_reverse (tree vectype)
> > >  {
> > >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); diff --git
> > > a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index
> > >
> > b9a71a0b5f5407417e8366b0df132df20c7f60aa..f261fc74b8795b4516b17
> > 155441d
> > > 25baaf8c22ae 100644
> > > --- a/gcc/tree-vectorizer.h
> > > +++ b/gcc/tree-vectorizer.h
> > > @@ -2246,6 +2246,7 @@ extern bool vect_is_simple_use (vec_info *,
> > stmt_vec_info, slp_tree,
> > >  				enum vect_def_type *,
> > >  				tree *, stmt_vec_info * = NULL);
> > >  extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
> > > +extern tree perm_mask_for_reverse (tree);
> > >  extern bool supportable_widening_operation (vec_info*, code_helper,
> > >  					    stmt_vec_info, tree, tree,
> > >  					    code_helper*, code_helper*,
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
>
  
Tamar Christina Nov. 20, 2023, 9:57 p.m. UTC | #5
Good morning,

Here's the respun patch,  as discussed we now use reductions and inductions rather than scalar values:

This adds support to vectorizable_live_reduction to handle multiple exits by
doing a search for which exit the live value should be materialized in.

Additinally which value in the index we're after depends on whether the exit
it's materialized in is an early exit or whether the loop's main exit is
different from the loop's natural one (i.e. the one with the same src block as
the latch).

In those two cases we want the first rather than the last value as we're going
to restart the iteration in the scalar loop.  For VLA this means we need to
reverse both the mask and vector since there's only a way to get the last
active element and not the first.

For inductions and multiple exits:
  - we test if the target will support vectorizing the induction
  - mark all inductions in the loop as relevant
  - for codegen of non-live inductions during codegen
  - induction during an early exit gets the first element rather than last.

For reductions and multiple exits:
  - Reductions for early exits reduces the reduction definition statement
    rather than the reduction step.  This allows us to get the value at the
    start of the iteration.
  - The peeling layout means that we just have to update one block, the merge
    block.  We expect all the reductions to be the same but we leave it up to
    the value numbering to clean up any duplicate code as we iterate over all
    edges.

These two changes fix the reduction codegen given before which has been added
to the testsuite for early vect.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
	(vect_analyze_loop_operations): Check if target supports vectorizing IV.
	(vect_transform_loop): Call vectorizable_live_operation for non-live
	inductions or reductions.
	(find_connected_edge, vectorizable_live_operation_1): New.
	(vect_create_epilog_for_reduction): Support reductions in early break.
	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
	(vect_stmt_relevant_p): Mark all inductions when early break as being
	relevant.
	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
	(vect_iv_increment_position): New.
	* tree-vect-loop-manip.cc (vect_iv_increment_position): Expose.

--- inline copy of patch ---

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 139311142b376d4eaa7ef8765608220b1eb92b31..af216d3dcb8a502639898ff67cb86948a7f140a4 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -453,7 +453,7 @@ vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
    INSERT_AFTER is set to true if the increment should be inserted after
    *BSI.  */
 
-static void
+void
 vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
 			    bool *insert_after)
 {
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 8a50380de49bc12105be47ea1d8ee3cf1f2bdab4..b1c34c4c3aaf8bdf9bf52d5a726836936de772b6 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2163,6 +2163,15 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 	    ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
 					      -1, false, &cost_vec);
 
+	  /* Check if we can perform the operation for early break if we force
+	     the live operation.  */
+	  if (ok
+	      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+	      && !STMT_VINFO_LIVE_P (stmt_info)
+	      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
+	    ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
+					      -1, false, &cost_vec);
+
           if (!ok)
 	    return opt_result::failure_at (phi,
 					   "not vectorized: relevant phi not "
@@ -5842,6 +5851,10 @@ vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
      (counting from 0)
+   LOOP_EXIT is the edge to update in the merge block.  In the case of a single
+     exit this edge is always the main loop exit.
+   MAIN_EXIT_P indicates whether we are updating the main exit or an alternate
+     exit.  This determines whether we use the final or original value.
 
    This function:
    1. Completes the reduction def-use cycles.
@@ -5882,7 +5895,9 @@ static void
 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 				  stmt_vec_info stmt_info,
 				  slp_tree slp_node,
-				  slp_instance slp_node_instance)
+				  slp_instance slp_node_instance,
+				  edge loop_exit,
+				  bool main_exit_p = true)
 {
   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
   gcc_assert (reduc_info->is_reduc_info);
@@ -6053,7 +6068,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       /* Create an induction variable.  */
       gimple_stmt_iterator incr_gsi;
       bool insert_after;
-      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+      vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
 		 insert_after, &indx_before_incr, &indx_after_incr);
 
@@ -6132,23 +6147,30 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
          Store them in NEW_PHIS.  */
   if (double_reduc)
     loop = outer_loop;
-  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
+  /* We need to reduce values in all exits.  */
+  exit_bb = loop_exit->dest;
   exit_gsi = gsi_after_labels (exit_bb);
   reduc_inputs.create (slp_node ? vec_num : ncopies);
+  vec <gimple *> vec_stmts;
+  if (main_exit_p)
+    vec_stmts = STMT_VINFO_VEC_STMTS (rdef_info);
+  else
+    vec_stmts = STMT_VINFO_VEC_STMTS (STMT_VINFO_REDUC_DEF (rdef_info));
+
   for (unsigned i = 0; i < vec_num; i++)
     {
       gimple_seq stmts = NULL;
       if (slp_node)
 	def = vect_get_slp_vect_def (slp_node, i);
       else
-	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
+	def = gimple_get_lhs (vec_stmts[0]);
       for (j = 0; j < ncopies; j++)
 	{
 	  tree new_def = copy_ssa_name (def);
 	  phi = create_phi_node (new_def, exit_bb);
 	  if (j)
-	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
-	  SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
+	    def = gimple_get_lhs (vec_stmts[j]);
+	  SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
 	  new_def = gimple_convert (&stmts, vectype, new_def);
 	  reduc_inputs.quick_push (new_def);
 	}
@@ -6885,7 +6907,20 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
 	    {
 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
-		SET_USE (use_p, scalar_result);
+		{
+		  gimple *stmt = USE_STMT (use_p);
+		  if (main_exit_p)
+		    SET_USE (use_p, scalar_result);
+		  else if (is_a <gphi *> (stmt))
+		    {
+		      /* If an early exit only update usages in the merge
+			 block.  */
+		      edge merge_e = single_succ_edge (loop_exit->dest);
+		      if (gimple_bb (stmt) != merge_e->dest)
+			continue;
+		      SET_PHI_ARG_DEF (stmt, merge_e->dest_idx, scalar_result);
+		    }
+		}
 	      update_stmt (use_stmt);
 	    }
         }
@@ -10481,6 +10516,156 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   return true;
 }
 
+/* Function vectorizable_live_operation_1.
+
+   helper function for vectorizable_live_operation.  */
+
+tree
+vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
+			       stmt_vec_info stmt_info, edge exit_e,
+			       tree vectype, int ncopies, slp_tree slp_node,
+			       tree bitsize, tree bitstart, tree vec_lhs,
+			       tree lhs_type, bool restart_loop,
+			       gimple_stmt_iterator *exit_gsi)
+{
+  basic_block exit_bb = exit_e->dest;
+  gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
+
+  tree vec_lhs_phi = copy_ssa_name (vec_lhs);
+  gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
+  for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
+    SET_PHI_ARG_DEF (phi, i, vec_lhs);
+
+  gimple_seq stmts = NULL;
+  tree new_tree;
+  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+    {
+      /* Emit:
+
+	 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
+	 where VEC_LHS is the vectorized live-out result and MASK is
+	 the loop mask for the final iteration.  */
+      gcc_assert (ncopies == 1 && !slp_node);
+      gimple_seq tem = NULL;
+      gimple_stmt_iterator gsi = gsi_last (tem);
+      tree len = vect_get_loop_len (loop_vinfo, &gsi,
+				    &LOOP_VINFO_LENS (loop_vinfo),
+				    1, vectype, 0, 0);
+
+      /* BIAS - 1.  */
+      signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+      tree bias_minus_one
+	= int_const_binop (MINUS_EXPR,
+			   build_int_cst (TREE_TYPE (len), biasval),
+			   build_one_cst (TREE_TYPE (len)));
+
+      /* LAST_INDEX = LEN + (BIAS - 1).  */
+      tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+				     len, bias_minus_one);
+
+      /* This needs to implement extraction of the first index, but not sure
+	 how the LEN stuff works.  At the moment we shouldn't get here since
+	 there's no LEN support for early breaks.  But guard this so there's
+	 no incorrect codegen.  */
+      gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
+
+      /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
+      tree scalar_res
+	= gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+			vec_lhs_phi, last_index);
+
+      /* Convert the extracted vector element to the scalar type.  */
+      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+    }
+  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      /* Emit:
+
+	 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+
+	 where VEC_LHS is the vectorized live-out result and MASK is
+	 the loop mask for the final iteration.  */
+      gcc_assert (!slp_node);
+      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+      gimple_seq tem = NULL;
+      gimple_stmt_iterator gsi = gsi_last (tem);
+      tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
+				      &LOOP_VINFO_MASKS (loop_vinfo),
+				      1, vectype, 0);
+      tree scalar_res;
+
+      /* For an inverted control flow with early breaks we want EXTRACT_FIRST
+	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
+      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  /* First create the permuted mask.  */
+	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
+	  tree perm_dest = copy_ssa_name (mask);
+	  gimple *perm_stmt
+		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
+				       mask, perm_mask);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
+				       &gsi);
+	  mask = perm_dest;
+
+	  /* Then permute the vector contents.  */
+	  tree perm_elem = perm_mask_for_reverse (vectype);
+	  perm_dest = copy_ssa_name (vec_lhs_phi);
+	  perm_stmt
+		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
+				       vec_lhs_phi, perm_elem);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
+				       &gsi);
+	  vec_lhs_phi = perm_dest;
+	}
+
+      gimple_seq_add_seq (&stmts, tem);
+
+      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+				 mask, vec_lhs_phi);
+
+      /* Convert the extracted vector element to the scalar type.  */
+      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+    }
+  else
+    {
+      tree bftype = TREE_TYPE (vectype);
+      if (VECTOR_BOOLEAN_TYPE_P (vectype))
+	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
+      new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
+      new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
+				       &stmts, true, NULL_TREE);
+    }
+
+  *exit_gsi = gsi_after_labels (exit_bb);
+  if (stmts)
+    gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
+
+  return new_tree;
+}
+
+/* Find the edge that's the final one in the path from SRC to DEST and
+   return it.  This edge must exist in at most one forwarder edge between.  */
+
+static edge
+find_connected_edge (edge src, basic_block dest)
+{
+   if (src->dest == dest)
+     return src;
+
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, dest->preds)
+    {
+      if (src->dest == e->src)
+	return e;
+    }
+
+  return NULL;
+}
+
 /* Function vectorizable_live_operation.
 
    STMT_INFO computes a value that is used outside the loop.  Check if
@@ -10505,7 +10690,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   int vec_entry = 0;
   poly_uint64 vec_index = 0;
 
-  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
+  gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
+	      || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
 
   /* If a stmt of a reduction is live, vectorize it via
      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
@@ -10530,8 +10716,22 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
 	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
 	return true;
+
+      /* If early break we only have to materialize the reduction on the merge
+	 block, but we have to find an alternate exit first.  */
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
+	    if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
+	      vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
+						slp_node, slp_node_instance,
+						exit, false);
+	}
+
       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
-					slp_node_instance);
+					slp_node_instance,
+					LOOP_VINFO_IV_EXIT (loop_vinfo));
+
       return true;
     }
 
@@ -10683,103 +10883,62 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	   lhs' = new_tree;  */
 
       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
-      gcc_assert (single_pred_p (exit_bb));
-
-      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
-      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
-      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
-
-      gimple_seq stmts = NULL;
-      tree new_tree;
-      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
-	{
-	  /* Emit:
-
-	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
-
-	     where VEC_LHS is the vectorized live-out result and MASK is
-	     the loop mask for the final iteration.  */
-	  gcc_assert (ncopies == 1 && !slp_node);
-	  gimple_seq tem = NULL;
-	  gimple_stmt_iterator gsi = gsi_last (tem);
-	  tree len
-	    = vect_get_loop_len (loop_vinfo, &gsi,
-				 &LOOP_VINFO_LENS (loop_vinfo),
-				 1, vectype, 0, 0);
-
-	  /* BIAS - 1.  */
-	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
-	  tree bias_minus_one
-	    = int_const_binop (MINUS_EXPR,
-			       build_int_cst (TREE_TYPE (len), biasval),
-			       build_one_cst (TREE_TYPE (len)));
-
-	  /* LAST_INDEX = LEN + (BIAS - 1).  */
-	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
-					  len, bias_minus_one);
-
-	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
-	  tree scalar_res
-	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
-			    vec_lhs_phi, last_index);
-
-	  /* Convert the extracted vector element to the scalar type.  */
-	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
-	}
-      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-	{
-	  /* Emit:
-
-	       SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
-
-	     where VEC_LHS is the vectorized live-out result and MASK is
-	     the loop mask for the final iteration.  */
-	  gcc_assert (ncopies == 1 && !slp_node);
-	  tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
-	  gimple_seq tem = NULL;
-	  gimple_stmt_iterator gsi = gsi_last (tem);
-	  tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
-					  &LOOP_VINFO_MASKS (loop_vinfo),
-					  1, vectype, 0);
-	  gimple_seq_add_seq (&stmts, tem);
-	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-					  mask, vec_lhs_phi);
-
-	  /* Convert the extracted vector element to the scalar type.  */
-	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
-	}
-      else
-	{
-	  tree bftype = TREE_TYPE (vectype);
-	  if (VECTOR_BOOLEAN_TYPE_P (vectype))
-	    bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
-	  new_tree = build3 (BIT_FIELD_REF, bftype,
-			     vec_lhs_phi, bitsize, bitstart);
-	  new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
-					   &stmts, true, NULL_TREE);
-	}
+      /* Check if we have a loop where the chosen exit is not the main exit,
+	 in these cases for an early break we restart the iteration the vector code
+	 did.  For the live values we want the value at the start of the iteration
+	 rather than at the end.  */
+      edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
+      bool restart_loop = !vect_is_loop_exit_latch_pred (main_e, loop);
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+	if (!is_gimple_debug (use_stmt)
+	    && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+	  {
+	    basic_block use_bb = gimple_bb (use_stmt);
+	    if (!is_a <gphi *> (use_stmt))
+	      continue;
+	    for (auto exit_e : get_loop_exit_edges (loop))
+	      {
+		/* See if this exit leads to the value.  */
+		edge dest_e = find_connected_edge (exit_e, use_bb);
+		if (!dest_e || PHI_ARG_DEF_FROM_EDGE (use_stmt, dest_e) != lhs)
+		  continue;
 
-      gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
-      if (stmts)
-	gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
+		gimple *tmp_vec_stmt = vec_stmt;
+		tree tmp_vec_lhs = vec_lhs;
+		tree tmp_bitstart = bitstart;
+		/* For early exit where the exit is not in the BB that leads
+		   to the latch then we're restarting the iteration in the
+		   scalar loop.  So get the first live value.  */
+		if (restart_loop || exit_e != main_e)
+		  {
+		    tmp_vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+		    tmp_vec_lhs = gimple_get_lhs (tmp_vec_stmt);
+		    tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
+		  }
 
-      /* Remove existing phis that copy from lhs and create copies
-	 from new_tree.  */
-      gimple_stmt_iterator gsi;
-      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
-	{
-	  gimple *phi = gsi_stmt (gsi);
-	  if ((gimple_phi_arg_def (phi, 0) == lhs))
-	    {
-	      remove_phi_node (&gsi, false);
-	      tree lhs_phi = gimple_phi_result (phi);
-	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
-	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
-	    }
-	  else
-	    gsi_next (&gsi);
-	}
+		gimple_stmt_iterator exit_gsi;
+		tree new_tree
+		  = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
+						   exit_e, vectype, ncopies,
+						   slp_node, bitsize,
+						   tmp_bitstart, tmp_vec_lhs,
+						   lhs_type, restart_loop,
+						   &exit_gsi);
+
+		/* Use the empty block on the exit to materialize the new stmts
+		   so we can use update the PHI here.  */
+		if (gimple_phi_num_args (use_stmt) == 1)
+		  {
+		    auto gsi = gsi_for_stmt (use_stmt);
+		    remove_phi_node (&gsi, false);
+		    tree lhs_phi = gimple_phi_result (use_stmt);
+		    gimple *copy = gimple_build_assign (lhs_phi, new_tree);
+		    gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
+		  }
+		else
+		  SET_PHI_ARG_DEF (use_stmt, dest_e->dest_idx, new_tree);
+	      }
+	  }
 
       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
@@ -11797,6 +11956,21 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
 	      vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
+	      /* If vectorizing early break we must also vectorize the use of
+		 the PHIs as a live operation.  */
+	      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+		  && !STMT_VINFO_LIVE_P (stmt_info)
+		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+			 "----> vectorizing early break reduc or induc phi: %G",
+			 (gimple *) phi);
+		  bool done
+		    = vectorizable_live_operation (loop_vinfo, stmt_info, NULL,
+						   NULL, -1, true, NULL);
+		  gcc_assert (done);
+		}
 	    }
 	}
 
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index fe38beb4fa1d9f8593445354f56ba52e10a040cd..f1b6a13395f286f9997530bbe57cda3a00502f8f 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -342,6 +342,7 @@ is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
    - it has uses outside the loop.
    - it has vdefs (it alters memory).
    - control stmts in the loop (except for the exit condition).
+   - it is an induction and we have multiple exits.
 
    CHECKME: what other side effects would the vectorizer allow?  */
 
@@ -399,6 +400,19 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	}
     }
 
+  /* Check if it's an induction and multiple exits.  In this case there will be
+     a usage later on after peeling which is needed for the alternate exit.  */
+  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
+    {
+      if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "vec_stmt_relevant_p: induction forced for "
+			   "early break.\n");
+      *relevant = vect_used_in_scope;
+
+    }
+
   if (*live_p && *relevant == vect_unused_in_scope
       && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
     {
@@ -1774,7 +1788,7 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
 /* If the target supports a permute mask that reverses the elements in
    a vector of type VECTYPE, return that mask, otherwise return null.  */
 
-static tree
+tree
 perm_mask_for_reverse (tree vectype)
 {
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 22a8c3d384d7ae1ca93079b64f2d40821b4a3c56..cfd6756492e4af460c2f5669ecccc82b1089cfe4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2225,6 +2225,7 @@ extern bool vect_can_advance_ivs_p (loop_vec_info);
 extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
 extern edge vec_init_loop_exit_info (class loop *);
 extern bool vect_is_loop_exit_latch_pred (edge, class loop *);
+extern void vect_iv_increment_position (edge, gimple_stmt_iterator *, bool *);
 
 /* In tree-vect-stmts.cc.  */
 extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
@@ -2246,6 +2247,7 @@ extern bool vect_is_simple_use (vec_info *, stmt_vec_info, slp_tree,
 				enum vect_def_type *,
 				tree *, stmt_vec_info * = NULL);
 extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
+extern tree perm_mask_for_reverse (tree);
 extern bool supportable_widening_operation (vec_info*, code_helper,
 					    stmt_vec_info, tree, tree,
 					    code_helper*, code_helper*,
  
Tamar Christina Nov. 24, 2023, 10:20 a.m. UTC | #6
Good morning,

This is a respun patch with a fix for VLA.

This adds support to vectorizable_live_reduction to handle multiple exits by
doing a search for which exit the live value should be materialized in.

Additionally which value in the index we're after depends on whether the exit
it's materialized in is an early exit or whether the loop's main exit is
different from the loop's natural one (i.e. the one with the same src block as
the latch).

In those two cases we want the first rather than the last value as we're going
to restart the iteration in the scalar loop.  For VLA this means we need to
reverse both the mask and vector since there's only a way to get the last
active element and not the first.

For inductions and multiple exits:
  - we test if the target will support vectorizing the induction
  - mark all inductions in the loop as relevant
  - for codegen of non-live inductions during codegen
  - induction during an early exit gets the first element rather than last.

For reductions and multiple exits:
  - Reductions for early exits reduces the reduction definition statement
    rather than the reduction step.  This allows us to get the value at the
    start of the iteration.
  - The peeling layout means that we just have to update one block, the merge
    block.  We expect all the reductions to be the same but we leave it up to
    the value numbering to clean up any duplicate code as we iterate over all
    edges.

These two changes fix the reduction codegen given before which has been added
to the testsuite for early vect.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
	(vect_analyze_loop_operations): Check if target supports vectorizing IV.
	(vect_transform_loop): Call vectorizable_live_operation for non-live
	inductions or reductions.
	(find_connected_edge, vectorizable_live_operation_1): New.
	(vect_create_epilog_for_reduction): Support reductions in early break.
	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
	(vect_stmt_relevant_p): Mark all inductions when early break as being
	relevant.
	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
	(vect_iv_increment_position): New.
	* tree-vect-loop-manip.cc (vect_iv_increment_position): Expose.

--- inline copy of patch ---

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 476be8a0bb6da2d06c4ca7052cb07bacecca60b1..1a4ba349fb6ae39c79401aecd4e7eaaaa9e2b8a0 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -453,7 +453,7 @@ vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
    INSERT_AFTER is set to true if the increment should be inserted after
    *BSI.  */
 
-static void
+void
 vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
 			    bool *insert_after)
 {
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 8a50380de49bc12105be47ea1d8ee3cf1f2bdab4..b42318b2999e6a27e6983382190792602cb25af1 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2163,6 +2163,15 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
 	    ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
 					      -1, false, &cost_vec);
 
+	  /* Check if we can perform the operation for early break if we force
+	     the live operation.  */
+	  if (ok
+	      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+	      && !STMT_VINFO_LIVE_P (stmt_info)
+	      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
+	    ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
+					      -1, false, &cost_vec);
+
           if (!ok)
 	    return opt_result::failure_at (phi,
 					   "not vectorized: relevant phi not "
@@ -5842,6 +5851,10 @@ vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
      (counting from 0)
+   LOOP_EXIT is the edge to update in the merge block.  In the case of a single
+     exit this edge is always the main loop exit.
+   MAIN_EXIT_P indicates whether we are updating the main exit or an alternate
+     exit.  This determines whether we use the final or original value.
 
    This function:
    1. Completes the reduction def-use cycles.
@@ -5882,7 +5895,9 @@ static void
 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 				  stmt_vec_info stmt_info,
 				  slp_tree slp_node,
-				  slp_instance slp_node_instance)
+				  slp_instance slp_node_instance,
+				  edge loop_exit,
+				  bool main_exit_p = true)
 {
   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
   gcc_assert (reduc_info->is_reduc_info);
@@ -6053,7 +6068,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       /* Create an induction variable.  */
       gimple_stmt_iterator incr_gsi;
       bool insert_after;
-      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+      vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
 		 insert_after, &indx_before_incr, &indx_after_incr);
 
@@ -6132,23 +6147,30 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
          Store them in NEW_PHIS.  */
   if (double_reduc)
     loop = outer_loop;
-  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
+  /* We need to reduce values in all exits.  */
+  exit_bb = loop_exit->dest;
   exit_gsi = gsi_after_labels (exit_bb);
   reduc_inputs.create (slp_node ? vec_num : ncopies);
+  vec <gimple *> vec_stmts;
+  if (main_exit_p)
+    vec_stmts = STMT_VINFO_VEC_STMTS (rdef_info);
+  else
+    vec_stmts = STMT_VINFO_VEC_STMTS (STMT_VINFO_REDUC_DEF (rdef_info));
+
   for (unsigned i = 0; i < vec_num; i++)
     {
       gimple_seq stmts = NULL;
       if (slp_node)
 	def = vect_get_slp_vect_def (slp_node, i);
       else
-	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
+	def = gimple_get_lhs (vec_stmts[0]);
       for (j = 0; j < ncopies; j++)
 	{
 	  tree new_def = copy_ssa_name (def);
 	  phi = create_phi_node (new_def, exit_bb);
 	  if (j)
-	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
-	  SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
+	    def = gimple_get_lhs (vec_stmts[j]);
+	  SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
 	  new_def = gimple_convert (&stmts, vectype, new_def);
 	  reduc_inputs.quick_push (new_def);
 	}
@@ -6885,7 +6907,20 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
 	    {
 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
-		SET_USE (use_p, scalar_result);
+		{
+		  gimple *stmt = USE_STMT (use_p);
+		  if (main_exit_p)
+		    SET_USE (use_p, scalar_result);
+		  else if (is_a <gphi *> (stmt))
+		    {
+		      /* If an early exit only update usages in the merge
+			 block.  */
+		      edge merge_e = single_succ_edge (loop_exit->dest);
+		      if (gimple_bb (stmt) != merge_e->dest)
+			continue;
+		      SET_PHI_ARG_DEF (stmt, merge_e->dest_idx, scalar_result);
+		    }
+		}
 	      update_stmt (use_stmt);
 	    }
         }
@@ -10481,6 +10516,156 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   return true;
 }
 
+/* Function vectorizable_live_operation_1.
+
+   helper function for vectorizable_live_operation.  */
+
+tree
+vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
+			       stmt_vec_info stmt_info, edge exit_e,
+			       tree vectype, int ncopies, slp_tree slp_node,
+			       tree bitsize, tree bitstart, tree vec_lhs,
+			       tree lhs_type, bool restart_loop,
+			       gimple_stmt_iterator *exit_gsi)
+{
+  basic_block exit_bb = exit_e->dest;
+  gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
+
+  tree vec_lhs_phi = copy_ssa_name (vec_lhs);
+  gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
+  for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
+    SET_PHI_ARG_DEF (phi, i, vec_lhs);
+
+  gimple_seq stmts = NULL;
+  tree new_tree;
+  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+    {
+      /* Emit:
+
+	 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
+	 where VEC_LHS is the vectorized live-out result and MASK is
+	 the loop mask for the final iteration.  */
+      gcc_assert (ncopies == 1 && !slp_node);
+      gimple_seq tem = NULL;
+      gimple_stmt_iterator gsi = gsi_last (tem);
+      tree len = vect_get_loop_len (loop_vinfo, &gsi,
+				    &LOOP_VINFO_LENS (loop_vinfo),
+				    1, vectype, 0, 0);
+
+      /* BIAS - 1.  */
+      signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+      tree bias_minus_one
+	= int_const_binop (MINUS_EXPR,
+			   build_int_cst (TREE_TYPE (len), biasval),
+			   build_one_cst (TREE_TYPE (len)));
+
+      /* LAST_INDEX = LEN + (BIAS - 1).  */
+      tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+				     len, bias_minus_one);
+
+      /* This needs to implement extraction of the first index, but not sure
+	 how the LEN stuff works.  At the moment we shouldn't get here since
+	 there's no LEN support for early breaks.  But guard this so there's
+	 no incorrect codegen.  */
+      gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
+
+      /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
+      tree scalar_res
+	= gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+			vec_lhs_phi, last_index);
+
+      /* Convert the extracted vector element to the scalar type.  */
+      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+    }
+  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      /* Emit:
+
+	 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+
+	 where VEC_LHS is the vectorized live-out result and MASK is
+	 the loop mask for the final iteration.  */
+      gcc_assert (!slp_node);
+      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+      gimple_seq tem = NULL;
+      gimple_stmt_iterator gsi = gsi_last (tem);
+      tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
+				      &LOOP_VINFO_MASKS (loop_vinfo),
+				      1, vectype, 0);
+      tree scalar_res;
+
+      /* For an inverted control flow with early breaks we want EXTRACT_FIRST
+	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
+      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  /* First create the permuted mask.  */
+	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
+	  tree perm_dest = copy_ssa_name (mask);
+	  gimple *perm_stmt
+		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
+				       mask, perm_mask);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
+				       &gsi);
+	  mask = perm_dest;
+
+	  /* Then permute the vector contents.  */
+	  tree perm_elem = perm_mask_for_reverse (vectype);
+	  perm_dest = copy_ssa_name (vec_lhs_phi);
+	  perm_stmt
+		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
+				       vec_lhs_phi, perm_elem);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
+				       &gsi);
+	  vec_lhs_phi = perm_dest;
+	}
+
+      gimple_seq_add_seq (&stmts, tem);
+
+      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+				 mask, vec_lhs_phi);
+
+      /* Convert the extracted vector element to the scalar type.  */
+      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+    }
+  else
+    {
+      tree bftype = TREE_TYPE (vectype);
+      if (VECTOR_BOOLEAN_TYPE_P (vectype))
+	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
+      new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
+      new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
+				       &stmts, true, NULL_TREE);
+    }
+
+  *exit_gsi = gsi_after_labels (exit_bb);
+  if (stmts)
+    gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
+
+  return new_tree;
+}
+
+/* Find the edge that's the final one in the path from SRC to DEST and
+   return it.  This edge must exist in at most one forwarder edge between.  */
+
+static edge
+find_connected_edge (edge src, basic_block dest)
+{
+   if (src->dest == dest)
+     return src;
+
+  edge e;
+  edge_iterator ei;
+
+  FOR_EACH_EDGE (e, ei, dest->preds)
+    {
+      if (src->dest == e->src)
+	return e;
+    }
+
+  return NULL;
+}
+
 /* Function vectorizable_live_operation.
 
    STMT_INFO computes a value that is used outside the loop.  Check if
@@ -10505,7 +10690,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   int vec_entry = 0;
   poly_uint64 vec_index = 0;
 
-  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
+  gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
+	      || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
 
   /* If a stmt of a reduction is live, vectorize it via
      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
@@ -10530,8 +10716,22 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
 	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
 	return true;
+
+      /* If early break we only have to materialize the reduction on the merge
+	 block, but we have to find an alternate exit first.  */
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
+	    if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
+	      vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
+						slp_node, slp_node_instance,
+						exit, false);
+	}
+
       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
-					slp_node_instance);
+					slp_node_instance,
+					LOOP_VINFO_IV_EXIT (loop_vinfo));
+
       return true;
     }
 
@@ -10683,103 +10883,63 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	   lhs' = new_tree;  */
 
       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
-      gcc_assert (single_pred_p (exit_bb));
-
-      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
-      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
-      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
-
-      gimple_seq stmts = NULL;
-      tree new_tree;
-      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
-	{
-	  /* Emit:
-
-	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
-
-	     where VEC_LHS is the vectorized live-out result and MASK is
-	     the loop mask for the final iteration.  */
-	  gcc_assert (ncopies == 1 && !slp_node);
-	  gimple_seq tem = NULL;
-	  gimple_stmt_iterator gsi = gsi_last (tem);
-	  tree len
-	    = vect_get_loop_len (loop_vinfo, &gsi,
-				 &LOOP_VINFO_LENS (loop_vinfo),
-				 1, vectype, 0, 0);
-
-	  /* BIAS - 1.  */
-	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
-	  tree bias_minus_one
-	    = int_const_binop (MINUS_EXPR,
-			       build_int_cst (TREE_TYPE (len), biasval),
-			       build_one_cst (TREE_TYPE (len)));
-
-	  /* LAST_INDEX = LEN + (BIAS - 1).  */
-	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
-					  len, bias_minus_one);
-
-	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
-	  tree scalar_res
-	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
-			    vec_lhs_phi, last_index);
-
-	  /* Convert the extracted vector element to the scalar type.  */
-	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
-	}
-      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-	{
-	  /* Emit:
-
-	       SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
-
-	     where VEC_LHS is the vectorized live-out result and MASK is
-	     the loop mask for the final iteration.  */
-	  gcc_assert (ncopies == 1 && !slp_node);
-	  tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
-	  gimple_seq tem = NULL;
-	  gimple_stmt_iterator gsi = gsi_last (tem);
-	  tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
-					  &LOOP_VINFO_MASKS (loop_vinfo),
-					  1, vectype, 0);
-	  gimple_seq_add_seq (&stmts, tem);
-	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-					  mask, vec_lhs_phi);
-
-	  /* Convert the extracted vector element to the scalar type.  */
-	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
-	}
-      else
-	{
-	  tree bftype = TREE_TYPE (vectype);
-	  if (VECTOR_BOOLEAN_TYPE_P (vectype))
-	    bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
-	  new_tree = build3 (BIT_FIELD_REF, bftype,
-			     vec_lhs_phi, bitsize, bitstart);
-	  new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
-					   &stmts, true, NULL_TREE);
-	}
+      /* Check if we have a loop where the chosen exit is not the main exit,
+	 in these cases for an early break we restart the iteration the vector code
+	 did.  For the live values we want the value at the start of the iteration
+	 rather than at the end.  */
+      edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
+      bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+	if (!is_gimple_debug (use_stmt)
+	    && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+	  {
+	    basic_block use_bb = gimple_bb (use_stmt);
+	    if (!is_a <gphi *> (use_stmt))
+	      continue;
+	    for (auto exit_e : get_loop_exit_edges (loop))
+	      {
+		/* See if this exit leads to the value.  */
+		edge dest_e = find_connected_edge (exit_e, use_bb);
+		if (!dest_e || PHI_ARG_DEF_FROM_EDGE (use_stmt, dest_e) != lhs)
+		  continue;
 
-      gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
-      if (stmts)
-	gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
+		gimple *tmp_vec_stmt = vec_stmt;
+		tree tmp_vec_lhs = vec_lhs;
+		tree tmp_bitstart = bitstart;
+		/* For early exit where the exit is not in the BB that leads
+		   to the latch then we're restarting the iteration in the
+		   scalar loop.  So get the first live value.  */
+		restart_loop = restart_loop || exit_e != main_e;
+		if (restart_loop)
+		  {
+		    tmp_vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+		    tmp_vec_lhs = gimple_get_lhs (tmp_vec_stmt);
+		    tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
+		  }
 
-      /* Remove existing phis that copy from lhs and create copies
-	 from new_tree.  */
-      gimple_stmt_iterator gsi;
-      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
-	{
-	  gimple *phi = gsi_stmt (gsi);
-	  if ((gimple_phi_arg_def (phi, 0) == lhs))
-	    {
-	      remove_phi_node (&gsi, false);
-	      tree lhs_phi = gimple_phi_result (phi);
-	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
-	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
-	    }
-	  else
-	    gsi_next (&gsi);
-	}
+		gimple_stmt_iterator exit_gsi;
+		tree new_tree
+		  = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
+						   exit_e, vectype, ncopies,
+						   slp_node, bitsize,
+						   tmp_bitstart, tmp_vec_lhs,
+						   lhs_type, restart_loop,
+						   &exit_gsi);
+
+		/* Use the empty block on the exit to materialize the new stmts
+		   so we can use update the PHI here.  */
+		if (gimple_phi_num_args (use_stmt) == 1)
+		  {
+		    auto gsi = gsi_for_stmt (use_stmt);
+		    remove_phi_node (&gsi, false);
+		    tree lhs_phi = gimple_phi_result (use_stmt);
+		    gimple *copy = gimple_build_assign (lhs_phi, new_tree);
+		    gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
+		  }
+		else
+		  SET_PHI_ARG_DEF (use_stmt, dest_e->dest_idx, new_tree);
+	      }
+	  }
 
       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
@@ -11797,6 +11957,21 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
 	      vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
+	      /* If vectorizing early break we must also vectorize the use of
+		 the PHIs as a live operation.  */
+	      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+		  && !STMT_VINFO_LIVE_P (stmt_info)
+		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+			 "----> vectorizing early break reduc or induc phi: %G",
+			 (gimple *) phi);
+		  bool done
+		    = vectorizable_live_operation (loop_vinfo, stmt_info, NULL,
+						   NULL, -1, true, NULL);
+		  gcc_assert (done);
+		}
 	    }
 	}
 
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index fe38beb4fa1d9f8593445354f56ba52e10a040cd..f1b6a13395f286f9997530bbe57cda3a00502f8f 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -342,6 +342,7 @@ is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
    - it has uses outside the loop.
    - it has vdefs (it alters memory).
    - control stmts in the loop (except for the exit condition).
+   - it is an induction and we have multiple exits.
 
    CHECKME: what other side effects would the vectorizer allow?  */
 
@@ -399,6 +400,19 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	}
     }
 
+  /* Check if it's an induction and multiple exits.  In this case there will be
+     a usage later on after peeling which is needed for the alternate exit.  */
+  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
+    {
+      if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "vec_stmt_relevant_p: induction forced for "
+			   "early break.\n");
+      *relevant = vect_used_in_scope;
+
+    }
+
   if (*live_p && *relevant == vect_unused_in_scope
       && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
     {
@@ -1774,7 +1788,7 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
 /* If the target supports a permute mask that reverses the elements in
    a vector of type VECTYPE, return that mask, otherwise return null.  */
 
-static tree
+tree
 perm_mask_for_reverse (tree vectype)
 {
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 076a698eb4288f68e81f91923f7e3e8d181ad685..de673ae56eac455c9560a29d7f3792b6c3c49f3b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2227,6 +2227,7 @@ extern bool vect_can_advance_ivs_p (loop_vec_info);
 extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
 extern edge vec_init_loop_exit_info (class loop *);
 extern bool vect_is_loop_exit_latch_pred (edge, class loop *);
+extern void vect_iv_increment_position (edge, gimple_stmt_iterator *, bool *);
 
 /* In tree-vect-stmts.cc.  */
 extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
@@ -2248,6 +2249,7 @@ extern bool vect_is_simple_use (vec_info *, stmt_vec_info, slp_tree,
 				enum vect_def_type *,
 				tree *, stmt_vec_info * = NULL);
 extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
+extern tree perm_mask_for_reverse (tree);
 extern bool supportable_widening_operation (vec_info*, code_helper,
 					    stmt_vec_info, tree, tree,
 					    code_helper*, code_helper*,
  
Richard Biener Nov. 24, 2023, 1:23 p.m. UTC | #7
On Fri, 24 Nov 2023, Tamar Christina wrote:

> Good morning,
> 
> This is a respun patch with a fix for VLA.
> 
> This adds support to vectorizable_live_reduction to handle multiple exits by
> doing a search for which exit the live value should be materialized in.
> 
> Additionally which value in the index we're after depends on whether the exit
> it's materialized in is an early exit or whether the loop's main exit is
> different from the loop's natural one (i.e. the one with the same src block as
> the latch).
> 
> In those two cases we want the first rather than the last value as we're going
> to restart the iteration in the scalar loop.  For VLA this means we need to
> reverse both the mask and vector since there's only a way to get the last
> active element and not the first.
> 
> For inductions and multiple exits:
>   - we test if the target will support vectorizing the induction
>   - mark all inductions in the loop as relevant
>   - for codegen of non-live inductions during codegen
>   - induction during an early exit gets the first element rather than last.
> 
> For reductions and multiple exits:
>   - Reductions for early exits reduces the reduction definition statement
>     rather than the reduction step.  This allows us to get the value at the
>     start of the iteration.
>   - The peeling layout means that we just have to update one block, the merge
>     block.  We expect all the reductions to be the same but we leave it up to
>     the value numbering to clean up any duplicate code as we iterate over all
>     edges.
> 
> These two changes fix the reduction codegen given before which has been added
> to the testsuite for early vect.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
> 	(vect_analyze_loop_operations): Check if target supports vectorizing IV.
> 	(vect_transform_loop): Call vectorizable_live_operation for non-live
> 	inductions or reductions.
> 	(find_connected_edge, vectorizable_live_operation_1): New.
> 	(vect_create_epilog_for_reduction): Support reductions in early break.
> 	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
> 	(vect_stmt_relevant_p): Mark all inductions when early break as being
> 	relevant.
> 	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
> 	(vect_iv_increment_position): New.
> 	* tree-vect-loop-manip.cc (vect_iv_increment_position): Expose.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 476be8a0bb6da2d06c4ca7052cb07bacecca60b1..1a4ba349fb6ae39c79401aecd4e7eaaaa9e2b8a0 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -453,7 +453,7 @@ vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
>     INSERT_AFTER is set to true if the increment should be inserted after
>     *BSI.  */
>  
> -static void
> +void
>  vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
>  			    bool *insert_after)
>  {
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 8a50380de49bc12105be47ea1d8ee3cf1f2bdab4..b42318b2999e6a27e6983382190792602cb25af1 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -2163,6 +2163,15 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
>  	    ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
>  					      -1, false, &cost_vec);
>  
> +	  /* Check if we can perform the operation for early break if we force
> +	     the live operation.  */
> +	  if (ok
> +	      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +	      && !STMT_VINFO_LIVE_P (stmt_info)
> +	      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> +	    ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
> +					      -1, false, &cost_vec);

can you add && !PURE_SLP_STMT?

> +
>            if (!ok)
>  	    return opt_result::failure_at (phi,
>  					   "not vectorized: relevant phi not "
> @@ -5842,6 +5851,10 @@ vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
>     SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
>     REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
>       (counting from 0)
> +   LOOP_EXIT is the edge to update in the merge block.  In the case of a single
> +     exit this edge is always the main loop exit.
> +   MAIN_EXIT_P indicates whether we are updating the main exit or an alternate
> +     exit.  This determines whether we use the final or original value.
>  
>     This function:
>     1. Completes the reduction def-use cycles.
> @@ -5882,7 +5895,9 @@ static void
>  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>  				  stmt_vec_info stmt_info,
>  				  slp_tree slp_node,
> -				  slp_instance slp_node_instance)
> +				  slp_instance slp_node_instance,
> +				  edge loop_exit,
> +				  bool main_exit_p = true)

isn't main_exit_p computable from 'loop_exit' by comparing that to
the one recorded in loop_vinfo?  If so please do that instead of passing
in another argument.

>  {
>    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
>    gcc_assert (reduc_info->is_reduc_info);
> @@ -6053,7 +6068,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        /* Create an induction variable.  */
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
> -      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +      vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
>        create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
>  		 insert_after, &indx_before_incr, &indx_after_incr);
>  
> @@ -6132,23 +6147,30 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>           Store them in NEW_PHIS.  */
>    if (double_reduc)
>      loop = outer_loop;
> -  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> +  /* We need to reduce values in all exits.  */
> +  exit_bb = loop_exit->dest;
>    exit_gsi = gsi_after_labels (exit_bb);
>    reduc_inputs.create (slp_node ? vec_num : ncopies);
> +  vec <gimple *> vec_stmts;
> +  if (main_exit_p)
> +    vec_stmts = STMT_VINFO_VEC_STMTS (rdef_info);
> +  else
> +    vec_stmts = STMT_VINFO_VEC_STMTS (STMT_VINFO_REDUC_DEF (rdef_info));

both would be wrong for SLP, also I think you need to look at
STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))?  For SLP the
PHI SLP node is reached via slp_node_instance->reduc_phis.

I think an overall better structure would be to add a

vect_get_vect_def (stmt_vec_info, slp_tree, unsigned);

abstracting SLP and non-SLP and doing

  for (unsigned i = 0; i < vec_num * ncopies; ++i)
    {
      def = vect_get_vect_def (stmt_info, slp_node, i);
...
    }

and then adjusting stmt_info/slp_node according to main_exit_p?
(would be nice to transition stmt_info->vec_stmts to stmt_info->vec_defs)

That said, wherever possible please think of SLP ;)

> +
>    for (unsigned i = 0; i < vec_num; i++)
>      {
>        gimple_seq stmts = NULL;
>        if (slp_node)
>  	def = vect_get_slp_vect_def (slp_node, i);
>        else
> -	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
> +	def = gimple_get_lhs (vec_stmts[0]);
>        for (j = 0; j < ncopies; j++)
>  	{
>  	  tree new_def = copy_ssa_name (def);
>  	  phi = create_phi_node (new_def, exit_bb);
>  	  if (j)
> -	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
> -	  SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
> +	    def = gimple_get_lhs (vec_stmts[j]);
> +	  SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
>  	  new_def = gimple_convert (&stmts, vectype, new_def);
>  	  reduc_inputs.quick_push (new_def);
>  	}
> @@ -6885,7 +6907,20 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>            FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
>  	    {
>  	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> -		SET_USE (use_p, scalar_result);
> +		{
> +		  gimple *stmt = USE_STMT (use_p);
> +		  if (main_exit_p)
> +		    SET_USE (use_p, scalar_result);
> +		  else if (is_a <gphi *> (stmt))
> +		    {
> +		      /* If an early exit only update usages in the merge
> +			 block.  */

shouldn't that be the only use at this point anyway?  You only
update uses in PHI nodes btw. and you can use SET_USE, maybe
you wanted to check that
gimple_phi_arg_edge (stmt, phi_arg_index_from_use (use_p)) == merge_e
instead?

That said, the comment could be more precise

Are we calling vect_create_epilog_for_reduction for each early exit?
I suppose not?

> +		      edge merge_e = single_succ_edge (loop_exit->dest);
> +		      if (gimple_bb (stmt) != merge_e->dest)
> +			continue;
> +		      SET_PHI_ARG_DEF (stmt, merge_e->dest_idx, scalar_result);
> +		    }
> +		}
>  	      update_stmt (use_stmt);
>  	    }
>          }
> @@ -10481,6 +10516,156 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>    return true;
>  }
>  
> +/* Function vectorizable_live_operation_1.
> +
> +   helper function for vectorizable_live_operation.  */
> +
> +tree
> +vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
> +			       stmt_vec_info stmt_info, edge exit_e,
> +			       tree vectype, int ncopies, slp_tree slp_node,
> +			       tree bitsize, tree bitstart, tree vec_lhs,
> +			       tree lhs_type, bool restart_loop,
> +			       gimple_stmt_iterator *exit_gsi)
> +{
> +  basic_block exit_bb = exit_e->dest;
> +  gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
> +
> +  tree vec_lhs_phi = copy_ssa_name (vec_lhs);
> +  gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
> +  for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
> +    SET_PHI_ARG_DEF (phi, i, vec_lhs);
> +
> +  gimple_seq stmts = NULL;
> +  tree new_tree;
> +  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +    {
> +      /* Emit:
> +
> +	 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> +	 where VEC_LHS is the vectorized live-out result and MASK is
> +	 the loop mask for the final iteration.  */
> +      gcc_assert (ncopies == 1 && !slp_node);
> +      gimple_seq tem = NULL;
> +      gimple_stmt_iterator gsi = gsi_last (tem);
> +      tree len = vect_get_loop_len (loop_vinfo, &gsi,
> +				    &LOOP_VINFO_LENS (loop_vinfo),
> +				    1, vectype, 0, 0);
> +
> +      /* BIAS - 1.  */
> +      signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +      tree bias_minus_one
> +	= int_const_binop (MINUS_EXPR,
> +			   build_int_cst (TREE_TYPE (len), biasval),
> +			   build_one_cst (TREE_TYPE (len)));
> +
> +      /* LAST_INDEX = LEN + (BIAS - 1).  */
> +      tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> +				     len, bias_minus_one);
> +
> +      /* This needs to implement extraction of the first index, but not sure
> +	 how the LEN stuff works.  At the moment we shouldn't get here since
> +	 there's no LEN support for early breaks.  But guard this so there's
> +	 no incorrect codegen.  */
> +      gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
> +
> +      /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> +      tree scalar_res
> +	= gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> +			vec_lhs_phi, last_index);
> +
> +      /* Convert the extracted vector element to the scalar type.  */
> +      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +    }
> +  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +    {
> +      /* Emit:
> +
> +	 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
> +
> +	 where VEC_LHS is the vectorized live-out result and MASK is
> +	 the loop mask for the final iteration.  */
> +      gcc_assert (!slp_node);
> +      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> +      gimple_seq tem = NULL;
> +      gimple_stmt_iterator gsi = gsi_last (tem);
> +      tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
> +				      &LOOP_VINFO_MASKS (loop_vinfo),
> +				      1, vectype, 0);
> +      tree scalar_res;
> +
> +      /* For an inverted control flow with early breaks we want EXTRACT_FIRST
> +	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
> +      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +	{
> +	  /* First create the permuted mask.  */
> +	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> +	  tree perm_dest = copy_ssa_name (mask);
> +	  gimple *perm_stmt
> +		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> +				       mask, perm_mask);
> +	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> +				       &gsi);
> +	  mask = perm_dest;
> +
> +	  /* Then permute the vector contents.  */
> +	  tree perm_elem = perm_mask_for_reverse (vectype);
> +	  perm_dest = copy_ssa_name (vec_lhs_phi);
> +	  perm_stmt
> +		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
> +				       vec_lhs_phi, perm_elem);
> +	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> +				       &gsi);
> +	  vec_lhs_phi = perm_dest;
> +	}
> +
> +      gimple_seq_add_seq (&stmts, tem);
> +
> +      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> +				 mask, vec_lhs_phi);
> +
> +      /* Convert the extracted vector element to the scalar type.  */
> +      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +    }
> +  else
> +    {
> +      tree bftype = TREE_TYPE (vectype);
> +      if (VECTOR_BOOLEAN_TYPE_P (vectype))
> +	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
> +      new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
> +      new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
> +				       &stmts, true, NULL_TREE);
> +    }
> +
> +  *exit_gsi = gsi_after_labels (exit_bb);
> +  if (stmts)
> +    gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
> +
> +  return new_tree;
> +}
> +
> +/* Find the edge that's the final one in the path from SRC to DEST and
> +   return it.  This edge must exist in at most one forwarder edge between.  */
> +
> +static edge
> +find_connected_edge (edge src, basic_block dest)
> +{
> +   if (src->dest == dest)
> +     return src;
> +
> +  edge e;
> +  edge_iterator ei;
> +
> +  FOR_EACH_EDGE (e, ei, dest->preds)
> +    {
> +      if (src->dest == e->src)
> +	return e;
> +    }

isn't that just find_edge (src->dest, dest)?

> +  return NULL;
> +}
> +
>  /* Function vectorizable_live_operation.
>  
>     STMT_INFO computes a value that is used outside the loop.  Check if
> @@ -10505,7 +10690,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    int vec_entry = 0;
>    poly_uint64 vec_index = 0;
>  
> -  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
> +  gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
> +	      || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
>  
>    /* If a stmt of a reduction is live, vectorize it via
>       vect_create_epilog_for_reduction.  vectorizable_reduction assessed
> @@ -10530,8 +10716,22 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
>  	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
>  	return true;
> +
> +      /* If early break we only have to materialize the reduction on the merge
> +	 block, but we have to find an alternate exit first.  */
> +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +	{
> +	  for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
> +	    if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
> +	      vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
> +						slp_node, slp_node_instance,
> +						exit, false);

Hmm, for each one.  But we only need a single reduction epilogue, no?
In the merge block?

> +	}
> +
>        vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
> -					slp_node_instance);
> +					slp_node_instance,
> +					LOOP_VINFO_IV_EXIT (loop_vinfo));
> +
>        return true;
>      }
>  
> @@ -10683,103 +10883,63 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  	   lhs' = new_tree;  */
>  
>        class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> -      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> -      gcc_assert (single_pred_p (exit_bb));
> -
> -      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
> -      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
> -      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
> -
> -      gimple_seq stmts = NULL;
> -      tree new_tree;
> -      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> -	{
> -	  /* Emit:
> -
> -	       SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> -
> -	     where VEC_LHS is the vectorized live-out result and MASK is
> -	     the loop mask for the final iteration.  */
> -	  gcc_assert (ncopies == 1 && !slp_node);
> -	  gimple_seq tem = NULL;
> -	  gimple_stmt_iterator gsi = gsi_last (tem);
> -	  tree len
> -	    = vect_get_loop_len (loop_vinfo, &gsi,
> -				 &LOOP_VINFO_LENS (loop_vinfo),
> -				 1, vectype, 0, 0);
> -
> -	  /* BIAS - 1.  */
> -	  signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> -	  tree bias_minus_one
> -	    = int_const_binop (MINUS_EXPR,
> -			       build_int_cst (TREE_TYPE (len), biasval),
> -			       build_one_cst (TREE_TYPE (len)));
> -
> -	  /* LAST_INDEX = LEN + (BIAS - 1).  */
> -	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> -					  len, bias_minus_one);
> -
> -	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> -	  tree scalar_res
> -	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> -			    vec_lhs_phi, last_index);
> -
> -	  /* Convert the extracted vector element to the scalar type.  */
> -	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> -	}
> -      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> -	{
> -	  /* Emit:
> -
> -	       SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
> -
> -	     where VEC_LHS is the vectorized live-out result and MASK is
> -	     the loop mask for the final iteration.  */
> -	  gcc_assert (ncopies == 1 && !slp_node);
> -	  tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> -	  gimple_seq tem = NULL;
> -	  gimple_stmt_iterator gsi = gsi_last (tem);
> -	  tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
> -					  &LOOP_VINFO_MASKS (loop_vinfo),
> -					  1, vectype, 0);
> -	  gimple_seq_add_seq (&stmts, tem);
> -	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> -					  mask, vec_lhs_phi);
> -
> -	  /* Convert the extracted vector element to the scalar type.  */
> -	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> -	}
> -      else
> -	{
> -	  tree bftype = TREE_TYPE (vectype);
> -	  if (VECTOR_BOOLEAN_TYPE_P (vectype))
> -	    bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
> -	  new_tree = build3 (BIT_FIELD_REF, bftype,
> -			     vec_lhs_phi, bitsize, bitstart);
> -	  new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
> -					   &stmts, true, NULL_TREE);
> -	}
> +      /* Check if we have a loop where the chosen exit is not the main exit,
> +	 in these cases for an early break we restart the iteration the vector code
> +	 did.  For the live values we want the value at the start of the iteration
> +	 rather than at the end.  */
> +      edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> +      bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
> +      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> +	if (!is_gimple_debug (use_stmt)
> +	    && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> +	  {
> +	    basic_block use_bb = gimple_bb (use_stmt);
> +	    if (!is_a <gphi *> (use_stmt))
> +	      continue;
> +	    for (auto exit_e : get_loop_exit_edges (loop))
> +	      {
> +		/* See if this exit leads to the value.  */
> +		edge dest_e = find_connected_edge (exit_e, use_bb);
> +		if (!dest_e || PHI_ARG_DEF_FROM_EDGE (use_stmt, dest_e) != lhs)
> +		  continue;
>  
> -      gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
> -      if (stmts)
> -	gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
> +		gimple *tmp_vec_stmt = vec_stmt;
> +		tree tmp_vec_lhs = vec_lhs;
> +		tree tmp_bitstart = bitstart;
> +		/* For early exit where the exit is not in the BB that leads
> +		   to the latch then we're restarting the iteration in the
> +		   scalar loop.  So get the first live value.  */
> +		restart_loop = restart_loop || exit_e != main_e;
> +		if (restart_loop)
> +		  {
> +		    tmp_vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> +		    tmp_vec_lhs = gimple_get_lhs (tmp_vec_stmt);
> +		    tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
> +		  }
>  
> -      /* Remove existing phis that copy from lhs and create copies
> -	 from new_tree.  */
> -      gimple_stmt_iterator gsi;
> -      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
> -	{
> -	  gimple *phi = gsi_stmt (gsi);
> -	  if ((gimple_phi_arg_def (phi, 0) == lhs))
> -	    {
> -	      remove_phi_node (&gsi, false);
> -	      tree lhs_phi = gimple_phi_result (phi);
> -	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> -	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> -	    }
> -	  else
> -	    gsi_next (&gsi);
> -	}
> +		gimple_stmt_iterator exit_gsi;
> +		tree new_tree
> +		  = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
> +						   exit_e, vectype, ncopies,
> +						   slp_node, bitsize,
> +						   tmp_bitstart, tmp_vec_lhs,
> +						   lhs_type, restart_loop,
> +						   &exit_gsi);
> +
> +		/* Use the empty block on the exit to materialize the new stmts
> +		   so we can use update the PHI here.  */
> +		if (gimple_phi_num_args (use_stmt) == 1)
> +		  {
> +		    auto gsi = gsi_for_stmt (use_stmt);
> +		    remove_phi_node (&gsi, false);
> +		    tree lhs_phi = gimple_phi_result (use_stmt);
> +		    gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> +		    gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> +		  }
> +		else
> +		  SET_PHI_ARG_DEF (use_stmt, dest_e->dest_idx, new_tree);
> +	      }
> +	  }

Difficult to see what changed due to the split out, guess it'll be ok.

>        /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
>        FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> @@ -11797,6 +11957,21 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
>  	      if (dump_enabled_p ())
>  		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
>  	      vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
> +	      /* If vectorizing early break we must also vectorize the use of
> +		 the PHIs as a live operation.  */
> +	      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +		  && !STMT_VINFO_LIVE_P (stmt_info)
> +		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> +		{
> +		  if (dump_enabled_p ())
> +		    dump_printf_loc (MSG_NOTE, vect_location,
> +			 "----> vectorizing early break reduc or induc phi: %G",
> +			 (gimple *) phi);
> +		  bool done
> +		    = vectorizable_live_operation (loop_vinfo, stmt_info, NULL,
> +						   NULL, -1, true, NULL);

you should be able to amend can_vectorize_live_stmts instead by
adding || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo) && vect_induction_def),
then we keep it at one place also where we'd handle the SLP case.

> +		  gcc_assert (done);
> +		}
>  	    }
>  	}
>  
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index fe38beb4fa1d9f8593445354f56ba52e10a040cd..f1b6a13395f286f9997530bbe57cda3a00502f8f 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -342,6 +342,7 @@ is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
>     - it has uses outside the loop.
>     - it has vdefs (it alters memory).
>     - control stmts in the loop (except for the exit condition).
> +   - it is an induction and we have multiple exits.
>  
>     CHECKME: what other side effects would the vectorizer allow?  */
>  
> @@ -399,6 +400,19 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
>  	}
>      }
>  
> +  /* Check if it's an induction and multiple exits.  In this case there will be
> +     a usage later on after peeling which is needed for the alternate exit.  */
> +  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> +    {
> +      if (dump_enabled_p ())
> +	  dump_printf_loc (MSG_NOTE, vect_location,
> +			   "vec_stmt_relevant_p: induction forced for "
> +			   "early break.\n");
> +      *relevant = vect_used_in_scope;
> +

I think you should instead set *live_p?

> +    }
> +
>    if (*live_p && *relevant == vect_unused_in_scope
>        && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
>      {
> @@ -1774,7 +1788,7 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
>  /* If the target supports a permute mask that reverses the elements in
>     a vector of type VECTYPE, return that mask, otherwise return null.  */
>  
> -static tree
> +tree
>  perm_mask_for_reverse (tree vectype)
>  {
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 076a698eb4288f68e81f91923f7e3e8d181ad685..de673ae56eac455c9560a29d7f3792b6c3c49f3b 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2227,6 +2227,7 @@ extern bool vect_can_advance_ivs_p (loop_vec_info);
>  extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
>  extern edge vec_init_loop_exit_info (class loop *);
>  extern bool vect_is_loop_exit_latch_pred (edge, class loop *);
> +extern void vect_iv_increment_position (edge, gimple_stmt_iterator *, bool *);
>  
>  /* In tree-vect-stmts.cc.  */
>  extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
> @@ -2248,6 +2249,7 @@ extern bool vect_is_simple_use (vec_info *, stmt_vec_info, slp_tree,
>  				enum vect_def_type *,
>  				tree *, stmt_vec_info * = NULL);
>  extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
> +extern tree perm_mask_for_reverse (tree);
>  extern bool supportable_widening_operation (vec_info*, code_helper,
>  					    stmt_vec_info, tree, tree,
>  					    code_helper*, code_helper*,
>
  
Tamar Christina Nov. 27, 2023, 10:47 p.m. UTC | #8
>
> > This is a respun patch with a fix for VLA.
> >
> > This adds support to vectorizable_live_reduction to handle multiple
> > exits by doing a search for which exit the live value should be materialized in.
> >
> > Additionally which value in the index we're after depends on whether
> > the exit it's materialized in is an early exit or whether the loop's
> > main exit is different from the loop's natural one (i.e. the one with
> > the same src block as the latch).
> >
> > In those two cases we want the first rather than the last value as
> > we're going to restart the iteration in the scalar loop.  For VLA this
> > means we need to reverse both the mask and vector since there's only a
> > way to get the last active element and not the first.
> >
> > For inductions and multiple exits:
> >   - we test if the target will support vectorizing the induction
> >   - mark all inductions in the loop as relevant
> >   - for codegen of non-live inductions during codegen
> >   - induction during an early exit gets the first element rather than last.
> >
> > For reductions and multiple exits:
> >   - Reductions for early exits reduces the reduction definition statement
> >     rather than the reduction step.  This allows us to get the value at the
> >     start of the iteration.
> >   - The peeling layout means that we just have to update one block, the
> merge
> >     block.  We expect all the reductions to be the same but we leave it up to
> >     the value numbering to clean up any duplicate code as we iterate over all
> >     edges.
> >
> > These two changes fix the reduction codegen given before which has
> > been added to the testsuite for early vect.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
> > 	(vect_analyze_loop_operations): Check if target supports vectorizing
> IV.
> > 	(vect_transform_loop): Call vectorizable_live_operation for non-live
> > 	inductions or reductions.
> > 	(find_connected_edge, vectorizable_live_operation_1): New.
> > 	(vect_create_epilog_for_reduction): Support reductions in early break.
> > 	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
> > 	(vect_stmt_relevant_p): Mark all inductions when early break as being
> > 	relevant.
> > 	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
> > 	(vect_iv_increment_position): New.
> > 	* tree-vect-loop-manip.cc (vect_iv_increment_position): Expose.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > index
> >
> 476be8a0bb6da2d06c4ca7052cb07bacecca60b1..1a4ba349fb6ae39c79401
> aecd4e7
> > eaaaa9e2b8a0 100644
> > --- a/gcc/tree-vect-loop-manip.cc
> > +++ b/gcc/tree-vect-loop-manip.cc
> > @@ -453,7 +453,7 @@ vect_adjust_loop_lens_control (tree iv_type,
> gimple_seq *seq,
> >     INSERT_AFTER is set to true if the increment should be inserted after
> >     *BSI.  */
> >
> > -static void
> > +void
> >  vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
> >  			    bool *insert_after)
> >  {
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index
> >
> 8a50380de49bc12105be47ea1d8ee3cf1f2bdab4..b42318b2999e6a27e698
> 33821907
> > 92602cb25af1 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -2163,6 +2163,15 @@ vect_analyze_loop_operations (loop_vec_info
> loop_vinfo)
> >  	    ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL,
> NULL,
> >  					      -1, false, &cost_vec);
> >
> > +	  /* Check if we can perform the operation for early break if we force
> > +	     the live operation.  */
> > +	  if (ok
> > +	      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > +	      && !STMT_VINFO_LIVE_P (stmt_info)
> > +	      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> > +	    ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL,
> NULL,
> > +					      -1, false, &cost_vec);
> 
> can you add && !PURE_SLP_STMT?
> 

I've cleaned up the patch a bit more, so these hunks are now all gone.

> > @@ -6132,23 +6147,30 @@ vect_create_epilog_for_reduction
> (loop_vec_info loop_vinfo,
> >           Store them in NEW_PHIS.  */
> >    if (double_reduc)
> >      loop = outer_loop;
> > -  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > +  /* We need to reduce values in all exits.  */  exit_bb =
> > + loop_exit->dest;
> >    exit_gsi = gsi_after_labels (exit_bb);
> >    reduc_inputs.create (slp_node ? vec_num : ncopies);
> > +  vec <gimple *> vec_stmts;
> > +  if (main_exit_p)
> > +    vec_stmts = STMT_VINFO_VEC_STMTS (rdef_info);  else
> > +    vec_stmts = STMT_VINFO_VEC_STMTS (STMT_VINFO_REDUC_DEF
> > + (rdef_info));
> 
> both would be wrong for SLP, also I think you need to look at
> STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))?  For SLP the PHI SLP
> node is reached via slp_node_instance->reduc_phis.
> 
> I think an overall better structure would be to add a
> 
> vect_get_vect_def (stmt_vec_info, slp_tree, unsigned);
> 
> abstracting SLP and non-SLP and doing
> 
>   for (unsigned i = 0; i < vec_num * ncopies; ++i)
>     {
>       def = vect_get_vect_def (stmt_info, slp_node, i); ...
>     }
> 
> and then adjusting stmt_info/slp_node according to main_exit_p?

Done.

> (would be nice to transition stmt_info->vec_stmts to stmt_info->vec_defs)

True. I guess since the plan is to remove non-SLP next year this'll just go away anyway.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop.cc (vectorizable_live_operation,
	vectorizable_live_operation_1): Support early exits.
	(can_vectorize_live_stmts): Call vectorizable_live_operation for non-live
	inductions or reductions.
	(find_connected_edge, vect_get_vect_def): New.
	(vect_create_epilog_for_reduction): Support reductions in early break.
	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
	(vect_stmt_relevant_p): Mark all inductions when early break as being
	live.
	* tree-vectorizer.h (perm_mask_for_reverse): Expose.

--- inline copy of patch ---

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index f38cc47551488525b15c2be758cac8291dbefb3a..4e48217a31e59318c2ea8e5ab63b06ba19840cbd 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -3346,6 +3346,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	    bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
 	  bb_before_epilog = loop_preheader_edge (epilog)->src;
 	}
+
       /* If loop is peeled for non-zero constant times, now niters refers to
 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
 	 overflows.  */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index df5e1d28fac2ce35e71decdec0d8e31fb75557f5..90041d1e138afb08c0116f48f517fe0fcc615557 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5831,6 +5831,34 @@ vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
   return new_temp;
 }
 
+/* Retrieves the definining statement to be used for a reduction.
+   For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
+   the reduction definitions.  */
+
+tree
+vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
+		   slp_instance slp_node_instance, bool main_exit_p, unsigned i,
+		   vec <gimple *> &vec_stmts)
+{
+  tree def;
+
+  if (slp_node)
+    {
+      if (!main_exit_p)
+        slp_node = slp_node_instance->reduc_phis;
+      def = vect_get_slp_vect_def (slp_node, i);
+    }
+  else
+    {
+      if (!main_exit_p)
+	reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
+      vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
+      def = gimple_get_lhs (vec_stmts[0]);
+    }
+
+  return def;
+}
+
 /* Function vect_create_epilog_for_reduction
 
    Create code at the loop-epilog to finalize the result of a reduction
@@ -5842,6 +5870,8 @@ vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
      (counting from 0)
+   LOOP_EXIT is the edge to update in the merge block.  In the case of a single
+     exit this edge is always the main loop exit.
 
    This function:
    1. Completes the reduction def-use cycles.
@@ -5882,7 +5912,8 @@ static void
 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 				  stmt_vec_info stmt_info,
 				  slp_tree slp_node,
-				  slp_instance slp_node_instance)
+				  slp_instance slp_node_instance,
+				  edge loop_exit)
 {
   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
   gcc_assert (reduc_info->is_reduc_info);
@@ -5891,6 +5922,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
      loop-closed PHI of the inner loop which we remember as
      def for the reduction PHI generation.  */
   bool double_reduc = false;
+  bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
   stmt_vec_info rdef_info = stmt_info;
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
     {
@@ -6053,7 +6085,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       /* Create an induction variable.  */
       gimple_stmt_iterator incr_gsi;
       bool insert_after;
-      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+      vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
 		 insert_after, &indx_before_incr, &indx_after_incr);
 
@@ -6132,23 +6164,23 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
          Store them in NEW_PHIS.  */
   if (double_reduc)
     loop = outer_loop;
-  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
+  /* We need to reduce values in all exits.  */
+  exit_bb = loop_exit->dest;
   exit_gsi = gsi_after_labels (exit_bb);
   reduc_inputs.create (slp_node ? vec_num : ncopies);
+  vec <gimple *> vec_stmts;
   for (unsigned i = 0; i < vec_num; i++)
     {
       gimple_seq stmts = NULL;
-      if (slp_node)
-	def = vect_get_slp_vect_def (slp_node, i);
-      else
-	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
+      def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
+			       main_exit_p, i, vec_stmts);
       for (j = 0; j < ncopies; j++)
 	{
 	  tree new_def = copy_ssa_name (def);
 	  phi = create_phi_node (new_def, exit_bb);
 	  if (j)
-	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
-	  SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
+	    def = gimple_get_lhs (vec_stmts[j]);
+	  SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
 	  new_def = gimple_convert (&stmts, vectype, new_def);
 	  reduc_inputs.quick_push (new_def);
 	}
@@ -6882,10 +6914,33 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 	    }
 
           scalar_result = scalar_results[k];
+	  edge merge_e = loop_exit;
+	  if (!main_exit_p)
+	    merge_e = single_succ_edge (loop_exit->dest);
           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
 	    {
 	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
-		SET_USE (use_p, scalar_result);
+		{
+		  if (main_exit_p)
+		    SET_USE (use_p, scalar_result);
+		  else
+		    {
+		      /* When multiple exits the same SSA name can appear in
+			 both the main and the early exits.  The meaning of the
+			 reduction however is not the same.  In the main exit
+			 case the meaning is "get the last value" and in the
+			 early exit case it means "get the first value".  As
+			 such we should only update the value for the exit
+			 attached to loop_exit.  To make this easier we always
+			 call vect_create_epilog_for_reduction on the early
+			 exit main block first.  As such for the main exit we
+			 no longer have to perform the BB check.  */
+		      gphi *stmt = as_a <gphi *> (USE_STMT (use_p));
+		      int idx = phi_arg_index_from_use (use_p);
+		      if (gimple_phi_arg_edge (stmt, idx) == merge_e)
+			SET_USE (use_p, scalar_result);
+		    }
+		}
 	      update_stmt (use_stmt);
 	    }
         }
@@ -10481,15 +10536,17 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   return true;
 }
 
-
 /* Function vectorizable_live_operation_1.
+
    helper function for vectorizable_live_operation.  */
+
 tree
 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
 			       stmt_vec_info stmt_info, edge exit_e,
 			       tree vectype, int ncopies, slp_tree slp_node,
 			       tree bitsize, tree bitstart, tree vec_lhs,
-			       tree lhs_type, gimple_stmt_iterator *exit_gsi)
+			       tree lhs_type, bool restart_loop,
+			       gimple_stmt_iterator *exit_gsi)
 {
   basic_block exit_bb = exit_e->dest;
   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
@@ -10504,7 +10561,9 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
   if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
     {
       /* Emit:
+
 	 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
 	 where VEC_LHS is the vectorized live-out result and MASK is
 	 the loop mask for the final iteration.  */
       gcc_assert (ncopies == 1 && !slp_node);
@@ -10513,15 +10572,18 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
       tree len = vect_get_loop_len (loop_vinfo, &gsi,
 				    &LOOP_VINFO_LENS (loop_vinfo),
 				    1, vectype, 0, 0);
+
       /* BIAS - 1.  */
       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
       tree bias_minus_one
 	= int_const_binop (MINUS_EXPR,
 			   build_int_cst (TREE_TYPE (len), biasval),
 			   build_one_cst (TREE_TYPE (len)));
+
       /* LAST_INDEX = LEN + (BIAS - 1).  */
       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
 				     len, bias_minus_one);
+
       /* This needs to implement extraction of the first index, but not sure
 	 how the LEN stuff works.  At the moment we shouldn't get here since
 	 there's no LEN support for early breaks.  But guard this so there's
@@ -10532,13 +10594,16 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
       tree scalar_res
 	= gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
 			vec_lhs_phi, last_index);
+
       /* Convert the extracted vector element to the scalar type.  */
       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
     }
   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
     {
       /* Emit:
+
 	 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+
 	 where VEC_LHS is the vectorized live-out result and MASK is
 	 the loop mask for the final iteration.  */
       gcc_assert (!slp_node);
@@ -10548,10 +10613,38 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
 				      &LOOP_VINFO_MASKS (loop_vinfo),
 				      1, vectype, 0);
+      tree scalar_res;
+
+      /* For an inverted control flow with early breaks we want EXTRACT_FIRST
+	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
+      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  /* First create the permuted mask.  */
+	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
+	  tree perm_dest = copy_ssa_name (mask);
+	  gimple *perm_stmt
+		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
+				       mask, perm_mask);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
+				       &gsi);
+	  mask = perm_dest;
+
+	  /* Then permute the vector contents.  */
+	  tree perm_elem = perm_mask_for_reverse (vectype);
+	  perm_dest = copy_ssa_name (vec_lhs_phi);
+	  perm_stmt
+		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
+				       vec_lhs_phi, perm_elem);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
+				       &gsi);
+	  vec_lhs_phi = perm_dest;
+	}
 
       gimple_seq_add_seq (&stmts, tem);
-       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-				       mask, vec_lhs_phi);
+
+      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+				 mask, vec_lhs_phi);
+
       /* Convert the extracted vector element to the scalar type.  */
       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
     }
@@ -10564,12 +10657,26 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
 				       &stmts, true, NULL_TREE);
     }
+
   *exit_gsi = gsi_after_labels (exit_bb);
   if (stmts)
     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
+
   return new_tree;
 }
 
+/* Find the edge that's the final one in the path from SRC to DEST and
+   return it.  This edge must exist in at most one forwarder edge between.  */
+
+static edge
+find_connected_edge (edge src, basic_block dest)
+{
+   if (src->dest == dest)
+     return src;
+
+  return find_edge (src->dest, dest);
+}
+
 /* Function vectorizable_live_operation.
 
    STMT_INFO computes a value that is used outside the loop.  Check if
@@ -10594,7 +10701,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   int vec_entry = 0;
   poly_uint64 vec_index = 0;
 
-  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
+  gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
+	      || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
 
   /* If a stmt of a reduction is live, vectorize it via
      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
@@ -10619,8 +10727,25 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
 	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
 	return true;
+
+      /* If early break we only have to materialize the reduction on the merge
+	 block, but we have to find an alternate exit first.  */
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
+	    if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
+	      {
+		vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
+						  slp_node, slp_node_instance,
+						  exit);
+		break;
+	      }
+	}
+
       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
-					slp_node_instance);
+					slp_node_instance,
+					LOOP_VINFO_IV_EXIT (loop_vinfo));
+
       return true;
     }
 
@@ -10772,37 +10897,63 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	   lhs' = new_tree;  */
 
       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
-      gcc_assert (single_pred_p (exit_bb));
-
-      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
-      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
-      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
-
-      gimple_stmt_iterator exit_gsi;
-      tree new_tree
-	= vectorizable_live_operation_1 (loop_vinfo, stmt_info,
-					 LOOP_VINFO_IV_EXIT (loop_vinfo),
-					 vectype, ncopies, slp_node, bitsize,
-					 bitstart, vec_lhs, lhs_type,
-					 &exit_gsi);
-
-      /* Remove existing phis that copy from lhs and create copies
-	 from new_tree.  */
-      gimple_stmt_iterator gsi;
-      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
-	{
-	  gimple *phi = gsi_stmt (gsi);
-	  if ((gimple_phi_arg_def (phi, 0) == lhs))
-	    {
-	      remove_phi_node (&gsi, false);
-	      tree lhs_phi = gimple_phi_result (phi);
-	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
-	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
-	    }
-	  else
-	    gsi_next (&gsi);
-	}
+      /* Check if we have a loop where the chosen exit is not the main exit,
+	 in these cases for an early break we restart the iteration the vector code
+	 did.  For the live values we want the value at the start of the iteration
+	 rather than at the end.  */
+      edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
+      bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+	if (!is_gimple_debug (use_stmt)
+	    && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+	  {
+	    basic_block use_bb = gimple_bb (use_stmt);
+	    if (!is_a <gphi *> (use_stmt))
+	      continue;
+	    for (auto exit_e : get_loop_exit_edges (loop))
+	      {
+		/* See if this exit leads to the value.  */
+		edge dest_e = find_connected_edge (exit_e, use_bb);
+		if (!dest_e || PHI_ARG_DEF_FROM_EDGE (use_stmt, dest_e) != lhs)
+		  continue;
+
+		gimple *tmp_vec_stmt = vec_stmt;
+		tree tmp_vec_lhs = vec_lhs;
+		tree tmp_bitstart = bitstart;
+		/* For early exit where the exit is not in the BB that leads
+		   to the latch then we're restarting the iteration in the
+		   scalar loop.  So get the first live value.  */
+		restart_loop = restart_loop || exit_e != main_e;
+		if (restart_loop)
+		  {
+		    tmp_vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+		    tmp_vec_lhs = gimple_get_lhs (tmp_vec_stmt);
+		    tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
+		  }
+
+		gimple_stmt_iterator exit_gsi;
+		tree new_tree
+		  = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
+						   exit_e, vectype, ncopies,
+						   slp_node, bitsize,
+						   tmp_bitstart, tmp_vec_lhs,
+						   lhs_type, restart_loop,
+						   &exit_gsi);
+
+		/* Use the empty block on the exit to materialize the new stmts
+		   so we can use update the PHI here.  */
+		if (gimple_phi_num_args (use_stmt) == 1)
+		  {
+		    auto gsi = gsi_for_stmt (use_stmt);
+		    remove_phi_node (&gsi, false);
+		    tree lhs_phi = gimple_phi_result (use_stmt);
+		    gimple *copy = gimple_build_assign (lhs_phi, new_tree);
+		    gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
+		  }
+		else
+		  SET_PHI_ARG_DEF (use_stmt, dest_e->dest_idx, new_tree);
+	      }
+	  }
 
       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index fe38beb4fa1d9f8593445354f56ba52e10a040cd..27221c6e8e86034050b562ee5c15992827a8d2cb 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -342,6 +342,7 @@ is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
    - it has uses outside the loop.
    - it has vdefs (it alters memory).
    - control stmts in the loop (except for the exit condition).
+   - it is an induction and we have multiple exits.
 
    CHECKME: what other side effects would the vectorizer allow?  */
 
@@ -399,6 +400,19 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	}
     }
 
+  /* Check if it's an induction and multiple exits.  In this case there will be
+     a usage later on after peeling which is needed for the alternate exit.  */
+  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
+    {
+      if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "vec_stmt_relevant_p: induction forced for "
+			   "early break.\n");
+      *live_p = true;
+
+    }
+
   if (*live_p && *relevant == vect_unused_in_scope
       && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
     {
@@ -1774,7 +1788,7 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
 /* If the target supports a permute mask that reverses the elements in
    a vector of type VECTYPE, return that mask, otherwise return null.  */
 
-static tree
+tree
 perm_mask_for_reverse (tree vectype)
 {
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
@@ -12720,20 +12734,27 @@ can_vectorize_live_stmts (vec_info *vinfo, stmt_vec_info stmt_info,
 			  bool vec_stmt_p,
 			  stmt_vector_for_cost *cost_vec)
 {
+  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   if (slp_node)
     {
       stmt_vec_info slp_stmt_info;
       unsigned int i;
       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, slp_stmt_info)
 	{
-	  if (STMT_VINFO_LIVE_P (slp_stmt_info)
+	  if ((STMT_VINFO_LIVE_P (slp_stmt_info)
+	       || (loop_vinfo
+		   && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+		   && STMT_VINFO_DEF_TYPE (slp_stmt_info)
+			== vect_induction_def))
 	      && !vectorizable_live_operation (vinfo, slp_stmt_info, slp_node,
 					       slp_node_instance, i,
 					       vec_stmt_p, cost_vec))
 	    return false;
 	}
     }
-  else if (STMT_VINFO_LIVE_P (stmt_info)
+  else if ((STMT_VINFO_LIVE_P (stmt_info)
+	    || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+		&& STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def))
 	   && !vectorizable_live_operation (vinfo, stmt_info,
 					    slp_node, slp_node_instance, -1,
 					    vec_stmt_p, cost_vec))
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index de60da31e2a3030a7fbc302d3f676af9683fd019..fd4b0a787e6128b43c5ca2b0612f55845e6b3cef 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2248,6 +2248,7 @@ extern bool vect_is_simple_use (vec_info *, stmt_vec_info, slp_tree,
 				enum vect_def_type *,
 				tree *, stmt_vec_info * = NULL);
 extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
+extern tree perm_mask_for_reverse (tree);
 extern bool supportable_widening_operation (vec_info*, code_helper,
 					    stmt_vec_info, tree, tree,
 					    code_helper*, code_helper*,
  
Richard Biener Nov. 29, 2023, 1:28 p.m. UTC | #9
On Mon, 27 Nov 2023, Tamar Christina wrote:

>  >
> > > This is a respun patch with a fix for VLA.
> > >
> > > This adds support to vectorizable_live_reduction to handle multiple
> > > exits by doing a search for which exit the live value should be materialized in.
> > >
> > > Additionally which value in the index we're after depends on whether
> > > the exit it's materialized in is an early exit or whether the loop's
> > > main exit is different from the loop's natural one (i.e. the one with
> > > the same src block as the latch).
> > >
> > > In those two cases we want the first rather than the last value as
> > > we're going to restart the iteration in the scalar loop.  For VLA this
> > > means we need to reverse both the mask and vector since there's only a
> > > way to get the last active element and not the first.
> > >
> > > For inductions and multiple exits:
> > >   - we test if the target will support vectorizing the induction
> > >   - mark all inductions in the loop as relevant
> > >   - for codegen of non-live inductions during codegen
> > >   - induction during an early exit gets the first element rather than last.
> > >
> > > For reductions and multiple exits:
> > >   - Reductions for early exits reduces the reduction definition statement
> > >     rather than the reduction step.  This allows us to get the value at the
> > >     start of the iteration.
> > >   - The peeling layout means that we just have to update one block, the
> > merge
> > >     block.  We expect all the reductions to be the same but we leave it up to
> > >     the value numbering to clean up any duplicate code as we iterate over all
> > >     edges.
> > >
> > > These two changes fix the reduction codegen given before which has
> > > been added to the testsuite for early vect.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
> > > 	(vect_analyze_loop_operations): Check if target supports vectorizing
> > IV.
> > > 	(vect_transform_loop): Call vectorizable_live_operation for non-live
> > > 	inductions or reductions.
> > > 	(find_connected_edge, vectorizable_live_operation_1): New.
> > > 	(vect_create_epilog_for_reduction): Support reductions in early break.
> > > 	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
> > > 	(vect_stmt_relevant_p): Mark all inductions when early break as being
> > > 	relevant.
> > > 	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
> > > 	(vect_iv_increment_position): New.
> > > 	* tree-vect-loop-manip.cc (vect_iv_increment_position): Expose.
> > >
> > > --- inline copy of patch ---
> > >
> > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > > index
> > >
> > 476be8a0bb6da2d06c4ca7052cb07bacecca60b1..1a4ba349fb6ae39c79401
> > aecd4e7
> > > eaaaa9e2b8a0 100644
> > > --- a/gcc/tree-vect-loop-manip.cc
> > > +++ b/gcc/tree-vect-loop-manip.cc
> > > @@ -453,7 +453,7 @@ vect_adjust_loop_lens_control (tree iv_type,
> > gimple_seq *seq,
> > >     INSERT_AFTER is set to true if the increment should be inserted after
> > >     *BSI.  */
> > >
> > > -static void
> > > +void
> > >  vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
> > >  			    bool *insert_after)
> > >  {
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index
> > >
> > 8a50380de49bc12105be47ea1d8ee3cf1f2bdab4..b42318b2999e6a27e698
> > 33821907
> > > 92602cb25af1 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -2163,6 +2163,15 @@ vect_analyze_loop_operations (loop_vec_info
> > loop_vinfo)
> > >  	    ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL,
> > NULL,
> > >  					      -1, false, &cost_vec);
> > >
> > > +	  /* Check if we can perform the operation for early break if we force
> > > +	     the live operation.  */
> > > +	  if (ok
> > > +	      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > > +	      && !STMT_VINFO_LIVE_P (stmt_info)
> > > +	      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> > > +	    ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL,
> > NULL,
> > > +					      -1, false, &cost_vec);
> > 
> > can you add && !PURE_SLP_STMT?
> > 
> 
> I've cleaned up the patch a bit more, so these hunks are now all gone.
> 
> > > @@ -6132,23 +6147,30 @@ vect_create_epilog_for_reduction
> > (loop_vec_info loop_vinfo,
> > >           Store them in NEW_PHIS.  */
> > >    if (double_reduc)
> > >      loop = outer_loop;
> > > -  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > > +  /* We need to reduce values in all exits.  */  exit_bb =
> > > + loop_exit->dest;
> > >    exit_gsi = gsi_after_labels (exit_bb);
> > >    reduc_inputs.create (slp_node ? vec_num : ncopies);
> > > +  vec <gimple *> vec_stmts;
> > > +  if (main_exit_p)
> > > +    vec_stmts = STMT_VINFO_VEC_STMTS (rdef_info);  else
> > > +    vec_stmts = STMT_VINFO_VEC_STMTS (STMT_VINFO_REDUC_DEF
> > > + (rdef_info));
> > 
> > both would be wrong for SLP, also I think you need to look at
> > STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))?  For SLP the PHI SLP
> > node is reached via slp_node_instance->reduc_phis.
> > 
> > I think an overall better structure would be to add a
> > 
> > vect_get_vect_def (stmt_vec_info, slp_tree, unsigned);
> > 
> > abstracting SLP and non-SLP and doing
> > 
> >   for (unsigned i = 0; i < vec_num * ncopies; ++i)
> >     {
> >       def = vect_get_vect_def (stmt_info, slp_node, i); ...
> >     }
> > 
> > and then adjusting stmt_info/slp_node according to main_exit_p?
> 
> Done.
> 
> > (would be nice to transition stmt_info->vec_stmts to stmt_info->vec_defs)
> 
> True. I guess since the plan is to remove non-SLP next year this'll just go away anyway.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-loop.cc (vectorizable_live_operation,
> 	vectorizable_live_operation_1): Support early exits.
> 	(can_vectorize_live_stmts): Call vectorizable_live_operation for non-live
> 	inductions or reductions.
> 	(find_connected_edge, vect_get_vect_def): New.
> 	(vect_create_epilog_for_reduction): Support reductions in early break.
> 	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
> 	(vect_stmt_relevant_p): Mark all inductions when early break as being
> 	live.
> 	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index f38cc47551488525b15c2be758cac8291dbefb3a..4e48217a31e59318c2ea8e5ab63b06ba19840cbd 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -3346,6 +3346,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>  	    bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
>  	  bb_before_epilog = loop_preheader_edge (epilog)->src;
>  	}
> +
>        /* If loop is peeled for non-zero constant times, now niters refers to
>  	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
>  	 overflows.  */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index df5e1d28fac2ce35e71decdec0d8e31fb75557f5..90041d1e138afb08c0116f48f517fe0fcc615557 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -5831,6 +5831,34 @@ vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
>    return new_temp;
>  }
>  
> +/* Retrieves the definining statement to be used for a reduction.
> +   For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
> +   the reduction definitions.  */
> +
> +tree
> +vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
> +		   slp_instance slp_node_instance, bool main_exit_p, unsigned i,
> +		   vec <gimple *> &vec_stmts)
> +{
> +  tree def;
> +
> +  if (slp_node)
> +    {
> +      if (!main_exit_p)
> +        slp_node = slp_node_instance->reduc_phis;
> +      def = vect_get_slp_vect_def (slp_node, i);
> +    }
> +  else
> +    {
> +      if (!main_exit_p)
> +	reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
> +      vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
> +      def = gimple_get_lhs (vec_stmts[0]);
> +    }
> +
> +  return def;
> +}
> +
>  /* Function vect_create_epilog_for_reduction
>  
>     Create code at the loop-epilog to finalize the result of a reduction
> @@ -5842,6 +5870,8 @@ vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
>     SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
>     REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
>       (counting from 0)
> +   LOOP_EXIT is the edge to update in the merge block.  In the case of a single
> +     exit this edge is always the main loop exit.
>  
>     This function:
>     1. Completes the reduction def-use cycles.
> @@ -5882,7 +5912,8 @@ static void
>  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>  				  stmt_vec_info stmt_info,
>  				  slp_tree slp_node,
> -				  slp_instance slp_node_instance)
> +				  slp_instance slp_node_instance,
> +				  edge loop_exit)
>  {
>    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
>    gcc_assert (reduc_info->is_reduc_info);
> @@ -5891,6 +5922,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>       loop-closed PHI of the inner loop which we remember as
>       def for the reduction PHI generation.  */
>    bool double_reduc = false;
> +  bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
>    stmt_vec_info rdef_info = stmt_info;
>    if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
>      {
> @@ -6053,7 +6085,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        /* Create an induction variable.  */
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
> -      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +      vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
>        create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
>  		 insert_after, &indx_before_incr, &indx_after_incr);
>  
> @@ -6132,23 +6164,23 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>           Store them in NEW_PHIS.  */
>    if (double_reduc)
>      loop = outer_loop;
> -  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> +  /* We need to reduce values in all exits.  */
> +  exit_bb = loop_exit->dest;
>    exit_gsi = gsi_after_labels (exit_bb);
>    reduc_inputs.create (slp_node ? vec_num : ncopies);
> +  vec <gimple *> vec_stmts;
>    for (unsigned i = 0; i < vec_num; i++)
>      {
>        gimple_seq stmts = NULL;
> -      if (slp_node)
> -	def = vect_get_slp_vect_def (slp_node, i);
> -      else
> -	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
> +      def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
> +			       main_exit_p, i, vec_stmts);
>        for (j = 0; j < ncopies; j++)
>  	{
>  	  tree new_def = copy_ssa_name (def);
>  	  phi = create_phi_node (new_def, exit_bb);
>  	  if (j)
> -	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
> -	  SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
> +	    def = gimple_get_lhs (vec_stmts[j]);
> +	  SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
>  	  new_def = gimple_convert (&stmts, vectype, new_def);
>  	  reduc_inputs.quick_push (new_def);
>  	}
> @@ -6882,10 +6914,33 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>  	    }
>  
>            scalar_result = scalar_results[k];
> +	  edge merge_e = loop_exit;
> +	  if (!main_exit_p)
> +	    merge_e = single_succ_edge (loop_exit->dest);
>            FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
>  	    {
>  	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> -		SET_USE (use_p, scalar_result);
> +		{
> +		  if (main_exit_p)
> +		    SET_USE (use_p, scalar_result);
> +		  else
> +		    {
> +		      /* When multiple exits the same SSA name can appear in
> +			 both the main and the early exits.  The meaning of the
> +			 reduction however is not the same.  In the main exit
> +			 case the meaning is "get the last value" and in the
> +			 early exit case it means "get the first value".  As
> +			 such we should only update the value for the exit
> +			 attached to loop_exit.  To make this easier we always
> +			 call vect_create_epilog_for_reduction on the early
> +			 exit main block first.  As such for the main exit we
> +			 no longer have to perform the BB check.  */
> +		      gphi *stmt = as_a <gphi *> (USE_STMT (use_p));
> +		      int idx = phi_arg_index_from_use (use_p);
> +		      if (gimple_phi_arg_edge (stmt, idx) == merge_e)
> +			SET_USE (use_p, scalar_result);

Hmm, I guess I still don't understand.  This code tries, in the
reduction epilog

  # scalar_result_1 = PHI <..>
  # vector_result_2 = PHI <..>
  _3 = ... reduce vector_result_2 ...;

replace uses of scalar_result_1 with _3 of which there could be many,
including in debug stmts (there doesn't have to be an epilog loop
after all).

Now, for an early exit we know there _is_ an epilog loop and we
have a merge block merging early exits before merging with the
main exit.  We have forced(?) PHI nodes to merge the individual
early exit reduction results?

Either I can't see how we can end up with multiple uses or I can't
see how the main_exit_p case cannot also stomp on those?

Maybe it's related to the other question whether we are emitting
a reduction epilogue for each of the early exits or just once.

> +		    }
> +		}
>  	      update_stmt (use_stmt);
>  	    }
>          }
> @@ -10481,15 +10536,17 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>    return true;
>  }
>  
> -
>  /* Function vectorizable_live_operation_1.
> +
>     helper function for vectorizable_live_operation.  */
> +
>  tree
>  vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>  			       stmt_vec_info stmt_info, edge exit_e,
>  			       tree vectype, int ncopies, slp_tree slp_node,
>  			       tree bitsize, tree bitstart, tree vec_lhs,
> -			       tree lhs_type, gimple_stmt_iterator *exit_gsi)
> +			       tree lhs_type, bool restart_loop,
> +			       gimple_stmt_iterator *exit_gsi)
>  {
>    basic_block exit_bb = exit_e->dest;
>    gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
> @@ -10504,7 +10561,9 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>    if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>      {
>        /* Emit:
> +
>  	 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
>  	 where VEC_LHS is the vectorized live-out result and MASK is
>  	 the loop mask for the final iteration.  */
>        gcc_assert (ncopies == 1 && !slp_node);
> @@ -10513,15 +10572,18 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>        tree len = vect_get_loop_len (loop_vinfo, &gsi,
>  				    &LOOP_VINFO_LENS (loop_vinfo),
>  				    1, vectype, 0, 0);
> +
>        /* BIAS - 1.  */
>        signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>        tree bias_minus_one
>  	= int_const_binop (MINUS_EXPR,
>  			   build_int_cst (TREE_TYPE (len), biasval),
>  			   build_one_cst (TREE_TYPE (len)));
> +
>        /* LAST_INDEX = LEN + (BIAS - 1).  */
>        tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
>  				     len, bias_minus_one);
> +
>        /* This needs to implement extraction of the first index, but not sure
>  	 how the LEN stuff works.  At the moment we shouldn't get here since
>  	 there's no LEN support for early breaks.  But guard this so there's
> @@ -10532,13 +10594,16 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>        tree scalar_res
>  	= gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
>  			vec_lhs_phi, last_index);
> +
>        /* Convert the extracted vector element to the scalar type.  */
>        new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
>      }
>    else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>      {
>        /* Emit:
> +
>  	 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
> +
>  	 where VEC_LHS is the vectorized live-out result and MASK is
>  	 the loop mask for the final iteration.  */
>        gcc_assert (!slp_node);
> @@ -10548,10 +10613,38 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>        tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
>  				      &LOOP_VINFO_MASKS (loop_vinfo),
>  				      1, vectype, 0);
> +      tree scalar_res;
> +
> +      /* For an inverted control flow with early breaks we want EXTRACT_FIRST
> +	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
> +      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +	{
> +	  /* First create the permuted mask.  */
> +	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> +	  tree perm_dest = copy_ssa_name (mask);
> +	  gimple *perm_stmt
> +		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> +				       mask, perm_mask);
> +	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> +				       &gsi);
> +	  mask = perm_dest;
> +
> +	  /* Then permute the vector contents.  */
> +	  tree perm_elem = perm_mask_for_reverse (vectype);
> +	  perm_dest = copy_ssa_name (vec_lhs_phi);
> +	  perm_stmt
> +		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
> +				       vec_lhs_phi, perm_elem);
> +	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> +				       &gsi);
> +	  vec_lhs_phi = perm_dest;
> +	}
>  
>        gimple_seq_add_seq (&stmts, tem);
> -       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> -				       mask, vec_lhs_phi);
> +
> +      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> +				 mask, vec_lhs_phi);
> +
>        /* Convert the extracted vector element to the scalar type.  */
>        new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
>      }
> @@ -10564,12 +10657,26 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>        new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
>  				       &stmts, true, NULL_TREE);
>      }
> +
>    *exit_gsi = gsi_after_labels (exit_bb);
>    if (stmts)
>      gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
> +
>    return new_tree;
>  }
>  
> +/* Find the edge that's the final one in the path from SRC to DEST and
> +   return it.  This edge must exist in at most one forwarder edge between.  */
> +
> +static edge
> +find_connected_edge (edge src, basic_block dest)
> +{
> +   if (src->dest == dest)
> +     return src;
> +
> +  return find_edge (src->dest, dest);
> +}
> +
>  /* Function vectorizable_live_operation.
>  
>     STMT_INFO computes a value that is used outside the loop.  Check if
> @@ -10594,7 +10701,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    int vec_entry = 0;
>    poly_uint64 vec_index = 0;
>  
> -  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
> +  gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
> +	      || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
>  
>    /* If a stmt of a reduction is live, vectorize it via
>       vect_create_epilog_for_reduction.  vectorizable_reduction assessed
> @@ -10619,8 +10727,25 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
>  	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
>  	return true;
> +
> +      /* If early break we only have to materialize the reduction on the merge
> +	 block, but we have to find an alternate exit first.  */
> +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +	{
> +	  for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
> +	    if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
> +	      {
> +		vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
> +						  slp_node, slp_node_instance,
> +						  exit);
> +		break;
> +	      }
> +	}
> +
>        vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
> -					slp_node_instance);
> +					slp_node_instance,
> +					LOOP_VINFO_IV_EXIT (loop_vinfo));
> +
>        return true;
>      }
>  
> @@ -10772,37 +10897,63 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  	   lhs' = new_tree;  */
>  
>        class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> -      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> -      gcc_assert (single_pred_p (exit_bb));
> -
> -      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
> -      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
> -      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
> -
> -      gimple_stmt_iterator exit_gsi;
> -      tree new_tree
> -	= vectorizable_live_operation_1 (loop_vinfo, stmt_info,
> -					 LOOP_VINFO_IV_EXIT (loop_vinfo),
> -					 vectype, ncopies, slp_node, bitsize,
> -					 bitstart, vec_lhs, lhs_type,
> -					 &exit_gsi);
> -
> -      /* Remove existing phis that copy from lhs and create copies
> -	 from new_tree.  */
> -      gimple_stmt_iterator gsi;
> -      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
> -	{
> -	  gimple *phi = gsi_stmt (gsi);
> -	  if ((gimple_phi_arg_def (phi, 0) == lhs))
> -	    {
> -	      remove_phi_node (&gsi, false);
> -	      tree lhs_phi = gimple_phi_result (phi);
> -	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> -	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> -	    }
> -	  else
> -	    gsi_next (&gsi);
> -	}
> +      /* Check if we have a loop where the chosen exit is not the main exit,
> +	 in these cases for an early break we restart the iteration the vector code
> +	 did.  For the live values we want the value at the start of the iteration
> +	 rather than at the end.  */
> +      edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> +      bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
> +      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> +	if (!is_gimple_debug (use_stmt)
> +	    && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> +	  {
> +	    basic_block use_bb = gimple_bb (use_stmt);
> +	    if (!is_a <gphi *> (use_stmt))

should always be a PHI

> +	      continue;
> +	    for (auto exit_e : get_loop_exit_edges (loop))
> +	      {
> +		/* See if this exit leads to the value.  */
> +		edge dest_e = find_connected_edge (exit_e, use_bb);

When is this not exit_e->dest == use_bb?

> +		if (!dest_e || PHI_ARG_DEF_FROM_EDGE (use_stmt, dest_e) != lhs)
> +		  continue;

I'd change the above to

       FOR_EACH_IMM_USE_FAST (...)

then

   gimple_phi_arg_edge (USE_STMT (use_p), phi_arg_index_from_use (use_p))

is the exit edge you are looking for without iterating over all loop
exits.

> +		gimple *tmp_vec_stmt = vec_stmt;
> +		tree tmp_vec_lhs = vec_lhs;
> +		tree tmp_bitstart = bitstart;
> +		/* For early exit where the exit is not in the BB that leads
> +		   to the latch then we're restarting the iteration in the
> +		   scalar loop.  So get the first live value.  */
> +		restart_loop = restart_loop || exit_e != main_e;
> +		if (restart_loop)
> +		  {
> +		    tmp_vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> +		    tmp_vec_lhs = gimple_get_lhs (tmp_vec_stmt);
> +		    tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));

Hmm, that gets you the value after the first iteration, not the one
before which would be the last value of the preceeding vector iteration?
(but we don't keep those, we'd need a PHI)

Why again do we need (non-induction) live values from the vector loop
to the epilogue loop again?

If we are dealing with an induction (or a reduction, you can check
the def type), there should be an associated PHI node to get that
vector.

That said, are you sure there's testsuite coverage for the induction
case?

> +		  }
> +
> +		gimple_stmt_iterator exit_gsi;
> +		tree new_tree
> +		  = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
> +						   exit_e, vectype, ncopies,
> +						   slp_node, bitsize,
> +						   tmp_bitstart, tmp_vec_lhs,
> +						   lhs_type, restart_loop,
> +						   &exit_gsi);
> +
> +		/* Use the empty block on the exit to materialize the new stmts
> +		   so we can use update the PHI here.  */
> +		if (gimple_phi_num_args (use_stmt) == 1)
> +		  {
> +		    auto gsi = gsi_for_stmt (use_stmt);
> +		    remove_phi_node (&gsi, false);
> +		    tree lhs_phi = gimple_phi_result (use_stmt);
> +		    gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> +		    gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> +		  }
> +		else
> +		  SET_PHI_ARG_DEF (use_stmt, dest_e->dest_idx, new_tree);

if the else case works, why not use it always?


The rest looks OK.

Richard.

> +	      }
> +	  }
>  
>        /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
>        FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index fe38beb4fa1d9f8593445354f56ba52e10a040cd..27221c6e8e86034050b562ee5c15992827a8d2cb 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -342,6 +342,7 @@ is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
>     - it has uses outside the loop.
>     - it has vdefs (it alters memory).
>     - control stmts in the loop (except for the exit condition).
> +   - it is an induction and we have multiple exits.
>  
>     CHECKME: what other side effects would the vectorizer allow?  */
>  
> @@ -399,6 +400,19 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
>  	}
>      }
>  
> +  /* Check if it's an induction and multiple exits.  In this case there will be
> +     a usage later on after peeling which is needed for the alternate exit.  */
> +  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> +    {
> +      if (dump_enabled_p ())
> +	  dump_printf_loc (MSG_NOTE, vect_location,
> +			   "vec_stmt_relevant_p: induction forced for "
> +			   "early break.\n");
> +      *live_p = true;
> +
> +    }
> +
>    if (*live_p && *relevant == vect_unused_in_scope
>        && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
>      {
> @@ -1774,7 +1788,7 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
>  /* If the target supports a permute mask that reverses the elements in
>     a vector of type VECTYPE, return that mask, otherwise return null.  */
>  
> -static tree
> +tree
>  perm_mask_for_reverse (tree vectype)
>  {
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> @@ -12720,20 +12734,27 @@ can_vectorize_live_stmts (vec_info *vinfo, stmt_vec_info stmt_info,
>  			  bool vec_stmt_p,
>  			  stmt_vector_for_cost *cost_vec)
>  {
> +  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
>    if (slp_node)
>      {
>        stmt_vec_info slp_stmt_info;
>        unsigned int i;
>        FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, slp_stmt_info)
>  	{
> -	  if (STMT_VINFO_LIVE_P (slp_stmt_info)
> +	  if ((STMT_VINFO_LIVE_P (slp_stmt_info)
> +	       || (loop_vinfo
> +		   && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +		   && STMT_VINFO_DEF_TYPE (slp_stmt_info)
> +			== vect_induction_def))
>  	      && !vectorizable_live_operation (vinfo, slp_stmt_info, slp_node,
>  					       slp_node_instance, i,
>  					       vec_stmt_p, cost_vec))
>  	    return false;
>  	}
>      }
> -  else if (STMT_VINFO_LIVE_P (stmt_info)
> +  else if ((STMT_VINFO_LIVE_P (stmt_info)
> +	    || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +		&& STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def))
>  	   && !vectorizable_live_operation (vinfo, stmt_info,
>  					    slp_node, slp_node_instance, -1,
>  					    vec_stmt_p, cost_vec))
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index de60da31e2a3030a7fbc302d3f676af9683fd019..fd4b0a787e6128b43c5ca2b0612f55845e6b3cef 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2248,6 +2248,7 @@ extern bool vect_is_simple_use (vec_info *, stmt_vec_info, slp_tree,
>  				enum vect_def_type *,
>  				tree *, stmt_vec_info * = NULL);
>  extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
> +extern tree perm_mask_for_reverse (tree);
>  extern bool supportable_widening_operation (vec_info*, code_helper,
>  					    stmt_vec_info, tree, tree,
>  					    code_helper*, code_helper*,
>
  
Tamar Christina Nov. 29, 2023, 9:22 p.m. UTC | #10
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Wednesday, November 29, 2023 2:29 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH 8/21]middle-end: update vectorizable_live_reduction
> with support for multiple exits and different exits
> 
> On Mon, 27 Nov 2023, Tamar Christina wrote:
> 
> >  >
> > > > This is a respun patch with a fix for VLA.
> > > >
> > > > This adds support to vectorizable_live_reduction to handle
> > > > multiple exits by doing a search for which exit the live value should be
> materialized in.
> > > >
> > > > Additionally which value in the index we're after depends on
> > > > whether the exit it's materialized in is an early exit or whether
> > > > the loop's main exit is different from the loop's natural one
> > > > (i.e. the one with the same src block as the latch).
> > > >
> > > > In those two cases we want the first rather than the last value as
> > > > we're going to restart the iteration in the scalar loop.  For VLA
> > > > this means we need to reverse both the mask and vector since
> > > > there's only a way to get the last active element and not the first.
> > > >
> > > > For inductions and multiple exits:
> > > >   - we test if the target will support vectorizing the induction
> > > >   - mark all inductions in the loop as relevant
> > > >   - for codegen of non-live inductions during codegen
> > > >   - induction during an early exit gets the first element rather than last.
> > > >
> > > > For reductions and multiple exits:
> > > >   - Reductions for early exits reduces the reduction definition statement
> > > >     rather than the reduction step.  This allows us to get the value at the
> > > >     start of the iteration.
> > > >   - The peeling layout means that we just have to update one
> > > > block, the
> > > merge
> > > >     block.  We expect all the reductions to be the same but we leave it up to
> > > >     the value numbering to clean up any duplicate code as we iterate over
> all
> > > >     edges.
> > > >
> > > > These two changes fix the reduction codegen given before which has
> > > > been added to the testsuite for early vect.
> > > >
> > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > >
> > > > Ok for master?
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > 	* tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
> > > > 	(vect_analyze_loop_operations): Check if target supports
> > > > vectorizing
> > > IV.
> > > > 	(vect_transform_loop): Call vectorizable_live_operation for non-live
> > > > 	inductions or reductions.
> > > > 	(find_connected_edge, vectorizable_live_operation_1): New.
> > > > 	(vect_create_epilog_for_reduction): Support reductions in early break.
> > > > 	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
> > > > 	(vect_stmt_relevant_p): Mark all inductions when early break as being
> > > > 	relevant.
> > > > 	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
> > > > 	(vect_iv_increment_position): New.
> > > > 	* tree-vect-loop-manip.cc (vect_iv_increment_position): Expose.
> > > >
> > > > --- inline copy of patch ---
> > > >
> > > > diff --git a/gcc/tree-vect-loop-manip.cc
> > > > b/gcc/tree-vect-loop-manip.cc index
> > > >
> > >
> 476be8a0bb6da2d06c4ca7052cb07bacecca60b1..1a4ba349fb6ae39c79401
> > > aecd4e7
> > > > eaaaa9e2b8a0 100644
> > > > --- a/gcc/tree-vect-loop-manip.cc
> > > > +++ b/gcc/tree-vect-loop-manip.cc
> > > > @@ -453,7 +453,7 @@ vect_adjust_loop_lens_control (tree iv_type,
> > > gimple_seq *seq,
> > > >     INSERT_AFTER is set to true if the increment should be inserted after
> > > >     *BSI.  */
> > > >
> > > > -static void
> > > > +void
> > > >  vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
> > > >  			    bool *insert_after)
> > > >  {
> > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index
> > > >
> > >
> 8a50380de49bc12105be47ea1d8ee3cf1f2bdab4..b42318b2999e6a27e698
> > > 33821907
> > > > 92602cb25af1 100644
> > > > --- a/gcc/tree-vect-loop.cc
> > > > +++ b/gcc/tree-vect-loop.cc
> > > > @@ -2163,6 +2163,15 @@ vect_analyze_loop_operations
> (loop_vec_info
> > > loop_vinfo)
> > > >  	    ok = vectorizable_live_operation (loop_vinfo, stmt_info,
> > > > NULL,
> > > NULL,
> > > >  					      -1, false, &cost_vec);
> > > >
> > > > +	  /* Check if we can perform the operation for early break if we force
> > > > +	     the live operation.  */
> > > > +	  if (ok
> > > > +	      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > > > +	      && !STMT_VINFO_LIVE_P (stmt_info)
> > > > +	      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> > > > +	    ok = vectorizable_live_operation (loop_vinfo, stmt_info,
> > > > +NULL,
> > > NULL,
> > > > +					      -1, false, &cost_vec);
> > >
> > > can you add && !PURE_SLP_STMT?
> > >
> >
> > I've cleaned up the patch a bit more, so these hunks are now all gone.
> >
> > > > @@ -6132,23 +6147,30 @@ vect_create_epilog_for_reduction
> > > (loop_vec_info loop_vinfo,
> > > >           Store them in NEW_PHIS.  */
> > > >    if (double_reduc)
> > > >      loop = outer_loop;
> > > > -  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > > > +  /* We need to reduce values in all exits.  */  exit_bb =
> > > > + loop_exit->dest;
> > > >    exit_gsi = gsi_after_labels (exit_bb);
> > > >    reduc_inputs.create (slp_node ? vec_num : ncopies);
> > > > +  vec <gimple *> vec_stmts;
> > > > +  if (main_exit_p)
> > > > +    vec_stmts = STMT_VINFO_VEC_STMTS (rdef_info);  else
> > > > +    vec_stmts = STMT_VINFO_VEC_STMTS (STMT_VINFO_REDUC_DEF
> > > > + (rdef_info));
> > >
> > > both would be wrong for SLP, also I think you need to look at
> > > STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))?  For SLP the PHI
> > > SLP node is reached via slp_node_instance->reduc_phis.
> > >
> > > I think an overall better structure would be to add a
> > >
> > > vect_get_vect_def (stmt_vec_info, slp_tree, unsigned);
> > >
> > > abstracting SLP and non-SLP and doing
> > >
> > >   for (unsigned i = 0; i < vec_num * ncopies; ++i)
> > >     {
> > >       def = vect_get_vect_def (stmt_info, slp_node, i); ...
> > >     }
> > >
> > > and then adjusting stmt_info/slp_node according to main_exit_p?
> >
> > Done.
> >
> > > (would be nice to transition stmt_info->vec_stmts to
> > > stmt_info->vec_defs)
> >
> > True. I guess since the plan is to remove non-SLP next year this'll just go
> away anyway.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* tree-vect-loop.cc (vectorizable_live_operation,
> > 	vectorizable_live_operation_1): Support early exits.
> > 	(can_vectorize_live_stmts): Call vectorizable_live_operation for non-
> live
> > 	inductions or reductions.
> > 	(find_connected_edge, vect_get_vect_def): New.
> > 	(vect_create_epilog_for_reduction): Support reductions in early break.
> > 	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
> > 	(vect_stmt_relevant_p): Mark all inductions when early break as being
> > 	live.
> > 	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > index
> >
> f38cc47551488525b15c2be758cac8291dbefb3a..4e48217a31e59318c2ea8
> e5ab63b
> > 06ba19840cbd 100644
> > --- a/gcc/tree-vect-loop-manip.cc
> > +++ b/gcc/tree-vect-loop-manip.cc
> > @@ -3346,6 +3346,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree
> niters, tree nitersm1,
> >  	    bb_before_epilog->count = single_pred_edge (bb_before_epilog)-
> >count ();
> >  	  bb_before_epilog = loop_preheader_edge (epilog)->src;
> >  	}
> > +
> >        /* If loop is peeled for non-zero constant times, now niters refers to
> >  	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
> >  	 overflows.  */
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index
> >
> df5e1d28fac2ce35e71decdec0d8e31fb75557f5..90041d1e138afb08c0116f
> 48f517
> > fe0fcc615557 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -5831,6 +5831,34 @@ vect_create_partial_epilog (tree vec_def, tree
> vectype, code_helper code,
> >    return new_temp;
> >  }
> >
> > +/* Retrieves the definining statement to be used for a reduction.
> > +   For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look
> at
> > +   the reduction definitions.  */
> > +
> > +tree
> > +vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
> > +		   slp_instance slp_node_instance, bool main_exit_p, unsigned
> i,
> > +		   vec <gimple *> &vec_stmts)
> > +{
> > +  tree def;
> > +
> > +  if (slp_node)
> > +    {
> > +      if (!main_exit_p)
> > +        slp_node = slp_node_instance->reduc_phis;
> > +      def = vect_get_slp_vect_def (slp_node, i);
> > +    }
> > +  else
> > +    {
> > +      if (!main_exit_p)
> > +	reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
> > +      vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
> > +      def = gimple_get_lhs (vec_stmts[0]);
> > +    }
> > +
> > +  return def;
> > +}
> > +
> >  /* Function vect_create_epilog_for_reduction
> >
> >     Create code at the loop-epilog to finalize the result of a
> > reduction @@ -5842,6 +5870,8 @@ vect_create_partial_epilog (tree
> vec_def, tree vectype, code_helper code,
> >     SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
> >     REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction
> phi
> >       (counting from 0)
> > +   LOOP_EXIT is the edge to update in the merge block.  In the case of a
> single
> > +     exit this edge is always the main loop exit.
> >
> >     This function:
> >     1. Completes the reduction def-use cycles.
> > @@ -5882,7 +5912,8 @@ static void
> >  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
> >  				  stmt_vec_info stmt_info,
> >  				  slp_tree slp_node,
> > -				  slp_instance slp_node_instance)
> > +				  slp_instance slp_node_instance,
> > +				  edge loop_exit)
> >  {
> >    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
> >    gcc_assert (reduc_info->is_reduc_info); @@ -5891,6 +5922,7 @@
> > vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
> >       loop-closed PHI of the inner loop which we remember as
> >       def for the reduction PHI generation.  */
> >    bool double_reduc = false;
> > +  bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
> >    stmt_vec_info rdef_info = stmt_info;
> >    if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
> >      {
> > @@ -6053,7 +6085,7 @@ vect_create_epilog_for_reduction (loop_vec_info
> loop_vinfo,
> >        /* Create an induction variable.  */
> >        gimple_stmt_iterator incr_gsi;
> >        bool insert_after;
> > -      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> > +      vect_iv_increment_position (loop_exit, &incr_gsi,
> > + &insert_after);
> >        create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
> >  		 insert_after, &indx_before_incr, &indx_after_incr);
> >
> > @@ -6132,23 +6164,23 @@ vect_create_epilog_for_reduction
> (loop_vec_info loop_vinfo,
> >           Store them in NEW_PHIS.  */
> >    if (double_reduc)
> >      loop = outer_loop;
> > -  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > +  /* We need to reduce values in all exits.  */  exit_bb =
> > + loop_exit->dest;
> >    exit_gsi = gsi_after_labels (exit_bb);
> >    reduc_inputs.create (slp_node ? vec_num : ncopies);
> > +  vec <gimple *> vec_stmts;
> >    for (unsigned i = 0; i < vec_num; i++)
> >      {
> >        gimple_seq stmts = NULL;
> > -      if (slp_node)
> > -	def = vect_get_slp_vect_def (slp_node, i);
> > -      else
> > -	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
> > +      def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
> > +			       main_exit_p, i, vec_stmts);
> >        for (j = 0; j < ncopies; j++)
> >  	{
> >  	  tree new_def = copy_ssa_name (def);
> >  	  phi = create_phi_node (new_def, exit_bb);
> >  	  if (j)
> > -	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
> > -	  SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)-
> >dest_idx, def);
> > +	    def = gimple_get_lhs (vec_stmts[j]);
> > +	  SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
> >  	  new_def = gimple_convert (&stmts, vectype, new_def);
> >  	  reduc_inputs.quick_push (new_def);
> >  	}
> > @@ -6882,10 +6914,33 @@ vect_create_epilog_for_reduction
> (loop_vec_info loop_vinfo,
> >  	    }
> >
> >            scalar_result = scalar_results[k];
> > +	  edge merge_e = loop_exit;
> > +	  if (!main_exit_p)
> > +	    merge_e = single_succ_edge (loop_exit->dest);
> >            FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
> >  	    {
> >  	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> > -		SET_USE (use_p, scalar_result);
> > +		{
> > +		  if (main_exit_p)
> > +		    SET_USE (use_p, scalar_result);
> > +		  else
> > +		    {
> > +		      /* When multiple exits the same SSA name can appear in
> > +			 both the main and the early exits.  The meaning of the
> > +			 reduction however is not the same.  In the main exit
> > +			 case the meaning is "get the last value" and in the
> > +			 early exit case it means "get the first value".  As
> > +			 such we should only update the value for the exit
> > +			 attached to loop_exit.  To make this easier we always
> > +			 call vect_create_epilog_for_reduction on the early
> > +			 exit main block first.  As such for the main exit we
> > +			 no longer have to perform the BB check.  */
> > +		      gphi *stmt = as_a <gphi *> (USE_STMT (use_p));
> > +		      int idx = phi_arg_index_from_use (use_p);
> > +		      if (gimple_phi_arg_edge (stmt, idx) == merge_e)
> > +			SET_USE (use_p, scalar_result);
> 
> Hmm, I guess I still don't understand.  This code tries, in the reduction epilog
> 
>   # scalar_result_1 = PHI <..>
>   # vector_result_2 = PHI <..>
>   _3 = ... reduce vector_result_2 ...;
> 
> replace uses of scalar_result_1 with _3 of which there could be many,
> including in debug stmts (there doesn't have to be an epilog loop after all).
> 
> Now, for an early exit we know there _is_ an epilog loop and we have a merge
> block merging early exits before merging with the main exit.  We have forced(?)
> PHI nodes to merge the individual early exit reduction results?

Sure, but you only always go to it for the early exit.  Not for the main one.
That one is still decided by the exit guard.

> 
> Either I can't see how we can end up with multiple uses or I can't see how the
> main_exit_p case cannot also stomp on those?
> 
> Maybe it's related to the other question whether we are emitting a reduction
> epilogue for each of the early exits or just once.

We aren't. We are only doing so once.  While we loop over the exits to find the
alternative exit. After the first one is found it breaks out.  This is because we have
no easy way to identify the merge block but to iterate.

To explain the above lets look at an example with a reduction (testcase vect-early-break_16.c)

#define N 1024
unsigned vect_a[N];
unsigned vect_b[N];

unsigned test4(unsigned x)
{
 unsigned ret = 0;
 for (int i = 0; i < N; i++)
 {
   vect_b[i] = x + i;
   if (vect_a[i] > x)
     return vect_a[i];
   vect_a[i] = x;
   ret += vect_a[i] + vect_b[i];
 }
 return ret;
}

This will give you this graph after peeling and updating of IVs
https://gist.github.com/Mistuke/c2d632498ceeb10e24a9057bafd87412

This function does not need the epilogue.  So when the guard is added
The condition is always false.  But it hasn't been folded away yet.

Because of this you have the same PHI on both in the edge to the
function exit, and to the merge block, at least until CFG cleanup. 

However thinking about it some more, the possibility is that we remove
The main edge from the merge block,  so if I just handle the main edge
First then the SSA chain can never be broken and the check isn't needed.

Fixed, will be in next update.

> 
> > +		    }
> > +		}
> >  	      update_stmt (use_stmt);
> >  	    }
> >          }
> > @@ -10481,15 +10536,17 @@ vectorizable_induction (loop_vec_info
> loop_vinfo,
> >    return true;
> >  }
> >
> > -
> >  /* Function vectorizable_live_operation_1.
> > +
> >     helper function for vectorizable_live_operation.  */
> > +
> >  tree
> >  vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
> >  			       stmt_vec_info stmt_info, edge exit_e,
> >  			       tree vectype, int ncopies, slp_tree slp_node,
> >  			       tree bitsize, tree bitstart, tree vec_lhs,
> > -			       tree lhs_type, gimple_stmt_iterator *exit_gsi)
> > +			       tree lhs_type, bool restart_loop,
> > +			       gimple_stmt_iterator *exit_gsi)
> >  {
> >    basic_block exit_bb = exit_e->dest;
> >    gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS
> > (loop_vinfo)); @@ -10504,7 +10561,9 @@ vectorizable_live_operation_1
> (loop_vec_info loop_vinfo,
> >    if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> >      {
> >        /* Emit:
> > +
> >  	 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> > +
> >  	 where VEC_LHS is the vectorized live-out result and MASK is
> >  	 the loop mask for the final iteration.  */
> >        gcc_assert (ncopies == 1 && !slp_node); @@ -10513,15 +10572,18
> > @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
> >        tree len = vect_get_loop_len (loop_vinfo, &gsi,
> >  				    &LOOP_VINFO_LENS (loop_vinfo),
> >  				    1, vectype, 0, 0);
> > +
> >        /* BIAS - 1.  */
> >        signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS
> (loop_vinfo);
> >        tree bias_minus_one
> >  	= int_const_binop (MINUS_EXPR,
> >  			   build_int_cst (TREE_TYPE (len), biasval),
> >  			   build_one_cst (TREE_TYPE (len)));
> > +
> >        /* LAST_INDEX = LEN + (BIAS - 1).  */
> >        tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> >  				     len, bias_minus_one);
> > +
> >        /* This needs to implement extraction of the first index, but not sure
> >  	 how the LEN stuff works.  At the moment we shouldn't get here since
> >  	 there's no LEN support for early breaks.  But guard this so there's
> > @@ -10532,13 +10594,16 @@ vectorizable_live_operation_1
> (loop_vec_info loop_vinfo,
> >        tree scalar_res
> >  	= gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> >  			vec_lhs_phi, last_index);
> > +
> >        /* Convert the extracted vector element to the scalar type.  */
> >        new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> >      }
> >    else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> >      {
> >        /* Emit:
> > +
> >  	 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
> > +
> >  	 where VEC_LHS is the vectorized live-out result and MASK is
> >  	 the loop mask for the final iteration.  */
> >        gcc_assert (!slp_node);
> > @@ -10548,10 +10613,38 @@ vectorizable_live_operation_1
> (loop_vec_info loop_vinfo,
> >        tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
> >  				      &LOOP_VINFO_MASKS (loop_vinfo),
> >  				      1, vectype, 0);
> > +      tree scalar_res;
> > +
> > +      /* For an inverted control flow with early breaks we want
> EXTRACT_FIRST
> > +	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask.
> */
> > +      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > +	{
> > +	  /* First create the permuted mask.  */
> > +	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> > +	  tree perm_dest = copy_ssa_name (mask);
> > +	  gimple *perm_stmt
> > +		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> > +				       mask, perm_mask);
> > +	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> > +				       &gsi);
> > +	  mask = perm_dest;
> > +
> > +	  /* Then permute the vector contents.  */
> > +	  tree perm_elem = perm_mask_for_reverse (vectype);
> > +	  perm_dest = copy_ssa_name (vec_lhs_phi);
> > +	  perm_stmt
> > +		= gimple_build_assign (perm_dest, VEC_PERM_EXPR,
> vec_lhs_phi,
> > +				       vec_lhs_phi, perm_elem);
> > +	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> > +				       &gsi);
> > +	  vec_lhs_phi = perm_dest;
> > +	}
> >
> >        gimple_seq_add_seq (&stmts, tem);
> > -       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
> scalar_type,
> > -				       mask, vec_lhs_phi);
> > +
> > +      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> > +				 mask, vec_lhs_phi);
> > +
> >        /* Convert the extracted vector element to the scalar type.  */
> >        new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> >      }
> > @@ -10564,12 +10657,26 @@ vectorizable_live_operation_1
> (loop_vec_info loop_vinfo,
> >        new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
> >  				       &stmts, true, NULL_TREE);
> >      }
> > +
> >    *exit_gsi = gsi_after_labels (exit_bb);
> >    if (stmts)
> >      gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
> > +
> >    return new_tree;
> >  }
> >
> > +/* Find the edge that's the final one in the path from SRC to DEST and
> > +   return it.  This edge must exist in at most one forwarder edge
> > +between.  */
> > +
> > +static edge
> > +find_connected_edge (edge src, basic_block dest) {
> > +   if (src->dest == dest)
> > +     return src;
> > +
> > +  return find_edge (src->dest, dest); }
> > +
> >  /* Function vectorizable_live_operation.
> >
> >     STMT_INFO computes a value that is used outside the loop.  Check
> > if @@ -10594,7 +10701,8 @@ vectorizable_live_operation (vec_info *vinfo,
> stmt_vec_info stmt_info,
> >    int vec_entry = 0;
> >    poly_uint64 vec_index = 0;
> >
> > -  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
> > +  gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
> > +	      || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
> >
> >    /* If a stmt of a reduction is live, vectorize it via
> >       vect_create_epilog_for_reduction.  vectorizable_reduction
> > assessed @@ -10619,8 +10727,25 @@ vectorizable_live_operation
> (vec_info *vinfo, stmt_vec_info stmt_info,
> >        if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
> >  	  || STMT_VINFO_REDUC_TYPE (reduc_info) ==
> EXTRACT_LAST_REDUCTION)
> >  	return true;
> > +
> > +      /* If early break we only have to materialize the reduction on the merge
> > +	 block, but we have to find an alternate exit first.  */
> > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> > +	{
> > +	  for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP
> (loop_vinfo)))
> > +	    if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
> > +	      {
> > +		vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
> > +						  slp_node,
> slp_node_instance,
> > +						  exit);
> > +		break;
> > +	      }
> > +	}
> > +
> >        vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
> > -					slp_node_instance);
> > +					slp_node_instance,
> > +					LOOP_VINFO_IV_EXIT (loop_vinfo));
> > +
> >        return true;
> >      }
> >
> > @@ -10772,37 +10897,63 @@ vectorizable_live_operation (vec_info
> *vinfo, stmt_vec_info stmt_info,
> >  	   lhs' = new_tree;  */
> >
> >        class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> > -      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> > -      gcc_assert (single_pred_p (exit_bb));
> > -
> > -      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
> > -      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
> > -      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx,
> vec_lhs);
> > -
> > -      gimple_stmt_iterator exit_gsi;
> > -      tree new_tree
> > -	= vectorizable_live_operation_1 (loop_vinfo, stmt_info,
> > -					 LOOP_VINFO_IV_EXIT (loop_vinfo),
> > -					 vectype, ncopies, slp_node, bitsize,
> > -					 bitstart, vec_lhs, lhs_type,
> > -					 &exit_gsi);
> > -
> > -      /* Remove existing phis that copy from lhs and create copies
> > -	 from new_tree.  */
> > -      gimple_stmt_iterator gsi;
> > -      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
> > -	{
> > -	  gimple *phi = gsi_stmt (gsi);
> > -	  if ((gimple_phi_arg_def (phi, 0) == lhs))
> > -	    {
> > -	      remove_phi_node (&gsi, false);
> > -	      tree lhs_phi = gimple_phi_result (phi);
> > -	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> > -	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> > -	    }
> > -	  else
> > -	    gsi_next (&gsi);
> > -	}
> > +      /* Check if we have a loop where the chosen exit is not the main exit,
> > +	 in these cases for an early break we restart the iteration the vector
> code
> > +	 did.  For the live values we want the value at the start of the iteration
> > +	 rather than at the end.  */
> > +      edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> > +      bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED
> (loop_vinfo);
> > +      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> > +	if (!is_gimple_debug (use_stmt)
> > +	    && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> > +	  {
> > +	    basic_block use_bb = gimple_bb (use_stmt);
> > +	    if (!is_a <gphi *> (use_stmt))
> 
> should always be a PHI
> 
> > +	      continue;
> > +	    for (auto exit_e : get_loop_exit_edges (loop))
> > +	      {
> > +		/* See if this exit leads to the value.  */
> > +		edge dest_e = find_connected_edge (exit_e, use_bb);
> 
> When is this not exit_e->dest == use_bb?

The main exit blocks have an intermediate block in between it and the merge block
that contains only the same PHI nodes as the original exit, and in an order to allow
it to be easily linked to epilogue's main exit.

That block won't contain induction values as they're only needed in the merge block.
With that said.. simplified the code.

> 
> > +		if (!dest_e || PHI_ARG_DEF_FROM_EDGE (use_stmt, dest_e)
> != lhs)
> > +		  continue;
> 
> I'd change the above to
> 
>        FOR_EACH_IMM_USE_FAST (...)
> 
> then
> 
>    gimple_phi_arg_edge (USE_STMT (use_p), phi_arg_index_from_use (use_p))
> 
> is the exit edge you are looking for without iterating over all loop exits.
> 
> > +		gimple *tmp_vec_stmt = vec_stmt;
> > +		tree tmp_vec_lhs = vec_lhs;
> > +		tree tmp_bitstart = bitstart;
> > +		/* For early exit where the exit is not in the BB that leads
> > +		   to the latch then we're restarting the iteration in the
> > +		   scalar loop.  So get the first live value.  */
> > +		restart_loop = restart_loop || exit_e != main_e;
> > +		if (restart_loop)
> > +		  {
> > +		    tmp_vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> > +		    tmp_vec_lhs = gimple_get_lhs (tmp_vec_stmt);
> > +		    tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
> 
> Hmm, that gets you the value after the first iteration, not the one before which
> would be the last value of the preceeding vector iteration?
> (but we don't keep those, we'd need a PHI)

I don't fully follow.  The comment on top of this hunk under if (loop_vinfo) states
that lhs should be pointing to a PHI.

When I inspect the statement I see

i_14 = PHI <i_11(6), 0(14)>

so i_14 is the value at the start of the current iteration.  If we're coming from the
header 0, otherwise i_11 which is the value of the previous iteration?

The peeling code explicitly leaves i_14 in the merge block and not i_11 for this exact reason.
So I'm confused, my understanding is that we're already *at* the right PHI.

Is it perhaps that you thought we put i_11 here for the early exits? In which case
Yes I'd agree that that would be wrong, and there we would have had to look at
The defs, but i_11 is the def.

I already kept this in mind and leveraged peeling to make this part easier.
i_11 is used in the main exit and i_14 in the early one.

> 
> Why again do we need (non-induction) live values from the vector loop to the
> epilogue loop again?

They can appear as the result value of the main exit.

e.g. in testcase (vect-early-break_17.c)

#define N 1024
unsigned vect_a[N];
unsigned vect_b[N];

unsigned test4(unsigned x)
{
 unsigned ret = 0;
 for (int i = 0; i < N; i++)
 {
   vect_b[i] = x + i;
   if (vect_a[i] > x)
     return vect_a[i];
   vect_a[i] = x;
   ret = vect_a[i] + vect_b[i];
 }
 return ret;
}

The only situation they can appear in the as an early-break is when
we have a case where main exit != latch connected exit.

However in these cases they are unused, and only there because
normally you would have exited (i.e. there was a return) but the
vector loop needs to start over so we ignore it.

These happen in testcase vect-early-break_74.c and
vect-early-break_78.c

> 
> If we are dealing with an induction (or a reduction, you can check the def
> type), there should be an associated PHI node to get that vector.
> 
> That said, are you sure there's testsuite coverage for the induction case?

Well, we now require it every for IV related variables between the two loops.
So there's not a single testcase that doesn't use it.

> 
> > +		  }
> > +
> > +		gimple_stmt_iterator exit_gsi;
> > +		tree new_tree
> > +		  = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
> > +						   exit_e, vectype, ncopies,
> > +						   slp_node, bitsize,
> > +						   tmp_bitstart, tmp_vec_lhs,
> > +						   lhs_type, restart_loop,
> > +						   &exit_gsi);
> > +
> > +		/* Use the empty block on the exit to materialize the new
> stmts
> > +		   so we can use update the PHI here.  */
> > +		if (gimple_phi_num_args (use_stmt) == 1)
> > +		  {
> > +		    auto gsi = gsi_for_stmt (use_stmt);
> > +		    remove_phi_node (&gsi, false);
> > +		    tree lhs_phi = gimple_phi_result (use_stmt);
> > +		    gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> > +		    gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> > +		  }
> > +		else
> > +		  SET_PHI_ARG_DEF (use_stmt, dest_e->dest_idx, new_tree);
> 
> if the else case works, why not use it always?

Because it doesn't work for main exit.  The early exit have a intermediate block
that is used to generate the statements on, so for them we are fine updating the
use in place.

The main exits don't. and so the existing trick the vectorizer uses is to materialize
the statements in the same block and then dissolves the phi node.   However you
can't do that for the early exit because the phi node isn't singular.

Now I know what your next question is, well why don't we just use the same method
for both?  When I create an extra intermediate block for this reduction vectorization
for VLA seems to go off the rails.  The code is still correct, just highly inefficient.

That is because without having code to prevent this peeling will create LCSSA PHI blocks
with singular entries.  These should be harmless, but VLA reductions generate some
unpacking and packing to deal with them.  I tried to figure out why but this is a large bit
of code.  So for now I went with the simpler approach of replacing the use only for the
early exit, where we never have the intermediate PHIs.

Thanks,
Tamar

> 
> 
> The rest looks OK.
> 
> Richard.
> 
> > +	      }
> > +	  }
> >
> >        /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
> >        FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) diff --git
> > a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> >
> fe38beb4fa1d9f8593445354f56ba52e10a040cd..27221c6e8e86034050b56
> 2ee5c15
> > 992827a8d2cb 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -342,6 +342,7 @@ is_simple_and_all_uses_invariant (stmt_vec_info
> stmt_info,
> >     - it has uses outside the loop.
> >     - it has vdefs (it alters memory).
> >     - control stmts in the loop (except for the exit condition).
> > +   - it is an induction and we have multiple exits.
> >
> >     CHECKME: what other side effects would the vectorizer allow?  */
> >
> > @@ -399,6 +400,19 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info,
> loop_vec_info loop_vinfo,
> >  	}
> >      }
> >
> > +  /* Check if it's an induction and multiple exits.  In this case there will be
> > +     a usage later on after peeling which is needed for the alternate
> > +exit.  */
> > +  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > +      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> > +    {
> > +      if (dump_enabled_p ())
> > +	  dump_printf_loc (MSG_NOTE, vect_location,
> > +			   "vec_stmt_relevant_p: induction forced for "
> > +			   "early break.\n");
> > +      *live_p = true;
> > +
> > +    }
> > +
> >    if (*live_p && *relevant == vect_unused_in_scope
> >        && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
> >      {
> > @@ -1774,7 +1788,7 @@ compare_step_with_zero (vec_info *vinfo,
> > stmt_vec_info stmt_info)
> >  /* If the target supports a permute mask that reverses the elements in
> >     a vector of type VECTYPE, return that mask, otherwise return null.
> > */
> >
> > -static tree
> > +tree
> >  perm_mask_for_reverse (tree vectype)
> >  {
> >    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); @@ -12720,20
> > +12734,27 @@ can_vectorize_live_stmts (vec_info *vinfo, stmt_vec_info
> stmt_info,
> >  			  bool vec_stmt_p,
> >  			  stmt_vector_for_cost *cost_vec)
> >  {
> > +  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
> >    if (slp_node)
> >      {
> >        stmt_vec_info slp_stmt_info;
> >        unsigned int i;
> >        FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i,
> slp_stmt_info)
> >  	{
> > -	  if (STMT_VINFO_LIVE_P (slp_stmt_info)
> > +	  if ((STMT_VINFO_LIVE_P (slp_stmt_info)
> > +	       || (loop_vinfo
> > +		   && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > +		   && STMT_VINFO_DEF_TYPE (slp_stmt_info)
> > +			== vect_induction_def))
> >  	      && !vectorizable_live_operation (vinfo, slp_stmt_info, slp_node,
> >  					       slp_node_instance, i,
> >  					       vec_stmt_p, cost_vec))
> >  	    return false;
> >  	}
> >      }
> > -  else if (STMT_VINFO_LIVE_P (stmt_info)
> > +  else if ((STMT_VINFO_LIVE_P (stmt_info)
> > +	    || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > +		&& STMT_VINFO_DEF_TYPE (stmt_info) ==
> vect_induction_def))
> >  	   && !vectorizable_live_operation (vinfo, stmt_info,
> >  					    slp_node, slp_node_instance, -1,
> >  					    vec_stmt_p, cost_vec))
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index
> >
> de60da31e2a3030a7fbc302d3f676af9683fd019..fd4b0a787e6128b43c5ca2
> b0612f
> > 55845e6b3cef 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -2248,6 +2248,7 @@ extern bool vect_is_simple_use (vec_info *,
> stmt_vec_info, slp_tree,
> >  				enum vect_def_type *,
> >  				tree *, stmt_vec_info * = NULL);
> >  extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
> > +extern tree perm_mask_for_reverse (tree);
> >  extern bool supportable_widening_operation (vec_info*, code_helper,
> >  					    stmt_vec_info, tree, tree,
> >  					    code_helper*, code_helper*,
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
  
Tamar Christina Dec. 6, 2023, 4:21 a.m. UTC | #11
> > > is the exit edge you are looking for without iterating over all loop exits.
> > >
> > > > +		gimple *tmp_vec_stmt = vec_stmt;
> > > > +		tree tmp_vec_lhs = vec_lhs;
> > > > +		tree tmp_bitstart = bitstart;
> > > > +		/* For early exit where the exit is not in the BB that leads
> > > > +		   to the latch then we're restarting the iteration in the
> > > > +		   scalar loop.  So get the first live value.  */
> > > > +		restart_loop = restart_loop || exit_e != main_e;
> > > > +		if (restart_loop)
> > > > +		  {
> > > > +		    tmp_vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> > > > +		    tmp_vec_lhs = gimple_get_lhs (tmp_vec_stmt);
> > > > +		    tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
> > >
> > > Hmm, that gets you the value after the first iteration, not the one before which
> > > would be the last value of the preceeding vector iteration?
> > > (but we don't keep those, we'd need a PHI)
> >
> > I don't fully follow.  The comment on top of this hunk under if (loop_vinfo) states
> > that lhs should be pointing to a PHI.
> >
> > When I inspect the statement I see
> >
> > i_14 = PHI <i_11(6), 0(14)>
> >
> > so i_14 is the value at the start of the current iteration.  If we're coming from the
> > header 0, otherwise i_11 which is the value of the previous iteration?
> >
> > The peeling code explicitly leaves i_14 in the merge block and not i_11 for this
> exact reason.
> > So I'm confused, my understanding is that we're already *at* the right PHI.
> >
> > Is it perhaps that you thought we put i_11 here for the early exits? In which case
> > Yes I'd agree that that would be wrong, and there we would have had to look at
> > The defs, but i_11 is the def.
> >
> > I already kept this in mind and leveraged peeling to make this part easier.
> > i_11 is used in the main exit and i_14 in the early one.
> 
> I think the important detail is that this code is only executed for
> vect_induction_defs which are indeed PHIs and so we're sure the
> value live is before any modification so fine to feed as initial
> value for the PHI in the epilog.
> 
> Maybe we can assert the def type here?

We can't assert because until cfg cleanup the dead value is still seen and still
vectorized.  That said I've added a guard here.  We vectorize the non-induction
value as normal now and if it's ever used it'll fail.

> 
> > >
> > > Why again do we need (non-induction) live values from the vector loop to the
> > > epilogue loop again?
> >
> > They can appear as the result value of the main exit.
> >
> > e.g. in testcase (vect-early-break_17.c)
> >
> > #define N 1024
> > unsigned vect_a[N];
> > unsigned vect_b[N];
> >
> > unsigned test4(unsigned x)
> > {
> >  unsigned ret = 0;
> >  for (int i = 0; i < N; i++)
> >  {
> >    vect_b[i] = x + i;
> >    if (vect_a[i] > x)
> >      return vect_a[i];
> >    vect_a[i] = x;
> >    ret = vect_a[i] + vect_b[i];
> >  }
> >  return ret;
> > }
> >
> > The only situation they can appear in the as an early-break is when
> > we have a case where main exit != latch connected exit.
> >
> > However in these cases they are unused, and only there because
> > normally you would have exited (i.e. there was a return) but the
> > vector loop needs to start over so we ignore it.
> >
> > These happen in testcase vect-early-break_74.c and
> > vect-early-break_78.c
> 
> Hmm, so in that case their value is incorrect (but doesn't matter,
> we ignore it)?
> 

Correct, they're placed there due to exit redirection, but in these inverted
testcases where we've peeled the vector iteration you can't ever skip the
epilogue.  So they are guaranteed not to be used.

> > > > +		gimple_stmt_iterator exit_gsi;
> > > > +		tree new_tree
> > > > +		  = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
> > > > +						   exit_e, vectype, ncopies,
> > > > +						   slp_node, bitsize,
> > > > +						   tmp_bitstart, tmp_vec_lhs,
> > > > +						   lhs_type, restart_loop,
> > > > +						   &exit_gsi);
> > > > +
> > > > +		/* Use the empty block on the exit to materialize the new
> > > stmts
> > > > +		   so we can use update the PHI here.  */
> > > > +		if (gimple_phi_num_args (use_stmt) == 1)
> > > > +		  {
> > > > +		    auto gsi = gsi_for_stmt (use_stmt);
> > > > +		    remove_phi_node (&gsi, false);
> > > > +		    tree lhs_phi = gimple_phi_result (use_stmt);
> > > > +		    gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> > > > +		    gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> > > > +		  }
> > > > +		else
> > > > +		  SET_PHI_ARG_DEF (use_stmt, dest_e->dest_idx, new_tree);
> > >
> > > if the else case works, why not use it always?
> >
> > Because it doesn't work for main exit.  The early exit have a intermediate block
> > that is used to generate the statements on, so for them we are fine updating the
> > use in place.
> >
> > The main exits don't. and so the existing trick the vectorizer uses is to materialize
> > the statements in the same block and then dissolves the phi node.   However you
> > can't do that for the early exit because the phi node isn't singular.
> 
> But if the PHI has a single arg you can replace that?  By making a
> copy stmt from it don't you break LC SSA?
> 

Yeah, what the existing code is sneakily doing is this:

It has to vectorize

x = PHI <y>
y gets vectorized a z but

x = PHI <z>
z = ...

would be invalid,  so what it does, since it doesn't have a predecessor note to place stuff in,
it'll do

z = ...
x = z

and removed the PHI.  The PHI was only placed there for vectorization so it's not needed
after this point.  It's also for this reason why the code passes around a gimpe_seq since
it needs to make sure it gets the order right when inserting statements.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-loop.cc (vectorizable_live_operation,
	vectorizable_live_operation_1): Support early exits.
	(can_vectorize_live_stmts): Call vectorizable_live_operation for non-live
	inductions or reductions.
	(find_connected_edge, vect_get_vect_def): New.
	(vect_create_epilog_for_reduction): Support reductions in early break.
	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
	(vect_stmt_relevant_p): Mark all inductions when early break as being
	live.
	* tree-vectorizer.h (perm_mask_for_reverse): Expose.

--- inline copy of patch ---

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index f38cc47551488525b15c2be758cac8291dbefb3a..4e48217a31e59318c2ea8e5ab63b06ba19840cbd 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -3346,6 +3346,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	    bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
 	  bb_before_epilog = loop_preheader_edge (epilog)->src;
 	}
+
       /* If loop is peeled for non-zero constant times, now niters refers to
 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
 	 overflows.  */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index df5e1d28fac2ce35e71decdec0d8e31fb75557f5..2f922b42f6d567dfd5da9b276b1c9d37bc681876 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -5831,6 +5831,34 @@ vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
   return new_temp;
 }
 
+/* Retrieves the definining statement to be used for a reduction.
+   For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
+   the reduction definitions.  */
+
+tree
+vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
+		   slp_instance slp_node_instance, bool main_exit_p, unsigned i,
+		   vec <gimple *> &vec_stmts)
+{
+  tree def;
+
+  if (slp_node)
+    {
+      if (!main_exit_p)
+        slp_node = slp_node_instance->reduc_phis;
+      def = vect_get_slp_vect_def (slp_node, i);
+    }
+  else
+    {
+      if (!main_exit_p)
+	reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
+      vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
+      def = gimple_get_lhs (vec_stmts[0]);
+    }
+
+  return def;
+}
+
 /* Function vect_create_epilog_for_reduction
 
    Create code at the loop-epilog to finalize the result of a reduction
@@ -5842,6 +5870,8 @@ vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
      (counting from 0)
+   LOOP_EXIT is the edge to update in the merge block.  In the case of a single
+     exit this edge is always the main loop exit.
 
    This function:
    1. Completes the reduction def-use cycles.
@@ -5882,7 +5912,8 @@ static void
 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
 				  stmt_vec_info stmt_info,
 				  slp_tree slp_node,
-				  slp_instance slp_node_instance)
+				  slp_instance slp_node_instance,
+				  edge loop_exit)
 {
   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
   gcc_assert (reduc_info->is_reduc_info);
@@ -5891,6 +5922,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
      loop-closed PHI of the inner loop which we remember as
      def for the reduction PHI generation.  */
   bool double_reduc = false;
+  bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
   stmt_vec_info rdef_info = stmt_info;
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
     {
@@ -6053,7 +6085,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
       /* Create an induction variable.  */
       gimple_stmt_iterator incr_gsi;
       bool insert_after;
-      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+      vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
 		 insert_after, &indx_before_incr, &indx_after_incr);
 
@@ -6132,23 +6164,23 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
          Store them in NEW_PHIS.  */
   if (double_reduc)
     loop = outer_loop;
-  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
+  /* We need to reduce values in all exits.  */
+  exit_bb = loop_exit->dest;
   exit_gsi = gsi_after_labels (exit_bb);
   reduc_inputs.create (slp_node ? vec_num : ncopies);
+  vec <gimple *> vec_stmts;
   for (unsigned i = 0; i < vec_num; i++)
     {
       gimple_seq stmts = NULL;
-      if (slp_node)
-	def = vect_get_slp_vect_def (slp_node, i);
-      else
-	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
+      def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
+			       main_exit_p, i, vec_stmts);
       for (j = 0; j < ncopies; j++)
 	{
 	  tree new_def = copy_ssa_name (def);
 	  phi = create_phi_node (new_def, exit_bb);
 	  if (j)
-	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
-	  SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
+	    def = gimple_get_lhs (vec_stmts[j]);
+	  SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
 	  new_def = gimple_convert (&stmts, vectype, new_def);
 	  reduc_inputs.quick_push (new_def);
 	}
@@ -10481,17 +10513,18 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   return true;
 }
 
-
 /* Function vectorizable_live_operation_1.
+
    helper function for vectorizable_live_operation.  */
+
 tree
 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
-			       stmt_vec_info stmt_info, edge exit_e,
+			       stmt_vec_info stmt_info, basic_block exit_bb,
 			       tree vectype, int ncopies, slp_tree slp_node,
 			       tree bitsize, tree bitstart, tree vec_lhs,
-			       tree lhs_type, gimple_stmt_iterator *exit_gsi)
+			       tree lhs_type, bool restart_loop,
+			       gimple_stmt_iterator *exit_gsi)
 {
-  basic_block exit_bb = exit_e->dest;
   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
 
   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
@@ -10504,7 +10537,9 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
   if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
     {
       /* Emit:
+
 	 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
 	 where VEC_LHS is the vectorized live-out result and MASK is
 	 the loop mask for the final iteration.  */
       gcc_assert (ncopies == 1 && !slp_node);
@@ -10513,15 +10548,18 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
       tree len = vect_get_loop_len (loop_vinfo, &gsi,
 				    &LOOP_VINFO_LENS (loop_vinfo),
 				    1, vectype, 0, 0);
+
       /* BIAS - 1.  */
       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
       tree bias_minus_one
 	= int_const_binop (MINUS_EXPR,
 			   build_int_cst (TREE_TYPE (len), biasval),
 			   build_one_cst (TREE_TYPE (len)));
+
       /* LAST_INDEX = LEN + (BIAS - 1).  */
       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
 				     len, bias_minus_one);
+
       /* This needs to implement extraction of the first index, but not sure
 	 how the LEN stuff works.  At the moment we shouldn't get here since
 	 there's no LEN support for early breaks.  But guard this so there's
@@ -10532,13 +10570,16 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
       tree scalar_res
 	= gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
 			vec_lhs_phi, last_index);
+
       /* Convert the extracted vector element to the scalar type.  */
       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
     }
   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
     {
       /* Emit:
+
 	 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+
 	 where VEC_LHS is the vectorized live-out result and MASK is
 	 the loop mask for the final iteration.  */
       gcc_assert (!slp_node);
@@ -10548,10 +10589,38 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
 				      &LOOP_VINFO_MASKS (loop_vinfo),
 				      1, vectype, 0);
+      tree scalar_res;
+
+      /* For an inverted control flow with early breaks we want EXTRACT_FIRST
+	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
+      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  /* First create the permuted mask.  */
+	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
+	  tree perm_dest = copy_ssa_name (mask);
+	  gimple *perm_stmt
+		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
+				       mask, perm_mask);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
+				       &gsi);
+	  mask = perm_dest;
+
+	  /* Then permute the vector contents.  */
+	  tree perm_elem = perm_mask_for_reverse (vectype);
+	  perm_dest = copy_ssa_name (vec_lhs_phi);
+	  perm_stmt
+		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
+				       vec_lhs_phi, perm_elem);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
+				       &gsi);
+	  vec_lhs_phi = perm_dest;
+	}
 
       gimple_seq_add_seq (&stmts, tem);
-       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-				       mask, vec_lhs_phi);
+
+      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+				 mask, vec_lhs_phi);
+
       /* Convert the extracted vector element to the scalar type.  */
       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
     }
@@ -10564,12 +10633,26 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
 				       &stmts, true, NULL_TREE);
     }
+
   *exit_gsi = gsi_after_labels (exit_bb);
   if (stmts)
     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
+
   return new_tree;
 }
 
+/* Find the edge that's the final one in the path from SRC to DEST and
+   return it.  This edge must exist in at most one forwarder edge between.  */
+
+static edge
+find_connected_edge (edge src, basic_block dest)
+{
+   if (src->dest == dest)
+     return src;
+
+  return find_edge (src->dest, dest);
+}
+
 /* Function vectorizable_live_operation.
 
    STMT_INFO computes a value that is used outside the loop.  Check if
@@ -10590,11 +10673,13 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   int ncopies;
   gimple *use_stmt;
+  use_operand_p use_p;
   auto_vec<tree> vec_oprnds;
   int vec_entry = 0;
   poly_uint64 vec_index = 0;
 
-  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
+  gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
+	      || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
 
   /* If a stmt of a reduction is live, vectorize it via
      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
@@ -10619,8 +10704,25 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
 	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
 	return true;
+
       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
-					slp_node_instance);
+					slp_node_instance,
+					LOOP_VINFO_IV_EXIT (loop_vinfo));
+
+      /* If early break we only have to materialize the reduction on the merge
+	 block, but we have to find an alternate exit first.  */
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
+	    if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
+	      {
+		vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
+						  slp_node, slp_node_instance,
+						  exit);
+		break;
+	      }
+	}
+
       return true;
     }
 
@@ -10772,37 +10874,62 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	   lhs' = new_tree;  */
 
       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
-      gcc_assert (single_pred_p (exit_bb));
-
-      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
-      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
-      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
-
-      gimple_stmt_iterator exit_gsi;
-      tree new_tree
-	= vectorizable_live_operation_1 (loop_vinfo, stmt_info,
-					 LOOP_VINFO_IV_EXIT (loop_vinfo),
-					 vectype, ncopies, slp_node, bitsize,
-					 bitstart, vec_lhs, lhs_type,
-					 &exit_gsi);
-
-      /* Remove existing phis that copy from lhs and create copies
-	 from new_tree.  */
-      gimple_stmt_iterator gsi;
-      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
-	{
-	  gimple *phi = gsi_stmt (gsi);
-	  if ((gimple_phi_arg_def (phi, 0) == lhs))
+      /* Check if we have a loop where the chosen exit is not the main exit,
+	 in these cases for an early break we restart the iteration the vector code
+	 did.  For the live values we want the value at the start of the iteration
+	 rather than at the end.  */
+      edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
+      bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+	if (!is_gimple_debug (use_stmt)
+	    && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 	    {
-	      remove_phi_node (&gsi, false);
-	      tree lhs_phi = gimple_phi_result (phi);
-	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
-	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
-	    }
-	  else
-	    gsi_next (&gsi);
-	}
+	      edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
+					   phi_arg_index_from_use (use_p));
+	      bool main_exit_edge = e == main_e
+				    || find_connected_edge (main_e, e->src);
+
+	      /* Early exits have an merge block, we want the merge block itself
+		 so use ->src.  For main exit the merge block is the
+		 destination.  */
+	      basic_block dest = main_exit_edge ? main_e->dest : e->src;
+	      gimple *tmp_vec_stmt = vec_stmt;
+	      tree tmp_vec_lhs = vec_lhs;
+	      tree tmp_bitstart = bitstart;
+
+	      /* For early exit where the exit is not in the BB that leads
+		 to the latch then we're restarting the iteration in the
+		 scalar loop.  So get the first live value.  */
+	      restart_loop = restart_loop || !main_exit_edge;
+	      if (restart_loop
+		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
+		{
+		  tmp_vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+		  tmp_vec_lhs = gimple_get_lhs (tmp_vec_stmt);
+		  tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
+		}
+
+	      gimple_stmt_iterator exit_gsi;
+	      tree new_tree
+		= vectorizable_live_operation_1 (loop_vinfo, stmt_info,
+						 dest, vectype, ncopies,
+						 slp_node, bitsize,
+						 tmp_bitstart, tmp_vec_lhs,
+						 lhs_type, restart_loop,
+						 &exit_gsi);
+
+	      if (gimple_phi_num_args (use_stmt) == 1)
+		{
+		  auto gsi = gsi_for_stmt (use_stmt);
+		  remove_phi_node (&gsi, false);
+		  tree lhs_phi = gimple_phi_result (use_stmt);
+		  gimple *copy = gimple_build_assign (lhs_phi, new_tree);
+		  gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
+		}
+	      else
+		SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
+	  }
 
       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b3a09c0a804a38e17ef32b6ce13b98b077459fc7..582c5e678fad802d6e76300fe3c939b9f2978f17 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -342,6 +342,7 @@ is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
    - it has uses outside the loop.
    - it has vdefs (it alters memory).
    - control stmts in the loop (except for the exit condition).
+   - it is an induction and we have multiple exits.
 
    CHECKME: what other side effects would the vectorizer allow?  */
 
@@ -399,6 +400,19 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	}
     }
 
+  /* Check if it's an induction and multiple exits.  In this case there will be
+     a usage later on after peeling which is needed for the alternate exit.  */
+  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
+    {
+      if (dump_enabled_p ())
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "vec_stmt_relevant_p: induction forced for "
+			   "early break.\n");
+      *live_p = true;
+
+    }
+
   if (*live_p && *relevant == vect_unused_in_scope
       && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
     {
@@ -1774,7 +1788,7 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
 /* If the target supports a permute mask that reverses the elements in
    a vector of type VECTYPE, return that mask, otherwise return null.  */
 
-static tree
+tree
 perm_mask_for_reverse (tree vectype)
 {
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
@@ -12720,20 +12734,27 @@ can_vectorize_live_stmts (vec_info *vinfo, stmt_vec_info stmt_info,
 			  bool vec_stmt_p,
 			  stmt_vector_for_cost *cost_vec)
 {
+  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   if (slp_node)
     {
       stmt_vec_info slp_stmt_info;
       unsigned int i;
       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, slp_stmt_info)
 	{
-	  if (STMT_VINFO_LIVE_P (slp_stmt_info)
+	  if ((STMT_VINFO_LIVE_P (slp_stmt_info)
+	       || (loop_vinfo
+		   && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+		   && STMT_VINFO_DEF_TYPE (slp_stmt_info)
+			== vect_induction_def))
 	      && !vectorizable_live_operation (vinfo, slp_stmt_info, slp_node,
 					       slp_node_instance, i,
 					       vec_stmt_p, cost_vec))
 	    return false;
 	}
     }
-  else if (STMT_VINFO_LIVE_P (stmt_info)
+  else if ((STMT_VINFO_LIVE_P (stmt_info)
+	    || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
+		&& STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def))
 	   && !vectorizable_live_operation (vinfo, stmt_info,
 					    slp_node, slp_node_instance, -1,
 					    vec_stmt_p, cost_vec))
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 15c7f75b1f3c61ab469f1b1970dae9c6ac1a9f55..974f617d54a14c903894dd20d60098ca259c96f2 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2248,6 +2248,7 @@ extern bool vect_is_simple_use (vec_info *, stmt_vec_info, slp_tree,
 				enum vect_def_type *,
 				tree *, stmt_vec_info * = NULL);
 extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
+extern tree perm_mask_for_reverse (tree);
 extern bool supportable_widening_operation (vec_info*, code_helper,
 					    stmt_vec_info, tree, tree,
 					    code_helper*, code_helper*,
  
Richard Biener Dec. 6, 2023, 9:33 a.m. UTC | #12
On Wed, 6 Dec 2023, Tamar Christina wrote:

> > > > is the exit edge you are looking for without iterating over all loop exits.
> > > >
> > > > > +		gimple *tmp_vec_stmt = vec_stmt;
> > > > > +		tree tmp_vec_lhs = vec_lhs;
> > > > > +		tree tmp_bitstart = bitstart;
> > > > > +		/* For early exit where the exit is not in the BB that leads
> > > > > +		   to the latch then we're restarting the iteration in the
> > > > > +		   scalar loop.  So get the first live value.  */
> > > > > +		restart_loop = restart_loop || exit_e != main_e;
> > > > > +		if (restart_loop)
> > > > > +		  {
> > > > > +		    tmp_vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> > > > > +		    tmp_vec_lhs = gimple_get_lhs (tmp_vec_stmt);
> > > > > +		    tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
> > > >
> > > > Hmm, that gets you the value after the first iteration, not the one before which
> > > > would be the last value of the preceeding vector iteration?
> > > > (but we don't keep those, we'd need a PHI)
> > >
> > > I don't fully follow.  The comment on top of this hunk under if (loop_vinfo) states
> > > that lhs should be pointing to a PHI.
> > >
> > > When I inspect the statement I see
> > >
> > > i_14 = PHI <i_11(6), 0(14)>
> > >
> > > so i_14 is the value at the start of the current iteration.  If we're coming from the
> > > header 0, otherwise i_11 which is the value of the previous iteration?
> > >
> > > The peeling code explicitly leaves i_14 in the merge block and not i_11 for this
> > exact reason.
> > > So I'm confused, my understanding is that we're already *at* the right PHI.
> > >
> > > Is it perhaps that you thought we put i_11 here for the early exits? In which case
> > > Yes I'd agree that that would be wrong, and there we would have had to look at
> > > The defs, but i_11 is the def.
> > >
> > > I already kept this in mind and leveraged peeling to make this part easier.
> > > i_11 is used in the main exit and i_14 in the early one.
> > 
> > I think the important detail is that this code is only executed for
> > vect_induction_defs which are indeed PHIs and so we're sure the
> > value live is before any modification so fine to feed as initial
> > value for the PHI in the epilog.
> > 
> > Maybe we can assert the def type here?
> 
> We can't assert because until cfg cleanup the dead value is still seen and still
> vectorized.  That said I've added a guard here.  We vectorize the non-induction
> value as normal now and if it's ever used it'll fail.
> 
> > 
> > > >
> > > > Why again do we need (non-induction) live values from the vector loop to the
> > > > epilogue loop again?
> > >
> > > They can appear as the result value of the main exit.
> > >
> > > e.g. in testcase (vect-early-break_17.c)
> > >
> > > #define N 1024
> > > unsigned vect_a[N];
> > > unsigned vect_b[N];
> > >
> > > unsigned test4(unsigned x)
> > > {
> > >  unsigned ret = 0;
> > >  for (int i = 0; i < N; i++)
> > >  {
> > >    vect_b[i] = x + i;
> > >    if (vect_a[i] > x)
> > >      return vect_a[i];
> > >    vect_a[i] = x;
> > >    ret = vect_a[i] + vect_b[i];
> > >  }
> > >  return ret;
> > > }
> > >
> > > The only situation they can appear in the as an early-break is when
> > > we have a case where main exit != latch connected exit.
> > >
> > > However in these cases they are unused, and only there because
> > > normally you would have exited (i.e. there was a return) but the
> > > vector loop needs to start over so we ignore it.
> > >
> > > These happen in testcase vect-early-break_74.c and
> > > vect-early-break_78.c
> > 
> > Hmm, so in that case their value is incorrect (but doesn't matter,
> > we ignore it)?
> > 
> 
> Correct, they're placed there due to exit redirection, but in these inverted
> testcases where we've peeled the vector iteration you can't ever skip the
> epilogue.  So they are guaranteed not to be used.
> 
> > > > > +		gimple_stmt_iterator exit_gsi;
> > > > > +		tree new_tree
> > > > > +		  = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
> > > > > +						   exit_e, vectype, ncopies,
> > > > > +						   slp_node, bitsize,
> > > > > +						   tmp_bitstart, tmp_vec_lhs,
> > > > > +						   lhs_type, restart_loop,
> > > > > +						   &exit_gsi);
> > > > > +
> > > > > +		/* Use the empty block on the exit to materialize the new
> > > > stmts
> > > > > +		   so we can use update the PHI here.  */
> > > > > +		if (gimple_phi_num_args (use_stmt) == 1)
> > > > > +		  {
> > > > > +		    auto gsi = gsi_for_stmt (use_stmt);
> > > > > +		    remove_phi_node (&gsi, false);
> > > > > +		    tree lhs_phi = gimple_phi_result (use_stmt);
> > > > > +		    gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> > > > > +		    gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> > > > > +		  }
> > > > > +		else
> > > > > +		  SET_PHI_ARG_DEF (use_stmt, dest_e->dest_idx, new_tree);
> > > >
> > > > if the else case works, why not use it always?
> > >
> > > Because it doesn't work for main exit.  The early exit have a intermediate block
> > > that is used to generate the statements on, so for them we are fine updating the
> > > use in place.
> > >
> > > The main exits don't. and so the existing trick the vectorizer uses is to materialize
> > > the statements in the same block and then dissolves the phi node.   However you
> > > can't do that for the early exit because the phi node isn't singular.
> > 
> > But if the PHI has a single arg you can replace that?  By making a
> > copy stmt from it don't you break LC SSA?
> > 
> 
> Yeah, what the existing code is sneakily doing is this:
> 
> It has to vectorize
> 
> x = PHI <y>
> y gets vectorized a z but
> 
> x = PHI <z>
> z = ...
> 
> would be invalid,  so what it does, since it doesn't have a predecessor note to place stuff in,
> it'll do
> 
> z = ...
> x = z
> 
> and removed the PHI.  The PHI was only placed there for vectorization so it's not needed
> after this point.  It's also for this reason why the code passes around a gimpe_seq since
> it needs to make sure it gets the order right when inserting statements.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?

OK.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-loop.cc (vectorizable_live_operation,
> 	vectorizable_live_operation_1): Support early exits.
> 	(can_vectorize_live_stmts): Call vectorizable_live_operation for non-live
> 	inductions or reductions.
> 	(find_connected_edge, vect_get_vect_def): New.
> 	(vect_create_epilog_for_reduction): Support reductions in early break.
> 	* tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
> 	(vect_stmt_relevant_p): Mark all inductions when early break as being
> 	live.
> 	* tree-vectorizer.h (perm_mask_for_reverse): Expose.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index f38cc47551488525b15c2be758cac8291dbefb3a..4e48217a31e59318c2ea8e5ab63b06ba19840cbd 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -3346,6 +3346,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
>  	    bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
>  	  bb_before_epilog = loop_preheader_edge (epilog)->src;
>  	}
> +
>        /* If loop is peeled for non-zero constant times, now niters refers to
>  	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
>  	 overflows.  */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index df5e1d28fac2ce35e71decdec0d8e31fb75557f5..2f922b42f6d567dfd5da9b276b1c9d37bc681876 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -5831,6 +5831,34 @@ vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
>    return new_temp;
>  }
>  
> +/* Retrieves the definining statement to be used for a reduction.
> +   For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
> +   the reduction definitions.  */
> +
> +tree
> +vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
> +		   slp_instance slp_node_instance, bool main_exit_p, unsigned i,
> +		   vec <gimple *> &vec_stmts)
> +{
> +  tree def;
> +
> +  if (slp_node)
> +    {
> +      if (!main_exit_p)
> +        slp_node = slp_node_instance->reduc_phis;
> +      def = vect_get_slp_vect_def (slp_node, i);
> +    }
> +  else
> +    {
> +      if (!main_exit_p)
> +	reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
> +      vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
> +      def = gimple_get_lhs (vec_stmts[0]);
> +    }
> +
> +  return def;
> +}
> +
>  /* Function vect_create_epilog_for_reduction
>  
>     Create code at the loop-epilog to finalize the result of a reduction
> @@ -5842,6 +5870,8 @@ vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
>     SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
>     REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
>       (counting from 0)
> +   LOOP_EXIT is the edge to update in the merge block.  In the case of a single
> +     exit this edge is always the main loop exit.
>  
>     This function:
>     1. Completes the reduction def-use cycles.
> @@ -5882,7 +5912,8 @@ static void
>  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>  				  stmt_vec_info stmt_info,
>  				  slp_tree slp_node,
> -				  slp_instance slp_node_instance)
> +				  slp_instance slp_node_instance,
> +				  edge loop_exit)
>  {
>    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
>    gcc_assert (reduc_info->is_reduc_info);
> @@ -5891,6 +5922,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>       loop-closed PHI of the inner loop which we remember as
>       def for the reduction PHI generation.  */
>    bool double_reduc = false;
> +  bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
>    stmt_vec_info rdef_info = stmt_info;
>    if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
>      {
> @@ -6053,7 +6085,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>        /* Create an induction variable.  */
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
> -      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +      vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
>        create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
>  		 insert_after, &indx_before_incr, &indx_after_incr);
>  
> @@ -6132,23 +6164,23 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>           Store them in NEW_PHIS.  */
>    if (double_reduc)
>      loop = outer_loop;
> -  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> +  /* We need to reduce values in all exits.  */
> +  exit_bb = loop_exit->dest;
>    exit_gsi = gsi_after_labels (exit_bb);
>    reduc_inputs.create (slp_node ? vec_num : ncopies);
> +  vec <gimple *> vec_stmts;
>    for (unsigned i = 0; i < vec_num; i++)
>      {
>        gimple_seq stmts = NULL;
> -      if (slp_node)
> -	def = vect_get_slp_vect_def (slp_node, i);
> -      else
> -	def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
> +      def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
> +			       main_exit_p, i, vec_stmts);
>        for (j = 0; j < ncopies; j++)
>  	{
>  	  tree new_def = copy_ssa_name (def);
>  	  phi = create_phi_node (new_def, exit_bb);
>  	  if (j)
> -	    def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
> -	  SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
> +	    def = gimple_get_lhs (vec_stmts[j]);
> +	  SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
>  	  new_def = gimple_convert (&stmts, vectype, new_def);
>  	  reduc_inputs.quick_push (new_def);
>  	}
> @@ -10481,17 +10513,18 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>    return true;
>  }
>  
> -
>  /* Function vectorizable_live_operation_1.
> +
>     helper function for vectorizable_live_operation.  */
> +
>  tree
>  vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
> -			       stmt_vec_info stmt_info, edge exit_e,
> +			       stmt_vec_info stmt_info, basic_block exit_bb,
>  			       tree vectype, int ncopies, slp_tree slp_node,
>  			       tree bitsize, tree bitstart, tree vec_lhs,
> -			       tree lhs_type, gimple_stmt_iterator *exit_gsi)
> +			       tree lhs_type, bool restart_loop,
> +			       gimple_stmt_iterator *exit_gsi)
>  {
> -  basic_block exit_bb = exit_e->dest;
>    gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
>  
>    tree vec_lhs_phi = copy_ssa_name (vec_lhs);
> @@ -10504,7 +10537,9 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>    if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
>      {
>        /* Emit:
> +
>  	 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
>  	 where VEC_LHS is the vectorized live-out result and MASK is
>  	 the loop mask for the final iteration.  */
>        gcc_assert (ncopies == 1 && !slp_node);
> @@ -10513,15 +10548,18 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>        tree len = vect_get_loop_len (loop_vinfo, &gsi,
>  				    &LOOP_VINFO_LENS (loop_vinfo),
>  				    1, vectype, 0, 0);
> +
>        /* BIAS - 1.  */
>        signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>        tree bias_minus_one
>  	= int_const_binop (MINUS_EXPR,
>  			   build_int_cst (TREE_TYPE (len), biasval),
>  			   build_one_cst (TREE_TYPE (len)));
> +
>        /* LAST_INDEX = LEN + (BIAS - 1).  */
>        tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
>  				     len, bias_minus_one);
> +
>        /* This needs to implement extraction of the first index, but not sure
>  	 how the LEN stuff works.  At the moment we shouldn't get here since
>  	 there's no LEN support for early breaks.  But guard this so there's
> @@ -10532,13 +10570,16 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>        tree scalar_res
>  	= gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
>  			vec_lhs_phi, last_index);
> +
>        /* Convert the extracted vector element to the scalar type.  */
>        new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
>      }
>    else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
>      {
>        /* Emit:
> +
>  	 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
> +
>  	 where VEC_LHS is the vectorized live-out result and MASK is
>  	 the loop mask for the final iteration.  */
>        gcc_assert (!slp_node);
> @@ -10548,10 +10589,38 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>        tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
>  				      &LOOP_VINFO_MASKS (loop_vinfo),
>  				      1, vectype, 0);
> +      tree scalar_res;
> +
> +      /* For an inverted control flow with early breaks we want EXTRACT_FIRST
> +	 instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
> +      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +	{
> +	  /* First create the permuted mask.  */
> +	  tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> +	  tree perm_dest = copy_ssa_name (mask);
> +	  gimple *perm_stmt
> +		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> +				       mask, perm_mask);
> +	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> +				       &gsi);
> +	  mask = perm_dest;
> +
> +	  /* Then permute the vector contents.  */
> +	  tree perm_elem = perm_mask_for_reverse (vectype);
> +	  perm_dest = copy_ssa_name (vec_lhs_phi);
> +	  perm_stmt
> +		= gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
> +				       vec_lhs_phi, perm_elem);
> +	  vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> +				       &gsi);
> +	  vec_lhs_phi = perm_dest;
> +	}
>  
>        gimple_seq_add_seq (&stmts, tem);
> -       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> -				       mask, vec_lhs_phi);
> +
> +      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> +				 mask, vec_lhs_phi);
> +
>        /* Convert the extracted vector element to the scalar type.  */
>        new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
>      }
> @@ -10564,12 +10633,26 @@ vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
>        new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
>  				       &stmts, true, NULL_TREE);
>      }
> +
>    *exit_gsi = gsi_after_labels (exit_bb);
>    if (stmts)
>      gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
> +
>    return new_tree;
>  }
>  
> +/* Find the edge that's the final one in the path from SRC to DEST and
> +   return it.  This edge must exist in at most one forwarder edge between.  */
> +
> +static edge
> +find_connected_edge (edge src, basic_block dest)
> +{
> +   if (src->dest == dest)
> +     return src;
> +
> +  return find_edge (src->dest, dest);
> +}
> +
>  /* Function vectorizable_live_operation.
>  
>     STMT_INFO computes a value that is used outside the loop.  Check if
> @@ -10590,11 +10673,13 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
>    int ncopies;
>    gimple *use_stmt;
> +  use_operand_p use_p;
>    auto_vec<tree> vec_oprnds;
>    int vec_entry = 0;
>    poly_uint64 vec_index = 0;
>  
> -  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
> +  gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
> +	      || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
>  
>    /* If a stmt of a reduction is live, vectorize it via
>       vect_create_epilog_for_reduction.  vectorizable_reduction assessed
> @@ -10619,8 +10704,25 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>        if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
>  	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
>  	return true;
> +
>        vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
> -					slp_node_instance);
> +					slp_node_instance,
> +					LOOP_VINFO_IV_EXIT (loop_vinfo));
> +
> +      /* If early break we only have to materialize the reduction on the merge
> +	 block, but we have to find an alternate exit first.  */
> +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +	{
> +	  for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
> +	    if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
> +	      {
> +		vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
> +						  slp_node, slp_node_instance,
> +						  exit);
> +		break;
> +	      }
> +	}
> +
>        return true;
>      }
>  
> @@ -10772,37 +10874,62 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>  	   lhs' = new_tree;  */
>  
>        class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> -      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> -      gcc_assert (single_pred_p (exit_bb));
> -
> -      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
> -      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
> -      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
> -
> -      gimple_stmt_iterator exit_gsi;
> -      tree new_tree
> -	= vectorizable_live_operation_1 (loop_vinfo, stmt_info,
> -					 LOOP_VINFO_IV_EXIT (loop_vinfo),
> -					 vectype, ncopies, slp_node, bitsize,
> -					 bitstart, vec_lhs, lhs_type,
> -					 &exit_gsi);
> -
> -      /* Remove existing phis that copy from lhs and create copies
> -	 from new_tree.  */
> -      gimple_stmt_iterator gsi;
> -      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
> -	{
> -	  gimple *phi = gsi_stmt (gsi);
> -	  if ((gimple_phi_arg_def (phi, 0) == lhs))
> +      /* Check if we have a loop where the chosen exit is not the main exit,
> +	 in these cases for an early break we restart the iteration the vector code
> +	 did.  For the live values we want the value at the start of the iteration
> +	 rather than at the end.  */
> +      edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> +      bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
> +      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> +	if (!is_gimple_debug (use_stmt)
> +	    && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> +	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
>  	    {
> -	      remove_phi_node (&gsi, false);
> -	      tree lhs_phi = gimple_phi_result (phi);
> -	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> -	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> -	    }
> -	  else
> -	    gsi_next (&gsi);
> -	}
> +	      edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
> +					   phi_arg_index_from_use (use_p));
> +	      bool main_exit_edge = e == main_e
> +				    || find_connected_edge (main_e, e->src);
> +
> +	      /* Early exits have an merge block, we want the merge block itself
> +		 so use ->src.  For main exit the merge block is the
> +		 destination.  */
> +	      basic_block dest = main_exit_edge ? main_e->dest : e->src;
> +	      gimple *tmp_vec_stmt = vec_stmt;
> +	      tree tmp_vec_lhs = vec_lhs;
> +	      tree tmp_bitstart = bitstart;
> +
> +	      /* For early exit where the exit is not in the BB that leads
> +		 to the latch then we're restarting the iteration in the
> +		 scalar loop.  So get the first live value.  */
> +	      restart_loop = restart_loop || !main_exit_edge;
> +	      if (restart_loop
> +		  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> +		{
> +		  tmp_vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> +		  tmp_vec_lhs = gimple_get_lhs (tmp_vec_stmt);
> +		  tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
> +		}
> +
> +	      gimple_stmt_iterator exit_gsi;
> +	      tree new_tree
> +		= vectorizable_live_operation_1 (loop_vinfo, stmt_info,
> +						 dest, vectype, ncopies,
> +						 slp_node, bitsize,
> +						 tmp_bitstart, tmp_vec_lhs,
> +						 lhs_type, restart_loop,
> +						 &exit_gsi);
> +
> +	      if (gimple_phi_num_args (use_stmt) == 1)
> +		{
> +		  auto gsi = gsi_for_stmt (use_stmt);
> +		  remove_phi_node (&gsi, false);
> +		  tree lhs_phi = gimple_phi_result (use_stmt);
> +		  gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> +		  gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> +		}
> +	      else
> +		SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
> +	  }
>  
>        /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
>        FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index b3a09c0a804a38e17ef32b6ce13b98b077459fc7..582c5e678fad802d6e76300fe3c939b9f2978f17 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -342,6 +342,7 @@ is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
>     - it has uses outside the loop.
>     - it has vdefs (it alters memory).
>     - control stmts in the loop (except for the exit condition).
> +   - it is an induction and we have multiple exits.
>  
>     CHECKME: what other side effects would the vectorizer allow?  */
>  
> @@ -399,6 +400,19 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
>  	}
>      }
>  
> +  /* Check if it's an induction and multiple exits.  In this case there will be
> +     a usage later on after peeling which is needed for the alternate exit.  */
> +  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> +    {
> +      if (dump_enabled_p ())
> +	  dump_printf_loc (MSG_NOTE, vect_location,
> +			   "vec_stmt_relevant_p: induction forced for "
> +			   "early break.\n");
> +      *live_p = true;
> +
> +    }
> +
>    if (*live_p && *relevant == vect_unused_in_scope
>        && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
>      {
> @@ -1774,7 +1788,7 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
>  /* If the target supports a permute mask that reverses the elements in
>     a vector of type VECTYPE, return that mask, otherwise return null.  */
>  
> -static tree
> +tree
>  perm_mask_for_reverse (tree vectype)
>  {
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> @@ -12720,20 +12734,27 @@ can_vectorize_live_stmts (vec_info *vinfo, stmt_vec_info stmt_info,
>  			  bool vec_stmt_p,
>  			  stmt_vector_for_cost *cost_vec)
>  {
> +  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
>    if (slp_node)
>      {
>        stmt_vec_info slp_stmt_info;
>        unsigned int i;
>        FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, slp_stmt_info)
>  	{
> -	  if (STMT_VINFO_LIVE_P (slp_stmt_info)
> +	  if ((STMT_VINFO_LIVE_P (slp_stmt_info)
> +	       || (loop_vinfo
> +		   && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +		   && STMT_VINFO_DEF_TYPE (slp_stmt_info)
> +			== vect_induction_def))
>  	      && !vectorizable_live_operation (vinfo, slp_stmt_info, slp_node,
>  					       slp_node_instance, i,
>  					       vec_stmt_p, cost_vec))
>  	    return false;
>  	}
>      }
> -  else if (STMT_VINFO_LIVE_P (stmt_info)
> +  else if ((STMT_VINFO_LIVE_P (stmt_info)
> +	    || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +		&& STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def))
>  	   && !vectorizable_live_operation (vinfo, stmt_info,
>  					    slp_node, slp_node_instance, -1,
>  					    vec_stmt_p, cost_vec))
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 15c7f75b1f3c61ab469f1b1970dae9c6ac1a9f55..974f617d54a14c903894dd20d60098ca259c96f2 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2248,6 +2248,7 @@ extern bool vect_is_simple_use (vec_info *, stmt_vec_info, slp_tree,
>  				enum vect_def_type *,
>  				tree *, stmt_vec_info * = NULL);
>  extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
> +extern tree perm_mask_for_reverse (tree);
>  extern bool supportable_widening_operation (vec_info*, code_helper,
>  					    stmt_vec_info, tree, tree,
>  					    code_helper*, code_helper*,
>
  

Patch

--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10503,12 +10503,56 @@  vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	   lhs' = new_tree;  */
 
       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
+      /* A value can only be live in one exit.  So figure out which one.  */
+      edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
+      /* Check if we have a loop where the chosen exit is not the main exit,
+	 in these cases for an early break we restart the iteration the vector code
+	 did.  For the live values we want the value at the start of the iteration
+	 rather than at the end.  */
+      bool inverted_ctrl_p = false;
+      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	{
+	  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+	    if (!is_gimple_debug (use_stmt)
+		&& !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+	      {
+		basic_block use_bb = gimple_bb (use_stmt);
+		for (auto edge : get_loop_exit_edges (loop))
+		  {
+		    /* Alternative exits can have an intermediate BB in
+		       between to update the IV.  In those cases we need to
+		       look one block further.  */
+		    if (use_bb == edge->dest
+			|| (single_succ_p (edge->dest)
+			    && use_bb == single_succ (edge->dest)))
+		      {
+			exit_e = edge;
+			goto found;
+		      }
+		  }
+	      }
+found:
+	  /* If the edge isn't a single pred then split the edge so we have a
+	     location to place the live operations.  Perhaps we should always
+	     split during IV updating.  But this way the CFG is cleaner to
+	     follow.  */
+	  inverted_ctrl_p = !vect_is_loop_exit_latch_pred (exit_e, loop);
+	  if (!single_pred_p (exit_e->dest))
+	    exit_e = single_pred_edge (split_edge (exit_e));
+
+	  /* For early exit where the exit is not in the BB that leads to the
+	     latch then we're restarting the iteration in the scalar loop. So
+	     get the first live value.  */
+	  if (inverted_ctrl_p)
+	    bitstart = build_zero_cst (TREE_TYPE (bitstart));
+	}
+
+      basic_block exit_bb = exit_e->dest;
       gcc_assert (single_pred_p (exit_bb));
 
       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
-      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
+      SET_PHI_ARG_DEF (phi, exit_e->dest_idx, vec_lhs);
 
       gimple_seq stmts = NULL;
       tree new_tree;
@@ -10539,6 +10583,12 @@  vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 	  tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
 					  len, bias_minus_one);
 
+	  /* This needs to implement extraction of the first index, but not sure
+	     how the LEN stuff works.  At the moment we shouldn't get here since
+	     there's no LEN support for early breaks.  But guard this so there's
+	     no incorrect codegen.  */
+	  gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
+
 	  /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
 	  tree scalar_res
 	    = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
@@ -10563,8 +10613,37 @@  vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
 					  &LOOP_VINFO_MASKS (loop_vinfo),
 					  1, vectype, 0);
 	  gimple_seq_add_seq (&stmts, tem);
-	  tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-					  mask, vec_lhs_phi);
+	  tree scalar_res;
+
+	  /* For an inverted control flow with early breaks we want EXTRACT_FIRST
+	     instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
+	  if (inverted_ctrl_p && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+	    {
+	      auto gsi_stmt = gsi_last (stmts);
+
+	       /* First create the permuted mask.  */
+	      tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
+	      tree perm_dest = copy_ssa_name (mask);
+	      gimple *perm_stmt
+		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
+					   mask, perm_mask);
+	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
+					   &gsi_stmt);
+	      mask = perm_dest;
+
+	       /* Then permute the vector contents.  */
+	      tree perm_elem = perm_mask_for_reverse (vectype);
+	      perm_dest = copy_ssa_name (vec_lhs_phi);
+	      perm_stmt
+		    = gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
+					   vec_lhs_phi, perm_elem);
+	      vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
+					   &gsi_stmt);
+	      vec_lhs_phi = perm_dest;
+	    }
+
+	  scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+				     mask, vec_lhs_phi);
 
 	  /* Convert the extracted vector element to the scalar type.  */
 	  new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 42cebb92789247434a91cb8e74c0557e75d1ea2c..36aeca60a22cfaea8d3b43348000d75de1d525c7 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1765,7 +1765,7 @@  compare_step_with_zero (vec_info *vinfo, stmt_vec_info stmt_info)
 /* If the target supports a permute mask that reverses the elements in
    a vector of type VECTYPE, return that mask, otherwise return null.  */
 
-static tree
+tree
 perm_mask_for_reverse (tree vectype)
 {
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index d8b532c4b8ca92a856368a686598859fab9d40e9..a570cf113adb8e11e5383d4ba7600bddaddbd8c4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2245,6 +2245,7 @@  extern bool vect_is_simple_use (vec_info *, stmt_vec_info, slp_tree,
 				enum vect_def_type *,
 				tree *, stmt_vec_info * = NULL);
 extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
+extern tree perm_mask_for_reverse (tree);
 extern bool supportable_widening_operation (vec_info*, code_helper,
 					    stmt_vec_info, tree, tree,
 					    code_helper*, code_helper*,