@@ -15,7 +15,7 @@ int foo ()
a[1] = tem1;
a[2] = tem2;
a[3] = tem3;
- return temx + temy;
+ return temx / temy;
}
/* We should extract the live lane from the vectorized add rather than
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+
+double
+not_vectorizable (double *x)
+{
+ double r = x[0];
+ return r + x[2] + x[1];
+}
+
+double
+vectorizable (double *x)
+{
+ double r = x[0];
+ return r + x[1] + x[2];
+}
+
+/* We can vectorize the in-order reduction in vectorizable but not the one
+ in not_vectorizable since we cannot handle the gap after x[2]. */
+/* { dg-final { scan-tree-dump-times "optimized: basic block part vectorized" 1 "slp2" { target vect_hw_misalign } } } */
@@ -1,7 +1,7 @@
/* { dg-xfail-run-if "" { { i?86-*-* x86_64-*-* } && ia32 } } */
/* { dg-require-effective-target vect_double } */
/* { dg-add-options ieee } */
-/* { dg-additional-options "-fno-fast-math" } */
+/* { dg-additional-options "-fno-fast-math -fno-tree-slp-vectorize" } */
#include "tree-vect.h"
@@ -6769,11 +6769,11 @@ merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
}
/* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
- order, starting with LHS. Insert the extraction statements before GSI and
- associate the new scalar SSA names with variable SCALAR_DEST.
+ order, starting with LHS if not NULL. Insert the extraction statements
+ before GSI and associate the new scalar SSA names with variable SCALAR_DEST.
Return the SSA name for the result. */
-static tree
+tree
vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
tree_code code, tree lhs, tree vector_rhs)
{
@@ -6796,11 +6796,16 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
gimple_assign_set_lhs (stmt, rhs);
gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
- stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
- tree new_name = make_ssa_name (scalar_dest, stmt);
- gimple_assign_set_lhs (stmt, new_name);
- gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
- lhs = new_name;
+ if (lhs)
+ {
+ stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
+ tree new_name = make_ssa_name (scalar_dest, stmt);
+ gimple_assign_set_lhs (stmt, new_name);
+ gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+ lhs = new_name;
+ }
+ else
+ lhs = rhs;
}
return lhs;
}
@@ -1480,9 +1480,10 @@ dt_sort_cmp (const void *op1_, const void *op2_, void *)
return (int)op1->code - (int)op2->code;
}
-/* Linearize the associatable expression chain at START with the
- associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
- filling CHAIN with the result and using WORKLIST as intermediate storage.
+/* Linearize the associatable (if ASSOCIATABLE is true) expression chain at
+ START with the associatable operation CODE (where PLUS_EXPR also allows
+ MINUS_EXPR), filling CHAIN with the result and using WORKLIST as
+ intermediate storage.
CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
stmts, starting with START. */
@@ -1493,7 +1494,7 @@ vect_slp_linearize_chain (vec_info *vinfo,
vec<chain_op_t> &chain,
enum tree_code code, gimple *start,
gimple *&code_stmt, gimple *&alt_code_stmt,
- vec<gimple *> *chain_stmts)
+ vec<gimple *> *chain_stmts, bool associatable)
{
/* For each lane linearize the addition/subtraction (or other
uniform associatable operation) expression tree. */
@@ -1526,6 +1527,7 @@ vect_slp_linearize_chain (vec_info *vinfo,
gimple *use_stmt;
use_operand_p use_p;
if (dt == vect_internal_def
+ && (opnum == 1 || associatable)
&& single_imm_use (op, &use_p, &use_stmt)
&& is_gimple_assign (def_stmt_info->stmt)
&& (gimple_assign_rhs_code (def_stmt_info->stmt) == code
@@ -1924,7 +1926,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
gimple *op_stmt = NULL, *other_op_stmt = NULL;
vect_slp_linearize_chain (vinfo, worklist, chain, code,
stmts[lane]->stmt, op_stmt, other_op_stmt,
- NULL);
+ NULL, true);
if (!op_stmt_info && op_stmt)
op_stmt_info = vinfo->lookup_stmt (op_stmt);
if (!other_op_stmt_info && other_op_stmt)
@@ -6399,17 +6401,31 @@ vectorizable_bb_reduc_epilogue (slp_instance instance,
reduc_code = PLUS_EXPR;
internal_fn reduc_fn;
tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
+ bool fl;
if (!vectype
- || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
- || reduc_fn == IFN_LAST
- || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
|| !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
- TREE_TYPE (vectype)))
- return false;
+ TREE_TYPE (vectype))
+ || (!(fl = needs_fold_left_reduction_p (TREE_TYPE (vectype), reduc_code))
+ && (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
+ || reduc_fn == IFN_LAST
+ || !direct_internal_fn_supported_p (reduc_fn,
+ vectype, OPTIMIZE_FOR_BOTH)))
+ || (fl
+ && (maybe_ne (TYPE_VECTOR_SUBPARTS (vectype),
+ SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)))
+ || SLP_INSTANCE_REMAIN_STMTS (instance).length () > 1)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorized: basic block reduction epilogue "
+ "operation unsupported.\n");
+ return false;
+ }
/* There's no way to cost a horizontal vector reduction via REDUC_FN so
cost log2 vector operations plus shuffles and one extraction. */
- unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
+ unsigned steps = (fl ? vect_nunits_for_cost (vectype)
+ : floor_log2 (vect_nunits_for_cost (vectype)));
record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
vectype, 0, vect_body);
record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
@@ -7220,13 +7236,6 @@ vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
}
else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
&& (associative_tree_code (code) || code == MINUS_EXPR)
- /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS
- checks pessimize a two-element reduction. PR54400.
- ??? In-order reduction could be handled if we only
- traverse one operand chain in vect_slp_linearize_chain. */
- && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
- || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
- && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
/* Ops with constants at the tail can be stripped here. */
&& TREE_CODE (rhs) == SSA_NAME
&& TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
@@ -7247,17 +7256,27 @@ vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
gimple *code_stmt = NULL, *alt_code_stmt = NULL;
if (code == MINUS_EXPR)
code = PLUS_EXPR;
+ /* ??? For TYPE_OVERFLOW_UNDEFINED we could resort to
+ unsigned arithmetic. */
+ bool associatable
+ = !needs_fold_left_reduction_p (TREE_TYPE (rhs), code);
internal_fn reduc_fn;
- if (!reduction_fn_for_scalar_code (code, &reduc_fn)
- || reduc_fn == IFN_LAST)
+ if (associatable
+ && (!reduction_fn_for_scalar_code (code, &reduc_fn)
+ || reduc_fn == IFN_LAST))
continue;
vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
- /* ??? */
- code_stmt, alt_code_stmt, &chain_stmts);
+ code_stmt, alt_code_stmt, &chain_stmts,
+ associatable);
if (chain.length () > 1)
{
- /* Sort the chain according to def_type and operation. */
- chain.sort (dt_sort_cmp, bb_vinfo);
+ /* Sort the chain according to def_type and operation if
+ allowed. Otherwise reverse so an in-order reduction
+ of the final vector(s) is in the correct order. */
+ if (associatable)
+ chain.sort (dt_sort_cmp, bb_vinfo);
+ else
+ chain.reverse ();
/* ??? Now we'd want to strip externals and constants
but record those to be handled in the epilogue. */
/* ??? For now do not allow mixing ops or externs/constants. */
@@ -9137,22 +9156,40 @@ vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
if (reduc_code == MINUS_EXPR)
reduc_code = PLUS_EXPR;
gimple_seq epilogue = NULL;
- /* We may end up with more than one vector result, reduce them
- to one vector. */
tree vec_def = vec_defs[0];
- for (unsigned i = 1; i < vec_defs.length (); ++i)
- vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
- vec_def, vec_defs[i]);
- vec_defs.release ();
- /* ??? Support other schemes than direct internal fn. */
- internal_fn reduc_fn;
- if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
- || reduc_fn == IFN_LAST)
- gcc_unreachable ();
- tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
- TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
+ bool fl_reduction
+ = needs_fold_left_reduction_p (TREE_TYPE (TREE_TYPE (vec_def)),
+ reduc_code);
+ /* ??? Support other schemes than direct internal fn and open-coded
+ fold-left reduction. */
+ tree scalar_type = TREE_TYPE (TREE_TYPE (vec_def));
+ tree scalar_def;
+ if (fl_reduction)
+ {
+ gcc_assert (vec_defs.length () == 1);
+ gimple_seq fl_stmts = NULL;
+ gimple_stmt_iterator gsi = gsi_last (fl_stmts);
+ scalar_def = vect_expand_fold_left (&gsi, scalar_type, reduc_code,
+ NULL_TREE, vec_def);
+ gimple_seq_add_seq (&epilogue, fl_stmts);
+ }
+ else
+ {
+ /* We may end up with more than one vector result, reduce them
+ to one vector. */
+ for (unsigned i = 1; i < vec_defs.length (); ++i)
+ vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
+ vec_def, vec_defs[i]);
+ vec_defs.release ();
+ internal_fn reduc_fn;
+ reduction_fn_for_scalar_code (reduc_code, &reduc_fn);
+ scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
+ scalar_type, vec_def);
+ }
if (!SLP_INSTANCE_REMAIN_STMTS (instance).is_empty ())
{
+ if (fl_reduction)
+ gcc_assert (SLP_INSTANCE_REMAIN_STMTS (instance).length () == 1);
tree rem_def = NULL_TREE;
for (auto rem : SLP_INSTANCE_REMAIN_STMTS (instance))
if (!rem_def)
@@ -2347,6 +2347,8 @@ extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
extern gimple_seq vect_gen_len (tree, tree, tree, tree);
extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
+tree vect_expand_fold_left (gimple_stmt_iterator *, tree,
+ tree_code, tree, tree);
/* Drive for loop transformation stage. */
extern class loop *vect_transform_loop (loop_vec_info, gimple *);