new file mode 100644
@@ -0,0 +1,46 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd" } */
+
+#include "tree-vect.h"
+
+int x[1024];
+
+#pragma omp declare simd simdlen(4) notinbranch
+__attribute__((noinline)) int
+foo (int a, int b)
+{
+ return a + b;
+}
+
+void __attribute__((noipa))
+bar (void)
+{
+#pragma omp simd
+ for (int i = 0; i < 512; i++)
+ {
+ x[2*i+0] = foo (x[2*i+0], x[2*i+0]);
+ x[2*i+1] = foo (x[2*i+1], x[2*i+1]);
+ }
+}
+
+int
+main ()
+{
+ int i;
+ check_vect ();
+
+#pragma GCC novector
+ for (i = 0; i < 1024; i++)
+ x[i] = i;
+
+ bar ();
+
+#pragma GCC novector
+ for (i = 0; i < 1024; i++)
+ if (x[i] != i + i)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
new file mode 100644
@@ -0,0 +1,57 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd" } */
+/* { dg-additional-options "-mavx2" { target avx2_runtime } } */
+
+#include "tree-vect.h"
+
+int x[1024];
+
+#pragma omp declare simd simdlen(4) inbranch
+__attribute__((noinline)) int
+foo (int a, int b)
+{
+ return a + b;
+}
+
+void __attribute__((noipa))
+bar (void)
+{
+#pragma omp simd
+ for (int i = 0; i < 512; i++)
+ {
+ if (x[2*i+0] < 10)
+ x[2*i+0] = foo (x[2*i+0], x[2*i+0]);
+ if (x[2*i+1] < 20)
+ x[2*i+1] = foo (x[2*i+1], x[2*i+1]);
+ }
+}
+
+int
+main ()
+{
+ int i;
+ check_vect ();
+
+#pragma GCC novector
+ for (i = 0; i < 1024; i++)
+ x[i] = i;
+
+ bar ();
+
+#pragma GCC novector
+ for (i = 0; i < 1024; i++)
+ {
+ if (((i & 1) && i < 20)
+ || (!(i & 1) && i < 10))
+ {
+ if (x[i] != i + i)
+ abort ();
+ }
+ else if (x[i] != i)
+ abort ();
+ }
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" { target avx2_runtime } } } */
@@ -505,6 +505,14 @@ static const int arg2_map[] = { 1, 2 };
static const int arg1_arg4_map[] = { 2, 1, 4 };
static const int arg3_arg2_map[] = { 2, 3, 2 };
static const int op1_op0_map[] = { 2, 1, 0 };
+static const int mask_call_maps[6][7] = {
+ { 1, 1, },
+ { 2, 1, 2, },
+ { 3, 1, 2, 3, },
+ { 4, 1, 2, 3, 4, },
+ { 5, 1, 2, 3, 4, 5, },
+ { 6, 1, 2, 3, 4, 5, 6 },
+};
/* For most SLP statements, there is a one-to-one mapping between
gimple arguments and child nodes. If that is not true for STMT,
@@ -547,6 +555,15 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
case IFN_MASK_STORE:
return arg3_arg2_map;
+ case IFN_MASK_CALL:
+ {
+ unsigned nargs = gimple_call_num_args (call);
+ if (nargs >= 2 && nargs <= 7)
+ return mask_call_maps[nargs-2];
+ else
+ return nullptr;
+ }
+
default:
break;
}
@@ -1070,7 +1087,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
if (call_stmt)
{
combined_fn cfn = gimple_call_combined_fn (call_stmt);
- if (cfn != CFN_LAST)
+ if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
rhs_code = cfn;
else
rhs_code = CALL_EXPR;
@@ -1085,6 +1102,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
rhs_code = CFN_MASK_STORE;
}
else if ((cfn != CFN_LAST
+ && cfn != CFN_MASK_CALL
&& internal_fn_p (cfn)
&& !vectorizable_internal_fn_p (as_internal_fn (cfn)))
|| gimple_call_tail_p (call_stmt)
@@ -4315,10 +4315,6 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
if (loop_vinfo && nested_in_vect_loop_p (loop, stmt_info))
return false;
- /* FORNOW */
- if (slp_node)
- return false;
-
/* Process function arguments. */
nargs = gimple_call_num_args (stmt) - arg_offset;
@@ -4327,6 +4323,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
return false;
arginfo.reserve (nargs, true);
+ auto_vec<slp_tree> slp_op;
+ slp_op.safe_grow_cleared (nargs);
for (i = 0; i < nargs; i++)
{
@@ -4338,9 +4336,12 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
thisarginfo.op = NULL_TREE;
thisarginfo.simd_lane_linear = false;
- op = gimple_call_arg (stmt, i + arg_offset);
- if (!vect_is_simple_use (op, vinfo, &thisarginfo.dt,
- &thisarginfo.vectype)
+ int op_no = i + arg_offset;
+ if (slp_node)
+ op_no = vect_slp_child_index_for_operand (stmt, op_no);
+ if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
+ op_no, &op, &slp_op[i],
+ &thisarginfo.dt, &thisarginfo.vectype)
|| thisarginfo.dt == vect_uninitialized_def)
{
if (dump_enabled_p ())
@@ -4351,7 +4352,13 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
if (thisarginfo.dt == vect_constant_def
|| thisarginfo.dt == vect_external_def)
- gcc_assert (thisarginfo.vectype == NULL_TREE);
+ {
+ gcc_assert (vec_stmt || thisarginfo.vectype == NULL_TREE);
+ if (!vec_stmt)
+ thisarginfo.vectype = get_vectype_for_scalar_type (vinfo,
+ TREE_TYPE (op),
+ slp_node);
+ }
else
gcc_assert (thisarginfo.vectype != NULL_TREE);
@@ -4408,15 +4415,14 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
&& thisarginfo.dt != vect_constant_def
&& thisarginfo.dt != vect_external_def
&& loop_vinfo
- && !slp_node
&& TREE_CODE (op) == SSA_NAME)
vect_simd_lane_linear (op, loop, &thisarginfo);
arginfo.quick_push (thisarginfo);
}
- poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
- if (!vf.is_constant ())
+ if (loop_vinfo
+ && !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -4425,6 +4431,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
return false;
}
+ poly_uint64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
+ unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 1;
unsigned int badness = 0;
struct cgraph_node *bestn = NULL;
if (STMT_VINFO_SIMD_CLONE_INFO (stmt_info).exists ())
@@ -4435,7 +4443,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
{
unsigned int this_badness = 0;
unsigned int num_calls;
- if (!constant_multiple_p (vf, n->simdclone->simdlen, &num_calls)
+ if (!constant_multiple_p (vf * group_size,
+ n->simdclone->simdlen, &num_calls)
|| n->simdclone->nargs != nargs)
continue;
if (num_calls != 1)
@@ -4561,7 +4570,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
fndecl = bestn->decl;
nunits = bestn->simdclone->simdlen;
- ncopies = vector_unroll_factor (vf, nunits);
+ if (slp_node)
+ ncopies = vector_unroll_factor (vf * group_size, nunits);
+ else
+ ncopies = vector_unroll_factor (vf, nunits);
/* If the function isn't const, only allow it in simd loops where user
has asserted that at least nunits consecutive iterations can be
@@ -4576,6 +4588,15 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
if (!vec_stmt) /* transformation not required. */
{
+ if (slp_node)
+ for (unsigned i = 0; i < nargs; ++i)
+ if (!vect_maybe_update_slp_op_vectype (slp_op[i], arginfo[i].vectype))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for invariants\n");
+ return false;
+ }
/* When the original call is pure or const but the SIMD ABI dictates
an aggregate return we will have to use a virtual definition and
in a loop eventually even need to add a virtual PHI. That's
@@ -4584,6 +4605,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
&& !gimple_vdef (stmt)
&& TREE_CODE (TREE_TYPE (TREE_TYPE (bestn->decl))) == ARRAY_TYPE)
vinfo->any_known_not_updated_vssa = true;
+ /* ??? For SLP code-gen we end up inserting after the last
+ vector argument def rather than at the original call position
+ so automagic virtual operand updating doesn't work. */
+ if (gimple_vuse (stmt) && slp_node)
+ vinfo->any_known_not_updated_vssa = true;
STMT_VINFO_SIMD_CLONE_INFO (stmt_info).safe_push (bestn->decl);
for (i = 0; i < nargs; i++)
if ((bestn->simdclone->args[i].arg_type
@@ -4633,8 +4659,14 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
auto_vec<vec<tree> > vec_oprnds;
auto_vec<unsigned> vec_oprnds_i;
- vec_oprnds.safe_grow_cleared (nargs, true);
vec_oprnds_i.safe_grow_cleared (nargs, true);
+ if (slp_node)
+ {
+ vec_oprnds.reserve_exact (nargs);
+ vect_get_slp_defs (vinfo, slp_node, &vec_oprnds);
+ }
+ else
+ vec_oprnds.safe_grow_cleared (nargs, true);
for (j = 0; j < ncopies; ++j)
{
/* Build argument list for the vectorized call. */
@@ -4665,9 +4697,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
gcc_assert ((k & (k - 1)) == 0);
if (m == 0)
{
- vect_get_vec_defs_for_operand (vinfo, stmt_info,
- ncopies * o / k, op,
- &vec_oprnds[i]);
+ if (!slp_node)
+ vect_get_vec_defs_for_operand (vinfo, stmt_info,
+ ncopies * o / k, op,
+ &vec_oprnds[i]);
vec_oprnds_i[i] = 0;
vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
}
@@ -4703,10 +4736,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
{
if (m == 0 && l == 0)
{
- vect_get_vec_defs_for_operand (vinfo, stmt_info,
- k * o * ncopies,
- op,
- &vec_oprnds[i]);
+ if (!slp_node)
+ vect_get_vec_defs_for_operand (vinfo, stmt_info,
+ k * o * ncopies,
+ op,
+ &vec_oprnds[i]);
vec_oprnds_i[i] = 0;
vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
}
@@ -4777,10 +4811,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
elements as the current function. */
if (m == 0)
{
- vect_get_vec_defs_for_operand (vinfo, stmt_info,
- o * ncopies,
- op,
- &vec_oprnds[i]);
+ if (!slp_node)
+ vect_get_vec_defs_for_operand (vinfo, stmt_info,
+ o * ncopies,
+ op,
+ &vec_oprnds[i]);
vec_oprnds_i[i] = 0;
}
vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++];
@@ -4924,7 +4959,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
if (j == 0 && l == 0)
*vec_stmt = new_stmt;
- STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ if (slp_node)
+ SLP_TREE_VEC_DEFS (slp_node)
+ .quick_push (gimple_assign_lhs (new_stmt));
+ else
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
}
if (ratype)
@@ -4967,7 +5006,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
if ((unsigned) j == k - 1)
*vec_stmt = new_stmt;
- STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ if (slp_node)
+ SLP_TREE_VEC_DEFS (slp_node)
+ .quick_push (gimple_assign_lhs (new_stmt));
+ else
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
continue;
}
else if (ratype)
@@ -4990,7 +5033,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info,
if (j == 0)
*vec_stmt = new_stmt;
- STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ if (slp_node)
+ SLP_TREE_VEC_DEFS (slp_node).quick_push (gimple_get_lhs (new_stmt));
+ else
+ STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
}
for (i = 0; i < nargs; ++i)