@@ -50,6 +50,9 @@ along with GCC; see the file COPYING3. If not see
#include "insn-config.h"
#include "rtl.h"
#include "recog.h"
+#include "langhooks.h"
+#include "tree-vector-builder.h"
+#include "optabs-tree.h"
/*************************************************************************
Simple Loop Peeling Utilities
@@ -845,7 +848,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
rgroup_controls *iv_rgc = nullptr;
unsigned int i;
auto_vec<rgroup_controls> *controls = use_masks_p
- ? &LOOP_VINFO_MASKS (loop_vinfo)
+ ? &LOOP_VINFO_MASKS (loop_vinfo).rgc_vec
: &LOOP_VINFO_LENS (loop_vinfo);
FOR_EACH_VEC_ELT (*controls, i, rgc)
if (!rgc->controls.is_empty ())
@@ -936,6 +939,246 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
return cond_stmt;
}
+/* Set up the iteration condition and rgroup controls for LOOP in AVX512
+ style, given that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the
+ vectorized loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
+ the number of iterations of the original scalar loop that should be
+ handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are as
+ for vect_set_loop_condition.
+
+ Insert the branch-back condition before LOOP_COND_GSI and return the
+ final gcond. */
+
+static gcond *
+vect_set_loop_condition_partial_vectors_avx512 (class loop *loop,
+ loop_vec_info loop_vinfo, tree niters,
+ tree final_iv,
+ bool niters_maybe_zero,
+ gimple_stmt_iterator loop_cond_gsi)
+{
+ tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ tree orig_niters = niters;
+ gimple_seq preheader_seq = NULL;
+
+ /* Create an IV that counts down from niters and whose step
+ is the number of iterations processed in the current iteration.
+ Produce the controls with compares like the following.
+
+ # iv_2 = PHI <niters, iv_3>
+ rem_4 = MIN <iv_2, VF>;
+ remv_6 = { rem_4, rem_4, rem_4, ... }
+ mask_5 = { 0, 0, 1, 1, 2, 2, ... } < remv6;
+ iv_3 = iv_2 - VF;
+ if (iv_2 > VF)
+ continue;
+
+ Where the constant is built with elements at most VF - 1 and
+ repetitions according to max_nscalars_per_iter which is guarnateed
+ to be the same within a group. */
+
+ /* Convert NITERS to the determined IV type. */
+ if (TYPE_PRECISION (iv_type) > TYPE_PRECISION (TREE_TYPE (niters))
+ && niters_maybe_zero)
+ {
+ /* We know that there is always at least one iteration, so if the
+ count is zero then it must have wrapped. Cope with this by
+ subtracting 1 before the conversion and adding 1 to the result. */
+ gcc_assert (TYPE_UNSIGNED (TREE_TYPE (niters)));
+ niters = gimple_build (&preheader_seq, PLUS_EXPR, TREE_TYPE (niters),
+ niters, build_minus_one_cst (TREE_TYPE (niters)));
+ niters = gimple_convert (&preheader_seq, iv_type, niters);
+ niters = gimple_build (&preheader_seq, PLUS_EXPR, iv_type,
+ niters, build_one_cst (iv_type));
+ }
+ else
+ niters = gimple_convert (&preheader_seq, iv_type, niters);
+
+ /* Bias the initial value of the IV in case we need to skip iterations
+ at the beginning. */
+ tree niters_adj = niters;
+ if (niters_skip)
+ {
+ tree skip = gimple_convert (&preheader_seq, iv_type, niters_skip);
+ niters_adj = gimple_build (&preheader_seq, PLUS_EXPR,
+ iv_type, niters, skip);
+ }
+
+ /* The iteration step is the vectorization factor. */
+ tree iv_step = build_int_cst (iv_type, vf);
+
+ /* Create the decrement IV. */
+ tree index_before_incr, index_after_incr;
+ gimple_stmt_iterator incr_gsi;
+ bool insert_after;
+ standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+ create_iv (niters_adj, MINUS_EXPR, iv_step, NULL_TREE, loop,
+ &incr_gsi, insert_after, &index_before_incr,
+ &index_after_incr);
+
+ /* Iterate over all the rgroups and fill in their controls. */
+ for (auto rgcm : LOOP_VINFO_MASKS (loop_vinfo).rgc_map)
+ {
+ rgroup_controls *rgc = rgcm.second;
+ if (rgc->controls.is_empty ())
+ continue;
+
+ tree ctrl_type = rgc->type;
+ poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type);
+
+ tree vectype = rgc->compare_type;
+
+ /* index_after_incr is the IV specifying the remaining iterations in
+ the next iteration. */
+ tree rem = index_after_incr;
+ /* When the data type for the compare to produce the mask is
+ smaller than the IV type we need to saturate. Saturate to
+ the smallest possible value (IV_TYPE) so we only have to
+ saturate once (CSE will catch redundant ones we add). */
+ if (TYPE_PRECISION (TREE_TYPE (vectype)) < TYPE_PRECISION (iv_type))
+ rem = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
+ UNKNOWN_LOCATION,
+ MIN_EXPR, TREE_TYPE (rem), rem, iv_step);
+ rem = gimple_convert (&incr_gsi, false, GSI_CONTINUE_LINKING,
+ UNKNOWN_LOCATION, TREE_TYPE (vectype), rem);
+
+ /* Build a data vector composed of the remaining iterations. */
+ rem = gimple_build_vector_from_val (&incr_gsi, false, GSI_CONTINUE_LINKING,
+ UNKNOWN_LOCATION, vectype, rem);
+
+ /* Provide a definition of each vector in the control group. */
+ tree next_ctrl = NULL_TREE;
+ tree first_rem = NULL_TREE;
+ tree ctrl;
+ unsigned int i;
+ FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
+ {
+ /* Previous controls will cover BIAS items. This control covers the
+ next batch. */
+ poly_uint64 bias = nitems_per_ctrl * i;
+
+ /* Build the constant to compare the remaining iters against,
+ this is sth like { 0, 0, 1, 1, 2, 2, 3, 3, ... } appropriately
+ split into pieces. */
+ unsigned n = TYPE_VECTOR_SUBPARTS (ctrl_type).to_constant ();
+ tree_vector_builder builder (vectype, n, 1);
+ for (unsigned i = 0; i < n; ++i)
+ {
+ unsigned HOST_WIDE_INT val
+ = (i + bias.to_constant ()) / rgc->max_nscalars_per_iter;
+ gcc_assert (val < vf.to_constant ());
+ builder.quick_push (build_int_cst (TREE_TYPE (vectype), val));
+ }
+ tree cmp_series = builder.build ();
+
+ /* Create the initial control. First include all items that
+ are within the loop limit. */
+ tree init_ctrl = NULL_TREE;
+ poly_uint64 const_limit;
+ /* See whether the first iteration of the vector loop is known
+ to have a full control. */
+ if (poly_int_tree_p (niters, &const_limit)
+ && known_ge (const_limit, (i + 1) * nitems_per_ctrl))
+ init_ctrl = build_minus_one_cst (ctrl_type);
+ else
+ {
+ /* The remaining work items initially are niters. Saturate,
+ splat and compare. */
+ if (!first_rem)
+ {
+ first_rem = niters;
+ if (TYPE_PRECISION (TREE_TYPE (vectype))
+ < TYPE_PRECISION (iv_type))
+ first_rem = gimple_build (&preheader_seq,
+ MIN_EXPR, TREE_TYPE (first_rem),
+ first_rem, iv_step);
+ first_rem = gimple_convert (&preheader_seq, TREE_TYPE (vectype),
+ first_rem);
+ first_rem = gimple_build_vector_from_val (&preheader_seq,
+ vectype, first_rem);
+ }
+ init_ctrl = gimple_build (&preheader_seq, LT_EXPR, ctrl_type,
+ cmp_series, first_rem);
+ }
+
+ /* Now AND out the bits that are within the number of skipped
+ items. */
+ poly_uint64 const_skip;
+ if (niters_skip
+ && !(poly_int_tree_p (niters_skip, &const_skip)
+ && known_le (const_skip, bias)))
+ {
+ /* For integer mode masks it's cheaper to shift out the bits
+ since that avoids loading a constant. */
+ gcc_assert (GET_MODE_CLASS (TYPE_MODE (ctrl_type)) == MODE_INT);
+ init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
+ lang_hooks.types.type_for_mode
+ (TYPE_MODE (ctrl_type), 1),
+ init_ctrl);
+ /* ??? But when the shift amount isn't constant this requires
+ a round-trip to GRPs. We could apply the bias to either
+ side of the compare instead. */
+ tree shift = gimple_build (&preheader_seq, MULT_EXPR,
+ TREE_TYPE (niters_skip),
+ niters_skip,
+ build_int_cst (TREE_TYPE (niters_skip),
+ rgc->max_nscalars_per_iter));
+ init_ctrl = gimple_build (&preheader_seq, LSHIFT_EXPR,
+ TREE_TYPE (init_ctrl),
+ init_ctrl, shift);
+ init_ctrl = gimple_build (&preheader_seq, VIEW_CONVERT_EXPR,
+ ctrl_type, init_ctrl);
+ }
+
+ /* Get the control value for the next iteration of the loop. */
+ next_ctrl = gimple_build (&incr_gsi, false, GSI_CONTINUE_LINKING,
+ UNKNOWN_LOCATION,
+ LT_EXPR, ctrl_type, cmp_series, rem);
+
+ vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
+ }
+ }
+
+ /* Emit all accumulated statements. */
+ add_preheader_seq (loop, preheader_seq);
+
+ /* Adjust the exit test using the decrementing IV. */
+ edge exit_edge = single_exit (loop);
+ tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? LE_EXPR : GT_EXPR;
+ /* When we peel for alignment with niter_skip != 0 this can
+ cause niter + niter_skip to wrap and since we are comparing the
+ value before the decrement here we get a false early exit.
+ We can't compare the value after decrement either because that
+ decrement could wrap as well as we're not doing a saturating
+ decrement. To avoid this situation we force a larger
+ iv_type. */
+ gcond *cond_stmt = gimple_build_cond (code, index_before_incr, iv_step,
+ NULL_TREE, NULL_TREE);
+ gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
+
+ /* The loop iterates (NITERS - 1 + NITERS_SKIP) / VF + 1 times.
+ Subtract one from this to get the latch count. */
+ tree niters_minus_one
+ = fold_build2 (PLUS_EXPR, TREE_TYPE (orig_niters), orig_niters,
+ build_minus_one_cst (TREE_TYPE (orig_niters)));
+ tree niters_adj2 = fold_convert (iv_type, niters_minus_one);
+ if (niters_skip)
+ niters_adj2 = fold_build2 (PLUS_EXPR, iv_type, niters_minus_one,
+ fold_convert (iv_type, niters_skip));
+ loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, iv_type,
+ niters_adj2, iv_step);
+
+ if (final_iv)
+ {
+ gassign *assign = gimple_build_assign (final_iv, orig_niters);
+ gsi_insert_on_edge_immediate (single_exit (loop), assign);
+ }
+
+ return cond_stmt;
+}
+
+
/* Like vect_set_loop_condition, but handle the case in which the vector
loop handles exactly VF scalars per iteration. */
@@ -1114,10 +1357,18 @@ vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
if (loop_vinfo && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
- cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
- niters, final_iv,
- niters_maybe_zero,
- loop_cond_gsi);
+ {
+ if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_avx512)
+ cond_stmt = vect_set_loop_condition_partial_vectors_avx512 (loop, loop_vinfo,
+ niters, final_iv,
+ niters_maybe_zero,
+ loop_cond_gsi);
+ else
+ cond_stmt = vect_set_loop_condition_partial_vectors (loop, loop_vinfo,
+ niters, final_iv,
+ niters_maybe_zero,
+ loop_cond_gsi);
+ }
else
cond_stmt = vect_set_loop_condition_normal (loop, niters, step, final_iv,
niters_maybe_zero,
@@ -2030,7 +2281,8 @@ void
vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
{
tree misalign_in_elems;
- tree type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+ /* ??? With AVX512 we want LOOP_VINFO_RGROUP_IV_TYPE in the end. */
+ tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
@@ -55,6 +55,7 @@ along with GCC; see the file COPYING3. If not see
#include "vec-perm-indices.h"
#include "tree-eh.h"
#include "case-cfn-macros.h"
+#include "langhooks.h"
/* Loop Vectorization Pass.
@@ -963,6 +964,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
mask_skip_niters (NULL_TREE),
rgroup_compare_type (NULL_TREE),
simd_if_cond (NULL_TREE),
+ partial_vector_style (vect_partial_vectors_none),
unaligned_dr (NULL),
peeling_for_alignment (0),
ptr_mask (0),
@@ -1058,7 +1060,12 @@ _loop_vec_info::~_loop_vec_info ()
{
free (bbs);
- release_vec_loop_controls (&masks);
+ for (auto m : masks.rgc_map)
+ {
+ m.second->controls.release ();
+ delete m.second;
+ }
+ release_vec_loop_controls (&masks.rgc_vec);
release_vec_loop_controls (&lens);
delete ivexpr_map;
delete scan_map;
@@ -1108,7 +1115,7 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
{
rgroup_controls *rgm;
unsigned int i;
- FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
+ FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
if (rgm->type != NULL_TREE
&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
cmp_type, rgm->type,
@@ -1203,9 +1210,33 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
return false;
+ /* Produce the rgroup controls. */
+ for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
+ {
+ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+ tree vectype = mask.first;
+ unsigned nvectors = mask.second;
+
+ if (masks->rgc_vec.length () < nvectors)
+ masks->rgc_vec.safe_grow_cleared (nvectors, true);
+ rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
+ /* The number of scalars per iteration and the number of vectors are
+ both compile-time constants. */
+ unsigned int nscalars_per_iter
+ = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+ if (rgm->max_nscalars_per_iter < nscalars_per_iter)
+ {
+ rgm->max_nscalars_per_iter = nscalars_per_iter;
+ rgm->type = truth_type_for (vectype);
+ rgm->factor = 1;
+ }
+ }
+
/* Calculate the maximum number of scalars per iteration for every rgroup. */
unsigned int max_nscalars_per_iter = 1;
- for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo))
+ for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
max_nscalars_per_iter
= MAX (max_nscalars_per_iter, rgm.max_nscalars_per_iter);
@@ -1268,10 +1299,159 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
}
if (!cmp_type)
- return false;
+ {
+ LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
+ return false;
+ }
LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+ LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
+ return true;
+}
+
+/* Each statement in LOOP_VINFO can be masked where necessary. Check
+ whether we can actually generate AVX512 style masks. Return true if so,
+ storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
+
+static bool
+vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
+{
+ /* Produce differently organized rgc_vec and differently check
+ we can produce masks. */
+
+ /* Use a normal loop if there are no statements that need masking.
+ This only happens in rare degenerate cases: it means that the loop
+ has no loads, no stores, and no live-out values. */
+ if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
+ return false;
+
+ /* For the decrementing IV we need to represent all values in
+ [0, niter + niter_skip] where niter_skip is the elements we
+ skip in the first iteration for prologue peeling. */
+ tree iv_type = NULL_TREE;
+ widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
+ unsigned int iv_precision = UINT_MAX;
+ if (iv_limit != -1)
+ iv_precision = wi::min_precision (iv_limit, UNSIGNED);
+
+ /* First compute the type for the IV we use to track the remaining
+ scalar iterations. */
+ opt_scalar_int_mode cmp_mode_iter;
+ FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
+ {
+ unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
+ if (cmp_bits >= iv_precision
+ && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
+ {
+ iv_type = build_nonstandard_integer_type (cmp_bits, true);
+ if (iv_type)
+ break;
+ }
+ }
+ if (!iv_type)
+ return false;
+
+ /* Produce the rgroup controls. */
+ for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
+ {
+ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+ tree vectype = mask.first;
+ unsigned nvectors = mask.second;
+
+ /* The number of scalars per iteration and the number of vectors are
+ both compile-time constants. */
+ unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+ unsigned int nscalars_per_iter
+ = nvectors * nunits / LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
+
+ /* We key off a hash-map with nscalars_per_iter and the number of total
+ lanes in the mask vector and then remember the needed vector mask
+ with the largest number of lanes (thus the fewest nV). */
+ bool existed;
+ rgroup_controls *& rgc
+ = masks->rgc_map.get_or_insert (std::make_pair (nscalars_per_iter,
+ nvectors * nunits),
+ &existed);
+ if (!existed)
+ {
+ rgc = new rgroup_controls ();
+ rgc->type = truth_type_for (vectype);
+ rgc->compare_type = NULL_TREE;
+ rgc->max_nscalars_per_iter = nscalars_per_iter;
+ rgc->factor = 1;
+ rgc->bias_adjusted_ctrl = NULL_TREE;
+ }
+ else
+ {
+ gcc_assert (rgc->max_nscalars_per_iter == nscalars_per_iter);
+ if (known_lt (TYPE_VECTOR_SUBPARTS (rgc->type),
+ TYPE_VECTOR_SUBPARTS (vectype)))
+ rgc->type = truth_type_for (vectype);
+ }
+ }
+
+ /* There is no fixed compare type we are going to use but we have to
+ be able to get at one for each mask group. */
+ unsigned int min_ni_width
+ = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
+
+ bool ok = true;
+ for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).rgc_map)
+ {
+ rgroup_controls *rgc = mask.second;
+ tree mask_type = rgc->type;
+ if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
+ {
+ ok = false;
+ break;
+ }
+
+ /* If iv_type is usable as compare type use that - we can elide the
+ saturation in that case. */
+ if (TYPE_PRECISION (iv_type) >= min_ni_width)
+ {
+ tree cmp_vectype
+ = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
+ if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
+ rgc->compare_type = cmp_vectype;
+ }
+ if (!rgc->compare_type)
+ FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
+ {
+ unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
+ if (cmp_bits >= min_ni_width
+ && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
+ {
+ tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
+ if (!cmp_type)
+ continue;
+
+ /* Check whether we can produce the mask with cmp_type. */
+ tree cmp_vectype
+ = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
+ if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
+ {
+ rgc->compare_type = cmp_vectype;
+ break;
+ }
+ }
+ }
+ if (!rgc->compare_type)
+ {
+ ok = false;
+ break;
+ }
+ }
+ if (!ok)
+ {
+ LOOP_VINFO_MASKS (loop_vinfo).rgc_map.empty ();
+ return false;
+ }
+
+ LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
+ LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+ LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
return true;
}
@@ -1371,6 +1551,7 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+ LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
return true;
}
@@ -2712,16 +2893,24 @@ start_over:
/* If we still have the option of using partial vectors,
check whether we can generate the necessary loop controls. */
- if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
- && !vect_verify_full_masking (loop_vinfo)
- && !vect_verify_loop_lens (loop_vinfo))
- LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
+ {
+ if (!vect_verify_full_masking (loop_vinfo)
+ && !vect_verify_full_masking_avx512 (loop_vinfo))
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
+ if (!vect_verify_loop_lens (loop_vinfo))
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
/* If we're vectorizing a loop that uses length "controls" and
can iterate more than once, we apply decrementing IV approach
in loop control. */
if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
- && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
+ && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
&& LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
&& !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
&& known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
@@ -3022,7 +3211,7 @@ again:
delete loop_vinfo->vector_costs;
loop_vinfo->vector_costs = nullptr;
/* Reset accumulated rgroup information. */
- release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
+ release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
/* Reset assorted flags. */
LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
@@ -4362,13 +4551,67 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
cond_branch_not_taken, vect_epilogue);
/* Take care of special costs for rgroup controls of partial vectors. */
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
+ == vect_partial_vectors_avx512))
+ {
+ /* Calculate how many masks we need to generate. */
+ unsigned int num_masks = 0;
+ bool need_saturation = false;
+ for (auto rgcm : LOOP_VINFO_MASKS (loop_vinfo).rgc_map)
+ {
+ rgroup_controls *rgm = rgcm.second;
+ unsigned nvectors
+ = (rgcm.first.second
+ / TYPE_VECTOR_SUBPARTS (rgm->type).to_constant ());
+ num_masks += nvectors;
+ if (TYPE_PRECISION (TREE_TYPE (rgm->compare_type))
+ < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
+ need_saturation = true;
+ }
+
+ /* ??? The target isn't able to identify the costs below as
+ producing masks so it cannot penaltize cases where we'd run
+ out of mask registers for example. */
+
+ /* In the worst case, we need to generate each mask in the prologue
+ and in the loop body. We need one splat per group and one
+ compare per mask.
+
+ Sometimes we can use unpacks instead of generating prologue
+ masks and sometimes the prologue mask will fold to a constant,
+ so the actual prologue cost might be smaller. However, it's
+ simpler and safer to use the worst-case cost; if this ends up
+ being the tie-breaker between vectorizing or not, then it's
+ probably better not to vectorize. */
+ (void) add_stmt_cost (target_cost_data,
+ num_masks
+ + LOOP_VINFO_MASKS (loop_vinfo).rgc_map.elements (),
+ vector_stmt, NULL, NULL, NULL_TREE, 0, vect_prologue);
+ (void) add_stmt_cost (target_cost_data,
+ num_masks
+ + LOOP_VINFO_MASKS (loop_vinfo).rgc_map.elements (),
+ vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
+
+ /* When we need saturation we need it both in the prologue and
+ the epilogue. */
+ if (need_saturation)
+ {
+ (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
+ NULL, NULL, NULL_TREE, 0, vect_prologue);
+ (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
+ NULL, NULL, NULL_TREE, 0, vect_body);
+ }
+ }
+ else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
+ == vect_partial_vectors_avx512))
{
/* Calculate how many masks we need to generate. */
unsigned int num_masks = 0;
rgroup_controls *rgm;
unsigned int num_vectors_m1;
- FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
+ FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, num_vectors_m1, rgm)
if (rgm->type)
num_masks += num_vectors_m1 + 1;
gcc_assert (num_masks > 0);
@@ -10329,14 +10572,6 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
unsigned int nvectors, tree vectype, tree scalar_mask)
{
gcc_assert (nvectors != 0);
- if (masks->length () < nvectors)
- masks->safe_grow_cleared (nvectors, true);
- rgroup_controls *rgm = &(*masks)[nvectors - 1];
- /* The number of scalars per iteration and the number of vectors are
- both compile-time constants. */
- unsigned int nscalars_per_iter
- = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
- LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
if (scalar_mask)
{
@@ -10344,12 +10579,7 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
loop_vinfo->scalar_cond_masked_set.add (cond);
}
- if (rgm->max_nscalars_per_iter < nscalars_per_iter)
- {
- rgm->max_nscalars_per_iter = nscalars_per_iter;
- rgm->type = truth_type_for (vectype);
- rgm->factor = 1;
- }
+ masks->mask_set.add (std::make_pair (vectype, nvectors));
}
/* Given a complete set of masks MASKS, extract mask number INDEX
@@ -10360,46 +10590,121 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
arrangement. */
tree
-vect_get_loop_mask (loop_vec_info,
+vect_get_loop_mask (loop_vec_info loop_vinfo,
gimple_stmt_iterator *gsi, vec_loop_masks *masks,
unsigned int nvectors, tree vectype, unsigned int index)
{
- rgroup_controls *rgm = &(*masks)[nvectors - 1];
- tree mask_type = rgm->type;
-
- /* Populate the rgroup's mask array, if this is the first time we've
- used it. */
- if (rgm->controls.is_empty ())
+ if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
+ == vect_partial_vectors_while_ult)
{
- rgm->controls.safe_grow_cleared (nvectors, true);
- for (unsigned int i = 0; i < nvectors; ++i)
+ rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
+ tree mask_type = rgm->type;
+
+ /* Populate the rgroup's mask array, if this is the first time we've
+ used it. */
+ if (rgm->controls.is_empty ())
{
- tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
- /* Provide a dummy definition until the real one is available. */
- SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
- rgm->controls[i] = mask;
+ rgm->controls.safe_grow_cleared (nvectors, true);
+ for (unsigned int i = 0; i < nvectors; ++i)
+ {
+ tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
+ /* Provide a dummy definition until the real one is available. */
+ SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
+ rgm->controls[i] = mask;
+ }
}
- }
- tree mask = rgm->controls[index];
- if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
- TYPE_VECTOR_SUBPARTS (vectype)))
+ tree mask = rgm->controls[index];
+ if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
+ TYPE_VECTOR_SUBPARTS (vectype)))
+ {
+ /* A loop mask for data type X can be reused for data type Y
+ if X has N times more elements than Y and if Y's elements
+ are N times bigger than X's. In this case each sequence
+ of N elements in the loop mask will be all-zero or all-one.
+ We can then view-convert the mask so that each sequence of
+ N elements is replaced by a single element. */
+ gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
+ TYPE_VECTOR_SUBPARTS (vectype)));
+ gimple_seq seq = NULL;
+ mask_type = truth_type_for (vectype);
+ /* We can only use re-use the mask by reinterpreting it if it
+ occupies the same space, that is the mask with less elements
+ uses multiple bits for each masked elements. */
+ gcc_assert (known_eq (TYPE_PRECISION (TREE_TYPE (TREE_TYPE (mask)))
+ * TYPE_VECTOR_SUBPARTS (TREE_TYPE (mask)),
+ TYPE_PRECISION (TREE_TYPE (mask_type))
+ * TYPE_VECTOR_SUBPARTS (mask_type)));
+ mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
+ if (seq)
+ gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+ }
+ return mask;
+ }
+ else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
+ == vect_partial_vectors_avx512)
{
- /* A loop mask for data type X can be reused for data type Y
- if X has N times more elements than Y and if Y's elements
- are N times bigger than X's. In this case each sequence
- of N elements in the loop mask will be all-zero or all-one.
- We can then view-convert the mask so that each sequence of
- N elements is replaced by a single element. */
- gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
- TYPE_VECTOR_SUBPARTS (vectype)));
+ /* The number of scalars per iteration and the number of vectors are
+ both compile-time constants. */
+ unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+ unsigned int nscalars_per_iter
+ = nvectors * nunits / LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
+
+ rgroup_controls *rgm
+ = *masks->rgc_map.get (std::make_pair (nscalars_per_iter,
+ nvectors * nunits));
+
+ /* The stored nV is dependent on the mask type produced. */
+ nvectors = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+ TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ();
+
+ /* Populate the rgroup's mask array, if this is the first time we've
+ used it. */
+ if (rgm->controls.is_empty ())
+ {
+ rgm->controls.safe_grow_cleared (nvectors, true);
+ for (unsigned int i = 0; i < nvectors; ++i)
+ {
+ tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
+ /* Provide a dummy definition until the real one is available. */
+ SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
+ rgm->controls[i] = mask;
+ }
+ }
+ if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
+ TYPE_VECTOR_SUBPARTS (vectype)))
+ return rgm->controls[index];
+
+ /* Split the vector if needed. Since we are dealing with integer mode
+ masks with AVX512 we can operate on the integer representation
+ performing the whole vector shifting. */
+ unsigned HOST_WIDE_INT factor;
+ bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
+ TYPE_VECTOR_SUBPARTS (vectype), &factor);
+ gcc_assert (ok);
+ gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
+ tree mask_type = truth_type_for (vectype);
+ gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
+ unsigned vi = index / factor;
+ unsigned vpart = index % factor;
+ tree vec = rgm->controls[vi];
gimple_seq seq = NULL;
- mask_type = truth_type_for (vectype);
- mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
+ vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
+ lang_hooks.types.type_for_mode
+ (TYPE_MODE (rgm->type), 1), vec);
+ /* For integer mode masks simply shift the right bits into position. */
+ if (vpart != 0)
+ vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
+ build_int_cst (integer_type_node, vpart * nunits));
+ vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
+ (TYPE_MODE (mask_type), 1), vec);
+ vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
if (seq)
gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+ return vec;
}
- return mask;
+ else
+ gcc_unreachable ();
}
/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
@@ -300,6 +300,13 @@ public:
#define SLP_TREE_LANES(S) (S)->lanes
#define SLP_TREE_CODE(S) (S)->code
+enum vect_partial_vector_style {
+ vect_partial_vectors_none,
+ vect_partial_vectors_while_ult,
+ vect_partial_vectors_avx512,
+ vect_partial_vectors_len
+};
+
/* Key for map that records association between
scalar conditions and corresponding loop mask, and
is populated by vect_record_loop_mask. */
@@ -605,6 +612,10 @@ struct rgroup_controls {
specified number of elements; the type of the elements doesn't matter. */
tree type;
+ /* When there is no uniformly used LOOP_VINFO_RGROUP_COMPARE_TYPE this
+ is the rgroup specific type used. */
+ tree compare_type;
+
/* A vector of nV controls, in iteration order. */
vec<tree> controls;
@@ -613,7 +624,24 @@ struct rgroup_controls {
tree bias_adjusted_ctrl;
};
-typedef auto_vec<rgroup_controls> vec_loop_masks;
+struct vec_loop_masks
+{
+ bool is_empty () const { return mask_set.is_empty (); }
+
+ typedef pair_hash <nofree_ptr_hash <tree_node>,
+ int_hash<unsigned, 0>> mp_hash;
+ hash_set<mp_hash> mask_set;
+
+ /* Default storage for rgroup_controls. */
+ auto_vec<rgroup_controls> rgc_vec;
+
+ /* The vect_partial_vectors_avx512 style uses a hash-map. */
+ hash_map<std::pair<unsigned /* nscalars_per_iter */,
+ unsigned /* nlanes */>, rgroup_controls *,
+ simple_hashmap_traits<pair_hash <int_hash<unsigned, 0>,
+ int_hash<unsigned, 0>>,
+ rgroup_controls *>> rgc_map;
+};
typedef auto_vec<rgroup_controls> vec_loop_lens;
@@ -741,6 +769,10 @@ public:
LOOP_VINFO_USING_PARTIAL_VECTORS_P is true. */
tree rgroup_iv_type;
+ /* The style used for implementing partial vectors when
+ LOOP_VINFO_USING_PARTIAL_VECTORS_P is true. */
+ vect_partial_vector_style partial_vector_style;
+
/* Unknown DRs according to which loop was peeled. */
class dr_vec_info *unaligned_dr;
@@ -914,6 +946,7 @@ public:
#define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters
#define LOOP_VINFO_RGROUP_COMPARE_TYPE(L) (L)->rgroup_compare_type
#define LOOP_VINFO_RGROUP_IV_TYPE(L) (L)->rgroup_iv_type
+#define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style
#define LOOP_VINFO_PTR_MASK(L) (L)->ptr_mask
#define LOOP_VINFO_N_STMTS(L) (L)->shared->n_stmts
#define LOOP_VINFO_LOOP_NEST(L) (L)->shared->loop_nest