@@ -4974,6 +4974,42 @@ for (i = 1; i < operand3; i++)
operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
@end smallexample
+@cindex @code{select_vl@var{m}} instruction pattern
+@item @code{select_vl@var{m}}
+Set operand 0 to the number of active elements in a vector to be updated
+in a loop iteration based on the total number of elements to be updated,
+the vectorization factor and vector properties of the target.
+operand 1 is the total elements in the vector to be updated.
+operand 2 is the vectorization factor.
+The value of operand 0 is target dependent and flexible in each iteration.
+The operation of this pattern can be:
+
+@smallexample
+Case 1:
+operand0 = MIN (operand1, operand2);
+operand2 can be const_poly_int or poly_int related to vector mode size.
+Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
+that we can reduce a use of general purpose register.
+
+In this case, only the last iteration of the loop is partial iteration.
+@end smallexample
+
+@smallexample
+Case 2:
+if (operand1 <= operand2)
+ operand0 = operand1;
+else if (operand1 < 2 * operand2)
+ operand0 = ceil (operand1 / 2);
+else
+ operand0 = operand2;
+
+This case will evenly distribute work over the last 2 iterations of a stripmine loop.
+@end smallexample
+
+The output of this pattern is not only used as IV of loop control counter, but also
+is used as the IV of address calculation with multiply/shift operation. This allows
+dynamic adjustment of the number of elements processed each loop iteration.
+
@cindex @code{check_raw_ptrs@var{m}} instruction pattern
@item @samp{check_raw_ptrs@var{m}}
Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
@@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
check_raw_ptrs, check_ptrs)
DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
@@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
OPTAB_D (len_load_optab, "len_load_$a")
OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (select_vl_optab, "select_vl$a")
@@ -385,6 +385,353 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
return false;
}
+/* Try to use adjust loop lens for non-SLP multiple-rgroups.
+
+ _36 = MIN_EXPR <ivtmp_34, POLY_INT_CST [8, 8]>;
+
+ First length (MIN (X, VF/N)):
+ loop_len_15 = MIN_EXPR <_36, POLY_INT_CST [2, 2]>;
+
+ Second length (X - MIN (X, 1 * VF/N)):
+ loop_len_16 = _36 - loop_len_15;
+
+ Third length (X - MIN (X, 2 * VF/N)):
+ _38 = MIN_EXPR <_36, POLY_INT_CST [4, 4]>;
+ loop_len_17 = _36 - _38;
+
+ Forth length (X - MIN (X, 3 * VF/N)):
+ _39 = MIN_EXPR <_36, POLY_INT_CST [6, 6]>;
+ loop_len_18 = _36 - _39; */
+
+static void
+vect_adjust_loop_lens (tree iv_type, gimple_seq *seq, rgroup_controls *dest_rgm,
+ rgroup_controls *src_rgm)
+{
+ tree ctrl_type = dest_rgm->type;
+ poly_uint64 nitems_per_ctrl
+ = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
+
+ for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
+ {
+ tree src = src_rgm->controls[i / dest_rgm->controls.length ()];
+ tree dest = dest_rgm->controls[i];
+ gassign *stmt;
+ if (i == 0)
+ {
+ /* MIN (X, VF*I/N) capped to the range [0, VF/N]. */
+ tree factor = build_int_cst (iv_type, nitems_per_ctrl);
+ stmt = gimple_build_assign (dest, MIN_EXPR, src, factor);
+ gimple_seq_add_stmt (seq, stmt);
+ }
+ else
+ {
+ /* (X - MIN (X, VF*I/N)) capped to the range [0, VF/N]. */
+ tree factor = build_int_cst (iv_type, nitems_per_ctrl * i);
+ tree temp = make_ssa_name (iv_type);
+ stmt = gimple_build_assign (temp, MIN_EXPR, src, factor);
+ gimple_seq_add_stmt (seq, stmt);
+ stmt = gimple_build_assign (dest, MINUS_EXPR, src, temp);
+ gimple_seq_add_stmt (seq, stmt);
+ }
+ }
+}
+
+/* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
+ for all the rgroup controls in RGC and return a control that is nonzero
+ when the loop needs to iterate. Add any new preheader statements to
+ PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
+
+ RGC belongs to loop LOOP. The loop originally iterated NITERS
+ times and has been vectorized according to LOOP_VINFO.
+
+ Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
+ to TEST_LIMIT - bias.
+
+ In vect_set_loop_controls_by_select_vl, we are iterating from start at
+ IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
+ IFN_SELECT_VL pattern.
+
+ 1. Single rgroup, the Gimple IR should be:
+
+ # vectp_B.6_8 = PHI <vectp_B.6_13(6), &B(5)>
+ # vectp_B.8_16 = PHI <vectp_B.8_17(6), &B(5)>
+ # vectp_A.11_19 = PHI <vectp_A.11_20(6), &A(5)>
+ # vectp_A.13_22 = PHI <vectp_A.13_23(6), &A(5)>
+ # ivtmp_26 = PHI <ivtmp_27(6), _25(5)>
+ _28 = .SELECT_VL (ivtmp_26, POLY_INT_CST [4, 4]);
+ ivtmp_15 = _28 * 4;
+ vect__1.10_18 = .LEN_LOAD (vectp_B.8_16, 128B, _28, 0);
+ _1 = B[i_10];
+ .LEN_STORE (vectp_A.13_22, 128B, _28, vect__1.10_18, 0);
+ i_7 = i_10 + 1;
+ vectp_B.8_17 = vectp_B.8_16 + ivtmp_15;
+ vectp_A.13_23 = vectp_A.13_22 + ivtmp_15;
+ ivtmp_27 = ivtmp_26 - _28;
+ if (ivtmp_27 != 0)
+ goto <bb 6>; [83.33%]
+ else
+ goto <bb 7>; [16.67%]
+
+ Note: We use the outcome of .SELECT_VL to adjust both loop control IV and
+ data reference pointer IV.
+
+ 1). The result of .SELECT_VL:
+ _28 = .SELECT_VL (ivtmp_26, POLY_INT_CST [4, 4]);
+ The _28 is not necessary to be VF in any iteration, instead, we allow
+ _28 to be any value as long as _28 <= VF. Such flexible SELECT_VL
+ pattern allows target have various flexible optimizations in vector
+ loop iterations. Target like RISC-V has special application vector
+ length calculation instruction which will distribute even workload
+ in the last 2 iterations.
+
+ Other example is that we can allow even generate _28 <= VF / 2 so
+ that some machine can run vector codes in low power mode.
+
+ 2). Loop control IV:
+ ivtmp_27 = ivtmp_26 - _28;
+ if (ivtmp_27 != 0)
+ goto <bb 6>; [83.33%]
+ else
+ goto <bb 7>; [16.67%]
+
+ This is the saturating-subtraction towards zero, the outcome of
+ .SELECT_VL wil make ivtmp_27 never underflow zero.
+
+ 3). Data reference pointer IV:
+ ivtmp_15 = _28 * 4;
+ vectp_B.8_17 = vectp_B.8_16 + ivtmp_15;
+ vectp_A.13_23 = vectp_A.13_22 + ivtmp_15;
+
+ The pointer IV is adjusted accurately according to the .SELECT_VL.
+
+ 2. Multiple rgroup, the Gimple IR should be:
+
+ # i_23 = PHI <i_20(6), 0(11)>
+ # vectp_f.8_51 = PHI <vectp_f.8_52(6), f_15(D)(11)>
+ # vectp_d.10_59 = PHI <vectp_d.10_60(6), d_18(D)(11)>
+ # ivtmp_70 = PHI <ivtmp_71(6), _69(11)>
+ # ivtmp_73 = PHI <ivtmp_74(6), _67(11)>
+ _72 = MIN_EXPR <ivtmp_70, 16>;
+ _75 = MIN_EXPR <ivtmp_73, 16>;
+ _1 = i_23 * 2;
+ _2 = (long unsigned int) _1;
+ _3 = _2 * 2;
+ _4 = f_15(D) + _3;
+ _5 = _2 + 1;
+ _6 = _5 * 2;
+ _7 = f_15(D) + _6;
+ .LEN_STORE (vectp_f.8_51, 128B, _75, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+ vectp_f.8_56 = vectp_f.8_51 + 16;
+ .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+ _8 = (long unsigned int) i_23;
+ _9 = _8 * 4;
+ _10 = d_18(D) + _9;
+ _61 = _75 / 2;
+ .LEN_STORE (vectp_d.10_59, 128B, _61, { 3, 3, 3, 3 }, 0);
+ vectp_d.10_63 = vectp_d.10_59 + 16;
+ _64 = _72 / 2;
+ .LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
+ i_20 = i_23 + 1;
+ vectp_f.8_52 = vectp_f.8_56 + 16;
+ vectp_d.10_60 = vectp_d.10_63 + 16;
+ ivtmp_74 = ivtmp_73 - _75;
+ ivtmp_71 = ivtmp_70 - _72;
+ if (ivtmp_74 != 0)
+ goto <bb 6>; [83.33%]
+ else
+ goto <bb 13>; [16.67%]
+
+ Note: We DO NOT use .SELECT_VL in SLP auto-vectorization for multiple
+ rgroups. Instead, we use MIN_EXPR to guarantee we always use VF as the
+ iteration amount for mutiple rgroups.
+
+ The analysis of the flow of multiple rgroups:
+ _72 = MIN_EXPR <ivtmp_70, 16>;
+ _75 = MIN_EXPR <ivtmp_73, 16>;
+ ...
+ .LEN_STORE (vectp_f.8_51, 128B, _75, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+ vectp_f.8_56 = vectp_f.8_51 + 16;
+ .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+ ...
+ _61 = _75 / 2;
+ .LEN_STORE (vectp_d.10_59, 128B, _61, { 3, 3, 3, 3 }, 0);
+ vectp_d.10_63 = vectp_d.10_59 + 16;
+ _64 = _72 / 2;
+ .LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
+
+ We use _72 = MIN_EXPR <ivtmp_70, 16>; to generate the number of the elements
+ to be processed in each iteration.
+
+ The related STOREs:
+ _72 = MIN_EXPR <ivtmp_70, 16>;
+ .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+ _64 = _72 / 2;
+ .LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
+ Since these 2 STOREs store 2 vectors that the second vector is half elements
+ of the first vector. So the length of second STORE will be _64 = _72 / 2;
+ It's similar to the VIEW_CONVERT of handling masks in SLP.
+
+ 3. Multiple rgroups for non-SLP auto-vectorization.
+
+ # ivtmp_26 = PHI <ivtmp_27(4), _25(3)>
+ # ivtmp.35_10 = PHI <ivtmp.35_11(4), ivtmp.35_1(3)>
+ # ivtmp.36_2 = PHI <ivtmp.36_8(4), ivtmp.36_23(3)>
+ _28 = MIN_EXPR <ivtmp_26, POLY_INT_CST [8, 8]>;
+ loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
+ loop_len_16 = _28 - loop_len_15;
+ _29 = (void *) ivtmp.35_10;
+ _7 = &MEM <vector([4,4]) int> [(int *)_29];
+ vect__1.25_17 = .LEN_LOAD (_7, 128B, loop_len_15, 0);
+ _33 = _29 + POLY_INT_CST [16, 16];
+ _34 = &MEM <vector([4,4]) int> [(int *)_33];
+ vect__1.26_19 = .LEN_LOAD (_34, 128B, loop_len_16, 0);
+ vect__2.27_20 = VEC_PACK_TRUNC_EXPR <vect__1.25_17, vect__1.26_19>;
+ _30 = (void *) ivtmp.36_2;
+ _31 = &MEM <vector([8,8]) short int> [(short int *)_30];
+ .LEN_STORE (_31, 128B, _28, vect__2.27_20, 0);
+ ivtmp_27 = ivtmp_26 - _28;
+ ivtmp.35_11 = ivtmp.35_10 + POLY_INT_CST [32, 32];
+ ivtmp.36_8 = ivtmp.36_2 + POLY_INT_CST [16, 16];
+ if (ivtmp_27 != 0)
+ goto <bb 4>; [83.33%]
+ else
+ goto <bb 5>; [16.67%]
+
+ The total length: _28 = MIN_EXPR <ivtmp_26, POLY_INT_CST [8, 8]>;
+
+ The length of first half vector:
+ loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
+
+ The length of second half vector:
+ loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
+ loop_len_16 = _28 - loop_len_15;
+
+ 1). _28 always <= POLY_INT_CST [8, 8].
+ 2). When _28 <= POLY_INT_CST [4, 4], second half vector is not processed.
+ 3). When _28 > POLY_INT_CST [4, 4], second half vector is processed.
+*/
+
+static tree
+vect_set_loop_controls_by_select_vl (class loop *loop, loop_vec_info loop_vinfo,
+ gimple_seq *preheader_seq,
+ gimple_seq *header_seq,
+ rgroup_controls *rgc, tree niters)
+{
+ tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ /* We are not allowing masked approach in SELECT_VL. */
+ gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+
+ tree ctrl_type = rgc->type;
+ unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
+ poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+ /* Calculate the maximum number of item values that the rgroup
+ handles in total, the number that it handles for each iteration
+ of the vector loop. */
+ tree nitems_total = niters;
+ if (nitems_per_iter != 1)
+ {
+ /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
+ these multiplications don't overflow. */
+ tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
+ nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+ nitems_total, compare_factor);
+ }
+
+ /* Convert the comparison value to the IV type (either a no-op or
+ a promotion). */
+ nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+
+ /* Create an induction variable that counts the number of items
+ processed. */
+ tree index_before_incr, index_after_incr;
+ gimple_stmt_iterator incr_gsi;
+ bool insert_after;
+ standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+
+ /* Test the decremented IV, which will never underflow 0 since we have
+ IFN_SELECT_VL to gurantee that. */
+ tree test_limit = nitems_total;
+
+ /* Provide a definition of each control in the group. */
+ tree ctrl;
+ unsigned int i;
+ FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
+ {
+ /* Previous controls will cover BIAS items. This control covers the
+ next batch. */
+ poly_uint64 bias = nitems_per_ctrl * i;
+ tree bias_tree = build_int_cst (iv_type, bias);
+
+ /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
+ BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
+ control and adjust the bound down by BIAS. */
+ tree this_test_limit = test_limit;
+ if (i != 0)
+ {
+ this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
+ this_test_limit, bias_tree);
+ this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
+ this_test_limit, bias_tree);
+ }
+
+ /* Create decrement IV. */
+ create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop, &incr_gsi,
+ insert_after, &index_before_incr, &index_after_incr);
+
+ poly_uint64 final_vf = vf * nitems_per_iter;
+ tree vf_step = build_int_cst (iv_type, final_vf);
+ tree res_len;
+ if (LOOP_VINFO_LENS (loop_vinfo).length () == 1)
+ {
+ res_len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
+ index_before_incr, vf_step);
+ LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
+ }
+ else
+ {
+ /* For SLP, we can't allow non-VF number of elements to be processed
+ in non-final iteration. We force the number of elements to be
+ processed in each non-final iteration is VF elements. If we allow
+ non-VF elements processing in non-final iteration will make SLP too
+ complicated and produce inferior codegen.
+
+ For example:
+
+ If non-final iteration process VF elements.
+
+ ...
+ .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
+ .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
+ ...
+
+ If non-final iteration process non-VF elements.
+
+ ...
+ .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
+ if (_71 % 2 == 0)
+ .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
+ else
+ .LEN_STORE (vectp_f.8_56, 128B, _72, { 2, 1, 2, 1 }, 0);
+ ...
+
+ This is the simple case of 2-elements interleaved vector SLP. We
+ consider other interleave vector, the situation will become more
+ complicated. */
+ res_len = gimple_build (header_seq, MIN_EXPR, iv_type,
+ index_before_incr, vf_step);
+ if (rgc->max_nscalars_per_iter != 1)
+ LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P (loop_vinfo) = true;
+ }
+ gassign *assign = gimple_build_assign (ctrl, res_len);
+ gimple_seq_add_stmt (header_seq, assign);
+ }
+
+ return index_after_incr;
+}
+
/* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
for all the rgroup controls in RGC and return a control that is nonzero
when the loop needs to iterate. Add any new preheader statements to
@@ -704,6 +1051,10 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ bool use_vl_p = !use_masks_p
+ && direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
+ OPTIMIZE_FOR_SPEED);
unsigned int compare_precision = TYPE_PRECISION (compare_type);
tree orig_niters = niters;
@@ -753,17 +1104,34 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
continue;
}
+ if (use_vl_p && rgc->max_nscalars_per_iter == 1
+ && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
+ {
+ rgroup_controls *sub_rgc
+ = &(*controls)[nmasks / rgc->controls.length () - 1];
+ if (!sub_rgc->controls.is_empty ())
+ {
+ vect_adjust_loop_lens (iv_type, &header_seq, rgc, sub_rgc);
+ continue;
+ }
+ }
+
/* See whether zero-based IV would ever generate all-false masks
or zero length before wrapping around. */
bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
/* Set up all controls for this group. */
- test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
- &preheader_seq,
- &header_seq,
- loop_cond_gsi, rgc,
- niters, niters_skip,
- might_wrap_p);
+ if (use_vl_p)
+ test_ctrl
+ = vect_set_loop_controls_by_select_vl (loop, loop_vinfo,
+ &preheader_seq, &header_seq,
+ rgc, niters);
+ else
+ test_ctrl
+ = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
+ &header_seq, loop_cond_gsi, rgc,
+ niters, niters_skip,
+ might_wrap_p);
}
/* Emit all accumulated statements. */
@@ -10361,15 +10361,18 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
}
/* Given a complete set of length LENS, extract length number INDEX for an
- rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
+ rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.
+ Insert any set-up statements before GSI. */
tree
-vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
- unsigned int nvectors, unsigned int index)
+vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+ vec_loop_lens *lens, unsigned int nvectors, tree vectype,
+ unsigned int index)
{
rgroup_controls *rgl = &(*lens)[nvectors - 1];
bool use_bias_adjusted_len =
LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
/* Populate the rgroup's len array, if this is the first time we've
used it. */
@@ -10400,6 +10403,26 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
if (use_bias_adjusted_len)
return rgl->bias_adjusted_ctrl;
+ else if (LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P (loop_vinfo))
+ {
+ tree loop_len = rgl->controls[index];
+ poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
+ poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
+ if (maybe_ne (nunits1, nunits2))
+ {
+ /* A loop len for data type X can be reused for data type Y
+ if X has N times more elements than Y and if Y's elements
+ are N times bigger than X's. */
+ gcc_assert (multiple_p (nunits1, nunits2));
+ unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
+ gimple_seq seq = NULL;
+ loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
+ build_int_cst (iv_type, factor));
+ if (seq)
+ gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+ }
+ return loop_len;
+ }
else
return rgl->controls[index];
}
@@ -3147,6 +3147,61 @@ vect_get_data_ptr_increment (vec_info *vinfo,
return iv_step;
}
+/* Prepare the pointer IVs which needs to be updated by a variable amount.
+ Such variable amount is the outcome of .SELECT_VL. In this case, we can
+ allow each iteration process the flexible number of elements as long as
+ the number <= vf elments.
+
+ Return data reference according to SELECT_VL.
+ If new statements are needed, insert them before GSI. */
+
+static tree
+get_select_vl_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
+ tree aggr_type, class loop *at_loop, tree offset,
+ tree *dummy, gimple_stmt_iterator *gsi,
+ bool simd_lane_access_p, vec_loop_lens *loop_lens,
+ dr_vec_info *dr_info,
+ vect_memory_access_type memory_access_type)
+{
+ loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
+ tree step = vect_dr_behavior (vinfo, dr_info)->step;
+
+ /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
+ IVs are updated by variable amount but we will support them in the future.
+ */
+ gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
+ && memory_access_type != VMAT_LOAD_STORE_LANES);
+
+ /* When we support SELECT_VL pattern, we dynamic adjust
+ the memory address by .SELECT_VL result.
+
+ The result of .SELECT_VL is the number of elements to
+ be processed of each iteration. So the memory address
+ adjustment operation should be:
+
+ bytesize = GET_MODE_SIZE (element_mode (aggr_type));
+ addr = addr + .SELECT_VL (ARG..) * bytesize;
+ */
+ gimple *ptr_incr;
+ tree loop_len
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0);
+ tree len_type = TREE_TYPE (loop_len);
+ poly_uint64 bytesize = GET_MODE_SIZE (element_mode (aggr_type));
+ /* Since the outcome of .SELECT_VL is element size, we should adjust
+ it into bytesize so that it can be used in address pointer variable
+ amount IVs adjustment. */
+ tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
+ build_int_cst (len_type, bytesize));
+ if (tree_int_cst_sgn (step) == -1)
+ tmp = fold_build1 (NEGATE_EXPR, len_type, tmp);
+ tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
+ gassign *assign = gimple_build_assign (bump, tmp);
+ gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+ return vect_create_data_ref_ptr (vinfo, stmt_info, aggr_type, at_loop, offset,
+ dummy, gsi, &ptr_incr, simd_lane_access_p,
+ bump);
+}
+
/* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64,128}. */
static bool
@@ -8547,6 +8602,14 @@ vectorizable_store (vec_info *vinfo,
vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
slp_node, &gs_info, &dataref_ptr,
&vec_offsets);
+ else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)
+ && memory_access_type != VMAT_INVARIANT)
+ dataref_ptr
+ = get_select_vl_data_ref_ptr (vinfo, stmt_info, aggr_type,
+ simd_lane_access_p ? loop : NULL,
+ offset, &dummy, gsi,
+ simd_lane_access_p, loop_lens,
+ dr_info, memory_access_type);
else
dataref_ptr
= vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
@@ -8795,8 +8858,9 @@ vectorizable_store (vec_info *vinfo,
else if (loop_lens)
{
tree final_len
- = vect_get_loop_len (loop_vinfo, loop_lens,
- vec_num * ncopies, vec_num * j + i);
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+ vec_num * ncopies, vectype,
+ vec_num * j + i);
tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
machine_mode vmode = TYPE_MODE (vectype);
opt_machine_mode new_ovmode
@@ -9935,6 +9999,13 @@ vectorizable_load (vec_info *vinfo,
slp_node, &gs_info, &dataref_ptr,
&vec_offsets);
}
+ else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)
+ && memory_access_type != VMAT_INVARIANT)
+ dataref_ptr
+ = get_select_vl_data_ref_ptr (vinfo, stmt_info, aggr_type,
+ at_loop, offset, &dummy, gsi,
+ simd_lane_access_p, loop_lens,
+ dr_info, memory_access_type);
else
dataref_ptr
= vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
@@ -10151,8 +10222,8 @@ vectorizable_load (vec_info *vinfo,
else if (loop_lens && memory_access_type != VMAT_INVARIANT)
{
tree final_len
- = vect_get_loop_len (loop_vinfo, loop_lens,
- vec_num * ncopies,
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+ vec_num * ncopies, vectype,
vec_num * j + i);
tree ptr = build_int_cst (ref_type,
align * BITS_PER_UNIT);
@@ -818,6 +818,13 @@ public:
the vector loop can handle fewer than VF scalars. */
bool using_partial_vectors_p;
+ /* True if we've decided to use SELECT_VL to get the number of active
+ elements in a vector loop to be updated. */
+ bool using_select_vl_p;
+
+ /* True if use adjusted loop length for SLP. */
+ bool using_slp_adjusted_len_p;
+
/* True if we've decided to use partially-populated vectors for the
epilogue of loop. */
bool epil_using_partial_vectors_p;
@@ -890,6 +897,8 @@ public:
#define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable
#define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
#define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
+#define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p
+#define LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P(L) (L)->using_slp_adjusted_len_p
#define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \
(L)->epil_using_partial_vectors_p
#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
@@ -2293,7 +2302,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
unsigned int, tree, unsigned int);
extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
tree, unsigned int);
-extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
+extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
+ vec_loop_lens *, unsigned int, tree,
unsigned int);
extern gimple_seq vect_gen_len (tree, tree, tree, tree);
extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);