[V7] VECT: Add decrement IV support in Loop Vectorizer
Checks
Commit Message
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
This patch implement decrement IV for length approach in loop control.
Address comment from kewen that incorporate the implementation inside
"vect_set_loop_controls_directly" instead of a standalone function.
Address comment from Richard using MIN_EXPR to handle these 3 following
cases
1. single rgroup.
2. multiple rgroup for SLP.
3. multiple rgroup for non-SLP (tested on vec_pack_trunc).
Bootstraped && Regression on x86.
Ok for trunk ?
gcc/ChangeLog:
* tree-vect-loop-manip.cc (vect_adjust_loop_lens): New function.
(vect_set_loop_controls_directly): Add decrement IV support.
(vect_set_loop_condition_partial_vectors): Ditto.
* tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Add a new variable.
(vect_get_loop_len): Add decrement IV support.
* tree-vect-stmts.cc (vectorizable_store): Ditto.
(vectorizable_load): Ditto.
* tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro.
(vect_get_loop_len): Add decrement IV support.
---
gcc/tree-vect-loop-manip.cc | 177 +++++++++++++++++++++++++++++++++++-
gcc/tree-vect-loop.cc | 38 +++++++-
gcc/tree-vect-stmts.cc | 9 +-
gcc/tree-vectorizer.h | 13 ++-
4 files changed, 224 insertions(+), 13 deletions(-)
Comments
juzhe.zhong@rivai.ai writes:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> This patch implement decrement IV for length approach in loop control.
>
> Address comment from kewen that incorporate the implementation inside
> "vect_set_loop_controls_directly" instead of a standalone function.
>
> Address comment from Richard using MIN_EXPR to handle these 3 following
> cases
> 1. single rgroup.
> 2. multiple rgroup for SLP.
> 3. multiple rgroup for non-SLP (tested on vec_pack_trunc).
Thanks, this looks pretty reasonable to me FWIW, but some comments below:
> Bootstraped && Regression on x86.
>
> Ok for trunk ?
>
> gcc/ChangeLog:
>
> * tree-vect-loop-manip.cc (vect_adjust_loop_lens): New function.
> (vect_set_loop_controls_directly): Add decrement IV support.
> (vect_set_loop_condition_partial_vectors): Ditto.
> * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Add a new variable.
> (vect_get_loop_len): Add decrement IV support.
> * tree-vect-stmts.cc (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
> * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro.
> (vect_get_loop_len): Add decrement IV support.
>
> ---
> gcc/tree-vect-loop-manip.cc | 177 +++++++++++++++++++++++++++++++++++-
> gcc/tree-vect-loop.cc | 38 +++++++-
> gcc/tree-vect-stmts.cc | 9 +-
> gcc/tree-vectorizer.h | 13 ++-
> 4 files changed, 224 insertions(+), 13 deletions(-)
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index ff6159e08d5..1baac7b1b52 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -385,6 +385,58 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
> return false;
> }
>
> +/* Try to use adjust loop lens for non-SLP multiple-rgroups.
> +
> + _36 = MIN_EXPR <ivtmp_34, VF>;
> +
> + First length (MIN (X, VF/N)):
> + loop_len_15 = MIN_EXPR <_36, VF/N>;
> +
> + Second length:
> + tmp = _36 - loop_len_15;
> + loop_len_16 = MIN (tmp, VF/N);
> +
> + Third length:
> + tmp2 = tmp - loop_len_16;
> + loop_len_17 = MIN (tmp2, VF/N);
> +
> + Forth length:
> + tmp3 = tmp2 - loop_len_17;
> + loop_len_18 = MIN (tmp3, VF/N); */
> +
> +static void
> +vect_adjust_loop_lens (tree iv_type, gimple_seq *seq, rgroup_controls *dest_rgm,
> + rgroup_controls *src_rgm)
> +{
> + tree ctrl_type = dest_rgm->type;
> + poly_uint64 nitems_per_ctrl
> + = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
> +
> + for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
> + {
> + tree src = src_rgm->controls[i / dest_rgm->controls.length ()];
> + tree dest = dest_rgm->controls[i];
> + tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
> + gassign *stmt;
> + if (i == 0)
> + {
> + /* MIN (X, VF*I/N) capped to the range [0, VF/N]. */
> + stmt = gimple_build_assign (dest, MIN_EXPR, src, length_limit);
> + gimple_seq_add_stmt (seq, stmt);
> + }
> + else
> + {
> + /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
> + tree temp = make_ssa_name (iv_type);
> + stmt = gimple_build_assign (temp, MINUS_EXPR, src,
> + dest_rgm->controls[i - 1]);
> + gimple_seq_add_stmt (seq, stmt);
> + stmt = gimple_build_assign (dest, MIN_EXPR, temp, length_limit);
> + gimple_seq_add_stmt (seq, stmt);
> + }
> + }
> +}
> +
> /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
> for all the rgroup controls in RGC and return a control that is nonzero
> when the loop needs to iterate. Add any new preheader statements to
> @@ -468,9 +520,10 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> gimple_stmt_iterator incr_gsi;
> bool insert_after;
> standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> - create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> - loop, &incr_gsi, insert_after, &index_before_incr,
> - &index_after_incr);
> + if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> + loop, &incr_gsi, insert_after, &index_before_incr,
> + &index_after_incr);
>
> tree zero_index = build_int_cst (compare_type, 0);
> tree test_index, test_limit, first_limit;
> @@ -552,8 +605,13 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> /* Convert the IV value to the comparison type (either a no-op or
> a demotion). */
> gimple_seq test_seq = NULL;
> - test_index = gimple_convert (&test_seq, compare_type, test_index);
> - gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + test_limit = gimple_convert (preheader_seq, iv_type, nitems_total);
> + else
> + {
> + test_index = gimple_convert (&test_seq, compare_type, test_index);
> + gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
> + }
>
> /* Provide a definition of each control in the group. */
> tree next_ctrl = NULL_TREE;
> @@ -587,6 +645,101 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> bias_tree);
> }
>
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + {
> + /* Create decrement IV:
> +
> + Case 1 (single rgroup):
> + ...
> + _10 = (unsigned long) count_12(D);
> + ...
> + # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> + _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> + ...
> + vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
> + ...
> + ivtmp_35 = ivtmp_9 - _36;
> + ...
> + if (ivtmp_35 != 0)
> + goto <bb 4>; [83.33%]
> + else
> + goto <bb 5>; [16.67%]
> +
> + Case 2 (SLP multiple rgroup):
> + ...
> + _38 = (unsigned long) n_12(D);
> + _39 = _38 * 2;
> + _40 = MAX_EXPR <_39, 16>;
> + _41 = _40 + 18446744073709551600;
Easier to read as:
_41 = _40 - 16
(which might not be valid gimple, but pseudocode is good enough).
> + ...
> + # ivtmp_42 = PHI <ivtmp_43(4), _41(3)>
> + # ivtmp_45 = PHI <ivtmp_46(4), _39(3)>
> + ...
> + _44 = MIN_EXPR <ivtmp_42, 32>;
> + _47 = MIN_EXPR <ivtmp_45, 32>;
> + ...
> + .LEN_STORE (_6, 8B, _47, ...);
> + ...
> + .LEN_STORE (_25, 8B, _44, ...);
> + _33 = _47 / 2;
> + ...
> + .LEN_STORE (_8, 16B, _33, ...);
> + _36 = _44 / 2;
> + ...
> + .LEN_STORE (_15, 16B, _36, ...);
> + ivtmp_46 = ivtmp_45 - _47;
> + ivtmp_43 = ivtmp_42 - _44;
> + ...
> + if (ivtmp_46 != 0)
> + goto <bb 4>; [83.33%]
> + else
> + goto <bb 5>; [16.67%]
The examples are good, but this one made me wonder: why is the
adjustment made to the limit (namely 16, the gap between _39 and _41)
different from the limits imposed by the MIN_EXPR (32)? And I think
the answer is that:
- _47 counts the number of elements processed by the loop in total,
including the vectors under the control of _44
- _44 counts the number of elements controlled by _47 in the next
iteration of the vector loop (if there is one)
And that's needed to allow the IVs to be updated independently.
The difficulty with this is that the len_load* and len_store*
optabs currently say that the behaviour is undefined if the
length argument is greater than the length of a vector.
So I think using these values of _47 and _44 in the .LEN_STOREs
is relying on undefined behaviour.
Haven't had time to think about the consequences of that yet,
but wanted to send something out sooner rather than later.
> +
> + Case 3 (non-SLP multiple rgroup):
> + ...
> + _38 = (unsigned long) n_12(D);
> + ...
> + # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> + ...
> + _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
> + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
> + _41 = _40 - loop_len_21;
> + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
> + _42 = _40 - loop_len_20;
> + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
> + _43 = _40 - loop_len_19;
> + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> + ...
> + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
> + ...
> + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
> + ...
> + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
> + ...
> + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
> + vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
> + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <vect__4.10_28, vect__4.11_30>;
> + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <vect__7.13_31, vect__7.13_32>;
> + ...
> + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> + ivtmp_39 = ivtmp_38 - _40;
> + ...
> + if (ivtmp_39 != 0)
> + goto <bb 3>; [92.31%]
> + else
> + goto <bb 4>; [7.69%]
> + */
> + create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop,
> + &incr_gsi, insert_after, &index_before_incr,
> + &index_after_incr);
> + tree step = gimple_build (header_seq, MIN_EXPR, iv_type,
> + index_before_incr, nitems_step);
> + gassign *assign = gimple_build_assign (ctrl, step);
> + gimple_seq_add_stmt (header_seq, assign);
> + next_ctrl = index_after_incr;
> + continue;
> + }
> +
> /* Create the initial control. First include all items that
> are within the loop limit. */
> tree init_ctrl = NULL_TREE;
> @@ -704,6 +857,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>
> bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> unsigned int compare_precision = TYPE_PRECISION (compare_type);
> tree orig_niters = niters;
>
> @@ -753,6 +907,19 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> continue;
> }
>
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> + && rgc->max_nscalars_per_iter == 1
> + && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
> + {
> + rgroup_controls *sub_rgc
> + = &(*controls)[nmasks / rgc->controls.length () - 1];
> + if (!sub_rgc->controls.is_empty ())
> + {
> + vect_adjust_loop_lens (iv_type, &header_seq, rgc, sub_rgc);
> + continue;
> + }
> + }
> +
> /* See whether zero-based IV would ever generate all-false masks
> or zero length before wrapping around. */
> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index ed0166fedab..44c0829a823 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -973,6 +973,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
> vectorizable (false),
> can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
> using_partial_vectors_p (false),
> + using_decrementing_iv_p (false),
> epil_using_partial_vectors_p (false),
> partial_load_store_bias (0),
> peeling_for_gaps (false),
> @@ -2725,6 +2726,17 @@ start_over:
> && !vect_verify_loop_lens (loop_vinfo))
> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>
> + /* If we're vectorizing an loop that uses length "controls" and
> + can iterate more than once, we apply decrementing IV approach
> + in loop control. */
> + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> + && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
> + && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> + && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
> + && LOOP_VINFO_INT_NITERS (loop_vinfo)
> + <= LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
It would be better to use known_le here, without checking whether the
VF is constant.
Thanks,
Richard
> + LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
> +
> /* If we're vectorizing an epilogue loop, the vectorized loop either needs
> to be able to handle fewer than VF scalars, or needs to have a lower VF
> than the main loop. */
> @@ -10364,12 +10376,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
>
> tree
> -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> - unsigned int nvectors, unsigned int index)
> +vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
> + vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> + unsigned int index)
> {
> rgroup_controls *rgl = &(*lens)[nvectors - 1];
> bool use_bias_adjusted_len =
> LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>
> /* Populate the rgroup's len array, if this is the first time we've
> used it. */
> @@ -10400,6 +10414,26 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>
> if (use_bias_adjusted_len)
> return rgl->bias_adjusted_ctrl;
> + else if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + {
> + tree loop_len = rgl->controls[index];
> + poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> + poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> + if (maybe_ne (nunits1, nunits2))
> + {
> + /* A loop len for data type X can be reused for data type Y
> + if X has N times more elements than Y and if Y's elements
> + are N times bigger than X's. */
> + gcc_assert (multiple_p (nunits1, nunits2));
> + unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> + gimple_seq seq = NULL;
> + loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> + build_int_cst (iv_type, factor));
> + if (seq)
> + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> + }
> + return loop_len;
> + }
> else
> return rgl->controls[index];
> }
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 7313191b0db..b5e4bc59355 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -8795,8 +8795,9 @@ vectorizable_store (vec_info *vinfo,
> else if (loop_lens)
> {
> tree final_len
> - = vect_get_loop_len (loop_vinfo, loop_lens,
> - vec_num * ncopies, vec_num * j + i);
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i);
> tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> machine_mode vmode = TYPE_MODE (vectype);
> opt_machine_mode new_ovmode
> @@ -10151,8 +10152,8 @@ vectorizable_load (vec_info *vinfo,
> else if (loop_lens && memory_access_type != VMAT_INVARIANT)
> {
> tree final_len
> - = vect_get_loop_len (loop_vinfo, loop_lens,
> - vec_num * ncopies,
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> vec_num * j + i);
> tree ptr = build_int_cst (ref_type,
> align * BITS_PER_UNIT);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 9cf2fb23fe3..8af3b35324e 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -818,6 +818,13 @@ public:
> the vector loop can handle fewer than VF scalars. */
> bool using_partial_vectors_p;
>
> + /* True if we've decided to use a decrementing loop control IV that counts
> + scalars. This can be done for any loop that:
> +
> + (a) uses length "controls"; and
> + (b) can iterate more than once. */
> + bool using_decrementing_iv_p;
> +
> /* True if we've decided to use partially-populated vectors for the
> epilogue of loop. */
> bool epil_using_partial_vectors_p;
> @@ -890,6 +897,7 @@ public:
> #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable
> #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
> #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
> +#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
> #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \
> (L)->epil_using_partial_vectors_p
> #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
> @@ -2293,8 +2301,9 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> unsigned int, tree, unsigned int);
> extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> tree, unsigned int);
> -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> - unsigned int);
> +extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
> + vec_loop_lens *, unsigned int, tree,
> + unsigned int);
> extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
Hi, Richard.
>> Easier to read as:
>> _41 = _40 - 16
>> (which might not be valid gimple, but pseudocode is good enough).
OK.
>> The difficulty with this is that the len_load* and len_store*
>>optabs currently say that the behaviour is undefined if the
>>length argument is greater than the length of a vector.
>>So I think using these values of _47 and _44 in the .LEN_STOREs
>>is relying on undefined behaviour.
>>Haven't had time to think about the consequences of that yet,
>>but wanted to send something out sooner rather than later.
Yes, we have tail agnostic (TA) in vsevli which is make tail element
undefined value. The current optabs behavior matches the RVV specification.
I think maybe we can leave it to be carefully solved in the future. Currently,
I don't see the issue yet so far.
>>It would be better to use known_le here, without checking whether the
>>VF is constant.
Ok
Thank you so much for your patience helping this patch.
I have sent V8 patch with fixes as you suggested:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618638.html
Can I merge this patch?
I am gonna post the next patch with select_vl included.
Thanks.
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-05-16 03:44
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V7] VECT: Add decrement IV support in Loop Vectorizer
juzhe.zhong@rivai.ai writes:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> This patch implement decrement IV for length approach in loop control.
>
> Address comment from kewen that incorporate the implementation inside
> "vect_set_loop_controls_directly" instead of a standalone function.
>
> Address comment from Richard using MIN_EXPR to handle these 3 following
> cases
> 1. single rgroup.
> 2. multiple rgroup for SLP.
> 3. multiple rgroup for non-SLP (tested on vec_pack_trunc).
Thanks, this looks pretty reasonable to me FWIW, but some comments below:
> Bootstraped && Regression on x86.
>
> Ok for trunk ?
>
> gcc/ChangeLog:
>
> * tree-vect-loop-manip.cc (vect_adjust_loop_lens): New function.
> (vect_set_loop_controls_directly): Add decrement IV support.
> (vect_set_loop_condition_partial_vectors): Ditto.
> * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Add a new variable.
> (vect_get_loop_len): Add decrement IV support.
> * tree-vect-stmts.cc (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
> * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro.
> (vect_get_loop_len): Add decrement IV support.
>
> ---
> gcc/tree-vect-loop-manip.cc | 177 +++++++++++++++++++++++++++++++++++-
> gcc/tree-vect-loop.cc | 38 +++++++-
> gcc/tree-vect-stmts.cc | 9 +-
> gcc/tree-vectorizer.h | 13 ++-
> 4 files changed, 224 insertions(+), 13 deletions(-)
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index ff6159e08d5..1baac7b1b52 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -385,6 +385,58 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
> return false;
> }
>
> +/* Try to use adjust loop lens for non-SLP multiple-rgroups.
> +
> + _36 = MIN_EXPR <ivtmp_34, VF>;
> +
> + First length (MIN (X, VF/N)):
> + loop_len_15 = MIN_EXPR <_36, VF/N>;
> +
> + Second length:
> + tmp = _36 - loop_len_15;
> + loop_len_16 = MIN (tmp, VF/N);
> +
> + Third length:
> + tmp2 = tmp - loop_len_16;
> + loop_len_17 = MIN (tmp2, VF/N);
> +
> + Forth length:
> + tmp3 = tmp2 - loop_len_17;
> + loop_len_18 = MIN (tmp3, VF/N); */
> +
> +static void
> +vect_adjust_loop_lens (tree iv_type, gimple_seq *seq, rgroup_controls *dest_rgm,
> + rgroup_controls *src_rgm)
> +{
> + tree ctrl_type = dest_rgm->type;
> + poly_uint64 nitems_per_ctrl
> + = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
> +
> + for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
> + {
> + tree src = src_rgm->controls[i / dest_rgm->controls.length ()];
> + tree dest = dest_rgm->controls[i];
> + tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
> + gassign *stmt;
> + if (i == 0)
> + {
> + /* MIN (X, VF*I/N) capped to the range [0, VF/N]. */
> + stmt = gimple_build_assign (dest, MIN_EXPR, src, length_limit);
> + gimple_seq_add_stmt (seq, stmt);
> + }
> + else
> + {
> + /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
> + tree temp = make_ssa_name (iv_type);
> + stmt = gimple_build_assign (temp, MINUS_EXPR, src,
> + dest_rgm->controls[i - 1]);
> + gimple_seq_add_stmt (seq, stmt);
> + stmt = gimple_build_assign (dest, MIN_EXPR, temp, length_limit);
> + gimple_seq_add_stmt (seq, stmt);
> + }
> + }
> +}
> +
> /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
> for all the rgroup controls in RGC and return a control that is nonzero
> when the loop needs to iterate. Add any new preheader statements to
> @@ -468,9 +520,10 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> gimple_stmt_iterator incr_gsi;
> bool insert_after;
> standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> - create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> - loop, &incr_gsi, insert_after, &index_before_incr,
> - &index_after_incr);
> + if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> + loop, &incr_gsi, insert_after, &index_before_incr,
> + &index_after_incr);
>
> tree zero_index = build_int_cst (compare_type, 0);
> tree test_index, test_limit, first_limit;
> @@ -552,8 +605,13 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> /* Convert the IV value to the comparison type (either a no-op or
> a demotion). */
> gimple_seq test_seq = NULL;
> - test_index = gimple_convert (&test_seq, compare_type, test_index);
> - gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + test_limit = gimple_convert (preheader_seq, iv_type, nitems_total);
> + else
> + {
> + test_index = gimple_convert (&test_seq, compare_type, test_index);
> + gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
> + }
>
> /* Provide a definition of each control in the group. */
> tree next_ctrl = NULL_TREE;
> @@ -587,6 +645,101 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> bias_tree);
> }
>
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + {
> + /* Create decrement IV:
> +
> + Case 1 (single rgroup):
> + ...
> + _10 = (unsigned long) count_12(D);
> + ...
> + # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> + _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> + ...
> + vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
> + ...
> + ivtmp_35 = ivtmp_9 - _36;
> + ...
> + if (ivtmp_35 != 0)
> + goto <bb 4>; [83.33%]
> + else
> + goto <bb 5>; [16.67%]
> +
> + Case 2 (SLP multiple rgroup):
> + ...
> + _38 = (unsigned long) n_12(D);
> + _39 = _38 * 2;
> + _40 = MAX_EXPR <_39, 16>;
> + _41 = _40 + 18446744073709551600;
Easier to read as:
_41 = _40 - 16
(which might not be valid gimple, but pseudocode is good enough).
> + ...
> + # ivtmp_42 = PHI <ivtmp_43(4), _41(3)>
> + # ivtmp_45 = PHI <ivtmp_46(4), _39(3)>
> + ...
> + _44 = MIN_EXPR <ivtmp_42, 32>;
> + _47 = MIN_EXPR <ivtmp_45, 32>;
> + ...
> + .LEN_STORE (_6, 8B, _47, ...);
> + ...
> + .LEN_STORE (_25, 8B, _44, ...);
> + _33 = _47 / 2;
> + ...
> + .LEN_STORE (_8, 16B, _33, ...);
> + _36 = _44 / 2;
> + ...
> + .LEN_STORE (_15, 16B, _36, ...);
> + ivtmp_46 = ivtmp_45 - _47;
> + ivtmp_43 = ivtmp_42 - _44;
> + ...
> + if (ivtmp_46 != 0)
> + goto <bb 4>; [83.33%]
> + else
> + goto <bb 5>; [16.67%]
The examples are good, but this one made me wonder: why is the
adjustment made to the limit (namely 16, the gap between _39 and _41)
different from the limits imposed by the MIN_EXPR (32)? And I think
the answer is that:
- _47 counts the number of elements processed by the loop in total,
including the vectors under the control of _44
- _44 counts the number of elements controlled by _47 in the next
iteration of the vector loop (if there is one)
And that's needed to allow the IVs to be updated independently.
The difficulty with this is that the len_load* and len_store*
optabs currently say that the behaviour is undefined if the
length argument is greater than the length of a vector.
So I think using these values of _47 and _44 in the .LEN_STOREs
is relying on undefined behaviour.
Haven't had time to think about the consequences of that yet,
but wanted to send something out sooner rather than later.
> +
> + Case 3 (non-SLP multiple rgroup):
> + ...
> + _38 = (unsigned long) n_12(D);
> + ...
> + # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> + ...
> + _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
> + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
> + _41 = _40 - loop_len_21;
> + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
> + _42 = _40 - loop_len_20;
> + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
> + _43 = _40 - loop_len_19;
> + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> + ...
> + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
> + ...
> + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
> + ...
> + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
> + ...
> + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
> + vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
> + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <vect__4.10_28, vect__4.11_30>;
> + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <vect__7.13_31, vect__7.13_32>;
> + ...
> + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> + ivtmp_39 = ivtmp_38 - _40;
> + ...
> + if (ivtmp_39 != 0)
> + goto <bb 3>; [92.31%]
> + else
> + goto <bb 4>; [7.69%]
> + */
> + create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop,
> + &incr_gsi, insert_after, &index_before_incr,
> + &index_after_incr);
> + tree step = gimple_build (header_seq, MIN_EXPR, iv_type,
> + index_before_incr, nitems_step);
> + gassign *assign = gimple_build_assign (ctrl, step);
> + gimple_seq_add_stmt (header_seq, assign);
> + next_ctrl = index_after_incr;
> + continue;
> + }
> +
> /* Create the initial control. First include all items that
> are within the loop limit. */
> tree init_ctrl = NULL_TREE;
> @@ -704,6 +857,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>
> bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> unsigned int compare_precision = TYPE_PRECISION (compare_type);
> tree orig_niters = niters;
>
> @@ -753,6 +907,19 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> continue;
> }
>
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> + && rgc->max_nscalars_per_iter == 1
> + && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
> + {
> + rgroup_controls *sub_rgc
> + = &(*controls)[nmasks / rgc->controls.length () - 1];
> + if (!sub_rgc->controls.is_empty ())
> + {
> + vect_adjust_loop_lens (iv_type, &header_seq, rgc, sub_rgc);
> + continue;
> + }
> + }
> +
> /* See whether zero-based IV would ever generate all-false masks
> or zero length before wrapping around. */
> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index ed0166fedab..44c0829a823 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -973,6 +973,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
> vectorizable (false),
> can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
> using_partial_vectors_p (false),
> + using_decrementing_iv_p (false),
> epil_using_partial_vectors_p (false),
> partial_load_store_bias (0),
> peeling_for_gaps (false),
> @@ -2725,6 +2726,17 @@ start_over:
> && !vect_verify_loop_lens (loop_vinfo))
> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>
> + /* If we're vectorizing an loop that uses length "controls" and
> + can iterate more than once, we apply decrementing IV approach
> + in loop control. */
> + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> + && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
> + && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> + && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
> + && LOOP_VINFO_INT_NITERS (loop_vinfo)
> + <= LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
It would be better to use known_le here, without checking whether the
VF is constant.
Thanks,
Richard
> + LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
> +
> /* If we're vectorizing an epilogue loop, the vectorized loop either needs
> to be able to handle fewer than VF scalars, or needs to have a lower VF
> than the main loop. */
> @@ -10364,12 +10376,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
>
> tree
> -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> - unsigned int nvectors, unsigned int index)
> +vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
> + vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> + unsigned int index)
> {
> rgroup_controls *rgl = &(*lens)[nvectors - 1];
> bool use_bias_adjusted_len =
> LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>
> /* Populate the rgroup's len array, if this is the first time we've
> used it. */
> @@ -10400,6 +10414,26 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>
> if (use_bias_adjusted_len)
> return rgl->bias_adjusted_ctrl;
> + else if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + {
> + tree loop_len = rgl->controls[index];
> + poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> + poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> + if (maybe_ne (nunits1, nunits2))
> + {
> + /* A loop len for data type X can be reused for data type Y
> + if X has N times more elements than Y and if Y's elements
> + are N times bigger than X's. */
> + gcc_assert (multiple_p (nunits1, nunits2));
> + unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> + gimple_seq seq = NULL;
> + loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> + build_int_cst (iv_type, factor));
> + if (seq)
> + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> + }
> + return loop_len;
> + }
> else
> return rgl->controls[index];
> }
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 7313191b0db..b5e4bc59355 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -8795,8 +8795,9 @@ vectorizable_store (vec_info *vinfo,
> else if (loop_lens)
> {
> tree final_len
> - = vect_get_loop_len (loop_vinfo, loop_lens,
> - vec_num * ncopies, vec_num * j + i);
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i);
> tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> machine_mode vmode = TYPE_MODE (vectype);
> opt_machine_mode new_ovmode
> @@ -10151,8 +10152,8 @@ vectorizable_load (vec_info *vinfo,
> else if (loop_lens && memory_access_type != VMAT_INVARIANT)
> {
> tree final_len
> - = vect_get_loop_len (loop_vinfo, loop_lens,
> - vec_num * ncopies,
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> vec_num * j + i);
> tree ptr = build_int_cst (ref_type,
> align * BITS_PER_UNIT);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 9cf2fb23fe3..8af3b35324e 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -818,6 +818,13 @@ public:
> the vector loop can handle fewer than VF scalars. */
> bool using_partial_vectors_p;
>
> + /* True if we've decided to use a decrementing loop control IV that counts
> + scalars. This can be done for any loop that:
> +
> + (a) uses length "controls"; and
> + (b) can iterate more than once. */
> + bool using_decrementing_iv_p;
> +
> /* True if we've decided to use partially-populated vectors for the
> epilogue of loop. */
> bool epil_using_partial_vectors_p;
> @@ -890,6 +897,7 @@ public:
> #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable
> #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
> #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
> +#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
> #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \
> (L)->epil_using_partial_vectors_p
> #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
> @@ -2293,8 +2301,9 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> unsigned int, tree, unsigned int);
> extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> tree, unsigned int);
> -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> - unsigned int);
> +extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
> + vec_loop_lens *, unsigned int, tree,
> + unsigned int);
> extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
>> The examples are good, but this one made me wonder: why is the
>> adjustment made to the limit (namely 16, the gap between _39 and _41)
>> different from the limits imposed by the MIN_EXPR (32)? And I think
>> the answer is that:
>> - _47 counts the number of elements processed by the loop in total,
>> including the vectors under the control of _44
>> - _44 counts the number of elements controlled by _47 in the next
>> iteration of the vector loop (if there is one)
>> And that's needed to allow the IVs to be updated independently.
>> The difficulty with this is that the len_load* and len_store*
>> optabs currently say that the behaviour is undefined if the
>> length argument is greater than the length of a vector.
>> So I think using these values of _47 and _44 in the .LEN_STOREs
>> is relying on undefined behaviour.
>> Haven't had time to think about the consequences of that yet,
>> but wanted to send something out sooner rather than later.
Hi, Richard. I totally understand your concern now. I think the undefine behavior is more
appropriate for RVV since we have vsetvli instruction that gurantee this will cause potential
issues. However, for some other target, we may need to use additional MIN_EXPR to guard
the length never over VF. I think it can be addressed in the future when it is needed.
For now, is it OK for trunk the V9 patch?
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618638.html
which has fixed the comments as you suggested.
Besides, we will going to add more patterns has length included:
len_mask_load/len_mask_stores, len_mask_gather_load/ len_cond...etc
They are all undefine behavior for length larger than the vector length.
Thanks.
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-05-16 03:44
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V7] VECT: Add decrement IV support in Loop Vectorizer
juzhe.zhong@rivai.ai writes:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> This patch implement decrement IV for length approach in loop control.
>
> Address comment from kewen that incorporate the implementation inside
> "vect_set_loop_controls_directly" instead of a standalone function.
>
> Address comment from Richard using MIN_EXPR to handle these 3 following
> cases
> 1. single rgroup.
> 2. multiple rgroup for SLP.
> 3. multiple rgroup for non-SLP (tested on vec_pack_trunc).
Thanks, this looks pretty reasonable to me FWIW, but some comments below:
> Bootstraped && Regression on x86.
>
> Ok for trunk ?
>
> gcc/ChangeLog:
>
> * tree-vect-loop-manip.cc (vect_adjust_loop_lens): New function.
> (vect_set_loop_controls_directly): Add decrement IV support.
> (vect_set_loop_condition_partial_vectors): Ditto.
> * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Add a new variable.
> (vect_get_loop_len): Add decrement IV support.
> * tree-vect-stmts.cc (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
> * tree-vectorizer.h (LOOP_VINFO_USING_DECREMENTING_IV_P): New macro.
> (vect_get_loop_len): Add decrement IV support.
>
> ---
> gcc/tree-vect-loop-manip.cc | 177 +++++++++++++++++++++++++++++++++++-
> gcc/tree-vect-loop.cc | 38 +++++++-
> gcc/tree-vect-stmts.cc | 9 +-
> gcc/tree-vectorizer.h | 13 ++-
> 4 files changed, 224 insertions(+), 13 deletions(-)
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index ff6159e08d5..1baac7b1b52 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -385,6 +385,58 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
> return false;
> }
>
> +/* Try to use adjust loop lens for non-SLP multiple-rgroups.
> +
> + _36 = MIN_EXPR <ivtmp_34, VF>;
> +
> + First length (MIN (X, VF/N)):
> + loop_len_15 = MIN_EXPR <_36, VF/N>;
> +
> + Second length:
> + tmp = _36 - loop_len_15;
> + loop_len_16 = MIN (tmp, VF/N);
> +
> + Third length:
> + tmp2 = tmp - loop_len_16;
> + loop_len_17 = MIN (tmp2, VF/N);
> +
> + Forth length:
> + tmp3 = tmp2 - loop_len_17;
> + loop_len_18 = MIN (tmp3, VF/N); */
> +
> +static void
> +vect_adjust_loop_lens (tree iv_type, gimple_seq *seq, rgroup_controls *dest_rgm,
> + rgroup_controls *src_rgm)
> +{
> + tree ctrl_type = dest_rgm->type;
> + poly_uint64 nitems_per_ctrl
> + = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
> +
> + for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
> + {
> + tree src = src_rgm->controls[i / dest_rgm->controls.length ()];
> + tree dest = dest_rgm->controls[i];
> + tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
> + gassign *stmt;
> + if (i == 0)
> + {
> + /* MIN (X, VF*I/N) capped to the range [0, VF/N]. */
> + stmt = gimple_build_assign (dest, MIN_EXPR, src, length_limit);
> + gimple_seq_add_stmt (seq, stmt);
> + }
> + else
> + {
> + /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
> + tree temp = make_ssa_name (iv_type);
> + stmt = gimple_build_assign (temp, MINUS_EXPR, src,
> + dest_rgm->controls[i - 1]);
> + gimple_seq_add_stmt (seq, stmt);
> + stmt = gimple_build_assign (dest, MIN_EXPR, temp, length_limit);
> + gimple_seq_add_stmt (seq, stmt);
> + }
> + }
> +}
> +
> /* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
> for all the rgroup controls in RGC and return a control that is nonzero
> when the loop needs to iterate. Add any new preheader statements to
> @@ -468,9 +520,10 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> gimple_stmt_iterator incr_gsi;
> bool insert_after;
> standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> - create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> - loop, &incr_gsi, insert_after, &index_before_incr,
> - &index_after_incr);
> + if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
> + loop, &incr_gsi, insert_after, &index_before_incr,
> + &index_after_incr);
>
> tree zero_index = build_int_cst (compare_type, 0);
> tree test_index, test_limit, first_limit;
> @@ -552,8 +605,13 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> /* Convert the IV value to the comparison type (either a no-op or
> a demotion). */
> gimple_seq test_seq = NULL;
> - test_index = gimple_convert (&test_seq, compare_type, test_index);
> - gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + test_limit = gimple_convert (preheader_seq, iv_type, nitems_total);
> + else
> + {
> + test_index = gimple_convert (&test_seq, compare_type, test_index);
> + gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
> + }
>
> /* Provide a definition of each control in the group. */
> tree next_ctrl = NULL_TREE;
> @@ -587,6 +645,101 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> bias_tree);
> }
>
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + {
> + /* Create decrement IV:
> +
> + Case 1 (single rgroup):
> + ...
> + _10 = (unsigned long) count_12(D);
> + ...
> + # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> + _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> + ...
> + vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
> + ...
> + ivtmp_35 = ivtmp_9 - _36;
> + ...
> + if (ivtmp_35 != 0)
> + goto <bb 4>; [83.33%]
> + else
> + goto <bb 5>; [16.67%]
> +
> + Case 2 (SLP multiple rgroup):
> + ...
> + _38 = (unsigned long) n_12(D);
> + _39 = _38 * 2;
> + _40 = MAX_EXPR <_39, 16>;
> + _41 = _40 + 18446744073709551600;
Easier to read as:
_41 = _40 - 16
(which might not be valid gimple, but pseudocode is good enough).
> + ...
> + # ivtmp_42 = PHI <ivtmp_43(4), _41(3)>
> + # ivtmp_45 = PHI <ivtmp_46(4), _39(3)>
> + ...
> + _44 = MIN_EXPR <ivtmp_42, 32>;
> + _47 = MIN_EXPR <ivtmp_45, 32>;
> + ...
> + .LEN_STORE (_6, 8B, _47, ...);
> + ...
> + .LEN_STORE (_25, 8B, _44, ...);
> + _33 = _47 / 2;
> + ...
> + .LEN_STORE (_8, 16B, _33, ...);
> + _36 = _44 / 2;
> + ...
> + .LEN_STORE (_15, 16B, _36, ...);
> + ivtmp_46 = ivtmp_45 - _47;
> + ivtmp_43 = ivtmp_42 - _44;
> + ...
> + if (ivtmp_46 != 0)
> + goto <bb 4>; [83.33%]
> + else
> + goto <bb 5>; [16.67%]
The examples are good, but this one made me wonder: why is the
adjustment made to the limit (namely 16, the gap between _39 and _41)
different from the limits imposed by the MIN_EXPR (32)? And I think
the answer is that:
- _47 counts the number of elements processed by the loop in total,
including the vectors under the control of _44
- _44 counts the number of elements controlled by _47 in the next
iteration of the vector loop (if there is one)
And that's needed to allow the IVs to be updated independently.
The difficulty with this is that the len_load* and len_store*
optabs currently say that the behaviour is undefined if the
length argument is greater than the length of a vector.
So I think using these values of _47 and _44 in the .LEN_STOREs
is relying on undefined behaviour.
Haven't had time to think about the consequences of that yet,
but wanted to send something out sooner rather than later.
> +
> + Case 3 (non-SLP multiple rgroup):
> + ...
> + _38 = (unsigned long) n_12(D);
> + ...
> + # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> + ...
> + _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
> + loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
> + _41 = _40 - loop_len_21;
> + loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
> + _42 = _40 - loop_len_20;
> + loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
> + _43 = _40 - loop_len_19;
> + loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> + ...
> + vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
> + ...
> + vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
> + ...
> + vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
> + ...
> + vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
> + vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
> + vect__7.13_32 = VEC_PACK_TRUNC_EXPR <vect__4.10_28, vect__4.11_30>;
> + vect__7.12_33 = VEC_PACK_TRUNC_EXPR <vect__7.13_31, vect__7.13_32>;
> + ...
> + .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> + ivtmp_39 = ivtmp_38 - _40;
> + ...
> + if (ivtmp_39 != 0)
> + goto <bb 3>; [92.31%]
> + else
> + goto <bb 4>; [7.69%]
> + */
> + create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop,
> + &incr_gsi, insert_after, &index_before_incr,
> + &index_after_incr);
> + tree step = gimple_build (header_seq, MIN_EXPR, iv_type,
> + index_before_incr, nitems_step);
> + gassign *assign = gimple_build_assign (ctrl, step);
> + gimple_seq_add_stmt (header_seq, assign);
> + next_ctrl = index_after_incr;
> + continue;
> + }
> +
> /* Create the initial control. First include all items that
> are within the loop limit. */
> tree init_ctrl = NULL_TREE;
> @@ -704,6 +857,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
>
> bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> unsigned int compare_precision = TYPE_PRECISION (compare_type);
> tree orig_niters = niters;
>
> @@ -753,6 +907,19 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
> continue;
> }
>
> + if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> + && rgc->max_nscalars_per_iter == 1
> + && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
> + {
> + rgroup_controls *sub_rgc
> + = &(*controls)[nmasks / rgc->controls.length () - 1];
> + if (!sub_rgc->controls.is_empty ())
> + {
> + vect_adjust_loop_lens (iv_type, &header_seq, rgc, sub_rgc);
> + continue;
> + }
> + }
> +
> /* See whether zero-based IV would ever generate all-false masks
> or zero length before wrapping around. */
> bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index ed0166fedab..44c0829a823 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -973,6 +973,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
> vectorizable (false),
> can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
> using_partial_vectors_p (false),
> + using_decrementing_iv_p (false),
> epil_using_partial_vectors_p (false),
> partial_load_store_bias (0),
> peeling_for_gaps (false),
> @@ -2725,6 +2726,17 @@ start_over:
> && !vect_verify_loop_lens (loop_vinfo))
> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>
> + /* If we're vectorizing an loop that uses length "controls" and
> + can iterate more than once, we apply decrementing IV approach
> + in loop control. */
> + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> + && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
> + && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> + && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
> + && LOOP_VINFO_INT_NITERS (loop_vinfo)
> + <= LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
It would be better to use known_le here, without checking whether the
VF is constant.
Thanks,
Richard
> + LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
> +
> /* If we're vectorizing an epilogue loop, the vectorized loop either needs
> to be able to handle fewer than VF scalars, or needs to have a lower VF
> than the main loop. */
> @@ -10364,12 +10376,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
>
> tree
> -vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
> - unsigned int nvectors, unsigned int index)
> +vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
> + vec_loop_lens *lens, unsigned int nvectors, tree vectype,
> + unsigned int index)
> {
> rgroup_controls *rgl = &(*lens)[nvectors - 1];
> bool use_bias_adjusted_len =
> LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
>
> /* Populate the rgroup's len array, if this is the first time we've
> used it. */
> @@ -10400,6 +10414,26 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
>
> if (use_bias_adjusted_len)
> return rgl->bias_adjusted_ctrl;
> + else if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> + {
> + tree loop_len = rgl->controls[index];
> + poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> + poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> + if (maybe_ne (nunits1, nunits2))
> + {
> + /* A loop len for data type X can be reused for data type Y
> + if X has N times more elements than Y and if Y's elements
> + are N times bigger than X's. */
> + gcc_assert (multiple_p (nunits1, nunits2));
> + unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> + gimple_seq seq = NULL;
> + loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> + build_int_cst (iv_type, factor));
> + if (seq)
> + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> + }
> + return loop_len;
> + }
> else
> return rgl->controls[index];
> }
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 7313191b0db..b5e4bc59355 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -8795,8 +8795,9 @@ vectorizable_store (vec_info *vinfo,
> else if (loop_lens)
> {
> tree final_len
> - = vect_get_loop_len (loop_vinfo, loop_lens,
> - vec_num * ncopies, vec_num * j + i);
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i);
> tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> machine_mode vmode = TYPE_MODE (vectype);
> opt_machine_mode new_ovmode
> @@ -10151,8 +10152,8 @@ vectorizable_load (vec_info *vinfo,
> else if (loop_lens && memory_access_type != VMAT_INVARIANT)
> {
> tree final_len
> - = vect_get_loop_len (loop_vinfo, loop_lens,
> - vec_num * ncopies,
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> vec_num * j + i);
> tree ptr = build_int_cst (ref_type,
> align * BITS_PER_UNIT);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 9cf2fb23fe3..8af3b35324e 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -818,6 +818,13 @@ public:
> the vector loop can handle fewer than VF scalars. */
> bool using_partial_vectors_p;
>
> + /* True if we've decided to use a decrementing loop control IV that counts
> + scalars. This can be done for any loop that:
> +
> + (a) uses length "controls"; and
> + (b) can iterate more than once. */
> + bool using_decrementing_iv_p;
> +
> /* True if we've decided to use partially-populated vectors for the
> epilogue of loop. */
> bool epil_using_partial_vectors_p;
> @@ -890,6 +897,7 @@ public:
> #define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable
> #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
> #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
> +#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
> #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \
> (L)->epil_using_partial_vectors_p
> #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
> @@ -2293,8 +2301,9 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
> unsigned int, tree, unsigned int);
> extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> tree, unsigned int);
> -extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
> - unsigned int);
> +extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
> + vec_loop_lens *, unsigned int, tree,
> + unsigned int);
> extern gimple_seq vect_gen_len (tree, tree, tree, tree);
> extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
> extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>>> The examples are good, but this one made me wonder: why is the
>>> adjustment made to the limit (namely 16, the gap between _39 and _41)
>>> different from the limits imposed by the MIN_EXPR (32)? And I think
>>> the answer is that:
>
>>> - _47 counts the number of elements processed by the loop in total,
>>> including the vectors under the control of _44
>
>>> - _44 counts the number of elements controlled by _47 in the next
>>> iteration of the vector loop (if there is one)
>
>>> And that's needed to allow the IVs to be updated independently.
>
>>> The difficulty with this is that the len_load* and len_store*
>>> optabs currently say that the behaviour is undefined if the
>>> length argument is greater than the length of a vector.
>>> So I think using these values of _47 and _44 in the .LEN_STOREs
>>> is relying on undefined behaviour.
>
>>> Haven't had time to think about the consequences of that yet,
>>> but wanted to send something out sooner rather than later.
>
> Hi, Richard. I totally understand your concern now. I think the undefine behavior is more
> appropriate for RVV since we have vsetvli instruction that gurantee this will cause potential
> issues. However, for some other target, we may need to use additional MIN_EXPR to guard
> the length never over VF. I think it can be addressed in the future when it is needed.
But we can't generate (vector) gimple that has undefined behaviour from
(scalar) gimple that had defined behaviour. So something needs to change.
Either we need to generate a different sequence, or we need to define
what the behaviour of len_load/store/etc. are when the length is out of
range (perhaps under a target hook?).
We also need to be consistent. If case 2 is allowed to use length
parameters that are greater than the vector length, then there's no
reason for case 1 to use the result of the MIN_EXPR as the length
parameter. It could just use the loop IV directly. (I realise the
select_vl patch will change case 1 for RVV anyway. But the principle
still holds.)
What does the riscv backend's implementation of the len_load and
len_store guarantee? Is any length greater than the vector length
capped to the vector length? Or is it more complicated than that?
Thanks,
Richard
Hi, Richard.
>> But we can't generate (vector) gimple that has undefined behaviour from
>> (scalar) gimple that had defined behaviour. So something needs to change.
>> Either we need to generate a different sequence, or we need to define
>> what the behaviour of len_load/store/etc. are when the length is out of
>> range (perhaps under a target hook?).
To be safe for any targets, so we need add "MIN" to make the length never over length.
So for case 2 will be like this:
_44 = MIN_EXPR <ivtmp_42, 32>;_47 = MIN_EXPR <ivtmp_45, 32>;
...
_44_2 = MIN_EXPR <_44, 16>; ->>> add this MIN
....LEN_STORE (_6, 8B, _44_2, ...);...
I suddenly realize that it's better to add a MIN for it.But I am not sure whether we can have a better gimple IRthan it.
>> We also need to be consistent. If case 2 is allowed to use length
>> parameters that are greater than the vector length, then there's no
>> reason for case 1 to use the result of the MIN_EXPR as the length
>> parameter. It could just use the loop IV directly. (I realise the
>> select_vl patch will change case 1 for RVV anyway. But the principle
>> still holds.)
Oh, thanks for catching this. After thinking about your comments,
I suddenly realize that make length possible larger than VF will create
potential issues to RVV.
>> What does the riscv backend's implementation of the len_load and
>> len_store guarantee? Is any length greater than the vector length
>> capped to the vector length? Or is it more complicated than that?
For RVV, it is more complicated than that...
For RVV, we will emit vsetvli instruction first for len_load/len_store.
For example for case 1:
loop:
min a5, a4,16
vsetvli zero, a5...
load.
add.
store
a4 = a4 - a5...
Since we have min a5,a4,16 which will make the length always <= vf.
So it works fine for RVV.
However, if we do something like this (according to undefine behavior of len_load/len_store):
loop:
min a5, a4,16
vsetvli zero, a4...
load.
add.
store
a4 = a4 - a5...
Notice there is different here:
vsetvli zero, a4..., here we use "a4" instead of "a5", this will create an issue here:
Since according to RVV ISA, for vsetvli instruction:
vsetvli zero, a4..., if a4 <= vf, then, the length = vf
if vf < a4 <= 2 *vf, it will make length any value between [a4/2, vf] depending on the hardward design.
Since our current data reference pointer IV is added by VF (in bytes) by default.
Then it will be an issue.
So, may be for case 2 like your said, we should not involve undefine behavior into len_load/len_store,
instead, we should well handle loop control by add "MIN (16)" ?
Thanks.
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-05-16 14:57
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH V7] VECT: Add decrement IV support in Loop Vectorizer
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>>> The examples are good, but this one made me wonder: why is the
>>> adjustment made to the limit (namely 16, the gap between _39 and _41)
>>> different from the limits imposed by the MIN_EXPR (32)? And I think
>>> the answer is that:
>
>>> - _47 counts the number of elements processed by the loop in total,
>>> including the vectors under the control of _44
>
>>> - _44 counts the number of elements controlled by _47 in the next
>>> iteration of the vector loop (if there is one)
>
>>> And that's needed to allow the IVs to be updated independently.
>
>>> The difficulty with this is that the len_load* and len_store*
>>> optabs currently say that the behaviour is undefined if the
>>> length argument is greater than the length of a vector.
>>> So I think using these values of _47 and _44 in the .LEN_STOREs
>>> is relying on undefined behaviour.
>
>>> Haven't had time to think about the consequences of that yet,
>>> but wanted to send something out sooner rather than later.
>
> Hi, Richard. I totally understand your concern now. I think the undefine behavior is more
> appropriate for RVV since we have vsetvli instruction that gurantee this will cause potential
> issues. However, for some other target, we may need to use additional MIN_EXPR to guard
> the length never over VF. I think it can be addressed in the future when it is needed.
But we can't generate (vector) gimple that has undefined behaviour from
(scalar) gimple that had defined behaviour. So something needs to change.
Either we need to generate a different sequence, or we need to define
what the behaviour of len_load/store/etc. are when the length is out of
range (perhaps under a target hook?).
We also need to be consistent. If case 2 is allowed to use length
parameters that are greater than the vector length, then there's no
reason for case 1 to use the result of the MIN_EXPR as the length
parameter. It could just use the loop IV directly. (I realise the
select_vl patch will change case 1 for RVV anyway. But the principle
still holds.)
What does the riscv backend's implementation of the len_load and
len_store guarantee? Is any length greater than the vector length
capped to the vector length? Or is it more complicated than that?
Thanks,
Richard
Hi, Richard.
For case 2, I come up with this idea:
+ Case 2 (SLP multiple rgroup):
+ ...
+ _38 = (unsigned long) n_12(D);
+ _39 = _38 * 2;
+ _40 = MAX_EXPR <_39, 16>;
+ _41 = _40 - 16;
+ ...
+ # ivtmp_42 = PHI <ivtmp_43(4), _41(3)>
+ # ivtmp_45 = PHI <ivtmp_46(4), _39(3)>
+ ...
+ _44 = MIN_EXPR <ivtmp_42, 32>;
+ _47 = MIN_EXPR <ivtmp_45, 32>;+ _47_2 = MIN_EXPR <_47, 16>; -------->add+ _47_3 = _47 - _47_2 ; --------> add
+ ...
+ .LEN_STORE (_6, 8B, _47_2, ...);
+ ...
+ .LEN_STORE (_25, 8B, _47_3, ...);
+ _33 = _47_2 / 2;
+ ...
+ .LEN_STORE (_8, 16B, _33, ...);
+ _36 = _47_3 / 2;
+ ...
+ .LEN_STORE (_15, 16B, _36, ...);
+ ivtmp_46 = ivtmp_45 - _47;
+ ivtmp_43 = ivtmp_42 - _44;
+ ...
+ if (ivtmp_46 != 0)
+ goto <bb 4>; [83.33%]
+ else
+ goto <bb 5>; [16.67%]
Is it reasonable ? Or you do have better idea for it?
Thanks.
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-05-16 14:57
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH V7] VECT: Add decrement IV support in Loop Vectorizer
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>>> The examples are good, but this one made me wonder: why is the
>>> adjustment made to the limit (namely 16, the gap between _39 and _41)
>>> different from the limits imposed by the MIN_EXPR (32)? And I think
>>> the answer is that:
>
>>> - _47 counts the number of elements processed by the loop in total,
>>> including the vectors under the control of _44
>
>>> - _44 counts the number of elements controlled by _47 in the next
>>> iteration of the vector loop (if there is one)
>
>>> And that's needed to allow the IVs to be updated independently.
>
>>> The difficulty with this is that the len_load* and len_store*
>>> optabs currently say that the behaviour is undefined if the
>>> length argument is greater than the length of a vector.
>>> So I think using these values of _47 and _44 in the .LEN_STOREs
>>> is relying on undefined behaviour.
>
>>> Haven't had time to think about the consequences of that yet,
>>> but wanted to send something out sooner rather than later.
>
> Hi, Richard. I totally understand your concern now. I think the undefine behavior is more
> appropriate for RVV since we have vsetvli instruction that gurantee this will cause potential
> issues. However, for some other target, we may need to use additional MIN_EXPR to guard
> the length never over VF. I think it can be addressed in the future when it is needed.
But we can't generate (vector) gimple that has undefined behaviour from
(scalar) gimple that had defined behaviour. So something needs to change.
Either we need to generate a different sequence, or we need to define
what the behaviour of len_load/store/etc. are when the length is out of
range (perhaps under a target hook?).
We also need to be consistent. If case 2 is allowed to use length
parameters that are greater than the vector length, then there's no
reason for case 1 to use the result of the MIN_EXPR as the length
parameter. It could just use the loop IV directly. (I realise the
select_vl patch will change case 1 for RVV anyway. But the principle
still holds.)
What does the riscv backend's implementation of the len_load and
len_store guarantee? Is any length greater than the vector length
capped to the vector length? Or is it more complicated than that?
Thanks,
Richard
Oh,
I am sorry for incorrect typos in the last email, fix typos :
Hi, Richard.
For case 2, I come up with this idea:
+ Case 2 (SLP multiple rgroup):
+ ...
+ _38 = (unsigned long) n_12(D);
+ _39 = _38 * 2;
+ _40 = MAX_EXPR <_39, 16>; ----------------->remove
+ _41 = _40 - 16; ----------------->remove
+ ...
+ # ivtmp_42 = PHI <ivtmp_43(4), _41(3)> ----------------->remove
+ # ivtmp_45 = PHI <ivtmp_46(4), _39(3)>
+ ...
+ _44 = MIN_EXPR <ivtmp_42, 32>; ----------------->remove
+ _47 = MIN_EXPR <ivtmp_45, 32>;+ _47_2 = MIN_EXPR <_47, 16>; -------->add+ _47_3 = _47 - _47_2 ; --------> add
+ ...
+ .LEN_STORE (_6, 8B, _47_2, ...);
+ ...
+ .LEN_STORE (_25, 8B, _47_3, ...);
+ _33 = _47_2 / 2;
+ ...
+ .LEN_STORE (_8, 16B, _33, ...);
+ _36 = _47_3 / 2;
+ ...
+ .LEN_STORE (_15, 16B, _36, ...);
+ ivtmp_46 = ivtmp_45 - _47;
+ ivtmp_43 = ivtmp_42 - _44; ----------------->remove
+ ...
+ if (ivtmp_46 != 0)
+ goto <bb 4>; [83.33%]
+ else
+ goto <bb 5>; [16.67%]
Is it reasonable ? Or you do have better idea for it?
Thanks.
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-05-16 14:57
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH V7] VECT: Add decrement IV support in Loop Vectorizer
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>>> The examples are good, but this one made me wonder: why is the
>>> adjustment made to the limit (namely 16, the gap between _39 and _41)
>>> different from the limits imposed by the MIN_EXPR (32)? And I think
>>> the answer is that:
>
>>> - _47 counts the number of elements processed by the loop in total,
>>> including the vectors under the control of _44
>
>>> - _44 counts the number of elements controlled by _47 in the next
>>> iteration of the vector loop (if there is one)
>
>>> And that's needed to allow the IVs to be updated independently.
>
>>> The difficulty with this is that the len_load* and len_store*
>>> optabs currently say that the behaviour is undefined if the
>>> length argument is greater than the length of a vector.
>>> So I think using these values of _47 and _44 in the .LEN_STOREs
>>> is relying on undefined behaviour.
>
>>> Haven't had time to think about the consequences of that yet,
>>> but wanted to send something out sooner rather than later.
>
> Hi, Richard. I totally understand your concern now. I think the undefine behavior is more
> appropriate for RVV since we have vsetvli instruction that gurantee this will cause potential
> issues. However, for some other target, we may need to use additional MIN_EXPR to guard
> the length never over VF. I think it can be addressed in the future when it is needed.
But we can't generate (vector) gimple that has undefined behaviour from
(scalar) gimple that had defined behaviour. So something needs to change.
Either we need to generate a different sequence, or we need to define
what the behaviour of len_load/store/etc. are when the length is out of
range (perhaps under a target hook?).
We also need to be consistent. If case 2 is allowed to use length
parameters that are greater than the vector length, then there's no
reason for case 1 to use the result of the MIN_EXPR as the length
parameter. It could just use the loop IV directly. (I realise the
select_vl patch will change case 1 for RVV anyway. But the principle
still holds.)
What does the riscv backend's implementation of the len_load and
len_store guarantee? Is any length greater than the vector length
capped to the vector length? Or is it more complicated than that?
Thanks,
Richard
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Oh,
> I am sorry for incorrect typos in the last email, fix typos :
>
> Hi, Richard.
> For case 2, I come up with this idea:
> + Case 2 (SLP multiple rgroup):
> + ...
> + _38 = (unsigned long) n_12(D);
> + _39 = _38 * 2;
> + _40 = MAX_EXPR <_39, 16>; ----------------->remove
> + _41 = _40 - 16; ----------------->remove
>
> + ...
> + # ivtmp_42 = PHI <ivtmp_43(4), _41(3)> ----------------->remove
>
> + # ivtmp_45 = PHI <ivtmp_46(4), _39(3)>
> + ...
> + _44 = MIN_EXPR <ivtmp_42, 32>; ----------------->remove
>
> + _47 = MIN_EXPR <ivtmp_45, 32>;+ _47_2 = MIN_EXPR <_47, 16>; -------->add+ _47_3 = _47 - _47_2 ; --------> add
> + ...
> + .LEN_STORE (_6, 8B, _47_2, ...);
> + ...
> + .LEN_STORE (_25, 8B, _47_3, ...);
> + _33 = _47_2 / 2;
> + ...
> + .LEN_STORE (_8, 16B, _33, ...);
> + _36 = _47_3 / 2;
> + ...
> + .LEN_STORE (_15, 16B, _36, ...);
> + ivtmp_46 = ivtmp_45 - _47;
> + ivtmp_43 = ivtmp_42 - _44; ----------------->remove
>
> + ...
> + if (ivtmp_46 != 0)
> + goto <bb 4>; [83.33%]
> + else
> + goto <bb 5>; [16.67%]
> Is it reasonable ? Or you do have better idea for it?
Yeah, this makes sense, and I think it makes case 2 very similar
(equivalent?) to case 3. If so, it would be nice if they could be
combined.
Of course, this loses the nice property that the original had: that each
IV was independent, and so the dependency chains were shorter. With the
above approach, the second length parameter instead depends on a
three-instruction chain. But that might be OK (up to you).
How much of the riscv backend infrastructure is in place now? The reason
I ask is that it would be good if the patch had some tests. AIUI, the
patch is an optimisation on top of what the current len_load/store code does,
rather than something that is needed for correctness. So it seems like
the necessary patterns could be added and tested using the current approach,
then this patch could be applied on top, with its own tests for the new
approach.
Thanks,
Richard
Hi, Richard.
RVV infrastructure in RISC-V backend status:
1. All RVV instructions pattern related to intrinsics are all finished (They will be called not only by intrinsics but also autovec in the future).
2. In case of autovec, we finished len_load/len_store (They are temporary used and will be removed after I support len_mask_load/len_mask_store in the middle-end).
binary integer autovec patterns.
vec_init pattern.
That's all we have so far.
In case of testing of this patch, I have multiple rgroup testcases in local, you mean you want me to post them together with this patch?
Since I am gonna to put them in RISC-V backend testsuite, I was planning to post them after this patch is finished and merged into trunk.
What do you suggest ?
Thanks.
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-05-16 16:16
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH V7] VECT: Add decrement IV support in Loop Vectorizer
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Oh,
> I am sorry for incorrect typos in the last email, fix typos :
>
> Hi, Richard.
> For case 2, I come up with this idea:
> + Case 2 (SLP multiple rgroup):
> + ...
> + _38 = (unsigned long) n_12(D);
> + _39 = _38 * 2;
> + _40 = MAX_EXPR <_39, 16>; ----------------->remove
> + _41 = _40 - 16; ----------------->remove
>
> + ...
> + # ivtmp_42 = PHI <ivtmp_43(4), _41(3)> ----------------->remove
>
> + # ivtmp_45 = PHI <ivtmp_46(4), _39(3)>
> + ...
> + _44 = MIN_EXPR <ivtmp_42, 32>; ----------------->remove
>
> + _47 = MIN_EXPR <ivtmp_45, 32>;+ _47_2 = MIN_EXPR <_47, 16>; -------->add+ _47_3 = _47 - _47_2 ; --------> add
> + ...
> + .LEN_STORE (_6, 8B, _47_2, ...);
> + ...
> + .LEN_STORE (_25, 8B, _47_3, ...);
> + _33 = _47_2 / 2;
> + ...
> + .LEN_STORE (_8, 16B, _33, ...);
> + _36 = _47_3 / 2;
> + ...
> + .LEN_STORE (_15, 16B, _36, ...);
> + ivtmp_46 = ivtmp_45 - _47;
> + ivtmp_43 = ivtmp_42 - _44; ----------------->remove
>
> + ...
> + if (ivtmp_46 != 0)
> + goto <bb 4>; [83.33%]
> + else
> + goto <bb 5>; [16.67%]
> Is it reasonable ? Or you do have better idea for it?
Yeah, this makes sense, and I think it makes case 2 very similar
(equivalent?) to case 3. If so, it would be nice if they could be
combined.
Of course, this loses the nice property that the original had: that each
IV was independent, and so the dependency chains were shorter. With the
above approach, the second length parameter instead depends on a
three-instruction chain. But that might be OK (up to you).
How much of the riscv backend infrastructure is in place now? The reason
I ask is that it would be good if the patch had some tests. AIUI, the
patch is an optimisation on top of what the current len_load/store code does,
rather than something that is needed for correctness. So it seems like
the necessary patterns could be added and tested using the current approach,
then this patch could be applied on top, with its own tests for the new
approach.
Thanks,
Richard
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Hi, Richard.
>
> RVV infrastructure in RISC-V backend status:
> 1. All RVV instructions pattern related to intrinsics are all finished (They will be called not only by intrinsics but also autovec in the future).
> 2. In case of autovec, we finished len_load/len_store (They are temporary used and will be removed after I support len_mask_load/len_mask_store in the middle-end).
> binary integer autovec patterns.
> vec_init pattern.
> That's all we have so far.
Thanks.
> In case of testing of this patch, I have multiple rgroup testcases in local, you mean you want me to post them together with this patch?
> Since I am gonna to put them in RISC-V backend testsuite, I was planning to post them after this patch is finished and merged into trunk.
> What do you suggest ?
It would be useful to include the tests with the patch itself (as a patch
to the testsuite). It doesn't matter that the tests are riscv-specific.
Obviously it would be more appropriate for the riscv maintainers to
review the riscv tests. But keeping the tests with the patch helps when
reviewing the code, and also ensures that code is committed and never
later tested.
Richard
Hi, Richard.
I have sent V10:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618718.html
I can't combine implementation Case 2 and Case 3, Case 2 each control (len) are coming from same rgc.
But Case 3 each control (len) are coming coming from different rgc.
Can you help me with that ?
Also, I have append my testcases too in this patch too.
Thanks.
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-05-16 16:30
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH V7] VECT: Add decrement IV support in Loop Vectorizer
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Hi, Richard.
>
> RVV infrastructure in RISC-V backend status:
> 1. All RVV instructions pattern related to intrinsics are all finished (They will be called not only by intrinsics but also autovec in the future).
> 2. In case of autovec, we finished len_load/len_store (They are temporary used and will be removed after I support len_mask_load/len_mask_store in the middle-end).
> binary integer autovec patterns.
> vec_init pattern.
> That's all we have so far.
Thanks.
> In case of testing of this patch, I have multiple rgroup testcases in local, you mean you want me to post them together with this patch?
> Since I am gonna to put them in RISC-V backend testsuite, I was planning to post them after this patch is finished and merged into trunk.
> What do you suggest ?
It would be useful to include the tests with the patch itself (as a patch
to the testsuite). It doesn't matter that the tests are riscv-specific.
Obviously it would be more appropriate for the riscv maintainers to
review the riscv tests. But keeping the tests with the patch helps when
reviewing the code, and also ensures that code is committed and never
later tested.
Richard
Hi, Richard and Richi.
I am so sorry for sending you garbage patches (My mistake, sending RISC-V patches to you).
I finally realize that Case 2 and Case 3 are totally the same sequence!
I have combined them into single function called "vect_adjust_loop_lens_control"
I have sent V11 patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618724.html
I think this patch is the reasonable patch now!
Could you take a look at it?
Thanks.
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-05-16 16:30
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH V7] VECT: Add decrement IV support in Loop Vectorizer
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Hi, Richard.
>
> RVV infrastructure in RISC-V backend status:
> 1. All RVV instructions pattern related to intrinsics are all finished (They will be called not only by intrinsics but also autovec in the future).
> 2. In case of autovec, we finished len_load/len_store (They are temporary used and will be removed after I support len_mask_load/len_mask_store in the middle-end).
> binary integer autovec patterns.
> vec_init pattern.
> That's all we have so far.
Thanks.
> In case of testing of this patch, I have multiple rgroup testcases in local, you mean you want me to post them together with this patch?
> Since I am gonna to put them in RISC-V backend testsuite, I was planning to post them after this patch is finished and merged into trunk.
> What do you suggest ?
It would be useful to include the tests with the patch itself (as a patch
to the testsuite). It doesn't matter that the tests are riscv-specific.
Obviously it would be more appropriate for the riscv maintainers to
review the riscv tests. But keeping the tests with the patch helps when
reviewing the code, and also ensures that code is committed and never
later tested.
Richard
Hi, Richard. Forget about V10 patch. Just go directly V11 patch.
I am so sorry that I send V10 since I originally did not notice Case 2 and Case 3 are totally the same.
I apologize for that. I have reviewed V11 patch twice, it seems that this patch is much more reasonable and better understanding than before.
Thanks.
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-05-16 16:30
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH V7] VECT: Add decrement IV support in Loop Vectorizer
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Hi, Richard.
>
> RVV infrastructure in RISC-V backend status:
> 1. All RVV instructions pattern related to intrinsics are all finished (They will be called not only by intrinsics but also autovec in the future).
> 2. In case of autovec, we finished len_load/len_store (They are temporary used and will be removed after I support len_mask_load/len_mask_store in the middle-end).
> binary integer autovec patterns.
> vec_init pattern.
> That's all we have so far.
Thanks.
> In case of testing of this patch, I have multiple rgroup testcases in local, you mean you want me to post them together with this patch?
> Since I am gonna to put them in RISC-V backend testsuite, I was planning to post them after this patch is finished and merged into trunk.
> What do you suggest ?
It would be useful to include the tests with the patch itself (as a patch
to the testsuite). It doesn't matter that the tests are riscv-specific.
Obviously it would be more appropriate for the riscv maintainers to
review the riscv tests. But keeping the tests with the patch helps when
reviewing the code, and also ensures that code is committed and never
later tested.
Richard
@@ -385,6 +385,58 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
return false;
}
+/* Try to use adjust loop lens for non-SLP multiple-rgroups.
+
+ _36 = MIN_EXPR <ivtmp_34, VF>;
+
+ First length (MIN (X, VF/N)):
+ loop_len_15 = MIN_EXPR <_36, VF/N>;
+
+ Second length:
+ tmp = _36 - loop_len_15;
+ loop_len_16 = MIN (tmp, VF/N);
+
+ Third length:
+ tmp2 = tmp - loop_len_16;
+ loop_len_17 = MIN (tmp2, VF/N);
+
+ Forth length:
+ tmp3 = tmp2 - loop_len_17;
+ loop_len_18 = MIN (tmp3, VF/N); */
+
+static void
+vect_adjust_loop_lens (tree iv_type, gimple_seq *seq, rgroup_controls *dest_rgm,
+ rgroup_controls *src_rgm)
+{
+ tree ctrl_type = dest_rgm->type;
+ poly_uint64 nitems_per_ctrl
+ = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
+
+ for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
+ {
+ tree src = src_rgm->controls[i / dest_rgm->controls.length ()];
+ tree dest = dest_rgm->controls[i];
+ tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
+ gassign *stmt;
+ if (i == 0)
+ {
+ /* MIN (X, VF*I/N) capped to the range [0, VF/N]. */
+ stmt = gimple_build_assign (dest, MIN_EXPR, src, length_limit);
+ gimple_seq_add_stmt (seq, stmt);
+ }
+ else
+ {
+ /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N]. */
+ tree temp = make_ssa_name (iv_type);
+ stmt = gimple_build_assign (temp, MINUS_EXPR, src,
+ dest_rgm->controls[i - 1]);
+ gimple_seq_add_stmt (seq, stmt);
+ stmt = gimple_build_assign (dest, MIN_EXPR, temp, length_limit);
+ gimple_seq_add_stmt (seq, stmt);
+ }
+ }
+}
+
/* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
for all the rgroup controls in RGC and return a control that is nonzero
when the loop needs to iterate. Add any new preheader statements to
@@ -468,9 +520,10 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
gimple_stmt_iterator incr_gsi;
bool insert_after;
standard_iv_increment_position (loop, &incr_gsi, &insert_after);
- create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
- loop, &incr_gsi, insert_after, &index_before_incr,
- &index_after_incr);
+ if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+ create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
+ loop, &incr_gsi, insert_after, &index_before_incr,
+ &index_after_incr);
tree zero_index = build_int_cst (compare_type, 0);
tree test_index, test_limit, first_limit;
@@ -552,8 +605,13 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
/* Convert the IV value to the comparison type (either a no-op or
a demotion). */
gimple_seq test_seq = NULL;
- test_index = gimple_convert (&test_seq, compare_type, test_index);
- gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
+ if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+ test_limit = gimple_convert (preheader_seq, iv_type, nitems_total);
+ else
+ {
+ test_index = gimple_convert (&test_seq, compare_type, test_index);
+ gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
+ }
/* Provide a definition of each control in the group. */
tree next_ctrl = NULL_TREE;
@@ -587,6 +645,101 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
bias_tree);
}
+ if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+ {
+ /* Create decrement IV:
+
+ Case 1 (single rgroup):
+ ...
+ _10 = (unsigned long) count_12(D);
+ ...
+ # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
+ _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
+ ...
+ vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
+ ...
+ ivtmp_35 = ivtmp_9 - _36;
+ ...
+ if (ivtmp_35 != 0)
+ goto <bb 4>; [83.33%]
+ else
+ goto <bb 5>; [16.67%]
+
+ Case 2 (SLP multiple rgroup):
+ ...
+ _38 = (unsigned long) n_12(D);
+ _39 = _38 * 2;
+ _40 = MAX_EXPR <_39, 16>;
+ _41 = _40 + 18446744073709551600;
+ ...
+ # ivtmp_42 = PHI <ivtmp_43(4), _41(3)>
+ # ivtmp_45 = PHI <ivtmp_46(4), _39(3)>
+ ...
+ _44 = MIN_EXPR <ivtmp_42, 32>;
+ _47 = MIN_EXPR <ivtmp_45, 32>;
+ ...
+ .LEN_STORE (_6, 8B, _47, ...);
+ ...
+ .LEN_STORE (_25, 8B, _44, ...);
+ _33 = _47 / 2;
+ ...
+ .LEN_STORE (_8, 16B, _33, ...);
+ _36 = _44 / 2;
+ ...
+ .LEN_STORE (_15, 16B, _36, ...);
+ ivtmp_46 = ivtmp_45 - _47;
+ ivtmp_43 = ivtmp_42 - _44;
+ ...
+ if (ivtmp_46 != 0)
+ goto <bb 4>; [83.33%]
+ else
+ goto <bb 5>; [16.67%]
+
+ Case 3 (non-SLP multiple rgroup):
+ ...
+ _38 = (unsigned long) n_12(D);
+ ...
+ # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
+ ...
+ _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
+ loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
+ _41 = _40 - loop_len_21;
+ loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
+ _42 = _40 - loop_len_20;
+ loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
+ _43 = _40 - loop_len_19;
+ loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
+ ...
+ vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
+ ...
+ vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
+ ...
+ vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
+ ...
+ vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
+ vect__7.13_31 = VEC_PACK_TRUNC_EXPR <vect__4.8_15, vect__4.9_8>;
+ vect__7.13_32 = VEC_PACK_TRUNC_EXPR <vect__4.10_28, vect__4.11_30>;
+ vect__7.12_33 = VEC_PACK_TRUNC_EXPR <vect__7.13_31, vect__7.13_32>;
+ ...
+ .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
+ ivtmp_39 = ivtmp_38 - _40;
+ ...
+ if (ivtmp_39 != 0)
+ goto <bb 3>; [92.31%]
+ else
+ goto <bb 4>; [7.69%]
+ */
+ create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop,
+ &incr_gsi, insert_after, &index_before_incr,
+ &index_after_incr);
+ tree step = gimple_build (header_seq, MIN_EXPR, iv_type,
+ index_before_incr, nitems_step);
+ gassign *assign = gimple_build_assign (ctrl, step);
+ gimple_seq_add_stmt (header_seq, assign);
+ next_ctrl = index_after_incr;
+ continue;
+ }
+
/* Create the initial control. First include all items that
are within the loop limit. */
tree init_ctrl = NULL_TREE;
@@ -704,6 +857,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
unsigned int compare_precision = TYPE_PRECISION (compare_type);
tree orig_niters = niters;
@@ -753,6 +907,19 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
continue;
}
+ if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+ && rgc->max_nscalars_per_iter == 1
+ && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
+ {
+ rgroup_controls *sub_rgc
+ = &(*controls)[nmasks / rgc->controls.length () - 1];
+ if (!sub_rgc->controls.is_empty ())
+ {
+ vect_adjust_loop_lens (iv_type, &header_seq, rgc, sub_rgc);
+ continue;
+ }
+ }
+
/* See whether zero-based IV would ever generate all-false masks
or zero length before wrapping around. */
bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
@@ -973,6 +973,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
vectorizable (false),
can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
using_partial_vectors_p (false),
+ using_decrementing_iv_p (false),
epil_using_partial_vectors_p (false),
partial_load_store_bias (0),
peeling_for_gaps (false),
@@ -2725,6 +2726,17 @@ start_over:
&& !vect_verify_loop_lens (loop_vinfo))
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ /* If we're vectorizing an loop that uses length "controls" and
+ can iterate more than once, we apply decrementing IV approach
+ in loop control. */
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+ && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
+ && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
+ && LOOP_VINFO_INT_NITERS (loop_vinfo)
+ <= LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
+ LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
+
/* If we're vectorizing an epilogue loop, the vectorized loop either needs
to be able to handle fewer than VF scalars, or needs to have a lower VF
than the main loop. */
@@ -10364,12 +10376,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
tree
-vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
- unsigned int nvectors, unsigned int index)
+vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+ vec_loop_lens *lens, unsigned int nvectors, tree vectype,
+ unsigned int index)
{
rgroup_controls *rgl = &(*lens)[nvectors - 1];
bool use_bias_adjusted_len =
LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
/* Populate the rgroup's len array, if this is the first time we've
used it. */
@@ -10400,6 +10414,26 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
if (use_bias_adjusted_len)
return rgl->bias_adjusted_ctrl;
+ else if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+ {
+ tree loop_len = rgl->controls[index];
+ poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
+ poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
+ if (maybe_ne (nunits1, nunits2))
+ {
+ /* A loop len for data type X can be reused for data type Y
+ if X has N times more elements than Y and if Y's elements
+ are N times bigger than X's. */
+ gcc_assert (multiple_p (nunits1, nunits2));
+ unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
+ gimple_seq seq = NULL;
+ loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
+ build_int_cst (iv_type, factor));
+ if (seq)
+ gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+ }
+ return loop_len;
+ }
else
return rgl->controls[index];
}
@@ -8795,8 +8795,9 @@ vectorizable_store (vec_info *vinfo,
else if (loop_lens)
{
tree final_len
- = vect_get_loop_len (loop_vinfo, loop_lens,
- vec_num * ncopies, vec_num * j + i);
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+ vec_num * ncopies, vectype,
+ vec_num * j + i);
tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
machine_mode vmode = TYPE_MODE (vectype);
opt_machine_mode new_ovmode
@@ -10151,8 +10152,8 @@ vectorizable_load (vec_info *vinfo,
else if (loop_lens && memory_access_type != VMAT_INVARIANT)
{
tree final_len
- = vect_get_loop_len (loop_vinfo, loop_lens,
- vec_num * ncopies,
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+ vec_num * ncopies, vectype,
vec_num * j + i);
tree ptr = build_int_cst (ref_type,
align * BITS_PER_UNIT);
@@ -818,6 +818,13 @@ public:
the vector loop can handle fewer than VF scalars. */
bool using_partial_vectors_p;
+ /* True if we've decided to use a decrementing loop control IV that counts
+ scalars. This can be done for any loop that:
+
+ (a) uses length "controls"; and
+ (b) can iterate more than once. */
+ bool using_decrementing_iv_p;
+
/* True if we've decided to use partially-populated vectors for the
epilogue of loop. */
bool epil_using_partial_vectors_p;
@@ -890,6 +897,7 @@ public:
#define LOOP_VINFO_VECTORIZABLE_P(L) (L)->vectorizable
#define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
#define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
+#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
#define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \
(L)->epil_using_partial_vectors_p
#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
@@ -2293,8 +2301,9 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
unsigned int, tree, unsigned int);
extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
tree, unsigned int);
-extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
- unsigned int);
+extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
+ vec_loop_lens *, unsigned int, tree,
+ unsigned int);
extern gimple_seq vect_gen_len (tree, tree, tree, tree);
extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);