VECT: Add decrement IV iteration loop control by variable amount support
Checks
Commit Message
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Hi, this is the most important patch for RVV auto-vectorization support.
It supports WHILE_LEN pattern to not only decrement Loop control IV counter,
but also adjust data reference address pointer by WHILE_LEN.
1. Single control loop (vec_num == 1 && ncopies == 1):
int A[1024], B[1024];
void foo(int n)
{
for (int i = 0; i < n; i++)
A[i] = B[i];
}
-fno-vect-cost-model -fno-tree-loop-distribute-patterns:
Gimple IR:
# vectp_B.6_8 = PHI <vectp_B.6_13(6), &B(5)>
# vectp_B.8_16 = PHI <vectp_B.8_17(6), &B(5)>
# vectp_A.11_19 = PHI <vectp_A.11_20(6), &A(5)>
# vectp_A.13_22 = PHI <vectp_A.13_23(6), &A(5)>
# ivtmp_26 = PHI <ivtmp_27(6), _25(5)>
_28 = .WHILE_LEN (ivtmp_26, POLY_INT_CST [4, 4]);
ivtmp_15 = _28 * 4;
vect__1.10_18 = .LEN_LOAD (vectp_B.8_16, 128B, _28, 0);
_1 = B[i_10];
.LEN_STORE (vectp_A.13_22, 128B, _28, vect__1.10_18, 0);
i_7 = i_10 + 1;
vectp_B.8_17 = vectp_B.8_16 + ivtmp_15;
vectp_A.13_23 = vectp_A.13_22 + ivtmp_15;
ivtmp_27 = ivtmp_26 - _28;
if (ivtmp_27 != 0)
goto <bb 6>; [83.33%]
else
goto <bb 7>; [16.67%]
The WHILE_LEN:
_28 = .WHILE_LEN (ivtmp_26, POLY_INT_CST [4, 4]);
Data address pointer IVs:
ivtmp_15 = _28 * 4;
vectp_B.8_17 = vectp_B.8_16 + ivtmp_15;
vectp_A.13_23 = vectp_A.13_22 + ivtmp_15;
Loop control IVs:
ivtmp_27 = ivtmp_26 - _28;
if (ivtmp_27 != 0)
goto <bb 6>; [83.33%]
else
goto <bb 7>; [16.67%]
So, in this case, we can have non-VF elements to be processed
in non-final iteration.
Some target like RVV(according to vsetvli instruction in RVV ISA),
we can allow from iteration 0 to n - 2 update VF elements, and
iteration n - 1 and n update the remain elements / 2.
Such situation will make RVV CPU even distribute workload in the last
2 iterations.
So, we define WHILE_LEN as any iteration can process any number (<=VF) elements
in any iteration.
2. Multi control loop (SLP, vec_num != 1 && ncopies != 1):
void
foo0 (int16_t *__restrict f, int32_t *__restrict d, int n)
{
for (int i = 0; i < n; ++i)
{
f[i * 2 + 0] = 1;
f[i * 2 + 1] = 2;
d[i] = 3;
}
}
Gimple IR:
# i_23 = PHI <i_20(6), 0(11)>
# vectp_f.8_51 = PHI <vectp_f.8_52(6), f_15(D)(11)>
# vectp_d.10_59 = PHI <vectp_d.10_60(6), d_18(D)(11)>
# ivtmp_70 = PHI <ivtmp_71(6), _69(11)>
# ivtmp_73 = PHI <ivtmp_74(6), _67(11)>
_72 = MIN_EXPR <ivtmp_70, 16>; Force VF elements to be processed in SLP
_75 = MIN_EXPR <ivtmp_73, 16>;
_1 = i_23 * 2;
_2 = (long unsigned int) _1;
_3 = _2 * 2;
_4 = f_15(D) + _3;
_5 = _2 + 1;
_6 = _5 * 2;
_7 = f_15(D) + _6;
.LEN_STORE (vectp_f.8_51, 128B, _75, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
vectp_f.8_56 = vectp_f.8_51 + 16;
.LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
_8 = (long unsigned int) i_23;
_9 = _8 * 4;
_10 = d_18(D) + _9;
_61 = _75 / 2;
.LEN_STORE (vectp_d.10_59, 128B, _61, { 3, 3, 3, 3 }, 0);
vectp_d.10_63 = vectp_d.10_59 + 16;
_64 = _72 / 2;
.LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
i_20 = i_23 + 1;
vectp_f.8_52 = vectp_f.8_56 + 16;
vectp_d.10_60 = vectp_d.10_63 + 16;
ivtmp_74 = ivtmp_73 - _75;
ivtmp_71 = ivtmp_70 - _72;
if (ivtmp_74 != 0)
goto <bb 6>; [83.33%]
else
goto <bb 13>; [16.67%]
For SLP auto-vectorization, we don't use WHILE_LEN to do the variable adjustment
for loop control counter and data reference pointer IVs. We make them still use VF
for the IVs iterations.
Here is the reason:
.LEN_STORE (vectp_f.8_51, 128B, _75, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
vectp_f.8_56 = vectp_f.8_51 + 16;
.LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
The sequence listed above, we still force compiler SLP auto-vectorization using VF
elements to be processed. Since if we use WHILE_LEN to iterate IVs in SLP we will
have problems:
Since WHILE_ELN pattern allows us update any number of elements of any iteration
which is not suitable for SLP auto-vectorization:
...
_76 = WHILE_LEN (AVL)
.LEN_STORE (vectp_f.8_51, 128B, _76, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
...
.LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
If AVL is 6 (n * 2), and VF is 4, then if we flexible allow _76 to be 3 which is AVL / 2,
then the second store will be a problem, it can not store { 1, 2, 1, 2, 1, 2, 1, 2 },
It should store { 2, 1, 2, 1, 2, 1, 2, 1 }
gcc/ChangeLog:
* doc/md.texi: Add WHILE_LEN pattern support.
* internal-fn.cc (while_len_direct): New function.
(expand_while_len_optab_fn): Ditto.
(direct_while_len_optab_supported_p): Ditto.
* internal-fn.def (WHILE_LEN): New pattern.
* optabs.def (OPTAB_D): New pattern.
* tree-ssa-loop-manip.cc (create_iv): Add decrement IV support.
* tree-ssa-loop-manip.h (create_iv): Ditto.
* tree-vect-loop-manip.cc (vect_set_loop_controls_by_while_len): New function.
(vect_set_loop_condition_partial_vectors): Add WHILE_LEN support.
* tree-vect-loop.cc (vect_get_loop_len): Ditto.
* tree-vect-stmts.cc (get_while_len_data_ref_ptr): New function.
(vectorizable_store): Adjust data pointer by WHILE_LEN.
(vectorizable_load): Ditto.
* tree-vectorizer.h (vect_get_loop_len): Adjust for SLP vectorizer.
---
gcc/doc/md.texi | 34 ++++++
gcc/internal-fn.cc | 29 +++++
gcc/internal-fn.def | 1 +
gcc/optabs.def | 1 +
gcc/tree-ssa-loop-manip.cc | 4 +-
gcc/tree-ssa-loop-manip.h | 2 +-
gcc/tree-vect-loop-manip.cc | 223 +++++++++++++++++++++++++++++++++++-
gcc/tree-vect-loop.cc | 27 ++++-
gcc/tree-vect-stmts.cc | 91 ++++++++++++++-
gcc/tree-vectorizer.h | 4 +-
10 files changed, 399 insertions(+), 17 deletions(-)
Comments
juzhe.zhong@rivai.ai writes:
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 6e81dc05e0e..5f44def90d3 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -127,6 +127,7 @@ init_internal_fns ()
> #define cond_binary_direct { 1, 1, true }
> #define cond_ternary_direct { 1, 1, true }
> #define while_direct { 0, 2, false }
> +#define while_len_direct { 0, 0, false }
> #define fold_extract_direct { 2, 2, false }
> #define fold_left_direct { 1, 1, false }
> #define mask_fold_left_direct { 1, 1, false }
> @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> emit_move_insn (lhs_rtx, ops[0].value);
> }
>
> +/* Expand WHILE_LEN call STMT using optab OPTAB. */
> +static void
> +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> +{
> + expand_operand ops[3];
> + tree rhs_type[2];
> +
> + tree lhs = gimple_call_lhs (stmt);
> + tree lhs_type = TREE_TYPE (lhs);
> + rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> + create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
> +
> + for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
> + {
> + tree rhs = gimple_call_arg (stmt, i);
> + rhs_type[i] = TREE_TYPE (rhs);
> + rtx rhs_rtx = expand_normal (rhs);
> + create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
> + }
> +
> + insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
> +
> + expand_insn (icode, 3, ops);
> + if (!rtx_equal_p (lhs_rtx, ops[0].value))
> + emit_move_insn (lhs_rtx, ops[0].value);
> +}
Is this ifn-specific handling needed? From the description, it seems
like WHILE_LEN could be a normal binary internal function.
> +
> /* Expand a call to a convert-like optab using the operands in STMT.
> FN has a single output operand and NARGS input operands. */
>
> @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> #define direct_scatter_store_optab_supported_p convert_optab_supported_p
> #define direct_len_store_optab_supported_p direct_optab_supported_p
> #define direct_while_optab_supported_p convert_optab_supported_p
> +#define direct_while_len_optab_supported_p direct_optab_supported_p
> #define direct_fold_extract_optab_supported_p direct_optab_supported_p
> #define direct_fold_left_optab_supported_p direct_optab_supported_p
> #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 7fe742c2ae7..3a933abff5d 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
>
> DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
> DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
> check_raw_ptrs, check_ptrs)
> DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 695f5911b30..f5938bd2c24 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> OPTAB_D (len_load_optab, "len_load_$a")
> OPTAB_D (len_store_optab, "len_store_$a")
> +OPTAB_D (while_len_optab, "while_len$a")
> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> index a52277abdbf..54845a62298 100644
> --- a/gcc/tree-ssa-loop-manip.cc
> +++ b/gcc/tree-ssa-loop-manip.cc
> @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> void
> create_iv (tree base, tree step, tree var, class loop *loop,
> gimple_stmt_iterator *incr_pos, bool after,
> - tree *var_before, tree *var_after)
> + tree *var_before, tree *var_after, enum tree_code code)
The comment needs to be updated to describe the new interface.
This is personal preference, but: I think the interface would be
clearer if the code argument came between the base and step,
so that the order matches a SCEV. The code could no longer be
a default argument, and so all callers would need to be updated,
but IMO that's OK. Not sure what others think though.
> {
> gassign *stmt;
> gphi *phi;
> tree initial, step1;
> gimple_seq stmts;
> tree vb, va;
> - enum tree_code incr_op = PLUS_EXPR;
> + enum tree_code incr_op = code;
> edge pe = loop_preheader_edge (loop);
>
> if (var != NULL_TREE)
> diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> index d49273a3987..da755320a3a 100644
> --- a/gcc/tree-ssa-loop-manip.h
> +++ b/gcc/tree-ssa-loop-manip.h
> @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3. If not see
> typedef void (*transform_callback)(class loop *, void *);
>
> extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> - bool, tree *, tree *);
> + bool, tree *, tree *, enum tree_code = PLUS_EXPR);
> extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> extern void verify_loop_closed_ssa (bool, class loop * = NULL);
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index f60fa50e8f4..a1c892f285a 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -682,6 +682,210 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> return next_ctrl;
> }
>
> +/* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
> + for all the rgroup controls in RGC and return a control that is nonzero
> + when the loop needs to iterate. Add any new preheader statements to
> + PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
> +
> + RGC belongs to loop LOOP. The loop originally iterated NITERS
> + times and has been vectorized according to LOOP_VINFO.
> +
> + Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> + to TEST_LIMIT - bias.
> +
> + In vect_set_loop_controls_by_while_len, we are iterating from start at
> + IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> + IFN_WHILE_LEN pattern.
> +
> + Note: the cost of the code generated by this function is modeled
> + by vect_estimate_min_profitable_iters, so changes here may need
> + corresponding changes there.
The patch doesn't seem to change vect_estimate_min_profitable_iters though,
so the comment doesn't seem accurate.
> +
> + 1. Single rgroup, the Gimple IR should be:
> +
> + <bb 3>
> + _19 = (unsigned long) n_5(D);
> + ...
> +
> + <bb 4>:
> + ...
> + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> + ...
> + _22 = .WHILE_LEN (ivtmp_20, vf);
> + ...
> + vector statement (use _22);
> + ...
> + ivtmp_21 = ivtmp_20 - _22;
> + ...
> + if (ivtmp_21 != 0)
> + goto <bb 4>; [75.00%]
> + else
> + goto <bb 5>; [25.00%]
> +
> + <bb 5>
> + return;
> +
> + Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
> + underflow 0.
> +
> + 2. Multiple rgroup, the Gimple IR should be:
> +
> + <bb 3>
> + _70 = (unsigned long) bnd.7_52;
> + _71 = _70 * 2;
> + _72 = MAX_EXPR <_71, 4>;
> + _73 = _72 + 18446744073709551612;
> + ...
> +
> + <bb 4>:
> + ...
> + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
> + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
> + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
> + ...
> + vector statement (use _79);
> + ...
> + vector statement (use _76);
> + ...
> + _65 = _79 / 2;
> + vector statement (use _65);
> + ...
> + _68 = _76 / 2;
> + vector statement (use _68);
> + ...
> + ivtmp_78 = ivtmp_77 - _79;
> + ivtmp_75 = ivtmp_74 - _76;
> + ...
> + if (ivtmp_78 != 0)
> + goto <bb 4>; [75.00%]
> + else
> + goto <bb 5>; [25.00%]
> +
> + <bb 5>
> + return;
> +
> +*/
I'm not sure it's safe to handle multiple rgroups like this.
For example, if a loop operates on both int32_ts and int64_ts, and uses
fully-packed vectors for both data types, there will be two int64_t
vectors per int32_t vector (since the total number of lanes has to
be the same for both types). The int32_t vectors would be controlled
by one rgroup (with one control) and the int64_t vectors would
be controlled by a different rgroup (with two controls).
But it's critical that the two rgroups process the same number
of scalars every iteration, and that they "agree" on which
elements are being processed.
So if the int32_t length is some value X, where X<=VF, then the int32_t
control will enable the processing of X consecutive elements, starting
at the first lane. It's then critical that the int64_t control enables
processing of those same elements. If VF/2<=X<=VF then the first
int64_t control must include all VF/2 elements and the second must
include exactly the first X-VF/2 elements.
I don't think that's guaranteed by the proposed definition of WHILE_LEN.
The first int64_t WHILE_LEN could come up short, and return something
less than VF/2.
Thanks,
Richard
On 25 April 2023 18:58:10 CEST, Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>juzhe.zhong@rivai.ai writes:
>> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
>> index 6e81dc05e0e..5f44def90d3 100644
>> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
>> index a52277abdbf..54845a62298 100644
>> --- a/gcc/tree-ssa-loop-manip.cc
>> +++ b/gcc/tree-ssa-loop-manip.cc
>> @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
>> void
>> create_iv (tree base, tree step, tree var, class loop *loop,
>> gimple_stmt_iterator *incr_pos, bool after,
>> - tree *var_before, tree *var_after)
>> + tree *var_before, tree *var_after, enum tree_code code)
>
>The comment needs to be updated to describe the new interface.
Just cosmetics, but please omit the redundant "enum".
>
>This is personal preference, but: I think the interface would be
>clearer if the code argument came between the base and step,
>so that the order matches a SCEV. The code could no longer be
>a default argument, and so all callers would need to be updated,
>but IMO that's OK. Not sure what others think though.
>
>> {
>> gassign *stmt;
>> gphi *phi;
>> tree initial, step1;
>> gimple_seq stmts;
>> tree vb, va;
>> - enum tree_code incr_op = PLUS_EXPR;
>> + enum tree_code incr_op = code;
Likewise here. If you touch such lines, please omit the redundant "enum".
I don't have an opinion on the patch itself.
thanks,
Thanks Richard so much.
>> I don't think that's guaranteed by the proposed definition of WHILE_LEN.
>> The first int64_t WHILE_LEN could come up short, and return something
>> less than VF/2.
I am so sorry that the comments of vect_set_loop_controls_by_while_len
is totally misleading and incorrect and I have sent V3 patch to fix that.
Actually, I don't use WHILE_LEN in multi-rgroups situation, instead, I use MIN_EXPR
to force VF elements for each non-final iteration to make sure result is correct.
Yes, I agree with you that WHILE_LEN will produce issues for SLP situation that
having multi-rgroups since WHILE_LEN definition is allow target produces non-VF
outcome in non-final iteration.
Actually, I have fully tested this patch with local downstream
auto-vectorization testcases and your comment is correct since I saw the issue as you said
at the beginning so I use MIN_EXPR in multi-rgroups situation but I forgot to update the
comments and make you confused. Really sorry.
>> Is this ifn-specific handling needed? From the description, it seems
>> like WHILE_LEN could be a normal binary internal function.
>> The comment needs to be updated to describe the new interface.
>>This is personal preference, but: I think the interface would be
>>clearer if the code argument came between the base and step,
>>so that the order matches a SCEV. The code could no longer be
>>a default argument, and so all callers would need to be updated,
>>but IMO that's OK. Not sure what others think though.
>>The patch doesn't seem to change vect_estimate_min_profitable_iters though,
>>so the comment doesn't seem accurate.
Address all comments and fix patch V3:
https://gcc.gnu.org/pipermail/gcc-patches/2023-April/616730.html
Feel free to comments.
Thanks
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-04-26 00:58
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH] VECT: Add decrement IV iteration loop control by variable amount support
juzhe.zhong@rivai.ai writes:
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 6e81dc05e0e..5f44def90d3 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -127,6 +127,7 @@ init_internal_fns ()
> #define cond_binary_direct { 1, 1, true }
> #define cond_ternary_direct { 1, 1, true }
> #define while_direct { 0, 2, false }
> +#define while_len_direct { 0, 0, false }
> #define fold_extract_direct { 2, 2, false }
> #define fold_left_direct { 1, 1, false }
> #define mask_fold_left_direct { 1, 1, false }
> @@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> emit_move_insn (lhs_rtx, ops[0].value);
> }
>
> +/* Expand WHILE_LEN call STMT using optab OPTAB. */
> +static void
> +expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> +{
> + expand_operand ops[3];
> + tree rhs_type[2];
> +
> + tree lhs = gimple_call_lhs (stmt);
> + tree lhs_type = TREE_TYPE (lhs);
> + rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> + create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
> +
> + for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
> + {
> + tree rhs = gimple_call_arg (stmt, i);
> + rhs_type[i] = TREE_TYPE (rhs);
> + rtx rhs_rtx = expand_normal (rhs);
> + create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
> + }
> +
> + insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
> +
> + expand_insn (icode, 3, ops);
> + if (!rtx_equal_p (lhs_rtx, ops[0].value))
> + emit_move_insn (lhs_rtx, ops[0].value);
> +}
Is this ifn-specific handling needed? From the description, it seems
like WHILE_LEN could be a normal binary internal function.
> +
> /* Expand a call to a convert-like optab using the operands in STMT.
> FN has a single output operand and NARGS input operands. */
>
> @@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> #define direct_scatter_store_optab_supported_p convert_optab_supported_p
> #define direct_len_store_optab_supported_p direct_optab_supported_p
> #define direct_while_optab_supported_p convert_optab_supported_p
> +#define direct_while_len_optab_supported_p direct_optab_supported_p
> #define direct_fold_extract_optab_supported_p direct_optab_supported_p
> #define direct_fold_left_optab_supported_p direct_optab_supported_p
> #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 7fe742c2ae7..3a933abff5d 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
>
> DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> +DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
> DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
> check_raw_ptrs, check_ptrs)
> DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 695f5911b30..f5938bd2c24 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
> OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
> OPTAB_D (len_load_optab, "len_load_$a")
> OPTAB_D (len_store_optab, "len_store_$a")
> +OPTAB_D (while_len_optab, "while_len$a")
> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> index a52277abdbf..54845a62298 100644
> --- a/gcc/tree-ssa-loop-manip.cc
> +++ b/gcc/tree-ssa-loop-manip.cc
> @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> void
> create_iv (tree base, tree step, tree var, class loop *loop,
> gimple_stmt_iterator *incr_pos, bool after,
> - tree *var_before, tree *var_after)
> + tree *var_before, tree *var_after, enum tree_code code)
The comment needs to be updated to describe the new interface.
This is personal preference, but: I think the interface would be
clearer if the code argument came between the base and step,
so that the order matches a SCEV. The code could no longer be
a default argument, and so all callers would need to be updated,
but IMO that's OK. Not sure what others think though.
> {
> gassign *stmt;
> gphi *phi;
> tree initial, step1;
> gimple_seq stmts;
> tree vb, va;
> - enum tree_code incr_op = PLUS_EXPR;
> + enum tree_code incr_op = code;
> edge pe = loop_preheader_edge (loop);
>
> if (var != NULL_TREE)
> diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> index d49273a3987..da755320a3a 100644
> --- a/gcc/tree-ssa-loop-manip.h
> +++ b/gcc/tree-ssa-loop-manip.h
> @@ -23,7 +23,7 @@ along with GCC; see the file COPYING3. If not see
> typedef void (*transform_callback)(class loop *, void *);
>
> extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
> - bool, tree *, tree *);
> + bool, tree *, tree *, enum tree_code = PLUS_EXPR);
> extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> extern void verify_loop_closed_ssa (bool, class loop * = NULL);
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index f60fa50e8f4..a1c892f285a 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -682,6 +682,210 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
> return next_ctrl;
> }
>
> +/* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
> + for all the rgroup controls in RGC and return a control that is nonzero
> + when the loop needs to iterate. Add any new preheader statements to
> + PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
> +
> + RGC belongs to loop LOOP. The loop originally iterated NITERS
> + times and has been vectorized according to LOOP_VINFO.
> +
> + Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
> + to TEST_LIMIT - bias.
> +
> + In vect_set_loop_controls_by_while_len, we are iterating from start at
> + IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
> + IFN_WHILE_LEN pattern.
> +
> + Note: the cost of the code generated by this function is modeled
> + by vect_estimate_min_profitable_iters, so changes here may need
> + corresponding changes there.
The patch doesn't seem to change vect_estimate_min_profitable_iters though,
so the comment doesn't seem accurate.
> +
> + 1. Single rgroup, the Gimple IR should be:
> +
> + <bb 3>
> + _19 = (unsigned long) n_5(D);
> + ...
> +
> + <bb 4>:
> + ...
> + # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
> + ...
> + _22 = .WHILE_LEN (ivtmp_20, vf);
> + ...
> + vector statement (use _22);
> + ...
> + ivtmp_21 = ivtmp_20 - _22;
> + ...
> + if (ivtmp_21 != 0)
> + goto <bb 4>; [75.00%]
> + else
> + goto <bb 5>; [25.00%]
> +
> + <bb 5>
> + return;
> +
> + Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
> + underflow 0.
> +
> + 2. Multiple rgroup, the Gimple IR should be:
> +
> + <bb 3>
> + _70 = (unsigned long) bnd.7_52;
> + _71 = _70 * 2;
> + _72 = MAX_EXPR <_71, 4>;
> + _73 = _72 + 18446744073709551612;
> + ...
> +
> + <bb 4>:
> + ...
> + # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
> + # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
> + _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
> + _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
> + ...
> + vector statement (use _79);
> + ...
> + vector statement (use _76);
> + ...
> + _65 = _79 / 2;
> + vector statement (use _65);
> + ...
> + _68 = _76 / 2;
> + vector statement (use _68);
> + ...
> + ivtmp_78 = ivtmp_77 - _79;
> + ivtmp_75 = ivtmp_74 - _76;
> + ...
> + if (ivtmp_78 != 0)
> + goto <bb 4>; [75.00%]
> + else
> + goto <bb 5>; [25.00%]
> +
> + <bb 5>
> + return;
> +
> +*/
I'm not sure it's safe to handle multiple rgroups like this.
For example, if a loop operates on both int32_ts and int64_ts, and uses
fully-packed vectors for both data types, there will be two int64_t
vectors per int32_t vector (since the total number of lanes has to
be the same for both types). The int32_t vectors would be controlled
by one rgroup (with one control) and the int64_t vectors would
be controlled by a different rgroup (with two controls).
But it's critical that the two rgroups process the same number
of scalars every iteration, and that they "agree" on which
elements are being processed.
So if the int32_t length is some value X, where X<=VF, then the int32_t
control will enable the processing of X consecutive elements, starting
at the first lane. It's then critical that the int64_t control enables
processing of those same elements. If VF/2<=X<=VF then the first
int64_t control must include all VF/2 elements and the second must
include exactly the first X-VF/2 elements.
I don't think that's guaranteed by the proposed definition of WHILE_LEN.
The first int64_t WHILE_LEN could come up short, and return something
less than VF/2.
Thanks,
Richard
On Tue, 25 Apr 2023, Richard Sandiford wrote:
> juzhe.zhong@rivai.ai writes:
> > diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> > index a52277abdbf..54845a62298 100644
> > --- a/gcc/tree-ssa-loop-manip.cc
> > +++ b/gcc/tree-ssa-loop-manip.cc
> > @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
> > void
> > create_iv (tree base, tree step, tree var, class loop *loop,
> > gimple_stmt_iterator *incr_pos, bool after,
> > - tree *var_before, tree *var_after)
> > + tree *var_before, tree *var_after, enum tree_code code)
>
> The comment needs to be updated to describe the new interface.
>
> This is personal preference, but: I think the interface would be
> clearer if the code argument came between the base and step,
> so that the order matches a SCEV. The code could no longer be
> a default argument, and so all callers would need to be updated,
> but IMO that's OK. Not sure what others think though.
Just a quick comment - I think decrementing IVs are already supported,
you just have to make 'step' negative (or large positive). If you
really think using MINUS_EXPR is better or even required for
VLA step please add an assert that 'code' is either PLUS_EXPR or
MINUS_EXPR.
Note that for INTEGER_CST step we rewrite x - CST to x + -CST
during folding.
Going over v3 now.
Richard.
Richard Biener <rguenther@suse.de> writes:
> On Tue, 25 Apr 2023, Richard Sandiford wrote:
>> juzhe.zhong@rivai.ai writes:
>> > diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
>> > index a52277abdbf..54845a62298 100644
>> > --- a/gcc/tree-ssa-loop-manip.cc
>> > +++ b/gcc/tree-ssa-loop-manip.cc
>> > @@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
>> > void
>> > create_iv (tree base, tree step, tree var, class loop *loop,
>> > gimple_stmt_iterator *incr_pos, bool after,
>> > - tree *var_before, tree *var_after)
>> > + tree *var_before, tree *var_after, enum tree_code code)
>>
>> The comment needs to be updated to describe the new interface.
>>
>> This is personal preference, but: I think the interface would be
>> clearer if the code argument came between the base and step,
>> so that the order matches a SCEV. The code could no longer be
>> a default argument, and so all callers would need to be updated,
>> but IMO that's OK. Not sure what others think though.
>
> Just a quick comment - I think decrementing IVs are already supported,
> you just have to make 'step' negative (or large positive). If you
> really think using MINUS_EXPR is better or even required for
> VLA step please add an assert that 'code' is either PLUS_EXPR or
> MINUS_EXPR.
>
> Note that for INTEGER_CST step we rewrite x - CST to x + -CST
> during folding.
Yeah. I think the problem in this case is that the step is variable.
So if we only supported PLUS_EXPRs, we'd need a separate NEGATE_EXPR
stmt (which presumably would be folded in later).
Thanks,
Richard
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Thanks Richard so much.
>
>>> I don't think that's guaranteed by the proposed definition of WHILE_LEN.
>>> The first int64_t WHILE_LEN could come up short, and return something
>>> less than VF/2.
>
> I am so sorry that the comments of vect_set_loop_controls_by_while_len
> is totally misleading and incorrect and I have sent V3 patch to fix that.
> Actually, I don't use WHILE_LEN in multi-rgroups situation, instead, I use MIN_EXPR
> to force VF elements for each non-final iteration to make sure result is correct.
>
> Yes, I agree with you that WHILE_LEN will produce issues for SLP situation that
> having multi-rgroups since WHILE_LEN definition is allow target produces non-VF
> outcome in non-final iteration.
Yeah, I'd read that you weren't using WHILE_LEN for SLP. I was talking
specifically about non-SLP though (nitems_per_iter == 1). Consider:
void f(short *x, long *y) {
for (int i = 0; i < 100; ++i)
x[i] = y[i];
}
compiled at -O3 -fno-vect-cost-model for SVE:
whilelo p4.d, wzr, w6
whilelo p3.d, wzr, w5
whilelo p2.h, wzr, w3
whilelo p1.d, wzr, w3
whilelo p0.d, wzr, w4
.L2:
ld1d z2.d, p0/z, [x1, #1, mul vl]
ld1d z0.d, p1/z, [x1]
ld1d z1.d, p3/z, [x1, #2, mul vl]
uzp1 z0.s, z0.s, z2.s
ld1d z2.d, p4/z, [x1, #3, mul vl]
uzp1 z1.s, z1.s, z2.s
uzp1 z0.h, z0.h, z1.h
st1h z0.h, p2, [x0, x2, lsl 1]
add x2, x2, x8
whilelo p2.h, w2, w3
whilelo p4.d, w2, w6
whilelo p3.d, w2, w5
whilelo p0.d, w2, w4
add x1, x1, x7
whilelo p1.d, w2, w3
b.any .L2
This is a non-SLP loop. We have two rgroups: a single-mask/control
rgroup for the short vector, and a 4-mask/control rgroup for the long
vector. And the loop converts the Nth long scalar (selected from 4
concatenated vectors) to the Nth short scalar (in a single vector).
It's therefore important that the 4-mask/control rgroup and the
single-mask/control rgroup treat the same lanes/scalar iterations
as active and the same lanes/scalar iterations as inactive.
But if I read the code correctly, the patch would generate 5 WHILE_LENs
for this case, since nitems_per_iter==1 for all 5 controls. And I don't
think the documentation of WHILE_LEN guarantees that that will work
correctly, given that WHILE_LEN isn't a simple MIN operation.
It might be that it works correctly on RVV, given the later
backend-specific processing. But I'm looking at this as a purely
gimple thing. If something guarantees that the above works then
I think the WHILE_LEN documentation needs to be updated.
From the current documentation of WHILE_LEN, I think the safe
approach would be to use WHILE_LEN for a single-control rgroup
and then "expand" that to larger control rgroups using arithmetic.
Specifically, if the length computed by the single-control rgroup
is X, then control I in an N-control rgroup would need to be:
(X - VF*I/N) capped to the range [0, VF/N]
SVE does something similar for:
void f(short *x, int *y) {
for (int i = 0; i < 100; ++i)
x[i] = y[i];
}
Here we use a single WHILELO and then unpack it, rather than use
three independent WHILELOs:
whilelo p0.h, wzr, w3
.L2:
punpklo p2.h, p0.b
punpkhi p1.h, p0.b
ld1w z0.s, p2/z, [x1, x2, lsl 2]
ld1w z1.s, p1/z, [x5, x2, lsl 2]
uzp1 z0.h, z0.h, z1.h
st1h z0.h, p0, [x0, x2, lsl 1]
add x2, x2, x4
whilelo p0.h, w2, w3
b.any .L2
This is dreadful code (hence the -fno-vect-cost-model). And I'm sure
it's equally bad for RVV. But the point is that we can generate it,
and in more exotic cases it might even be worthwhile.
Thanks,
Richard
Thank you so much for pointing out this issue.
After reading your comments carefully, I need to revise "vect_set_loop_controls_by_while_len" in loop control like this:
vect_set_loop_controls_by_while_len
...
tree X = NULL_TREE;
FOR_EACH_VEC_ELT (rgc->controls, i, ctrl)
...
if (i == 0) {
X = gimple_build (WHILE_LEN);
gimple_build_assign (ctrl, X);
} else {
// (X - VF*I/N) capped to the range [0, VF/N]
tree t = gimple_build (MINUS, X, build_int_cst (VF*I/N));
gimple_build_assign (ctrl, t);
}
}
....
Am I understand your idea correctly ?
So the example you shows in ARM SVE gimple IR, is like this:
_3 = &MEM <vector([2,2]) long int> [(long int *)_2];
vect__4.6_15 = .MASK_LOAD (_3, 64B, loop_mask_21); (INT64)
_5 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [16B, 16B]];
vect__4.7_8 = .MASK_LOAD (_5, 64B, loop_mask_20);(INT64)
_7 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [32B, 32B]];
vect__4.8_28 = .MASK_LOAD (_7, 64B, loop_mask_19);(INT64)
_24 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [48B, 48B]];
vect__4.9_30 = .MASK_LOAD (_24, 64B, loop_mask_16); (INT64)
vect__7.11_31 = VEC_PACK_TRUNC_EXPR <vect__4.6_15, vect__4.7_8>;
vect__7.11_32 = VEC_PACK_TRUNC_EXPR <vect__4.8_28, vect__4.9_30>;
vect__7.10_33 = VEC_PACK_TRUNC_EXPR <vect__7.11_31, vect__7.11_32>;
...
.MASK_STORE (_13, 16B, loop_mask_36, vect__7.10_33); (INT16)
If it is changed into WHILE_LEN style, it should be:
X = WHILE_LEN;
_3 = &MEM <vector([2,2]) long int> [(long int *)_2];
vect__4.6_15 = .LEN_LOAD (_3, 64B, X - VF*1/N); (INT64)
_5 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*1/N)*8 ];
vect__4.7_8 = .LEN_LOAD (_5, 64B, X - VF*2/N);(INT64)
_7 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*2/N)*8];
vect__4.8_28 = .LEN_LOAD (_7, 64B, X - VF*3/N);(INT64)
_24 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*3/N)*8];
vect__4.9_30 = .LEN_LOAD (_24, 64B, X - VF*4/N); (INT64)
vect__7.11_31 = VEC_PACK_TRUNC_EXPR <vect__4.6_15, vect__4.7_8>;
vect__7.11_32 = VEC_PACK_TRUNC_EXPR <vect__4.8_28, vect__4.9_30>;
vect__7.10_33 = VEC_PACK_TRUNC_EXPR <vect__7.11_31, vect__7.11_32>;
...
.LEN_STORE (_13, 16B, X, vect__7.10_33); (INT16)
Is this correct ?
Thanks.
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-04-26 16:06
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH] VECT: Add decrement IV iteration loop control by variable amount support
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Thanks Richard so much.
>
>>> I don't think that's guaranteed by the proposed definition of WHILE_LEN.
>>> The first int64_t WHILE_LEN could come up short, and return something
>>> less than VF/2.
>
> I am so sorry that the comments of vect_set_loop_controls_by_while_len
> is totally misleading and incorrect and I have sent V3 patch to fix that.
> Actually, I don't use WHILE_LEN in multi-rgroups situation, instead, I use MIN_EXPR
> to force VF elements for each non-final iteration to make sure result is correct.
>
> Yes, I agree with you that WHILE_LEN will produce issues for SLP situation that
> having multi-rgroups since WHILE_LEN definition is allow target produces non-VF
> outcome in non-final iteration.
Yeah, I'd read that you weren't using WHILE_LEN for SLP. I was talking
specifically about non-SLP though (nitems_per_iter == 1). Consider:
void f(short *x, long *y) {
for (int i = 0; i < 100; ++i)
x[i] = y[i];
}
compiled at -O3 -fno-vect-cost-model for SVE:
whilelo p4.d, wzr, w6
whilelo p3.d, wzr, w5
whilelo p2.h, wzr, w3
whilelo p1.d, wzr, w3
whilelo p0.d, wzr, w4
.L2:
ld1d z2.d, p0/z, [x1, #1, mul vl]
ld1d z0.d, p1/z, [x1]
ld1d z1.d, p3/z, [x1, #2, mul vl]
uzp1 z0.s, z0.s, z2.s
ld1d z2.d, p4/z, [x1, #3, mul vl]
uzp1 z1.s, z1.s, z2.s
uzp1 z0.h, z0.h, z1.h
st1h z0.h, p2, [x0, x2, lsl 1]
add x2, x2, x8
whilelo p2.h, w2, w3
whilelo p4.d, w2, w6
whilelo p3.d, w2, w5
whilelo p0.d, w2, w4
add x1, x1, x7
whilelo p1.d, w2, w3
b.any .L2
This is a non-SLP loop. We have two rgroups: a single-mask/control
rgroup for the short vector, and a 4-mask/control rgroup for the long
vector. And the loop converts the Nth long scalar (selected from 4
concatenated vectors) to the Nth short scalar (in a single vector).
It's therefore important that the 4-mask/control rgroup and the
single-mask/control rgroup treat the same lanes/scalar iterations
as active and the same lanes/scalar iterations as inactive.
But if I read the code correctly, the patch would generate 5 WHILE_LENs
for this case, since nitems_per_iter==1 for all 5 controls. And I don't
think the documentation of WHILE_LEN guarantees that that will work
correctly, given that WHILE_LEN isn't a simple MIN operation.
It might be that it works correctly on RVV, given the later
backend-specific processing. But I'm looking at this as a purely
gimple thing. If something guarantees that the above works then
I think the WHILE_LEN documentation needs to be updated.
From the current documentation of WHILE_LEN, I think the safe
approach would be to use WHILE_LEN for a single-control rgroup
and then "expand" that to larger control rgroups using arithmetic.
Specifically, if the length computed by the single-control rgroup
is X, then control I in an N-control rgroup would need to be:
(X - VF*I/N) capped to the range [0, VF/N]
SVE does something similar for:
void f(short *x, int *y) {
for (int i = 0; i < 100; ++i)
x[i] = y[i];
}
Here we use a single WHILELO and then unpack it, rather than use
three independent WHILELOs:
whilelo p0.h, wzr, w3
.L2:
punpklo p2.h, p0.b
punpkhi p1.h, p0.b
ld1w z0.s, p2/z, [x1, x2, lsl 2]
ld1w z1.s, p1/z, [x5, x2, lsl 2]
uzp1 z0.h, z0.h, z1.h
st1h z0.h, p0, [x0, x2, lsl 1]
add x2, x2, x4
whilelo p0.h, w2, w3
b.any .L2
This is dreadful code (hence the -fno-vect-cost-model). And I'm sure
it's equally bad for RVV. But the point is that we can generate it,
and in more exotic cases it might even be worthwhile.
Thanks,
Richard
Oh, I see。 I just checked the codes again.
I think I can't do it directly in the vect_set_loop_controls_by_while_len
Instead, I should do something like this for length:
/* First try using permutes. This adds a single vector
instruction to the loop for each mask, but needs no extra
loop invariants or IVs. */
unsigned int nmasks = i + 1;
if (use_masks_p && (nmasks & 1) == 0)
{
rgroup_controls *half_rgc = &(*controls)[nmasks / 2 - 1];
if (!half_rgc->controls.is_empty ()
&& vect_maybe_permute_loop_masks (&header_seq, rgc, half_rgc))
continue;
}
Is that correct?
juzhe.zhong@rivai.ai
From: juzhe.zhong@rivai.ai
Date: 2023-04-26 16:55
To: richard.sandiford
CC: gcc-patches; rguenther
Subject: Re: Re: [PATCH] VECT: Add decrement IV iteration loop control by variable amount support
Thank you so much for pointing out this issue.
After reading your comments carefully, I need to revise "vect_set_loop_controls_by_while_len" in loop control like this:
vect_set_loop_controls_by_while_len
...
tree X = NULL_TREE;
FOR_EACH_VEC_ELT (rgc->controls, i, ctrl)
...
if (i == 0) {
X = gimple_build (WHILE_LEN);
gimple_build_assign (ctrl, X);
} else {
// (X - VF*I/N) capped to the range [0, VF/N]
tree t = gimple_build (MINUS, X, build_int_cst (VF*I/N));
gimple_build_assign (ctrl, t);
}
}
....
Am I understand your idea correctly ?
So the example you shows in ARM SVE gimple IR, is like this:
_3 = &MEM <vector([2,2]) long int> [(long int *)_2];
vect__4.6_15 = .MASK_LOAD (_3, 64B, loop_mask_21); (INT64)
_5 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [16B, 16B]];
vect__4.7_8 = .MASK_LOAD (_5, 64B, loop_mask_20);(INT64)
_7 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [32B, 32B]];
vect__4.8_28 = .MASK_LOAD (_7, 64B, loop_mask_19);(INT64)
_24 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [48B, 48B]];
vect__4.9_30 = .MASK_LOAD (_24, 64B, loop_mask_16); (INT64)
vect__7.11_31 = VEC_PACK_TRUNC_EXPR <vect__4.6_15, vect__4.7_8>;
vect__7.11_32 = VEC_PACK_TRUNC_EXPR <vect__4.8_28, vect__4.9_30>;
vect__7.10_33 = VEC_PACK_TRUNC_EXPR <vect__7.11_31, vect__7.11_32>;
...
.MASK_STORE (_13, 16B, loop_mask_36, vect__7.10_33); (INT16)
If it is changed into WHILE_LEN style, it should be:
X = WHILE_LEN;
_3 = &MEM <vector([2,2]) long int> [(long int *)_2];
vect__4.6_15 = .LEN_LOAD (_3, 64B, X - VF*1/N); (INT64)
_5 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*1/N)*8 ];
vect__4.7_8 = .LEN_LOAD (_5, 64B, X - VF*2/N);(INT64)
_7 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*2/N)*8];
vect__4.8_28 = .LEN_LOAD (_7, 64B, X - VF*3/N);(INT64)
_24 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*3/N)*8];
vect__4.9_30 = .LEN_LOAD (_24, 64B, X - VF*4/N); (INT64)
vect__7.11_31 = VEC_PACK_TRUNC_EXPR <vect__4.6_15, vect__4.7_8>;
vect__7.11_32 = VEC_PACK_TRUNC_EXPR <vect__4.8_28, vect__4.9_30>;
vect__7.10_33 = VEC_PACK_TRUNC_EXPR <vect__7.11_31, vect__7.11_32>;
...
.LEN_STORE (_13, 16B, X, vect__7.10_33); (INT16)
Is this correct ?
Thanks.
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-04-26 16:06
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH] VECT: Add decrement IV iteration loop control by variable amount support
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Thanks Richard so much.
>
>>> I don't think that's guaranteed by the proposed definition of WHILE_LEN.
>>> The first int64_t WHILE_LEN could come up short, and return something
>>> less than VF/2.
>
> I am so sorry that the comments of vect_set_loop_controls_by_while_len
> is totally misleading and incorrect and I have sent V3 patch to fix that.
> Actually, I don't use WHILE_LEN in multi-rgroups situation, instead, I use MIN_EXPR
> to force VF elements for each non-final iteration to make sure result is correct.
>
> Yes, I agree with you that WHILE_LEN will produce issues for SLP situation that
> having multi-rgroups since WHILE_LEN definition is allow target produces non-VF
> outcome in non-final iteration.
Yeah, I'd read that you weren't using WHILE_LEN for SLP. I was talking
specifically about non-SLP though (nitems_per_iter == 1). Consider:
void f(short *x, long *y) {
for (int i = 0; i < 100; ++i)
x[i] = y[i];
}
compiled at -O3 -fno-vect-cost-model for SVE:
whilelo p4.d, wzr, w6
whilelo p3.d, wzr, w5
whilelo p2.h, wzr, w3
whilelo p1.d, wzr, w3
whilelo p0.d, wzr, w4
.L2:
ld1d z2.d, p0/z, [x1, #1, mul vl]
ld1d z0.d, p1/z, [x1]
ld1d z1.d, p3/z, [x1, #2, mul vl]
uzp1 z0.s, z0.s, z2.s
ld1d z2.d, p4/z, [x1, #3, mul vl]
uzp1 z1.s, z1.s, z2.s
uzp1 z0.h, z0.h, z1.h
st1h z0.h, p2, [x0, x2, lsl 1]
add x2, x2, x8
whilelo p2.h, w2, w3
whilelo p4.d, w2, w6
whilelo p3.d, w2, w5
whilelo p0.d, w2, w4
add x1, x1, x7
whilelo p1.d, w2, w3
b.any .L2
This is a non-SLP loop. We have two rgroups: a single-mask/control
rgroup for the short vector, and a 4-mask/control rgroup for the long
vector. And the loop converts the Nth long scalar (selected from 4
concatenated vectors) to the Nth short scalar (in a single vector).
It's therefore important that the 4-mask/control rgroup and the
single-mask/control rgroup treat the same lanes/scalar iterations
as active and the same lanes/scalar iterations as inactive.
But if I read the code correctly, the patch would generate 5 WHILE_LENs
for this case, since nitems_per_iter==1 for all 5 controls. And I don't
think the documentation of WHILE_LEN guarantees that that will work
correctly, given that WHILE_LEN isn't a simple MIN operation.
It might be that it works correctly on RVV, given the later
backend-specific processing. But I'm looking at this as a purely
gimple thing. If something guarantees that the above works then
I think the WHILE_LEN documentation needs to be updated.
From the current documentation of WHILE_LEN, I think the safe
approach would be to use WHILE_LEN for a single-control rgroup
and then "expand" that to larger control rgroups using arithmetic.
Specifically, if the length computed by the single-control rgroup
is X, then control I in an N-control rgroup would need to be:
(X - VF*I/N) capped to the range [0, VF/N]
SVE does something similar for:
void f(short *x, int *y) {
for (int i = 0; i < 100; ++i)
x[i] = y[i];
}
Here we use a single WHILELO and then unpack it, rather than use
three independent WHILELOs:
whilelo p0.h, wzr, w3
.L2:
punpklo p2.h, p0.b
punpkhi p1.h, p0.b
ld1w z0.s, p2/z, [x1, x2, lsl 2]
ld1w z1.s, p1/z, [x5, x2, lsl 2]
uzp1 z0.h, z0.h, z1.h
st1h z0.h, p0, [x0, x2, lsl 1]
add x2, x2, x4
whilelo p0.h, w2, w3
b.any .L2
This is dreadful code (hence the -fno-vect-cost-model). And I'm sure
it's equally bad for RVV. But the point is that we can generate it,
and in more exotic cases it might even be worthwhile.
Thanks,
Richard
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Thank you so much for pointing out this issue.
>
> After reading your comments carefully, I need to revise "vect_set_loop_controls_by_while_len" in loop control like this:
>
> vect_set_loop_controls_by_while_len
> ...
> tree X = NULL_TREE;
> FOR_EACH_VEC_ELT (rgc->controls, i, ctrl)
> ...
> if (i == 0) {
> X = gimple_build (WHILE_LEN);
> gimple_build_assign (ctrl, X);
> } else {
> // (X - VF*I/N) capped to the range [0, VF/N]
> tree t = gimple_build (MINUS, X, build_int_cst (VF*I/N));
> gimple_build_assign (ctrl, t);
> }
> }
> ....
>
> Am I understand your idea correctly ?
I think it's more that rgc->controls.length () == 1 is a special case,
rather than i == 0 being a special case.
That is, rgc->controls.length () == 1 can use a single WHILE_LEN to
calculate the number of scalars that will be processed by the current
loop iteration. Let's call it X. Then all rgroups with
rgc->controls.length () > 1 will be based on X rather than using
WHILE_LEN. (And they would do that even for the first control in the
group, i.e. for i == 0.)
I'm not saying it has to be this way. It might be that a different
arrangement is better for the later RVV processing. But there needs
to be something in the gimple-level description, and something in
the optab documentation, that guarantees that whatever code we
generate for these cases works correctly.
BTW, very minor thing (I should have raised it earlier), but maybe
something like SELECT_VL would be a better name than WHILE_LEN?
WHILE_ULT means "while (IV) is unsigned less than" and so describes
an operation in terms of its arguments. But I think WHILE_LEN is
more describing an operation based on its use case.
Thanks,
Richard
>
> So the example you shows in ARM SVE gimple IR, is like this:
>
> _3 = &MEM <vector([2,2]) long int> [(long int *)_2];
> vect__4.6_15 = .MASK_LOAD (_3, 64B, loop_mask_21); (INT64)
> _5 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [16B, 16B]];
> vect__4.7_8 = .MASK_LOAD (_5, 64B, loop_mask_20);(INT64)
> _7 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [32B, 32B]];
> vect__4.8_28 = .MASK_LOAD (_7, 64B, loop_mask_19);(INT64)
> _24 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [48B, 48B]];
> vect__4.9_30 = .MASK_LOAD (_24, 64B, loop_mask_16); (INT64)
> vect__7.11_31 = VEC_PACK_TRUNC_EXPR <vect__4.6_15, vect__4.7_8>;
> vect__7.11_32 = VEC_PACK_TRUNC_EXPR <vect__4.8_28, vect__4.9_30>;
> vect__7.10_33 = VEC_PACK_TRUNC_EXPR <vect__7.11_31, vect__7.11_32>;
> ...
> .MASK_STORE (_13, 16B, loop_mask_36, vect__7.10_33); (INT16)
>
> If it is changed into WHILE_LEN style, it should be:
>
> X = WHILE_LEN;
> _3 = &MEM <vector([2,2]) long int> [(long int *)_2];
> vect__4.6_15 = .LEN_LOAD (_3, 64B, X - VF*1/N); (INT64)
> _5 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*1/N)*8 ];
> vect__4.7_8 = .LEN_LOAD (_5, 64B, X - VF*2/N);(INT64)
> _7 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*2/N)*8];
> vect__4.8_28 = .LEN_LOAD (_7, 64B, X - VF*3/N);(INT64)
> _24 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*3/N)*8];
> vect__4.9_30 = .LEN_LOAD (_24, 64B, X - VF*4/N); (INT64)
> vect__7.11_31 = VEC_PACK_TRUNC_EXPR <vect__4.6_15, vect__4.7_8>;
> vect__7.11_32 = VEC_PACK_TRUNC_EXPR <vect__4.8_28, vect__4.9_30>;
> vect__7.10_33 = VEC_PACK_TRUNC_EXPR <vect__7.11_31, vect__7.11_32>;
> ...
> .LEN_STORE (_13, 16B, X, vect__7.10_33); (INT16)
>
> Is this correct ?
>
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>
> From: Richard Sandiford
> Date: 2023-04-26 16:06
> To: juzhe.zhong\@rivai.ai
> CC: gcc-patches; rguenther
> Subject: Re: [PATCH] VECT: Add decrement IV iteration loop control by variable amount support
> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>> Thanks Richard so much.
>>
>>>> I don't think that's guaranteed by the proposed definition of WHILE_LEN.
>>>> The first int64_t WHILE_LEN could come up short, and return something
>>>> less than VF/2.
>>
>> I am so sorry that the comments of vect_set_loop_controls_by_while_len
>> is totally misleading and incorrect and I have sent V3 patch to fix that.
>> Actually, I don't use WHILE_LEN in multi-rgroups situation, instead, I use MIN_EXPR
>> to force VF elements for each non-final iteration to make sure result is correct.
>>
>> Yes, I agree with you that WHILE_LEN will produce issues for SLP situation that
>> having multi-rgroups since WHILE_LEN definition is allow target produces non-VF
>> outcome in non-final iteration.
>
> Yeah, I'd read that you weren't using WHILE_LEN for SLP. I was talking
> specifically about non-SLP though (nitems_per_iter == 1). Consider:
>
> void f(short *x, long *y) {
> for (int i = 0; i < 100; ++i)
> x[i] = y[i];
> }
>
> compiled at -O3 -fno-vect-cost-model for SVE:
>
> whilelo p4.d, wzr, w6
> whilelo p3.d, wzr, w5
> whilelo p2.h, wzr, w3
> whilelo p1.d, wzr, w3
> whilelo p0.d, wzr, w4
> .L2:
> ld1d z2.d, p0/z, [x1, #1, mul vl]
> ld1d z0.d, p1/z, [x1]
> ld1d z1.d, p3/z, [x1, #2, mul vl]
> uzp1 z0.s, z0.s, z2.s
> ld1d z2.d, p4/z, [x1, #3, mul vl]
> uzp1 z1.s, z1.s, z2.s
> uzp1 z0.h, z0.h, z1.h
> st1h z0.h, p2, [x0, x2, lsl 1]
> add x2, x2, x8
> whilelo p2.h, w2, w3
> whilelo p4.d, w2, w6
> whilelo p3.d, w2, w5
> whilelo p0.d, w2, w4
> add x1, x1, x7
> whilelo p1.d, w2, w3
> b.any .L2
>
> This is a non-SLP loop. We have two rgroups: a single-mask/control
> rgroup for the short vector, and a 4-mask/control rgroup for the long
> vector. And the loop converts the Nth long scalar (selected from 4
> concatenated vectors) to the Nth short scalar (in a single vector).
>
> It's therefore important that the 4-mask/control rgroup and the
> single-mask/control rgroup treat the same lanes/scalar iterations
> as active and the same lanes/scalar iterations as inactive.
>
> But if I read the code correctly, the patch would generate 5 WHILE_LENs
> for this case, since nitems_per_iter==1 for all 5 controls. And I don't
> think the documentation of WHILE_LEN guarantees that that will work
> correctly, given that WHILE_LEN isn't a simple MIN operation.
>
> It might be that it works correctly on RVV, given the later
> backend-specific processing. But I'm looking at this as a purely
> gimple thing. If something guarantees that the above works then
> I think the WHILE_LEN documentation needs to be updated.
>
> From the current documentation of WHILE_LEN, I think the safe
> approach would be to use WHILE_LEN for a single-control rgroup
> and then "expand" that to larger control rgroups using arithmetic.
> Specifically, if the length computed by the single-control rgroup
> is X, then control I in an N-control rgroup would need to be:
>
> (X - VF*I/N) capped to the range [0, VF/N]
>
> SVE does something similar for:
>
> void f(short *x, int *y) {
> for (int i = 0; i < 100; ++i)
> x[i] = y[i];
> }
>
> Here we use a single WHILELO and then unpack it, rather than use
> three independent WHILELOs:
>
> whilelo p0.h, wzr, w3
> .L2:
> punpklo p2.h, p0.b
> punpkhi p1.h, p0.b
> ld1w z0.s, p2/z, [x1, x2, lsl 2]
> ld1w z1.s, p1/z, [x5, x2, lsl 2]
> uzp1 z0.h, z0.h, z1.h
> st1h z0.h, p0, [x0, x2, lsl 1]
> add x2, x2, x4
> whilelo p0.h, w2, w3
> b.any .L2
>
> This is dreadful code (hence the -fno-vect-cost-model). And I'm sure
> it's equally bad for RVV. But the point is that we can generate it,
> and in more exotic cases it might even be worthwhile.
>
> Thanks,
> Richard
>
Hi, Richard.
Would you mind take a look at the loop control part again:
static gcond *
vect_set_loop_condition_partial_vectors (class loop *loop,
loop_vec_info loop_vinfo, tree niters,
tree final_iv, bool niters_maybe_zero,
gimple_stmt_iterator loop_cond_gsi)
...
tree loop_len_x = NULL_TREE;
FOR_EACH_VEC_ELT (*controls, i, rgc)
if (!rgc->controls.is_empty ())
{
...
/* Set up all controls for this group. */
if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
OPTIMIZE_FOR_SPEED))
test_ctrl
= vect_set_loop_controls_by_select_vl (loop, loop_vinfo,
&preheader_seq, &header_seq,
rgc, niters, &loop_len_x);
...
}
static tree
vect_set_loop_controls_by_select_vl (class loop *loop, loop_vec_info loop_vinfo,
gimple_seq *preheader_seq,
gimple_seq *header_seq,
rgroup_controls *rgc, tree niters, tree *x)
{
tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
/* We are not allowing masked approach in SELECT_VL. */
gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
tree ctrl_type = rgc->type;
unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
/* Calculate the maximum number of item values that the rgroup
handles in total, the number that it handles for each iteration
of the vector loop. */
tree nitems_total = niters;
if (nitems_per_iter != 1)
{
/* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
these multiplications don't overflow. */
tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
nitems_total, compare_factor);
}
/* Convert the comparison value to the IV type (either a no-op or
a promotion). */
nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
/* Create an induction variable that counts the number of items
processed. */
tree index_before_incr, index_after_incr;
gimple_stmt_iterator incr_gsi;
bool insert_after;
standard_iv_increment_position (loop, &incr_gsi, &insert_after);
/* Test the decremented IV, which will never underflow 0 since we have
IFN_SELECT_VL to gurantee that. */
tree test_limit = nitems_total;
/* Provide a definition of each control in the group. */
tree ctrl;
unsigned int i;
FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
{
/* Previous controls will cover BIAS items. This control covers the
next batch. */
poly_uint64 bias = nitems_per_ctrl * i;
tree bias_tree = build_int_cst (iv_type, bias);
/* Rather than have a new IV that starts at TEST_LIMIT and goes down to
BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
control and adjust the bound down by BIAS. */
tree this_test_limit = test_limit;
if (i != 0)
{
this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
this_test_limit, bias_tree);
this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
this_test_limit, bias_tree);
}
/* Create decrement IV. */
create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop, &incr_gsi,
insert_after, &index_before_incr, &index_after_incr);
tree res_len;
if (rgc->controls.length () != 1)
{
if (nitems_per_iter == 1)
{
/* Generte length = (X - VF*I/N) capped to the range [0, VF/N]. */
/* step = VF * I / N. */
tree step
= build_int_cst (iv_type,
exact_div (vf * i, rgc->controls.length ()));
/* Make sure (X - VF*I/N) never underflow zero. */
tree max = gimple_build (header_seq, MAX_EXPR, iv_type, *x, step);
res_len
= gimple_build (header_seq, MIN_EXPR, iv_type,
index_before_incr,
build_int_cst (iv_type, vf * nitems_per_iter));
}
else
{
/* For SLP, we can't allow non-VF number of elements to be
processed in non-final iteration. We force the number of
elements to be processed in each non-final iteration is VF
elements. If we allow non-VF elements processing in non-final
iteration will make SLP too complicated and produce inferior
codegen.
For example:
If non-final iteration process VF elements.
...
.LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
.LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
...
If non-final iteration process non-VF elements.
...
.LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
if (_71 % 2 == 0)
.LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
else
.LEN_STORE (vectp_f.8_56, 128B, _72, { 2, 1, 2, 1 }, 0);
...
This is the simple case of 2-elements interleaved vector SLP.
We consider other interleave vector, the situation will become
more complicated. */
res_len
= gimple_build (header_seq, MIN_EXPR, iv_type,
index_before_incr,
build_int_cst (iv_type, vf * nitems_per_iter));
}
}
else
{
res_len
= gimple_build (header_seq, IFN_SELECT_VL, iv_type,
index_before_incr, build_int_cst (iv_type, vf));
}
gassign *assign = gimple_build_assign (ctrl, res_len);
gimple_seq_add_stmt (header_seq, assign);
if (rgc->controls.length () == 1)
*x = ctrl;
}
return index_after_incr;
}
Am I understand correctly ? I think I may need to implement VEC_PACK_TRUNC to test your idea.
Thanks
juzhe.zhong@rivai.ai
From: Richard Sandiford
Date: 2023-04-26 17:06
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH] VECT: Add decrement IV iteration loop control by variable amount support
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Thank you so much for pointing out this issue.
>
> After reading your comments carefully, I need to revise "vect_set_loop_controls_by_while_len" in loop control like this:
>
> vect_set_loop_controls_by_while_len
> ...
> tree X = NULL_TREE;
> FOR_EACH_VEC_ELT (rgc->controls, i, ctrl)
> ...
> if (i == 0) {
> X = gimple_build (WHILE_LEN);
> gimple_build_assign (ctrl, X);
> } else {
> // (X - VF*I/N) capped to the range [0, VF/N]
> tree t = gimple_build (MINUS, X, build_int_cst (VF*I/N));
> gimple_build_assign (ctrl, t);
> }
> }
> ....
>
> Am I understand your idea correctly ?
I think it's more that rgc->controls.length () == 1 is a special case,
rather than i == 0 being a special case.
That is, rgc->controls.length () == 1 can use a single WHILE_LEN to
calculate the number of scalars that will be processed by the current
loop iteration. Let's call it X. Then all rgroups with
rgc->controls.length () > 1 will be based on X rather than using
WHILE_LEN. (And they would do that even for the first control in the
group, i.e. for i == 0.)
I'm not saying it has to be this way. It might be that a different
arrangement is better for the later RVV processing. But there needs
to be something in the gimple-level description, and something in
the optab documentation, that guarantees that whatever code we
generate for these cases works correctly.
BTW, very minor thing (I should have raised it earlier), but maybe
something like SELECT_VL would be a better name than WHILE_LEN?
WHILE_ULT means "while (IV) is unsigned less than" and so describes
an operation in terms of its arguments. But I think WHILE_LEN is
more describing an operation based on its use case.
Thanks,
Richard
>
> So the example you shows in ARM SVE gimple IR, is like this:
>
> _3 = &MEM <vector([2,2]) long int> [(long int *)_2];
> vect__4.6_15 = .MASK_LOAD (_3, 64B, loop_mask_21); (INT64)
> _5 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [16B, 16B]];
> vect__4.7_8 = .MASK_LOAD (_5, 64B, loop_mask_20);(INT64)
> _7 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [32B, 32B]];
> vect__4.8_28 = .MASK_LOAD (_7, 64B, loop_mask_19);(INT64)
> _24 = &MEM <vector([2,2]) long int> [(long int *)_2 + POLY_INT_CST [48B, 48B]];
> vect__4.9_30 = .MASK_LOAD (_24, 64B, loop_mask_16); (INT64)
> vect__7.11_31 = VEC_PACK_TRUNC_EXPR <vect__4.6_15, vect__4.7_8>;
> vect__7.11_32 = VEC_PACK_TRUNC_EXPR <vect__4.8_28, vect__4.9_30>;
> vect__7.10_33 = VEC_PACK_TRUNC_EXPR <vect__7.11_31, vect__7.11_32>;
> ...
> .MASK_STORE (_13, 16B, loop_mask_36, vect__7.10_33); (INT16)
>
> If it is changed into WHILE_LEN style, it should be:
>
> X = WHILE_LEN;
> _3 = &MEM <vector([2,2]) long int> [(long int *)_2];
> vect__4.6_15 = .LEN_LOAD (_3, 64B, X - VF*1/N); (INT64)
> _5 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*1/N)*8 ];
> vect__4.7_8 = .LEN_LOAD (_5, 64B, X - VF*2/N);(INT64)
> _7 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*2/N)*8];
> vect__4.8_28 = .LEN_LOAD (_7, 64B, X - VF*3/N);(INT64)
> _24 = &MEM <vector([2,2]) long int> [(long int *)_2 + (X - VF*3/N)*8];
> vect__4.9_30 = .LEN_LOAD (_24, 64B, X - VF*4/N); (INT64)
> vect__7.11_31 = VEC_PACK_TRUNC_EXPR <vect__4.6_15, vect__4.7_8>;
> vect__7.11_32 = VEC_PACK_TRUNC_EXPR <vect__4.8_28, vect__4.9_30>;
> vect__7.10_33 = VEC_PACK_TRUNC_EXPR <vect__7.11_31, vect__7.11_32>;
> ...
> .LEN_STORE (_13, 16B, X, vect__7.10_33); (INT16)
>
> Is this correct ?
>
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>
> From: Richard Sandiford
> Date: 2023-04-26 16:06
> To: juzhe.zhong\@rivai.ai
> CC: gcc-patches; rguenther
> Subject: Re: [PATCH] VECT: Add decrement IV iteration loop control by variable amount support
> "juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
>> Thanks Richard so much.
>>
>>>> I don't think that's guaranteed by the proposed definition of WHILE_LEN.
>>>> The first int64_t WHILE_LEN could come up short, and return something
>>>> less than VF/2.
>>
>> I am so sorry that the comments of vect_set_loop_controls_by_while_len
>> is totally misleading and incorrect and I have sent V3 patch to fix that.
>> Actually, I don't use WHILE_LEN in multi-rgroups situation, instead, I use MIN_EXPR
>> to force VF elements for each non-final iteration to make sure result is correct.
>>
>> Yes, I agree with you that WHILE_LEN will produce issues for SLP situation that
>> having multi-rgroups since WHILE_LEN definition is allow target produces non-VF
>> outcome in non-final iteration.
>
> Yeah, I'd read that you weren't using WHILE_LEN for SLP. I was talking
> specifically about non-SLP though (nitems_per_iter == 1). Consider:
>
> void f(short *x, long *y) {
> for (int i = 0; i < 100; ++i)
> x[i] = y[i];
> }
>
> compiled at -O3 -fno-vect-cost-model for SVE:
>
> whilelo p4.d, wzr, w6
> whilelo p3.d, wzr, w5
> whilelo p2.h, wzr, w3
> whilelo p1.d, wzr, w3
> whilelo p0.d, wzr, w4
> .L2:
> ld1d z2.d, p0/z, [x1, #1, mul vl]
> ld1d z0.d, p1/z, [x1]
> ld1d z1.d, p3/z, [x1, #2, mul vl]
> uzp1 z0.s, z0.s, z2.s
> ld1d z2.d, p4/z, [x1, #3, mul vl]
> uzp1 z1.s, z1.s, z2.s
> uzp1 z0.h, z0.h, z1.h
> st1h z0.h, p2, [x0, x2, lsl 1]
> add x2, x2, x8
> whilelo p2.h, w2, w3
> whilelo p4.d, w2, w6
> whilelo p3.d, w2, w5
> whilelo p0.d, w2, w4
> add x1, x1, x7
> whilelo p1.d, w2, w3
> b.any .L2
>
> This is a non-SLP loop. We have two rgroups: a single-mask/control
> rgroup for the short vector, and a 4-mask/control rgroup for the long
> vector. And the loop converts the Nth long scalar (selected from 4
> concatenated vectors) to the Nth short scalar (in a single vector).
>
> It's therefore important that the 4-mask/control rgroup and the
> single-mask/control rgroup treat the same lanes/scalar iterations
> as active and the same lanes/scalar iterations as inactive.
>
> But if I read the code correctly, the patch would generate 5 WHILE_LENs
> for this case, since nitems_per_iter==1 for all 5 controls. And I don't
> think the documentation of WHILE_LEN guarantees that that will work
> correctly, given that WHILE_LEN isn't a simple MIN operation.
>
> It might be that it works correctly on RVV, given the later
> backend-specific processing. But I'm looking at this as a purely
> gimple thing. If something guarantees that the above works then
> I think the WHILE_LEN documentation needs to be updated.
>
> From the current documentation of WHILE_LEN, I think the safe
> approach would be to use WHILE_LEN for a single-control rgroup
> and then "expand" that to larger control rgroups using arithmetic.
> Specifically, if the length computed by the single-control rgroup
> is X, then control I in an N-control rgroup would need to be:
>
> (X - VF*I/N) capped to the range [0, VF/N]
>
> SVE does something similar for:
>
> void f(short *x, int *y) {
> for (int i = 0; i < 100; ++i)
> x[i] = y[i];
> }
>
> Here we use a single WHILELO and then unpack it, rather than use
> three independent WHILELOs:
>
> whilelo p0.h, wzr, w3
> .L2:
> punpklo p2.h, p0.b
> punpkhi p1.h, p0.b
> ld1w z0.s, p2/z, [x1, x2, lsl 2]
> ld1w z1.s, p1/z, [x5, x2, lsl 2]
> uzp1 z0.h, z0.h, z1.h
> st1h z0.h, p0, [x0, x2, lsl 1]
> add x2, x2, x4
> whilelo p0.h, w2, w3
> b.any .L2
>
> This is dreadful code (hence the -fno-vect-cost-model). And I'm sure
> it's equally bad for RVV. But the point is that we can generate it,
> and in more exotic cases it might even be worthwhile.
>
> Thanks,
> Richard
>
"juzhe.zhong@rivai.ai" <juzhe.zhong@rivai.ai> writes:
> Hi, Richard.
> Would you mind take a look at the loop control part again:
>
> static gcond *
> vect_set_loop_condition_partial_vectors (class loop *loop,
> loop_vec_info loop_vinfo, tree niters,
> tree final_iv, bool niters_maybe_zero,
> gimple_stmt_iterator loop_cond_gsi)
> ...
> tree loop_len_x = NULL_TREE;
> FOR_EACH_VEC_ELT (*controls, i, rgc)
> if (!rgc->controls.is_empty ())
> {
> ...
>
> /* Set up all controls for this group. */
> if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
> OPTIMIZE_FOR_SPEED))
> test_ctrl
> = vect_set_loop_controls_by_select_vl (loop, loop_vinfo,
> &preheader_seq, &header_seq,
> rgc, niters, &loop_len_x);
> ...
> }
>
> static tree
> vect_set_loop_controls_by_select_vl (class loop *loop, loop_vec_info loop_vinfo,
> gimple_seq *preheader_seq,
> gimple_seq *header_seq,
> rgroup_controls *rgc, tree niters, tree *x)
> {
> tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
> tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> /* We are not allowing masked approach in SELECT_VL. */
> gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
>
> tree ctrl_type = rgc->type;
> unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
> poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
> poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>
> /* Calculate the maximum number of item values that the rgroup
> handles in total, the number that it handles for each iteration
> of the vector loop. */
> tree nitems_total = niters;
> if (nitems_per_iter != 1)
> {
> /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
> these multiplications don't overflow. */
> tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
> nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
> nitems_total, compare_factor);
> }
>
> /* Convert the comparison value to the IV type (either a no-op or
> a promotion). */
> nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
>
> /* Create an induction variable that counts the number of items
> processed. */
> tree index_before_incr, index_after_incr;
> gimple_stmt_iterator incr_gsi;
> bool insert_after;
> standard_iv_increment_position (loop, &incr_gsi, &insert_after);
>
> /* Test the decremented IV, which will never underflow 0 since we have
> IFN_SELECT_VL to gurantee that. */
> tree test_limit = nitems_total;
>
> /* Provide a definition of each control in the group. */
> tree ctrl;
> unsigned int i;
> FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
> {
> /* Previous controls will cover BIAS items. This control covers the
> next batch. */
> poly_uint64 bias = nitems_per_ctrl * i;
> tree bias_tree = build_int_cst (iv_type, bias);
>
> /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
> BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
> control and adjust the bound down by BIAS. */
> tree this_test_limit = test_limit;
> if (i != 0)
> {
> this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
> this_test_limit, bias_tree);
> this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
> this_test_limit, bias_tree);
> }
>
> /* Create decrement IV. */
> create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop, &incr_gsi,
> insert_after, &index_before_incr, &index_after_incr);
>
> tree res_len;
> if (rgc->controls.length () != 1)
> {
> if (nitems_per_iter == 1)
> {
> /* Generte length = (X - VF*I/N) capped to the range [0, VF/N]. */
> /* step = VF * I / N. */
> tree step
> = build_int_cst (iv_type,
> exact_div (vf * i, rgc->controls.length ()));
> /* Make sure (X - VF*I/N) never underflow zero. */
> tree max = gimple_build (header_seq, MAX_EXPR, iv_type, *x, step);
> res_len
> = gimple_build (header_seq, MIN_EXPR, iv_type,
> index_before_incr,
> build_int_cst (iv_type, vf * nitems_per_iter));
> }
> else
> {
> /* For SLP, we can't allow non-VF number of elements to be
> processed in non-final iteration. We force the number of
> elements to be processed in each non-final iteration is VF
> elements. If we allow non-VF elements processing in non-final
> iteration will make SLP too complicated and produce inferior
> codegen.
>
> For example:
>
> If non-final iteration process VF elements.
>
> ...
> .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
> .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
> ...
>
> If non-final iteration process non-VF elements.
>
> ...
> .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
> if (_71 % 2 == 0)
> .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
> else
> .LEN_STORE (vectp_f.8_56, 128B, _72, { 2, 1, 2, 1 }, 0);
> ...
>
> This is the simple case of 2-elements interleaved vector SLP.
> We consider other interleave vector, the situation will become
> more complicated. */
> res_len
> = gimple_build (header_seq, MIN_EXPR, iv_type,
> index_before_incr,
> build_int_cst (iv_type, vf * nitems_per_iter));
> }
> }
> else
> {
> res_len
> = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
> index_before_incr, build_int_cst (iv_type, vf));
> }
> gassign *assign = gimple_build_assign (ctrl, res_len);
> gimple_seq_add_stmt (header_seq, assign);
> if (rgc->controls.length () == 1)
> *x = ctrl;
> }
>
> return index_after_incr;
> }
>
> Am I understand correctly ? I think I may need to implement VEC_PACK_TRUNC to test your idea.
The formatting didn't reach me in tact, so TBH it's a bit difficult to
follow. But VF/N should in this code be equivalent to nitems_per_ctrl.
There shouldn't be any need to build an exact_div.
Also, only the single-control rgroup is/needs an IV. The other cases just
go at the start of the loop body, using the single-control IV as input.
Unless I'm missing something, the same length > 1 code could be used
for both SLP and non-SLP. (The length == 1 handling would still be
different for SLP and non-SLP.)
Thanks,
Richard
@@ -4965,6 +4965,40 @@ for (i = 1; i < operand3; i++)
operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
@end smallexample
+@cindex @code{while_len@var{m}@var{n}} instruction pattern
+@item @code{while_len@var{m}@var{n}}
+Set operand 0 to the number of active elements in vector will be updated value.
+operand 1 is the total elements need to be updated value.
+operand 2 is the vectorization factor.
+The value of operand 0 is target dependent and flexible in each iteration.
+The operation of this pattern can be:
+
+@smallexample
+Case 1:
+operand0 = MIN (operand1, operand2);
+operand2 can be const_poly_int or poly_int related to vector mode size.
+Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) so
+that we can reduce a use of general purpose register.
+
+In this case, only the last iteration of the loop is partial iteration.
+@end smallexample
+
+@smallexample
+Case 2:
+if (operand1 <= operand2)
+ operand0 = operand1;
+else if (operand1 < 2 * operand2)
+ operand0 = IN_RANGE (ceil (operand1 / 2), operand2);
+else
+ operand0 = operand2;
+
+This case will evenly distribute work over the last 2 iterations of a stripmine loop.
+@end smallexample
+
+The output of this pattern is not only used as IV of loop control counter, but also
+is used as the IV of address calculation with multiply/shift operation. This allow
+us dynamic adjust the number of elements is processed in each iteration of the loop.
+
@cindex @code{check_raw_ptrs@var{m}} instruction pattern
@item @samp{check_raw_ptrs@var{m}}
Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
@@ -127,6 +127,7 @@ init_internal_fns ()
#define cond_binary_direct { 1, 1, true }
#define cond_ternary_direct { 1, 1, true }
#define while_direct { 0, 2, false }
+#define while_len_direct { 0, 0, false }
#define fold_extract_direct { 2, 2, false }
#define fold_left_direct { 1, 1, false }
#define mask_fold_left_direct { 1, 1, false }
@@ -3702,6 +3703,33 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
emit_move_insn (lhs_rtx, ops[0].value);
}
+/* Expand WHILE_LEN call STMT using optab OPTAB. */
+static void
+expand_while_len_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+{
+ expand_operand ops[3];
+ tree rhs_type[2];
+
+ tree lhs = gimple_call_lhs (stmt);
+ tree lhs_type = TREE_TYPE (lhs);
+ rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+ create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type));
+
+ for (unsigned int i = 0; i < gimple_call_num_args (stmt); ++i)
+ {
+ tree rhs = gimple_call_arg (stmt, i);
+ rhs_type[i] = TREE_TYPE (rhs);
+ rtx rhs_rtx = expand_normal (rhs);
+ create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type[i]));
+ }
+
+ insn_code icode = direct_optab_handler (optab, TYPE_MODE (rhs_type[0]));
+
+ expand_insn (icode, 3, ops);
+ if (!rtx_equal_p (lhs_rtx, ops[0].value))
+ emit_move_insn (lhs_rtx, ops[0].value);
+}
+
/* Expand a call to a convert-like optab using the operands in STMT.
FN has a single output operand and NARGS input operands. */
@@ -3843,6 +3871,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
#define direct_scatter_store_optab_supported_p convert_optab_supported_p
#define direct_len_store_optab_supported_p direct_optab_supported_p
#define direct_while_optab_supported_p convert_optab_supported_p
+#define direct_while_len_optab_supported_p direct_optab_supported_p
#define direct_fold_extract_optab_supported_p direct_optab_supported_p
#define direct_fold_left_optab_supported_p direct_optab_supported_p
#define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
@@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (WHILE_LEN, ECF_CONST | ECF_NOTHROW, while_len, while_len)
DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
check_raw_ptrs, check_ptrs)
DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
@@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
OPTAB_D (len_load_optab, "len_load_$a")
OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (while_len_optab, "while_len$a")
@@ -59,14 +59,14 @@ static bitmap_obstack loop_renamer_obstack;
void
create_iv (tree base, tree step, tree var, class loop *loop,
gimple_stmt_iterator *incr_pos, bool after,
- tree *var_before, tree *var_after)
+ tree *var_before, tree *var_after, enum tree_code code)
{
gassign *stmt;
gphi *phi;
tree initial, step1;
gimple_seq stmts;
tree vb, va;
- enum tree_code incr_op = PLUS_EXPR;
+ enum tree_code incr_op = code;
edge pe = loop_preheader_edge (loop);
if (var != NULL_TREE)
@@ -23,7 +23,7 @@ along with GCC; see the file COPYING3. If not see
typedef void (*transform_callback)(class loop *, void *);
extern void create_iv (tree, tree, tree, class loop *, gimple_stmt_iterator *,
- bool, tree *, tree *);
+ bool, tree *, tree *, enum tree_code = PLUS_EXPR);
extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
extern void verify_loop_closed_ssa (bool, class loop * = NULL);
@@ -682,6 +682,210 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
return next_ctrl;
}
+/* Helper for vect_set_loop_condition_partial_vectors. Generate definitions
+ for all the rgroup controls in RGC and return a control that is nonzero
+ when the loop needs to iterate. Add any new preheader statements to
+ PREHEADER_SEQ. Use LOOP_COND_GSI to insert code before the exit gcond.
+
+ RGC belongs to loop LOOP. The loop originally iterated NITERS
+ times and has been vectorized according to LOOP_VINFO.
+
+ Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
+ to TEST_LIMIT - bias.
+
+ In vect_set_loop_controls_by_while_len, we are iterating from start at
+ IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
+ IFN_WHILE_LEN pattern.
+
+ Note: the cost of the code generated by this function is modeled
+ by vect_estimate_min_profitable_iters, so changes here may need
+ corresponding changes there.
+
+ 1. Single rgroup, the Gimple IR should be:
+
+ <bb 3>
+ _19 = (unsigned long) n_5(D);
+ ...
+
+ <bb 4>:
+ ...
+ # ivtmp_20 = PHI <ivtmp_21(4), _19(3)>
+ ...
+ _22 = .WHILE_LEN (ivtmp_20, vf);
+ ...
+ vector statement (use _22);
+ ...
+ ivtmp_21 = ivtmp_20 - _22;
+ ...
+ if (ivtmp_21 != 0)
+ goto <bb 4>; [75.00%]
+ else
+ goto <bb 5>; [25.00%]
+
+ <bb 5>
+ return;
+
+ Note: IFN_WHILE_LEN will guarantee "ivtmp_21 = ivtmp_20 - _22" never
+ underflow 0.
+
+ 2. Multiple rgroup, the Gimple IR should be:
+
+ <bb 3>
+ _70 = (unsigned long) bnd.7_52;
+ _71 = _70 * 2;
+ _72 = MAX_EXPR <_71, 4>;
+ _73 = _72 + 18446744073709551612;
+ ...
+
+ <bb 4>:
+ ...
+ # ivtmp_74 = PHI <ivtmp_75(6), _73(12)>
+ # ivtmp_77 = PHI <ivtmp_78(6), _71(12)>
+ _76 = .WHILE_LEN (ivtmp_74, vf * nitems_per_ctrl);
+ _79 = .WHILE_LEN (ivtmp_77, vf * nitems_per_ctrl);
+ ...
+ vector statement (use _79);
+ ...
+ vector statement (use _76);
+ ...
+ _65 = _79 / 2;
+ vector statement (use _65);
+ ...
+ _68 = _76 / 2;
+ vector statement (use _68);
+ ...
+ ivtmp_78 = ivtmp_77 - _79;
+ ivtmp_75 = ivtmp_74 - _76;
+ ...
+ if (ivtmp_78 != 0)
+ goto <bb 4>; [75.00%]
+ else
+ goto <bb 5>; [25.00%]
+
+ <bb 5>
+ return;
+
+*/
+
+static tree
+vect_set_loop_controls_by_while_len (class loop *loop, loop_vec_info loop_vinfo,
+ gimple_seq *preheader_seq,
+ gimple_seq *header_seq,
+ rgroup_controls *rgc, tree niters)
+{
+ tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ /* We are not allowing masked approach in WHILE_LEN. */
+ gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+
+ tree ctrl_type = rgc->type;
+ unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
+ poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+ /* Calculate the maximum number of item values that the rgroup
+ handles in total, the number that it handles for each iteration
+ of the vector loop. */
+ tree nitems_total = niters;
+ if (nitems_per_iter != 1)
+ {
+ /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
+ these multiplications don't overflow. */
+ tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
+ nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+ nitems_total, compare_factor);
+ }
+
+ /* Convert the comparison value to the IV type (either a no-op or
+ a promotion). */
+ nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+
+ /* Create an induction variable that counts the number of items
+ processed. */
+ tree index_before_incr, index_after_incr;
+ gimple_stmt_iterator incr_gsi;
+ bool insert_after;
+ standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+
+ /* Test the decremented IV, which will never underflow 0 since we have
+ IFN_WHILE_LEN to gurantee that. */
+ tree test_limit = nitems_total;
+
+ /* Provide a definition of each control in the group. */
+ tree ctrl;
+ unsigned int i;
+ FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
+ {
+ /* Previous controls will cover BIAS items. This control covers the
+ next batch. */
+ poly_uint64 bias = nitems_per_ctrl * i;
+ tree bias_tree = build_int_cst (iv_type, bias);
+
+ /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
+ BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
+ control and adjust the bound down by BIAS. */
+ tree this_test_limit = test_limit;
+ if (i != 0)
+ {
+ this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
+ this_test_limit, bias_tree);
+ this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
+ this_test_limit, bias_tree);
+ }
+
+ /* Create decrement IV. */
+ create_iv (this_test_limit, ctrl, NULL_TREE, loop, &incr_gsi,
+ insert_after, &index_before_incr, &index_after_incr,
+ MINUS_EXPR);
+
+ poly_uint64 final_vf = vf * nitems_per_iter;
+ tree vf_step = build_int_cst (iv_type, final_vf);
+ tree res_len;
+ if (nitems_per_iter != 1)
+ {
+ /* For SLP, we can't allow non-VF number of elements to be processed
+ in non-final iteration. We force the number of elements to be
+ processed in each non-final iteration is VF elements. If we allow
+ non-VF elements processing in non-final iteration will make SLP too
+ complicated and produce inferior codegen.
+
+ For example:
+
+ If non-final iteration process VF elements.
+
+ ...
+ .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
+ .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
+ ...
+
+ If non-final iteration process non-VF elements.
+
+ ...
+ .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
+ if (_71 % 2 == 0)
+ .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
+ else
+ .LEN_STORE (vectp_f.8_56, 128B, _72, { 2, 1, 2, 1 }, 0);
+ ...
+
+ This is the simple case of 2-elements interleaved vector SLP. We
+ consider other interleave vector, the situation will become more
+ complicated. */
+ res_len = gimple_build (header_seq, MIN_EXPR, iv_type,
+ index_before_incr, vf_step);
+ }
+ else
+ {
+ res_len = gimple_build (header_seq, IFN_WHILE_LEN, iv_type,
+ index_before_incr, vf_step);
+ }
+ gassign *assign = gimple_build_assign (ctrl, res_len);
+ gimple_seq_add_stmt (header_seq, assign);
+ }
+
+ return index_after_incr;
+}
+
/* Set up the iteration condition and rgroup controls for LOOP, given
that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
loop. LOOP_VINFO describes the vectorization of LOOP. NITERS is
@@ -703,6 +907,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
unsigned int compare_precision = TYPE_PRECISION (compare_type);
tree orig_niters = niters;
@@ -757,12 +962,18 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
/* Set up all controls for this group. */
- test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
- &preheader_seq,
- &header_seq,
- loop_cond_gsi, rgc,
- niters, niters_skip,
- might_wrap_p);
+ if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
+ OPTIMIZE_FOR_SPEED))
+ test_ctrl
+ = vect_set_loop_controls_by_while_len (loop, loop_vinfo,
+ &preheader_seq, &header_seq,
+ rgc, niters);
+ else
+ test_ctrl
+ = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
+ &header_seq, loop_cond_gsi, rgc,
+ niters, niters_skip,
+ might_wrap_p);
}
/* Emit all accumulated statements. */
@@ -10364,12 +10364,14 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
tree
-vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
- unsigned int nvectors, unsigned int index)
+vect_get_loop_len (gimple_stmt_iterator *gsi, loop_vec_info loop_vinfo,
+ vec_loop_lens *lens, unsigned int nvectors, tree vectype,
+ unsigned int index)
{
rgroup_controls *rgl = &(*lens)[nvectors - 1];
bool use_bias_adjusted_len =
LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
/* Populate the rgroup's len array, if this is the first time we've
used it. */
@@ -10400,6 +10402,27 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
if (use_bias_adjusted_len)
return rgl->bias_adjusted_ctrl;
+ else if (direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
+ OPTIMIZE_FOR_SPEED))
+ {
+ tree loop_len = rgl->controls[index];
+ poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
+ poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
+ if (maybe_ne (nunits1, nunits2))
+ {
+ /* A loop len for data type X can be reused for data type Y
+ if X has N times more elements than Y and if Y's elements
+ are N times bigger than X's. */
+ gcc_assert (multiple_p (nunits1, nunits2));
+ unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
+ gimple_seq seq = NULL;
+ loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
+ build_int_cst (iv_type, factor));
+ if (seq)
+ gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+ }
+ return loop_len;
+ }
else
return rgl->controls[index];
}
@@ -3144,6 +3144,70 @@ vect_get_data_ptr_increment (vec_info *vinfo,
return iv_step;
}
+/* Prepare the pointer IVs which needs to be updated by a variable amount.
+ Such variable amount is the outcome of .WHILE_LEN. In this case, we can
+ allow each iteration process the flexible number of elements as long as
+ the number <= vf elments.
+
+ Return data reference according to WHILE_LEN.
+ If new statements are needed, insert them before GSI. */
+
+static tree
+get_while_len_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
+ tree aggr_type, class loop *at_loop, tree offset,
+ tree *dummy, gimple_stmt_iterator *gsi,
+ bool simd_lane_access_p, vec_loop_lens *loop_lens,
+ dr_vec_info *dr_info,
+ vect_memory_access_type memory_access_type)
+{
+ if (!loop_lens || loop_lens->length () != 1)
+ return NULL_TREE;
+ loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ tree step = vect_dr_behavior (vinfo, dr_info)->step;
+ if (!direct_internal_fn_supported_p (IFN_WHILE_LEN, iv_type,
+ OPTIMIZE_FOR_SPEED))
+ return NULL_TREE;
+
+ if (memory_access_type == VMAT_INVARIANT)
+ return NULL_TREE;
+
+ /* TODO: We don't support gather/scatter or load_lanes/store_lanes for pointer
+ IVs are updated by variable amount but we will support them in the future.
+ */
+ gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
+ && memory_access_type != VMAT_LOAD_STORE_LANES);
+
+ /* When we support WHILE_LEN pattern, we dynamic adjust
+ the memory address by .WHILE_LEN result.
+
+ The result of .WHILE_LEN is the number of elements to
+ be processed of each iteration. So the memory address
+ adjustment operation should be:
+
+ bytesize = GET_MODE_SIZE (element_mode (aggr_type));
+ addr = addr + .WHILE_LEN (ARG..) * bytesize;
+ */
+ gimple *ptr_incr;
+ tree loop_len
+ = vect_get_loop_len (gsi, loop_vinfo, loop_lens, 1, aggr_type, 0);
+ tree len_type = TREE_TYPE (loop_len);
+ poly_uint64 bytesize = GET_MODE_SIZE (element_mode (aggr_type));
+ /* Since the outcome of .WHILE_LEN is element size, we should adjust
+ it into bytesize so that it can be used in address pointer variable
+ amount IVs adjustment. */
+ tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
+ build_int_cst (len_type, bytesize));
+ if (tree_int_cst_sgn (step) == -1)
+ tmp = fold_build1 (NEGATE_EXPR, len_type, tmp);
+ tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
+ gassign *assign = gimple_build_assign (bump, tmp);
+ gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+ return vect_create_data_ref_ptr (vinfo, stmt_info, aggr_type, at_loop, offset,
+ dummy, gsi, &ptr_incr, simd_lane_access_p,
+ bump);
+}
+
/* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64,128}. */
static bool
@@ -8465,6 +8529,15 @@ vectorizable_store (vec_info *vinfo,
simd_lane_access_p ? loop : NULL,
offset, &dummy, gsi, &ptr_incr,
simd_lane_access_p, bump);
+
+ tree while_len_data_ref_ptr
+ = get_while_len_data_ref_ptr (vinfo, stmt_info, aggr_type,
+ simd_lane_access_p ? loop : NULL,
+ offset, &dummy, gsi,
+ simd_lane_access_p, loop_lens,
+ dr_info, memory_access_type);
+ if (while_len_data_ref_ptr)
+ dataref_ptr = while_len_data_ref_ptr;
}
else
{
@@ -8652,8 +8725,9 @@ vectorizable_store (vec_info *vinfo,
else if (loop_lens)
{
tree final_len
- = vect_get_loop_len (loop_vinfo, loop_lens,
- vec_num * ncopies, vec_num * j + i);
+ = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
+ vec_num * ncopies, vectype,
+ vec_num * j + i);
tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
machine_mode vmode = TYPE_MODE (vectype);
opt_machine_mode new_ovmode
@@ -9798,6 +9872,15 @@ vectorizable_load (vec_info *vinfo,
at_loop,
offset, &dummy, gsi, &ptr_incr,
simd_lane_access_p, bump);
+
+ tree while_len_data_ref_ptr
+ = get_while_len_data_ref_ptr (vinfo, stmt_info, aggr_type, at_loop,
+ offset, &dummy, gsi,
+ simd_lane_access_p, loop_lens,
+ dr_info, memory_access_type);
+ if (while_len_data_ref_ptr)
+ dataref_ptr = while_len_data_ref_ptr;
+
if (mask)
vec_mask = vec_masks[0];
}
@@ -10008,8 +10091,8 @@ vectorizable_load (vec_info *vinfo,
else if (loop_lens && memory_access_type != VMAT_INVARIANT)
{
tree final_len
- = vect_get_loop_len (loop_vinfo, loop_lens,
- vec_num * ncopies,
+ = vect_get_loop_len (gsi, loop_vinfo, loop_lens,
+ vec_num * ncopies, vectype,
vec_num * j + i);
tree ptr = build_int_cst (ref_type,
align * BITS_PER_UNIT);
@@ -2293,8 +2293,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
unsigned int, tree, unsigned int);
extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
tree, unsigned int);
-extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
- unsigned int);
+extern tree vect_get_loop_len (gimple_stmt_iterator *, loop_vec_info, vec_loop_lens *,
+ unsigned int, tree, unsigned int);
extern gimple_seq vect_gen_len (tree, tree, tree, tree);
extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);