[V3] VECT: Support loop len control on EXTRACT_LAST vectorization
Checks
Commit Message
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
Hi, Richard and Richi.
This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
Consider this following case:
#include <stdint.h>
#define EXTRACT_LAST(TYPE) \
TYPE __attribute__ ((noinline, noclone)) \
test_##TYPE (TYPE *x, int n, TYPE value) \
{ \
TYPE last; \
for (int j = 0; j < n; ++j) \
{ \
last = x[j]; \
x[j] = last * value; \
} \
return last; \
}
#define TEST_ALL(T) \
T (uint8_t) \
TEST_ALL (EXTRACT_LAST)
ARM SVE IR:
Preheader:
max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
Loop:
...
# loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
...
vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
.MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
...
next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
...
Epilogue:
_25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
For RVV since we prefer len in loop control, after this patch for RVV:
Loop:
...
loop_len_22 = SELECT_VL;
vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
.MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
...
Epilogue:
_25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
Details of this approach:
1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization
for LEN loop control.
This function we check whether target support:
- Use LEN as the loop control.
- Support VEC_EXTRACT optab.
2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
The only difference between mask and len is that len is using length generated by SELECT_VL and
use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
Bootstrap and Regression on X86 passed.
Tested on ARM QEMU.
Ok for trunk?
gcc/ChangeLog:
* tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
(vectorizable_live_operation): Add loop len control.
---
gcc/tree-vect-loop.cc | 76 +++++++++++++++++++++++++++++++++++++++----
1 file changed, 70 insertions(+), 6 deletions(-)
Comments
On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
>
> This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
>
> Consider this following case:
>
> #include <stdint.h>
>
> #define EXTRACT_LAST(TYPE) \
> TYPE __attribute__ ((noinline, noclone)) \
> test_##TYPE (TYPE *x, int n, TYPE value) \
> { \
> TYPE last; \
> for (int j = 0; j < n; ++j) \
> { \
> last = x[j]; \
> x[j] = last * value; \
> } \
> return last; \
> }
>
> #define TEST_ALL(T) \
> T (uint8_t) \
>
> TEST_ALL (EXTRACT_LAST)
>
> ARM SVE IR:
>
> Preheader:
> max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
>
> Loop:
> ...
> # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> ...
> vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> ...
> next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> ...
>
> Epilogue:
> _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
>
> For RVV since we prefer len in loop control, after this patch for RVV:
>
> Loop:
> ...
> loop_len_22 = SELECT_VL;
> vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> ...
>
> Epilogue:
> _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
>
> Details of this approach:
>
> 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization
> for LEN loop control.
>
> This function we check whether target support:
> - Use LEN as the loop control.
> - Support VEC_EXTRACT optab.
>
> 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
>
> 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
>
> The only difference between mask and len is that len is using length generated by SELECT_VL and
> use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
>
> Bootstrap and Regression on X86 passed.
>
> Tested on ARM QEMU.
>
> Ok for trunk?
>
> gcc/ChangeLog:
>
> * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
> (vectorizable_live_operation): Add loop len control.
>
> ---
> gcc/tree-vect-loop.cc | 76 +++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 70 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bf8d677b584..809b73b966c 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -8963,6 +8963,27 @@ vect_can_vectorize_without_simd_p (code_helper code)
> && vect_can_vectorize_without_simd_p (tree_code (code)));
> }
>
> +/* Return true if target supports extract last vectorization with LEN. */
> +
> +static bool
> +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> +{
> + /* Return false if target doesn't support LEN in loop control. */
> + machine_mode vmode;
> + machine_mode vec_mode = TYPE_MODE (vectype);
> + if (!VECTOR_MODE_P (vec_mode))
> + return false;
> + if (!get_len_load_store_mode (vec_mode, true).exists (&vmode)
> + || !get_len_load_store_mode (vec_mode, false).exists (&vmode))
> + return false;
So this "hidden" bit in the end decides whether to ...
> + /* Target need to support VEC_EXTRACT to extract the last active element. */
> + return convert_optab_handler (vec_extract_optab,
> + vec_mode,
> + TYPE_MODE (TREE_TYPE (vectype)))
> + != CODE_FOR_nothing;
> +}
> +
> /* Create vector init for vectorized iv. */
> static tree
> vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> @@ -10279,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> {
> if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> - OPTIMIZE_FOR_SPEED))
> + OPTIMIZE_FOR_SPEED)
> + && !vect_can_vectorize_extract_last_with_len_p (vectype))
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10308,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> else
> {
> gcc_assert (ncopies == 1 && !slp_node);
> - vect_record_loop_mask (loop_vinfo,
> - &LOOP_VINFO_MASKS (loop_vinfo),
> - 1, vectype, NULL);
> + if (vect_can_vectorize_extract_last_with_len_p (vectype))
> + vect_record_loop_len (loop_vinfo,
> + &LOOP_VINFO_LENS (loop_vinfo),
> + 1, vectype, 1);
.. record a loop_len here. I think powerpc at least has .VEC_EXTRACT as
well but of course .VEC_EXTRACT support itself doesn't have anything to
do with 'len' support.
x86 has .VEC_SET but not yet .VEC_EXTRACT, if it gets .VEC_EXTRACT
its partial vector support still wants masks, not lens (and once
we record both we fail).
So how can we resolve the issue when a non-VL operation like
.VEC_EXTRACT is used for _len support?
Note x86 doens't yet support IFN_EXTRACT_LAST either.
So, why do we test for get_len_load_store_mode and not just for
VEC_EXTRACT?
> + else
> + vect_record_loop_mask (loop_vinfo,
> + &LOOP_VINFO_MASKS (loop_vinfo),
> + 1, vectype, NULL);
> }
> }
> /* ??? Enable for loop costing as well. */
> @@ -10336,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> gimple *vec_stmt;
> if (slp_node)
> {
> - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> + gcc_assert (!loop_vinfo
> + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>
> /* Get the correct slp vectorized stmt. */
> vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10380,7 +10409,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>
> gimple_seq stmts = NULL;
> tree new_tree;
> - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> + /* Emit:
> +
> + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> + where VEC_LHS is the vectorized live-out result and MASK is
> + the loop mask for the final iteration. */
> + gcc_assert (ncopies == 1 && !slp_node);
> + gimple_seq tem = NULL;
> + gimple_stmt_iterator gsi = gsi_last (tem);
> + tree len
> + = vect_get_loop_len (loop_vinfo, &gsi,
> + &LOOP_VINFO_LENS (loop_vinfo),
> + 1, vectype, 0, 0);
> +
> + /* BIAS - 1. */
> + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + tree bias_minus_one
> + = int_const_binop (MINUS_EXPR,
> + build_int_cst (TREE_TYPE (len), biasval),
> + build_one_cst (TREE_TYPE (len)));
> +
> + /* LAST_INDEX = LEN + (BIAS - 1). */
> + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> + len, bias_minus_one);
> +
> + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
> + tree scalar_res
> + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> + vec_lhs_phi, last_index);
> +
can we double-check this on powerpc and s390?
Thanks,
Richard.
> + /* Convert the extracted vector element to the scalar type. */
> + new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> + }
> + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> {
> /* Emit:
>
Hi, Richi.
>> So how can we resolve the issue when a non-VL operation like
>> .VEC_EXTRACT is used for _len support?
Do you mean non-VL extract last operation (I am sorry that not sure whether I understand your question correctly)?
If yes, the answer is for RVV, we are reusing the same flow as ARM SVE (BIT_FILED_REF approach), see the example below:
https://godbolt.org/z/cqrWrY8q4
#define EXTRACT_LAST(TYPE) \
TYPE __attribute__ ((noinline, noclone)) \
test_##TYPE (TYPE *x, int n, TYPE value) \
{ \
TYPE last; \
for (int j = 0; j < 64; ++j) \
{ \
last = x[j]; \
x[j] = last * value; \
} \
return last; \
}
#define TEST_ALL(T) \
T (uint8_t) \
TEST_ALL (EXTRACT_LAST)
vect_cst__22 = {value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D)};
vect_last_11.6_3 = MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)];
vect__4.7_23 = vect_last_11.6_3 * vect_cst__22;
MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)] = vect__4.7_23;
_21 = BIT_FIELD_REF <vect_last_11.6_3, 8, 504>;
This approach works perfectly for both RVV and ARM SVE for non-VL and non-MASK EXTRACT_LAST operation.
>> So, why do we test for get_len_load_store_mode and not just for
>> VEC_EXTRACT?
Before answer this question, let me first elaborate how ARM SVE is doing with MASK EXTRACT_LAST.
Here is the example:
https://godbolt.org/z/8cTv1jqMb
ARM SVE IR:
<bb 4> [local count: 955630224]:
# ivtmp_31 = PHI <ivtmp_32(4), 0(3)>
# loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> -----> For RVV, we want this to be loop_len = SELECT_VL;
_7 = &MEM <vector([16,16]) unsigned char> [(uint8_t *)x_11(D) + ivtmp_31 * 1];
vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
.MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
ivtmp_32 = ivtmp_31 + POLY_INT_CST [16, 16];
_1 = (unsigned int) ivtmp_32;
next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
if (next_mask_35 != { 0, ... })
goto <bb 4>; [89.00%]
else
goto <bb 5>; [11.00%]
<bb 5> [local count: 105119324]:
_25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); [tail call] ----> Use the last mask generated in BB 4, so for RVV, we are using the loop_len.
So this patch is trying to optimize the codegen with simulating same flow as ARM SVE but with replacing 'loop_mask_22' (This is generated in BB4) into 'loop_len'.
For ARM SVE, they only check whether target support EXTRACT_LAST pattern, this pattern is supported means:
1. Target is using loop MASK as the partial vector loop control.
2. extract_last optab is enabled in the backend.
So for RVV, we are also checking same conditions:
1. Target is using loop LEN as the partial vector loop control (I use get_len_load_store_mode to check whether target is using loop LEN as the partial vector loop control).
2. vec_extract optab is enabled in the backend.
An alternative approach is that we can adding EXTRACT_LAST_LEN internal FN, then we can only check this like ARM SVE only check EXTRACT_LAST.
>> can we double-check this on powerpc and s390?
Sure, I hope it can be beneficial to powerpc and s390.
And, I think Richard's comments are also very important so I am gonna wait for it.
Thanks.
juzhe.zhong@rivai.ai
From: Richard Biener
Date: 2023-08-11 15:01
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford; linkw; krebbel
Subject: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
>
> This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
>
> Consider this following case:
>
> #include <stdint.h>
>
> #define EXTRACT_LAST(TYPE) \
> TYPE __attribute__ ((noinline, noclone)) \
> test_##TYPE (TYPE *x, int n, TYPE value) \
> { \
> TYPE last; \
> for (int j = 0; j < n; ++j) \
> { \
> last = x[j]; \
> x[j] = last * value; \
> } \
> return last; \
> }
>
> #define TEST_ALL(T) \
> T (uint8_t) \
>
> TEST_ALL (EXTRACT_LAST)
>
> ARM SVE IR:
>
> Preheader:
> max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
>
> Loop:
> ...
> # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> ...
> vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> ...
> next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> ...
>
> Epilogue:
> _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
>
> For RVV since we prefer len in loop control, after this patch for RVV:
>
> Loop:
> ...
> loop_len_22 = SELECT_VL;
> vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> ...
>
> Epilogue:
> _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
>
> Details of this approach:
>
> 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization
> for LEN loop control.
>
> This function we check whether target support:
> - Use LEN as the loop control.
> - Support VEC_EXTRACT optab.
>
> 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
>
> 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
>
> The only difference between mask and len is that len is using length generated by SELECT_VL and
> use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
>
> Bootstrap and Regression on X86 passed.
>
> Tested on ARM QEMU.
>
> Ok for trunk?
>
> gcc/ChangeLog:
>
> * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
> (vectorizable_live_operation): Add loop len control.
>
> ---
> gcc/tree-vect-loop.cc | 76 +++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 70 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bf8d677b584..809b73b966c 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -8963,6 +8963,27 @@ vect_can_vectorize_without_simd_p (code_helper code)
> && vect_can_vectorize_without_simd_p (tree_code (code)));
> }
>
> +/* Return true if target supports extract last vectorization with LEN. */
> +
> +static bool
> +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> +{
> + /* Return false if target doesn't support LEN in loop control. */
> + machine_mode vmode;
> + machine_mode vec_mode = TYPE_MODE (vectype);
> + if (!VECTOR_MODE_P (vec_mode))
> + return false;
> + if (!get_len_load_store_mode (vec_mode, true).exists (&vmode)
> + || !get_len_load_store_mode (vec_mode, false).exists (&vmode))
> + return false;
So this "hidden" bit in the end decides whether to ...
> + /* Target need to support VEC_EXTRACT to extract the last active element. */
> + return convert_optab_handler (vec_extract_optab,
> + vec_mode,
> + TYPE_MODE (TREE_TYPE (vectype)))
> + != CODE_FOR_nothing;
> +}
> +
> /* Create vector init for vectorized iv. */
> static tree
> vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> @@ -10279,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> {
> if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> - OPTIMIZE_FOR_SPEED))
> + OPTIMIZE_FOR_SPEED)
> + && !vect_can_vectorize_extract_last_with_len_p (vectype))
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -10308,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> else
> {
> gcc_assert (ncopies == 1 && !slp_node);
> - vect_record_loop_mask (loop_vinfo,
> - &LOOP_VINFO_MASKS (loop_vinfo),
> - 1, vectype, NULL);
> + if (vect_can_vectorize_extract_last_with_len_p (vectype))
> + vect_record_loop_len (loop_vinfo,
> + &LOOP_VINFO_LENS (loop_vinfo),
> + 1, vectype, 1);
.. record a loop_len here. I think powerpc at least has .VEC_EXTRACT as
well but of course .VEC_EXTRACT support itself doesn't have anything to
do with 'len' support.
x86 has .VEC_SET but not yet .VEC_EXTRACT, if it gets .VEC_EXTRACT
its partial vector support still wants masks, not lens (and once
we record both we fail).
So how can we resolve the issue when a non-VL operation like
.VEC_EXTRACT is used for _len support?
Note x86 doens't yet support IFN_EXTRACT_LAST either.
So, why do we test for get_len_load_store_mode and not just for
VEC_EXTRACT?
> + else
> + vect_record_loop_mask (loop_vinfo,
> + &LOOP_VINFO_MASKS (loop_vinfo),
> + 1, vectype, NULL);
> }
> }
> /* ??? Enable for loop costing as well. */
> @@ -10336,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> gimple *vec_stmt;
> if (slp_node)
> {
> - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> + gcc_assert (!loop_vinfo
> + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
>
> /* Get the correct slp vectorized stmt. */
> vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> @@ -10380,7 +10409,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
>
> gimple_seq stmts = NULL;
> tree new_tree;
> - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> + /* Emit:
> +
> + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> + where VEC_LHS is the vectorized live-out result and MASK is
> + the loop mask for the final iteration. */
> + gcc_assert (ncopies == 1 && !slp_node);
> + gimple_seq tem = NULL;
> + gimple_stmt_iterator gsi = gsi_last (tem);
> + tree len
> + = vect_get_loop_len (loop_vinfo, &gsi,
> + &LOOP_VINFO_LENS (loop_vinfo),
> + 1, vectype, 0, 0);
> +
> + /* BIAS - 1. */
> + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + tree bias_minus_one
> + = int_const_binop (MINUS_EXPR,
> + build_int_cst (TREE_TYPE (len), biasval),
> + build_one_cst (TREE_TYPE (len)));
> +
> + /* LAST_INDEX = LEN + (BIAS - 1). */
> + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> + len, bias_minus_one);
> +
> + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
> + tree scalar_res
> + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> + vec_lhs_phi, last_index);
> +
can we double-check this on powerpc and s390?
Thanks,
Richard.
> + /* Convert the extracted vector element to the scalar type. */
> + new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> + }
> + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> {
> /* Emit:
>
On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
> Hi, Richi.
>
> >> So how can we resolve the issue when a non-VL operation like
> >> .VEC_EXTRACT is used for _len support?
>
> Do you mean non-VL extract last operation (I am sorry that not sure whether I understand your question correctly)?
> If yes, the answer is for RVV, we are reusing the same flow as ARM SVE (BIT_FILED_REF approach), see the example below:
>
> https://godbolt.org/z/cqrWrY8q4
>
> #define EXTRACT_LAST(TYPE) \
> TYPE __attribute__ ((noinline, noclone)) \
> test_##TYPE (TYPE *x, int n, TYPE value) \
> { \
> TYPE last; \
> for (int j = 0; j < 64; ++j) \
> { \
> last = x[j]; \
> x[j] = last * value; \
> } \
> return last; \
> }
>
> #define TEST_ALL(T) \
> T (uint8_t) \
>
> TEST_ALL (EXTRACT_LAST)
>
> vect_cst__22 = {value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D)};
> vect_last_11.6_3 = MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)];
> vect__4.7_23 = vect_last_11.6_3 * vect_cst__22;
> MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)] = vect__4.7_23;
> _21 = BIT_FIELD_REF <vect_last_11.6_3, 8, 504>;
>
> This approach works perfectly for both RVV and ARM SVE for non-VL and non-MASK EXTRACT_LAST operation.
>
> >> So, why do we test for get_len_load_store_mode and not just for
> >> VEC_EXTRACT?
>
> Before answer this question, let me first elaborate how ARM SVE is doing with MASK EXTRACT_LAST.
>
> Here is the example:
> https://godbolt.org/z/8cTv1jqMb
>
> ARM SVE IR:
>
> <bb 4> [local count: 955630224]:
> # ivtmp_31 = PHI <ivtmp_32(4), 0(3)>
>
> # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> -----> For RVV, we want this to be loop_len = SELECT_VL;
>
> _7 = &MEM <vector([16,16]) unsigned char> [(uint8_t *)x_11(D) + ivtmp_31 * 1];
> vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> ivtmp_32 = ivtmp_31 + POLY_INT_CST [16, 16];
> _1 = (unsigned int) ivtmp_32;
>
> next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>
> if (next_mask_35 != { 0, ... })
> goto <bb 4>; [89.00%]
> else
> goto <bb 5>; [11.00%]
>
> <bb 5> [local count: 105119324]:
>
> _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); [tail call] ----> Use the last mask generated in BB 4, so for RVV, we are using the loop_len.
>
> So this patch is trying to optimize the codegen with simulating same flow as ARM SVE but with replacing 'loop_mask_22' (This is generated in BB4) into 'loop_len'.
>
> For ARM SVE, they only check whether target support EXTRACT_LAST pattern, this pattern is supported means:
>
> 1. Target is using loop MASK as the partial vector loop control.
I don't think it checks for this?
> 2. extract_last optab is enabled in the backend.
>
> So for RVV, we are also checking same conditions:
>
> 1. Target is using loop LEN as the partial vector loop control (I use get_len_load_store_mode to check whether target is using loop LEN as the partial vector loop control).
But we don't really know this at this point? The only thing we know
is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false.
> 2. vec_extract optab is enabled in the backend.
>
> An alternative approach is that we can adding EXTRACT_LAST_LEN internal FN, then we can only check this like ARM SVE only check EXTRACT_LAST.
I think it should work to change the direct_internal_fn_supported_p
check for IFN_EXTRACT_LAST to a "poitive" one guarding
gcc_assert (ncopies == 1 && !slp_node);
vect_record_loop_mask (loop_vinfo,
&LOOP_VINFO_MASKS (loop_vinfo),
1, vectype, NULL);
and in the else branch check for VEC_EXTRACT support and if present
record a loop len. Just in this case this particular order would
be important.
> >> can we double-check this on powerpc and s390?
>
> Sure, I hope it can be beneficial to powerpc and s390.
> And, I think Richard's comments are also very important so I am gonna wait for it.
Yeah, just to double-check the bias stuff works correctly.
Richard.
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>
> From: Richard Biener
> Date: 2023-08-11 15:01
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford; linkw; krebbel
> Subject: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
> On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
>
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> >
> > Hi, Richard and Richi.
> >
> > This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> >
> > Consider this following case:
> >
> > #include <stdint.h>
> >
> > #define EXTRACT_LAST(TYPE) \
> > TYPE __attribute__ ((noinline, noclone)) \
> > test_##TYPE (TYPE *x, int n, TYPE value) \
> > { \
> > TYPE last; \
> > for (int j = 0; j < n; ++j) \
> > { \
> > last = x[j]; \
> > x[j] = last * value; \
> > } \
> > return last; \
> > }
> >
> > #define TEST_ALL(T) \
> > T (uint8_t) \
> >
> > TEST_ALL (EXTRACT_LAST)
> >
> > ARM SVE IR:
> >
> > Preheader:
> > max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> >
> > Loop:
> > ...
> > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> > ...
> > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> > ...
> > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> > ...
> >
> > Epilogue:
> > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> >
> > For RVV since we prefer len in loop control, after this patch for RVV:
> >
> > Loop:
> > ...
> > loop_len_22 = SELECT_VL;
> > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> > ...
> >
> > Epilogue:
> > _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
> >
> > Details of this approach:
> >
> > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization
> > for LEN loop control.
> >
> > This function we check whether target support:
> > - Use LEN as the loop control.
> > - Support VEC_EXTRACT optab.
> >
> > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> >
> > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
> >
> > The only difference between mask and len is that len is using length generated by SELECT_VL and
> > use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
> >
> > Bootstrap and Regression on X86 passed.
> >
> > Tested on ARM QEMU.
> >
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
> > (vectorizable_live_operation): Add loop len control.
> >
> > ---
> > gcc/tree-vect-loop.cc | 76 +++++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 70 insertions(+), 6 deletions(-)
> >
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index bf8d677b584..809b73b966c 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -8963,6 +8963,27 @@ vect_can_vectorize_without_simd_p (code_helper code)
> > && vect_can_vectorize_without_simd_p (tree_code (code)));
> > }
> >
> > +/* Return true if target supports extract last vectorization with LEN. */
> > +
> > +static bool
> > +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> > +{
> > + /* Return false if target doesn't support LEN in loop control. */
> > + machine_mode vmode;
> > + machine_mode vec_mode = TYPE_MODE (vectype);
> > + if (!VECTOR_MODE_P (vec_mode))
> > + return false;
> > + if (!get_len_load_store_mode (vec_mode, true).exists (&vmode)
> > + || !get_len_load_store_mode (vec_mode, false).exists (&vmode))
> > + return false;
>
> So this "hidden" bit in the end decides whether to ...
>
> > + /* Target need to support VEC_EXTRACT to extract the last active element. */
> > + return convert_optab_handler (vec_extract_optab,
> > + vec_mode,
> > + TYPE_MODE (TREE_TYPE (vectype)))
> > + != CODE_FOR_nothing;
> > +}
> > +
> > /* Create vector init for vectorized iv. */
> > static tree
> > vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> > @@ -10279,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> > {
> > if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> > - OPTIMIZE_FOR_SPEED))
> > + OPTIMIZE_FOR_SPEED)
> > + && !vect_can_vectorize_extract_last_with_len_p (vectype))
> > {
> > if (dump_enabled_p ())
> > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -10308,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > else
> > {
> > gcc_assert (ncopies == 1 && !slp_node);
> > - vect_record_loop_mask (loop_vinfo,
> > - &LOOP_VINFO_MASKS (loop_vinfo),
> > - 1, vectype, NULL);
> > + if (vect_can_vectorize_extract_last_with_len_p (vectype))
> > + vect_record_loop_len (loop_vinfo,
> > + &LOOP_VINFO_LENS (loop_vinfo),
> > + 1, vectype, 1);
>
> .. record a loop_len here. I think powerpc at least has .VEC_EXTRACT as
> well but of course .VEC_EXTRACT support itself doesn't have anything to
> do with 'len' support.
>
> x86 has .VEC_SET but not yet .VEC_EXTRACT, if it gets .VEC_EXTRACT
> its partial vector support still wants masks, not lens (and once
> we record both we fail).
>
> So how can we resolve the issue when a non-VL operation like
> .VEC_EXTRACT is used for _len support?
>
> Note x86 doens't yet support IFN_EXTRACT_LAST either.
>
> So, why do we test for get_len_load_store_mode and not just for
> VEC_EXTRACT?
>
> > + else
> > + vect_record_loop_mask (loop_vinfo,
> > + &LOOP_VINFO_MASKS (loop_vinfo),
> > + 1, vectype, NULL);
> > }
> > }
> > /* ??? Enable for loop costing as well. */
> > @@ -10336,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > gimple *vec_stmt;
> > if (slp_node)
> > {
> > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > + gcc_assert (!loop_vinfo
> > + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
> >
> > /* Get the correct slp vectorized stmt. */
> > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> > @@ -10380,7 +10409,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> >
> > gimple_seq stmts = NULL;
> > tree new_tree;
> > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > + {
> > + /* Emit:
> > +
> > + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> > +
> > + where VEC_LHS is the vectorized live-out result and MASK is
> > + the loop mask for the final iteration. */
> > + gcc_assert (ncopies == 1 && !slp_node);
> > + gimple_seq tem = NULL;
> > + gimple_stmt_iterator gsi = gsi_last (tem);
> > + tree len
> > + = vect_get_loop_len (loop_vinfo, &gsi,
> > + &LOOP_VINFO_LENS (loop_vinfo),
> > + 1, vectype, 0, 0);
> > +
> > + /* BIAS - 1. */
> > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > + tree bias_minus_one
> > + = int_const_binop (MINUS_EXPR,
> > + build_int_cst (TREE_TYPE (len), biasval),
> > + build_one_cst (TREE_TYPE (len)));
> > +
> > + /* LAST_INDEX = LEN + (BIAS - 1). */
> > + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> > + len, bias_minus_one);
> > +
> > + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
> > + tree scalar_res
> > + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> > + vec_lhs_phi, last_index);
> > +
>
> can we double-check this on powerpc and s390?
>
> Thanks,
> Richard.
>
> > + /* Convert the extracted vector element to the scalar type. */
> > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > + }
> > + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > {
> > /* Emit:
> >
>
>
Hi, Richi.
> 1. Target is using loop MASK as the partial vector loop control.
>> I don't think it checks for this?
I am not sure whether I understand EXTRACT_LAST correctly.
But if target doesn't use loop MASK for partial vector loop control, how does target use EXTRACT_LAST?
Since EXTRACT_LAST is always extracting the last element of the vector according to MASK operand.
> But we don't really know this at this point? The only thing we know
> is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false.
Yes. So I am try to use 'get_len_load_store' to check whether target support LEN loop control.
Well, I admit it's not a good idea.
> I think it should work to change the direct_internal_fn_supported_p
> check for IFN_EXTRACT_LAST to a "poitive" one guarding
> gcc_assert (ncopies == 1 && !slp_node);
> vect_record_loop_mask (loop_vinfo,
> &LOOP_VINFO_MASKS (loop_vinfo),
> 1, vectype, NULL);
> and in the else branch check for VEC_EXTRACT support and if present
> record a loop len. Just in this case this particular order would
> be important.
Do you mean change the codes as follows :?
- if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
- OPTIMIZE_FOR_SPEED))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't operate on partial vectors "
- "because the target doesn't support extract "
- "last reduction.\n");
- LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
- }
- else if (slp_node)
if (slp_node)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"can't operate on partial vectors "
"because an SLP statement is live after "
"the loop.\n");
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
}
else if (ncopies > 1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"can't operate on partial vectors "
"because ncopies is greater than 1.\n");
LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
}
else
{
gcc_assert (ncopies == 1 && !slp_node);
if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
OPTIMIZE_FOR_SPEED))
vect_record_loop_mask (loop_vinfo,
&LOOP_VINFO_MASKS (loop_vinfo),
1, vectype, NULL);
else
vect_record_loop_len (loop_vinfo,
&LOOP_VINFO_LENS (loop_vinfo),
1, vectype, 1);
}
Thanks.
juzhe.zhong@rivai.ai
From: Richard Biener
Date: 2023-08-11 18:21
To: juzhe.zhong@rivai.ai
CC: gcc-patches; richard.sandiford; linkw; krebbel
Subject: Re: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
> Hi, Richi.
>
> >> So how can we resolve the issue when a non-VL operation like
> >> .VEC_EXTRACT is used for _len support?
>
> Do you mean non-VL extract last operation (I am sorry that not sure whether I understand your question correctly)?
> If yes, the answer is for RVV, we are reusing the same flow as ARM SVE (BIT_FILED_REF approach), see the example below:
>
> https://godbolt.org/z/cqrWrY8q4
>
> #define EXTRACT_LAST(TYPE) \
> TYPE __attribute__ ((noinline, noclone)) \
> test_##TYPE (TYPE *x, int n, TYPE value) \
> { \
> TYPE last; \
> for (int j = 0; j < 64; ++j) \
> { \
> last = x[j]; \
> x[j] = last * value; \
> } \
> return last; \
> }
>
> #define TEST_ALL(T) \
> T (uint8_t) \
>
> TEST_ALL (EXTRACT_LAST)
>
> vect_cst__22 = {value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D)};
> vect_last_11.6_3 = MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)];
> vect__4.7_23 = vect_last_11.6_3 * vect_cst__22;
> MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)] = vect__4.7_23;
> _21 = BIT_FIELD_REF <vect_last_11.6_3, 8, 504>;
>
> This approach works perfectly for both RVV and ARM SVE for non-VL and non-MASK EXTRACT_LAST operation.
>
> >> So, why do we test for get_len_load_store_mode and not just for
> >> VEC_EXTRACT?
>
> Before answer this question, let me first elaborate how ARM SVE is doing with MASK EXTRACT_LAST.
>
> Here is the example:
> https://godbolt.org/z/8cTv1jqMb
>
> ARM SVE IR:
>
> <bb 4> [local count: 955630224]:
> # ivtmp_31 = PHI <ivtmp_32(4), 0(3)>
>
> # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> -----> For RVV, we want this to be loop_len = SELECT_VL;
>
> _7 = &MEM <vector([16,16]) unsigned char> [(uint8_t *)x_11(D) + ivtmp_31 * 1];
> vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> ivtmp_32 = ivtmp_31 + POLY_INT_CST [16, 16];
> _1 = (unsigned int) ivtmp_32;
>
> next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
>
> if (next_mask_35 != { 0, ... })
> goto <bb 4>; [89.00%]
> else
> goto <bb 5>; [11.00%]
>
> <bb 5> [local count: 105119324]:
>
> _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); [tail call] ----> Use the last mask generated in BB 4, so for RVV, we are using the loop_len.
>
> So this patch is trying to optimize the codegen with simulating same flow as ARM SVE but with replacing 'loop_mask_22' (This is generated in BB4) into 'loop_len'.
>
> For ARM SVE, they only check whether target support EXTRACT_LAST pattern, this pattern is supported means:
>
> 1. Target is using loop MASK as the partial vector loop control.
I don't think it checks for this?
> 2. extract_last optab is enabled in the backend.
>
> So for RVV, we are also checking same conditions:
>
> 1. Target is using loop LEN as the partial vector loop control (I use get_len_load_store_mode to check whether target is using loop LEN as the partial vector loop control).
But we don't really know this at this point? The only thing we know
is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false.
> 2. vec_extract optab is enabled in the backend.
>
> An alternative approach is that we can adding EXTRACT_LAST_LEN internal FN, then we can only check this like ARM SVE only check EXTRACT_LAST.
I think it should work to change the direct_internal_fn_supported_p
check for IFN_EXTRACT_LAST to a "poitive" one guarding
gcc_assert (ncopies == 1 && !slp_node);
vect_record_loop_mask (loop_vinfo,
&LOOP_VINFO_MASKS (loop_vinfo),
1, vectype, NULL);
and in the else branch check for VEC_EXTRACT support and if present
record a loop len. Just in this case this particular order would
be important.
> >> can we double-check this on powerpc and s390?
>
> Sure, I hope it can be beneficial to powerpc and s390.
> And, I think Richard's comments are also very important so I am gonna wait for it.
Yeah, just to double-check the bias stuff works correctly.
Richard.
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>
> From: Richard Biener
> Date: 2023-08-11 15:01
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford; linkw; krebbel
> Subject: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
> On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
>
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> >
> > Hi, Richard and Richi.
> >
> > This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> >
> > Consider this following case:
> >
> > #include <stdint.h>
> >
> > #define EXTRACT_LAST(TYPE) \
> > TYPE __attribute__ ((noinline, noclone)) \
> > test_##TYPE (TYPE *x, int n, TYPE value) \
> > { \
> > TYPE last; \
> > for (int j = 0; j < n; ++j) \
> > { \
> > last = x[j]; \
> > x[j] = last * value; \
> > } \
> > return last; \
> > }
> >
> > #define TEST_ALL(T) \
> > T (uint8_t) \
> >
> > TEST_ALL (EXTRACT_LAST)
> >
> > ARM SVE IR:
> >
> > Preheader:
> > max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> >
> > Loop:
> > ...
> > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> > ...
> > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> > ...
> > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> > ...
> >
> > Epilogue:
> > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> >
> > For RVV since we prefer len in loop control, after this patch for RVV:
> >
> > Loop:
> > ...
> > loop_len_22 = SELECT_VL;
> > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> > ...
> >
> > Epilogue:
> > _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
> >
> > Details of this approach:
> >
> > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization
> > for LEN loop control.
> >
> > This function we check whether target support:
> > - Use LEN as the loop control.
> > - Support VEC_EXTRACT optab.
> >
> > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> >
> > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
> >
> > The only difference between mask and len is that len is using length generated by SELECT_VL and
> > use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
> >
> > Bootstrap and Regression on X86 passed.
> >
> > Tested on ARM QEMU.
> >
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
> > (vectorizable_live_operation): Add loop len control.
> >
> > ---
> > gcc/tree-vect-loop.cc | 76 +++++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 70 insertions(+), 6 deletions(-)
> >
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index bf8d677b584..809b73b966c 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -8963,6 +8963,27 @@ vect_can_vectorize_without_simd_p (code_helper code)
> > && vect_can_vectorize_without_simd_p (tree_code (code)));
> > }
> >
> > +/* Return true if target supports extract last vectorization with LEN. */
> > +
> > +static bool
> > +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> > +{
> > + /* Return false if target doesn't support LEN in loop control. */
> > + machine_mode vmode;
> > + machine_mode vec_mode = TYPE_MODE (vectype);
> > + if (!VECTOR_MODE_P (vec_mode))
> > + return false;
> > + if (!get_len_load_store_mode (vec_mode, true).exists (&vmode)
> > + || !get_len_load_store_mode (vec_mode, false).exists (&vmode))
> > + return false;
>
> So this "hidden" bit in the end decides whether to ...
>
> > + /* Target need to support VEC_EXTRACT to extract the last active element. */
> > + return convert_optab_handler (vec_extract_optab,
> > + vec_mode,
> > + TYPE_MODE (TREE_TYPE (vectype)))
> > + != CODE_FOR_nothing;
> > +}
> > +
> > /* Create vector init for vectorized iv. */
> > static tree
> > vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> > @@ -10279,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> > {
> > if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> > - OPTIMIZE_FOR_SPEED))
> > + OPTIMIZE_FOR_SPEED)
> > + && !vect_can_vectorize_extract_last_with_len_p (vectype))
> > {
> > if (dump_enabled_p ())
> > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -10308,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > else
> > {
> > gcc_assert (ncopies == 1 && !slp_node);
> > - vect_record_loop_mask (loop_vinfo,
> > - &LOOP_VINFO_MASKS (loop_vinfo),
> > - 1, vectype, NULL);
> > + if (vect_can_vectorize_extract_last_with_len_p (vectype))
> > + vect_record_loop_len (loop_vinfo,
> > + &LOOP_VINFO_LENS (loop_vinfo),
> > + 1, vectype, 1);
>
> .. record a loop_len here. I think powerpc at least has .VEC_EXTRACT as
> well but of course .VEC_EXTRACT support itself doesn't have anything to
> do with 'len' support.
>
> x86 has .VEC_SET but not yet .VEC_EXTRACT, if it gets .VEC_EXTRACT
> its partial vector support still wants masks, not lens (and once
> we record both we fail).
>
> So how can we resolve the issue when a non-VL operation like
> .VEC_EXTRACT is used for _len support?
>
> Note x86 doens't yet support IFN_EXTRACT_LAST either.
>
> So, why do we test for get_len_load_store_mode and not just for
> VEC_EXTRACT?
>
> > + else
> > + vect_record_loop_mask (loop_vinfo,
> > + &LOOP_VINFO_MASKS (loop_vinfo),
> > + 1, vectype, NULL);
> > }
> > }
> > /* ??? Enable for loop costing as well. */
> > @@ -10336,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > gimple *vec_stmt;
> > if (slp_node)
> > {
> > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > + gcc_assert (!loop_vinfo
> > + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
> >
> > /* Get the correct slp vectorized stmt. */
> > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> > @@ -10380,7 +10409,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> >
> > gimple_seq stmts = NULL;
> > tree new_tree;
> > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > + {
> > + /* Emit:
> > +
> > + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> > +
> > + where VEC_LHS is the vectorized live-out result and MASK is
> > + the loop mask for the final iteration. */
> > + gcc_assert (ncopies == 1 && !slp_node);
> > + gimple_seq tem = NULL;
> > + gimple_stmt_iterator gsi = gsi_last (tem);
> > + tree len
> > + = vect_get_loop_len (loop_vinfo, &gsi,
> > + &LOOP_VINFO_LENS (loop_vinfo),
> > + 1, vectype, 0, 0);
> > +
> > + /* BIAS - 1. */
> > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > + tree bias_minus_one
> > + = int_const_binop (MINUS_EXPR,
> > + build_int_cst (TREE_TYPE (len), biasval),
> > + build_one_cst (TREE_TYPE (len)));
> > +
> > + /* LAST_INDEX = LEN + (BIAS - 1). */
> > + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> > + len, bias_minus_one);
> > +
> > + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
> > + tree scalar_res
> > + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> > + vec_lhs_phi, last_index);
> > +
>
> can we double-check this on powerpc and s390?
>
> Thanks,
> Richard.
>
> > + /* Convert the extracted vector element to the scalar type. */
> > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > + }
> > + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > {
> > /* Emit:
> >
>
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
> Hi, Richi.
>
> > 1. Target is using loop MASK as the partial vector loop control.
> >> I don't think it checks for this?
>
> I am not sure whether I understand EXTRACT_LAST correctly.
> But if target doesn't use loop MASK for partial vector loop control, how does target use EXTRACT_LAST?
> Since EXTRACT_LAST is always extracting the last element of the vector according to MASK operand.
>
> > But we don't really know this at this point? The only thing we know
> > is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false.
>
> Yes. So I am try to use 'get_len_load_store' to check whether target support LEN loop control.
>
> Well, I admit it's not a good idea.
>
>
> > I think it should work to change the direct_internal_fn_supported_p
> > check for IFN_EXTRACT_LAST to a "poitive" one guarding
>
> > gcc_assert (ncopies == 1 && !slp_node);
> > vect_record_loop_mask (loop_vinfo,
> > &LOOP_VINFO_MASKS (loop_vinfo),
> > 1, vectype, NULL);
>
> > and in the else branch check for VEC_EXTRACT support and if present
> > record a loop len. Just in this case this particular order would
> > be important.
>
> Do you mean change the codes as follows :?
>
> - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> - OPTIMIZE_FOR_SPEED))
> - {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "can't operate on partial vectors "
> - "because the target doesn't support extract "
> - "last reduction.\n");
> - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> - }
> - else if (slp_node)
> if (slp_node)
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> "can't operate on partial vectors "
> "because an SLP statement is live after "
> "the loop.\n");
> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> }
> else if (ncopies > 1)
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> "can't operate on partial vectors "
> "because ncopies is greater than 1.\n");
> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> }
> else
> {
> gcc_assert (ncopies == 1 && !slp_node);
> if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> OPTIMIZE_FOR_SPEED))
> vect_record_loop_mask (loop_vinfo,
> &LOOP_VINFO_MASKS (loop_vinfo),
> 1, vectype, NULL);
> else
check here the target supports VEC_EXTRACT
> vect_record_loop_len (loop_vinfo,
> &LOOP_VINFO_LENS (loop_vinfo),
> 1, vectype, 1);
else set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false with a
diagnostic.
> }
>
>
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>
> From: Richard Biener
> Date: 2023-08-11 18:21
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford; linkw; krebbel
> Subject: Re: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
> On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
>
> > Hi, Richi.
> >
> > >> So how can we resolve the issue when a non-VL operation like
> > >> .VEC_EXTRACT is used for _len support?
> >
> > Do you mean non-VL extract last operation (I am sorry that not sure whether I understand your question correctly)?
> > If yes, the answer is for RVV, we are reusing the same flow as ARM SVE (BIT_FILED_REF approach), see the example below:
> >
> > https://godbolt.org/z/cqrWrY8q4
> >
> > #define EXTRACT_LAST(TYPE) \
> > TYPE __attribute__ ((noinline, noclone)) \
> > test_##TYPE (TYPE *x, int n, TYPE value) \
> > { \
> > TYPE last; \
> > for (int j = 0; j < 64; ++j) \
> > { \
> > last = x[j]; \
> > x[j] = last * value; \
> > } \
> > return last; \
> > }
> >
> > #define TEST_ALL(T) \
> > T (uint8_t) \
> >
> > TEST_ALL (EXTRACT_LAST)
> >
> > vect_cst__22 = {value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D)};
> > vect_last_11.6_3 = MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)];
> > vect__4.7_23 = vect_last_11.6_3 * vect_cst__22;
> > MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)] = vect__4.7_23;
> > _21 = BIT_FIELD_REF <vect_last_11.6_3, 8, 504>;
> >
> > This approach works perfectly for both RVV and ARM SVE for non-VL and non-MASK EXTRACT_LAST operation.
> >
> > >> So, why do we test for get_len_load_store_mode and not just for
> > >> VEC_EXTRACT?
> >
> > Before answer this question, let me first elaborate how ARM SVE is doing with MASK EXTRACT_LAST.
> >
> > Here is the example:
> > https://godbolt.org/z/8cTv1jqMb
> >
> > ARM SVE IR:
> >
> > <bb 4> [local count: 955630224]:
> > # ivtmp_31 = PHI <ivtmp_32(4), 0(3)>
> >
> > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> -----> For RVV, we want this to be loop_len = SELECT_VL;
> >
> > _7 = &MEM <vector([16,16]) unsigned char> [(uint8_t *)x_11(D) + ivtmp_31 * 1];
> > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> > ivtmp_32 = ivtmp_31 + POLY_INT_CST [16, 16];
> > _1 = (unsigned int) ivtmp_32;
> >
> > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> >
> > if (next_mask_35 != { 0, ... })
> > goto <bb 4>; [89.00%]
> > else
> > goto <bb 5>; [11.00%]
> >
> > <bb 5> [local count: 105119324]:
> >
> > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); [tail call] ----> Use the last mask generated in BB 4, so for RVV, we are using the loop_len.
> >
> > So this patch is trying to optimize the codegen with simulating same flow as ARM SVE but with replacing 'loop_mask_22' (This is generated in BB4) into 'loop_len'.
> >
> > For ARM SVE, they only check whether target support EXTRACT_LAST pattern, this pattern is supported means:
> >
> > 1. Target is using loop MASK as the partial vector loop control.
>
> I don't think it checks for this?
>
> > 2. extract_last optab is enabled in the backend.
> >
> > So for RVV, we are also checking same conditions:
> >
> > 1. Target is using loop LEN as the partial vector loop control (I use get_len_load_store_mode to check whether target is using loop LEN as the partial vector loop control).
>
> But we don't really know this at this point? The only thing we know
> is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false.
>
> > 2. vec_extract optab is enabled in the backend.
> >
> > An alternative approach is that we can adding EXTRACT_LAST_LEN internal FN, then we can only check this like ARM SVE only check EXTRACT_LAST.
>
> I think it should work to change the direct_internal_fn_supported_p
> check for IFN_EXTRACT_LAST to a "poitive" one guarding
>
> gcc_assert (ncopies == 1 && !slp_node);
> vect_record_loop_mask (loop_vinfo,
> &LOOP_VINFO_MASKS (loop_vinfo),
> 1, vectype, NULL);
>
> and in the else branch check for VEC_EXTRACT support and if present
> record a loop len. Just in this case this particular order would
> be important.
>
> > >> can we double-check this on powerpc and s390?
> >
> > Sure, I hope it can be beneficial to powerpc and s390.
> > And, I think Richard's comments are also very important so I am gonna wait for it.
>
> Yeah, just to double-check the bias stuff works correctly.
>
> Richard.
>
> > Thanks.
> >
> >
> > juzhe.zhong@rivai.ai
> >
> > From: Richard Biener
> > Date: 2023-08-11 15:01
> > To: Ju-Zhe Zhong
> > CC: gcc-patches; richard.sandiford; linkw; krebbel
> > Subject: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
> > On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
> >
> > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > >
> > > Hi, Richard and Richi.
> > >
> > > This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> > >
> > > Consider this following case:
> > >
> > > #include <stdint.h>
> > >
> > > #define EXTRACT_LAST(TYPE) \
> > > TYPE __attribute__ ((noinline, noclone)) \
> > > test_##TYPE (TYPE *x, int n, TYPE value) \
> > > { \
> > > TYPE last; \
> > > for (int j = 0; j < n; ++j) \
> > > { \
> > > last = x[j]; \
> > > x[j] = last * value; \
> > > } \
> > > return last; \
> > > }
> > >
> > > #define TEST_ALL(T) \
> > > T (uint8_t) \
> > >
> > > TEST_ALL (EXTRACT_LAST)
> > >
> > > ARM SVE IR:
> > >
> > > Preheader:
> > > max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> > >
> > > Loop:
> > > ...
> > > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> > > ...
> > > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> > > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> > > ...
> > > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> > > ...
> > >
> > > Epilogue:
> > > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> > >
> > > For RVV since we prefer len in loop control, after this patch for RVV:
> > >
> > > Loop:
> > > ...
> > > loop_len_22 = SELECT_VL;
> > > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> > > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > > .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> > > ...
> > >
> > > Epilogue:
> > > _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
> > >
> > > Details of this approach:
> > >
> > > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization
> > > for LEN loop control.
> > >
> > > This function we check whether target support:
> > > - Use LEN as the loop control.
> > > - Support VEC_EXTRACT optab.
> > >
> > > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> > >
> > > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
> > >
> > > The only difference between mask and len is that len is using length generated by SELECT_VL and
> > > use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
> > >
> > > Bootstrap and Regression on X86 passed.
> > >
> > > Tested on ARM QEMU.
> > >
> > > Ok for trunk?
> > >
> > > gcc/ChangeLog:
> > >
> > > * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
> > > (vectorizable_live_operation): Add loop len control.
> > >
> > > ---
> > > gcc/tree-vect-loop.cc | 76 +++++++++++++++++++++++++++++++++++++++----
> > > 1 file changed, 70 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > index bf8d677b584..809b73b966c 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -8963,6 +8963,27 @@ vect_can_vectorize_without_simd_p (code_helper code)
> > > && vect_can_vectorize_without_simd_p (tree_code (code)));
> > > }
> > >
> > > +/* Return true if target supports extract last vectorization with LEN. */
> > > +
> > > +static bool
> > > +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> > > +{
> > > + /* Return false if target doesn't support LEN in loop control. */
> > > + machine_mode vmode;
> > > + machine_mode vec_mode = TYPE_MODE (vectype);
> > > + if (!VECTOR_MODE_P (vec_mode))
> > > + return false;
> > > + if (!get_len_load_store_mode (vec_mode, true).exists (&vmode)
> > > + || !get_len_load_store_mode (vec_mode, false).exists (&vmode))
> > > + return false;
> >
> > So this "hidden" bit in the end decides whether to ...
> >
> > > + /* Target need to support VEC_EXTRACT to extract the last active element. */
> > > + return convert_optab_handler (vec_extract_optab,
> > > + vec_mode,
> > > + TYPE_MODE (TREE_TYPE (vectype)))
> > > + != CODE_FOR_nothing;
> > > +}
> > > +
> > > /* Create vector init for vectorized iv. */
> > > static tree
> > > vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> > > @@ -10279,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > > if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> > > {
> > > if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> > > - OPTIMIZE_FOR_SPEED))
> > > + OPTIMIZE_FOR_SPEED)
> > > + && !vect_can_vectorize_extract_last_with_len_p (vectype))
> > > {
> > > if (dump_enabled_p ())
> > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > @@ -10308,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > > else
> > > {
> > > gcc_assert (ncopies == 1 && !slp_node);
> > > - vect_record_loop_mask (loop_vinfo,
> > > - &LOOP_VINFO_MASKS (loop_vinfo),
> > > - 1, vectype, NULL);
> > > + if (vect_can_vectorize_extract_last_with_len_p (vectype))
> > > + vect_record_loop_len (loop_vinfo,
> > > + &LOOP_VINFO_LENS (loop_vinfo),
> > > + 1, vectype, 1);
> >
> > .. record a loop_len here. I think powerpc at least has .VEC_EXTRACT as
> > well but of course .VEC_EXTRACT support itself doesn't have anything to
> > do with 'len' support.
> >
> > x86 has .VEC_SET but not yet .VEC_EXTRACT, if it gets .VEC_EXTRACT
> > its partial vector support still wants masks, not lens (and once
> > we record both we fail).
> >
> > So how can we resolve the issue when a non-VL operation like
> > .VEC_EXTRACT is used for _len support?
> >
> > Note x86 doens't yet support IFN_EXTRACT_LAST either.
> >
> > So, why do we test for get_len_load_store_mode and not just for
> > VEC_EXTRACT?
> >
> > > + else
> > > + vect_record_loop_mask (loop_vinfo,
> > > + &LOOP_VINFO_MASKS (loop_vinfo),
> > > + 1, vectype, NULL);
> > > }
> > > }
> > > /* ??? Enable for loop costing as well. */
> > > @@ -10336,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > > gimple *vec_stmt;
> > > if (slp_node)
> > > {
> > > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > > + gcc_assert (!loop_vinfo
> > > + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > > + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
> > >
> > > /* Get the correct slp vectorized stmt. */
> > > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> > > @@ -10380,7 +10409,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > >
> > > gimple_seq stmts = NULL;
> > > tree new_tree;
> > > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > > + {
> > > + /* Emit:
> > > +
> > > + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> > > +
> > > + where VEC_LHS is the vectorized live-out result and MASK is
> > > + the loop mask for the final iteration. */
> > > + gcc_assert (ncopies == 1 && !slp_node);
> > > + gimple_seq tem = NULL;
> > > + gimple_stmt_iterator gsi = gsi_last (tem);
> > > + tree len
> > > + = vect_get_loop_len (loop_vinfo, &gsi,
> > > + &LOOP_VINFO_LENS (loop_vinfo),
> > > + 1, vectype, 0, 0);
> > > +
> > > + /* BIAS - 1. */
> > > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > > + tree bias_minus_one
> > > + = int_const_binop (MINUS_EXPR,
> > > + build_int_cst (TREE_TYPE (len), biasval),
> > > + build_one_cst (TREE_TYPE (len)));
> > > +
> > > + /* LAST_INDEX = LEN + (BIAS - 1). */
> > > + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> > > + len, bias_minus_one);
> > > +
> > > + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
> > > + tree scalar_res
> > > + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> > > + vec_lhs_phi, last_index);
> > > +
> >
> > can we double-check this on powerpc and s390?
> >
> > Thanks,
> > Richard.
> >
> > > + /* Convert the extracted vector element to the scalar type. */
> > > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > > + }
> > > + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > > {
> > > /* Emit:
> > >
> >
> >
>
>
Hi, Richi.
>> check here the target supports VEC_EXTRACT
>> else set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false with a
>> diagnostic.
I am wondering target has VEC_EXTRACT but no EXTRACT_LAST, and such target is using MASK as the loop control.
It seems that it will cause ICE for such target ? (Not sure whether we currently have such target so far).
Thanks.
juzhe.zhong@rivai.ai
From: Richard Biener
Date: 2023-08-11 19:10
To: juzhe.zhong@rivai.ai
CC: gcc-patches; richard.sandiford; linkw; krebbel
Subject: Re: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
> Hi, Richi.
>
> > 1. Target is using loop MASK as the partial vector loop control.
> >> I don't think it checks for this?
>
> I am not sure whether I understand EXTRACT_LAST correctly.
> But if target doesn't use loop MASK for partial vector loop control, how does target use EXTRACT_LAST?
> Since EXTRACT_LAST is always extracting the last element of the vector according to MASK operand.
>
> > But we don't really know this at this point? The only thing we know
> > is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false.
>
> Yes. So I am try to use 'get_len_load_store' to check whether target support LEN loop control.
>
> Well, I admit it's not a good idea.
>
>
> > I think it should work to change the direct_internal_fn_supported_p
> > check for IFN_EXTRACT_LAST to a "poitive" one guarding
>
> > gcc_assert (ncopies == 1 && !slp_node);
> > vect_record_loop_mask (loop_vinfo,
> > &LOOP_VINFO_MASKS (loop_vinfo),
> > 1, vectype, NULL);
>
> > and in the else branch check for VEC_EXTRACT support and if present
> > record a loop len. Just in this case this particular order would
> > be important.
>
> Do you mean change the codes as follows :?
>
> - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> - OPTIMIZE_FOR_SPEED))
> - {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "can't operate on partial vectors "
> - "because the target doesn't support extract "
> - "last reduction.\n");
> - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> - }
> - else if (slp_node)
> if (slp_node)
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> "can't operate on partial vectors "
> "because an SLP statement is live after "
> "the loop.\n");
> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> }
> else if (ncopies > 1)
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> "can't operate on partial vectors "
> "because ncopies is greater than 1.\n");
> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> }
> else
> {
> gcc_assert (ncopies == 1 && !slp_node);
> if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> OPTIMIZE_FOR_SPEED))
> vect_record_loop_mask (loop_vinfo,
> &LOOP_VINFO_MASKS (loop_vinfo),
> 1, vectype, NULL);
> else
check here the target supports VEC_EXTRACT
> vect_record_loop_len (loop_vinfo,
> &LOOP_VINFO_LENS (loop_vinfo),
> 1, vectype, 1);
else set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false with a
diagnostic.
> }
>
>
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>
> From: Richard Biener
> Date: 2023-08-11 18:21
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford; linkw; krebbel
> Subject: Re: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
> On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
>
> > Hi, Richi.
> >
> > >> So how can we resolve the issue when a non-VL operation like
> > >> .VEC_EXTRACT is used for _len support?
> >
> > Do you mean non-VL extract last operation (I am sorry that not sure whether I understand your question correctly)?
> > If yes, the answer is for RVV, we are reusing the same flow as ARM SVE (BIT_FILED_REF approach), see the example below:
> >
> > https://godbolt.org/z/cqrWrY8q4
> >
> > #define EXTRACT_LAST(TYPE) \
> > TYPE __attribute__ ((noinline, noclone)) \
> > test_##TYPE (TYPE *x, int n, TYPE value) \
> > { \
> > TYPE last; \
> > for (int j = 0; j < 64; ++j) \
> > { \
> > last = x[j]; \
> > x[j] = last * value; \
> > } \
> > return last; \
> > }
> >
> > #define TEST_ALL(T) \
> > T (uint8_t) \
> >
> > TEST_ALL (EXTRACT_LAST)
> >
> > vect_cst__22 = {value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D)};
> > vect_last_11.6_3 = MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)];
> > vect__4.7_23 = vect_last_11.6_3 * vect_cst__22;
> > MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)] = vect__4.7_23;
> > _21 = BIT_FIELD_REF <vect_last_11.6_3, 8, 504>;
> >
> > This approach works perfectly for both RVV and ARM SVE for non-VL and non-MASK EXTRACT_LAST operation.
> >
> > >> So, why do we test for get_len_load_store_mode and not just for
> > >> VEC_EXTRACT?
> >
> > Before answer this question, let me first elaborate how ARM SVE is doing with MASK EXTRACT_LAST.
> >
> > Here is the example:
> > https://godbolt.org/z/8cTv1jqMb
> >
> > ARM SVE IR:
> >
> > <bb 4> [local count: 955630224]:
> > # ivtmp_31 = PHI <ivtmp_32(4), 0(3)>
> >
> > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> -----> For RVV, we want this to be loop_len = SELECT_VL;
> >
> > _7 = &MEM <vector([16,16]) unsigned char> [(uint8_t *)x_11(D) + ivtmp_31 * 1];
> > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> > ivtmp_32 = ivtmp_31 + POLY_INT_CST [16, 16];
> > _1 = (unsigned int) ivtmp_32;
> >
> > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> >
> > if (next_mask_35 != { 0, ... })
> > goto <bb 4>; [89.00%]
> > else
> > goto <bb 5>; [11.00%]
> >
> > <bb 5> [local count: 105119324]:
> >
> > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); [tail call] ----> Use the last mask generated in BB 4, so for RVV, we are using the loop_len.
> >
> > So this patch is trying to optimize the codegen with simulating same flow as ARM SVE but with replacing 'loop_mask_22' (This is generated in BB4) into 'loop_len'.
> >
> > For ARM SVE, they only check whether target support EXTRACT_LAST pattern, this pattern is supported means:
> >
> > 1. Target is using loop MASK as the partial vector loop control.
>
> I don't think it checks for this?
>
> > 2. extract_last optab is enabled in the backend.
> >
> > So for RVV, we are also checking same conditions:
> >
> > 1. Target is using loop LEN as the partial vector loop control (I use get_len_load_store_mode to check whether target is using loop LEN as the partial vector loop control).
>
> But we don't really know this at this point? The only thing we know
> is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false.
>
> > 2. vec_extract optab is enabled in the backend.
> >
> > An alternative approach is that we can adding EXTRACT_LAST_LEN internal FN, then we can only check this like ARM SVE only check EXTRACT_LAST.
>
> I think it should work to change the direct_internal_fn_supported_p
> check for IFN_EXTRACT_LAST to a "poitive" one guarding
>
> gcc_assert (ncopies == 1 && !slp_node);
> vect_record_loop_mask (loop_vinfo,
> &LOOP_VINFO_MASKS (loop_vinfo),
> 1, vectype, NULL);
>
> and in the else branch check for VEC_EXTRACT support and if present
> record a loop len. Just in this case this particular order would
> be important.
>
> > >> can we double-check this on powerpc and s390?
> >
> > Sure, I hope it can be beneficial to powerpc and s390.
> > And, I think Richard's comments are also very important so I am gonna wait for it.
>
> Yeah, just to double-check the bias stuff works correctly.
>
> Richard.
>
> > Thanks.
> >
> >
> > juzhe.zhong@rivai.ai
> >
> > From: Richard Biener
> > Date: 2023-08-11 15:01
> > To: Ju-Zhe Zhong
> > CC: gcc-patches; richard.sandiford; linkw; krebbel
> > Subject: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
> > On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
> >
> > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > >
> > > Hi, Richard and Richi.
> > >
> > > This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> > >
> > > Consider this following case:
> > >
> > > #include <stdint.h>
> > >
> > > #define EXTRACT_LAST(TYPE) \
> > > TYPE __attribute__ ((noinline, noclone)) \
> > > test_##TYPE (TYPE *x, int n, TYPE value) \
> > > { \
> > > TYPE last; \
> > > for (int j = 0; j < n; ++j) \
> > > { \
> > > last = x[j]; \
> > > x[j] = last * value; \
> > > } \
> > > return last; \
> > > }
> > >
> > > #define TEST_ALL(T) \
> > > T (uint8_t) \
> > >
> > > TEST_ALL (EXTRACT_LAST)
> > >
> > > ARM SVE IR:
> > >
> > > Preheader:
> > > max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> > >
> > > Loop:
> > > ...
> > > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> > > ...
> > > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> > > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> > > ...
> > > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> > > ...
> > >
> > > Epilogue:
> > > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> > >
> > > For RVV since we prefer len in loop control, after this patch for RVV:
> > >
> > > Loop:
> > > ...
> > > loop_len_22 = SELECT_VL;
> > > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> > > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > > .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> > > ...
> > >
> > > Epilogue:
> > > _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
> > >
> > > Details of this approach:
> > >
> > > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization
> > > for LEN loop control.
> > >
> > > This function we check whether target support:
> > > - Use LEN as the loop control.
> > > - Support VEC_EXTRACT optab.
> > >
> > > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> > >
> > > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
> > >
> > > The only difference between mask and len is that len is using length generated by SELECT_VL and
> > > use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
> > >
> > > Bootstrap and Regression on X86 passed.
> > >
> > > Tested on ARM QEMU.
> > >
> > > Ok for trunk?
> > >
> > > gcc/ChangeLog:
> > >
> > > * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
> > > (vectorizable_live_operation): Add loop len control.
> > >
> > > ---
> > > gcc/tree-vect-loop.cc | 76 +++++++++++++++++++++++++++++++++++++++----
> > > 1 file changed, 70 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > index bf8d677b584..809b73b966c 100644
> > > --- a/gcc/tree-vect-loop.cc
> > > +++ b/gcc/tree-vect-loop.cc
> > > @@ -8963,6 +8963,27 @@ vect_can_vectorize_without_simd_p (code_helper code)
> > > && vect_can_vectorize_without_simd_p (tree_code (code)));
> > > }
> > >
> > > +/* Return true if target supports extract last vectorization with LEN. */
> > > +
> > > +static bool
> > > +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> > > +{
> > > + /* Return false if target doesn't support LEN in loop control. */
> > > + machine_mode vmode;
> > > + machine_mode vec_mode = TYPE_MODE (vectype);
> > > + if (!VECTOR_MODE_P (vec_mode))
> > > + return false;
> > > + if (!get_len_load_store_mode (vec_mode, true).exists (&vmode)
> > > + || !get_len_load_store_mode (vec_mode, false).exists (&vmode))
> > > + return false;
> >
> > So this "hidden" bit in the end decides whether to ...
> >
> > > + /* Target need to support VEC_EXTRACT to extract the last active element. */
> > > + return convert_optab_handler (vec_extract_optab,
> > > + vec_mode,
> > > + TYPE_MODE (TREE_TYPE (vectype)))
> > > + != CODE_FOR_nothing;
> > > +}
> > > +
> > > /* Create vector init for vectorized iv. */
> > > static tree
> > > vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> > > @@ -10279,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > > if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> > > {
> > > if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> > > - OPTIMIZE_FOR_SPEED))
> > > + OPTIMIZE_FOR_SPEED)
> > > + && !vect_can_vectorize_extract_last_with_len_p (vectype))
> > > {
> > > if (dump_enabled_p ())
> > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > @@ -10308,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > > else
> > > {
> > > gcc_assert (ncopies == 1 && !slp_node);
> > > - vect_record_loop_mask (loop_vinfo,
> > > - &LOOP_VINFO_MASKS (loop_vinfo),
> > > - 1, vectype, NULL);
> > > + if (vect_can_vectorize_extract_last_with_len_p (vectype))
> > > + vect_record_loop_len (loop_vinfo,
> > > + &LOOP_VINFO_LENS (loop_vinfo),
> > > + 1, vectype, 1);
> >
> > .. record a loop_len here. I think powerpc at least has .VEC_EXTRACT as
> > well but of course .VEC_EXTRACT support itself doesn't have anything to
> > do with 'len' support.
> >
> > x86 has .VEC_SET but not yet .VEC_EXTRACT, if it gets .VEC_EXTRACT
> > its partial vector support still wants masks, not lens (and once
> > we record both we fail).
> >
> > So how can we resolve the issue when a non-VL operation like
> > .VEC_EXTRACT is used for _len support?
> >
> > Note x86 doens't yet support IFN_EXTRACT_LAST either.
> >
> > So, why do we test for get_len_load_store_mode and not just for
> > VEC_EXTRACT?
> >
> > > + else
> > > + vect_record_loop_mask (loop_vinfo,
> > > + &LOOP_VINFO_MASKS (loop_vinfo),
> > > + 1, vectype, NULL);
> > > }
> > > }
> > > /* ??? Enable for loop costing as well. */
> > > @@ -10336,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > > gimple *vec_stmt;
> > > if (slp_node)
> > > {
> > > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > > + gcc_assert (!loop_vinfo
> > > + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > > + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
> > >
> > > /* Get the correct slp vectorized stmt. */
> > > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> > > @@ -10380,7 +10409,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > >
> > > gimple_seq stmts = NULL;
> > > tree new_tree;
> > > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > > + {
> > > + /* Emit:
> > > +
> > > + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> > > +
> > > + where VEC_LHS is the vectorized live-out result and MASK is
> > > + the loop mask for the final iteration. */
> > > + gcc_assert (ncopies == 1 && !slp_node);
> > > + gimple_seq tem = NULL;
> > > + gimple_stmt_iterator gsi = gsi_last (tem);
> > > + tree len
> > > + = vect_get_loop_len (loop_vinfo, &gsi,
> > > + &LOOP_VINFO_LENS (loop_vinfo),
> > > + 1, vectype, 0, 0);
> > > +
> > > + /* BIAS - 1. */
> > > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > > + tree bias_minus_one
> > > + = int_const_binop (MINUS_EXPR,
> > > + build_int_cst (TREE_TYPE (len), biasval),
> > > + build_one_cst (TREE_TYPE (len)));
> > > +
> > > + /* LAST_INDEX = LEN + (BIAS - 1). */
> > > + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> > > + len, bias_minus_one);
> > > +
> > > + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
> > > + tree scalar_res
> > > + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> > > + vec_lhs_phi, last_index);
> > > +
> >
> > can we double-check this on powerpc and s390?
> >
> > Thanks,
> > Richard.
> >
> > > + /* Convert the extracted vector element to the scalar type. */
> > > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > > + }
> > > + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > > {
> > > /* Emit:
> > >
> >
> >
>
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
> Hi, Richi.
>
> >> check here the target supports VEC_EXTRACT
> >> else set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false with a
> >> diagnostic.
>
> I am wondering target has VEC_EXTRACT but no EXTRACT_LAST, and such
> target is using MASK as the loop control. It seems that it will cause
> ICE for such target ? (Not sure whether we currently have such target so
> far).
No, we'd record a length in that case and give up on partial vectors
because we'd have both length and masks recorded in the end.
Richard.
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>
> From: Richard Biener
> Date: 2023-08-11 19:10
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford; linkw; krebbel
> Subject: Re: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
> On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
>
> > Hi, Richi.
> >
> > > 1. Target is using loop MASK as the partial vector loop control.
> > >> I don't think it checks for this?
> >
> > I am not sure whether I understand EXTRACT_LAST correctly.
> > But if target doesn't use loop MASK for partial vector loop control, how does target use EXTRACT_LAST?
> > Since EXTRACT_LAST is always extracting the last element of the vector according to MASK operand.
> >
> > > But we don't really know this at this point? The only thing we know
> > > is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false.
> >
> > Yes. So I am try to use 'get_len_load_store' to check whether target support LEN loop control.
> >
> > Well, I admit it's not a good idea.
> >
> >
> > > I think it should work to change the direct_internal_fn_supported_p
> > > check for IFN_EXTRACT_LAST to a "poitive" one guarding
> >
> > > gcc_assert (ncopies == 1 && !slp_node);
> > > vect_record_loop_mask (loop_vinfo,
> > > &LOOP_VINFO_MASKS (loop_vinfo),
> > > 1, vectype, NULL);
> >
> > > and in the else branch check for VEC_EXTRACT support and if present
> > > record a loop len. Just in this case this particular order would
> > > be important.
> >
> > Do you mean change the codes as follows :?
> >
> > - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> > - OPTIMIZE_FOR_SPEED))
> > - {
> > - if (dump_enabled_p ())
> > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > - "can't operate on partial vectors "
> > - "because the target doesn't support extract "
> > - "last reduction.\n");
> > - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> > - }
> > - else if (slp_node)
> > if (slp_node)
> > {
> > if (dump_enabled_p ())
> > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > "can't operate on partial vectors "
> > "because an SLP statement is live after "
> > "the loop.\n");
> > LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> > }
> > else if (ncopies > 1)
> > {
> > if (dump_enabled_p ())
> > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > "can't operate on partial vectors "
> > "because ncopies is greater than 1.\n");
> > LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> > }
> > else
> > {
> > gcc_assert (ncopies == 1 && !slp_node);
> > if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> > OPTIMIZE_FOR_SPEED))
> > vect_record_loop_mask (loop_vinfo,
> > &LOOP_VINFO_MASKS (loop_vinfo),
> > 1, vectype, NULL);
> > else
>
> check here the target supports VEC_EXTRACT
>
> > vect_record_loop_len (loop_vinfo,
> > &LOOP_VINFO_LENS (loop_vinfo),
> > 1, vectype, 1);
>
> else set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false with a
> diagnostic.
>
> > }
> >
> >
> > Thanks.
> >
> >
> > juzhe.zhong@rivai.ai
> >
> > From: Richard Biener
> > Date: 2023-08-11 18:21
> > To: juzhe.zhong@rivai.ai
> > CC: gcc-patches; richard.sandiford; linkw; krebbel
> > Subject: Re: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
> > On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
> >
> > > Hi, Richi.
> > >
> > > >> So how can we resolve the issue when a non-VL operation like
> > > >> .VEC_EXTRACT is used for _len support?
> > >
> > > Do you mean non-VL extract last operation (I am sorry that not sure whether I understand your question correctly)?
> > > If yes, the answer is for RVV, we are reusing the same flow as ARM SVE (BIT_FILED_REF approach), see the example below:
> > >
> > > https://godbolt.org/z/cqrWrY8q4
> > >
> > > #define EXTRACT_LAST(TYPE) \
> > > TYPE __attribute__ ((noinline, noclone)) \
> > > test_##TYPE (TYPE *x, int n, TYPE value) \
> > > { \
> > > TYPE last; \
> > > for (int j = 0; j < 64; ++j) \
> > > { \
> > > last = x[j]; \
> > > x[j] = last * value; \
> > > } \
> > > return last; \
> > > }
> > >
> > > #define TEST_ALL(T) \
> > > T (uint8_t) \
> > >
> > > TEST_ALL (EXTRACT_LAST)
> > >
> > > vect_cst__22 = {value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D), value_12(D)};
> > > vect_last_11.6_3 = MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)];
> > > vect__4.7_23 = vect_last_11.6_3 * vect_cst__22;
> > > MEM <vector(64) unsigned char> [(uint8_t *)x_10(D)] = vect__4.7_23;
> > > _21 = BIT_FIELD_REF <vect_last_11.6_3, 8, 504>;
> > >
> > > This approach works perfectly for both RVV and ARM SVE for non-VL and non-MASK EXTRACT_LAST operation.
> > >
> > > >> So, why do we test for get_len_load_store_mode and not just for
> > > >> VEC_EXTRACT?
> > >
> > > Before answer this question, let me first elaborate how ARM SVE is doing with MASK EXTRACT_LAST.
> > >
> > > Here is the example:
> > > https://godbolt.org/z/8cTv1jqMb
> > >
> > > ARM SVE IR:
> > >
> > > <bb 4> [local count: 955630224]:
> > > # ivtmp_31 = PHI <ivtmp_32(4), 0(3)>
> > >
> > > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> -----> For RVV, we want this to be loop_len = SELECT_VL;
> > >
> > > _7 = &MEM <vector([16,16]) unsigned char> [(uint8_t *)x_11(D) + ivtmp_31 * 1];
> > > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> > > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> > > ivtmp_32 = ivtmp_31 + POLY_INT_CST [16, 16];
> > > _1 = (unsigned int) ivtmp_32;
> > >
> > > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> > >
> > > if (next_mask_35 != { 0, ... })
> > > goto <bb 4>; [89.00%]
> > > else
> > > goto <bb 5>; [11.00%]
> > >
> > > <bb 5> [local count: 105119324]:
> > >
> > > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); [tail call] ----> Use the last mask generated in BB 4, so for RVV, we are using the loop_len.
> > >
> > > So this patch is trying to optimize the codegen with simulating same flow as ARM SVE but with replacing 'loop_mask_22' (This is generated in BB4) into 'loop_len'.
> > >
> > > For ARM SVE, they only check whether target support EXTRACT_LAST pattern, this pattern is supported means:
> > >
> > > 1. Target is using loop MASK as the partial vector loop control.
> >
> > I don't think it checks for this?
> >
> > > 2. extract_last optab is enabled in the backend.
> > >
> > > So for RVV, we are also checking same conditions:
> > >
> > > 1. Target is using loop LEN as the partial vector loop control (I use get_len_load_store_mode to check whether target is using loop LEN as the partial vector loop control).
> >
> > But we don't really know this at this point? The only thing we know
> > is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false.
> >
> > > 2. vec_extract optab is enabled in the backend.
> > >
> > > An alternative approach is that we can adding EXTRACT_LAST_LEN internal FN, then we can only check this like ARM SVE only check EXTRACT_LAST.
> >
> > I think it should work to change the direct_internal_fn_supported_p
> > check for IFN_EXTRACT_LAST to a "poitive" one guarding
> >
> > gcc_assert (ncopies == 1 && !slp_node);
> > vect_record_loop_mask (loop_vinfo,
> > &LOOP_VINFO_MASKS (loop_vinfo),
> > 1, vectype, NULL);
> >
> > and in the else branch check for VEC_EXTRACT support and if present
> > record a loop len. Just in this case this particular order would
> > be important.
> >
> > > >> can we double-check this on powerpc and s390?
> > >
> > > Sure, I hope it can be beneficial to powerpc and s390.
> > > And, I think Richard's comments are also very important so I am gonna wait for it.
> >
> > Yeah, just to double-check the bias stuff works correctly.
> >
> > Richard.
> >
> > > Thanks.
> > >
> > >
> > > juzhe.zhong@rivai.ai
> > >
> > > From: Richard Biener
> > > Date: 2023-08-11 15:01
> > > To: Ju-Zhe Zhong
> > > CC: gcc-patches; richard.sandiford; linkw; krebbel
> > > Subject: Re: [PATCH V3] VECT: Support loop len control on EXTRACT_LAST vectorization
> > > On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
> > >
> > > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > > >
> > > > Hi, Richard and Richi.
> > > >
> > > > This patch add support live vectorization by VEC_EXTRACT for LEN loop control.
> > > >
> > > > Consider this following case:
> > > >
> > > > #include <stdint.h>
> > > >
> > > > #define EXTRACT_LAST(TYPE) \
> > > > TYPE __attribute__ ((noinline, noclone)) \
> > > > test_##TYPE (TYPE *x, int n, TYPE value) \
> > > > { \
> > > > TYPE last; \
> > > > for (int j = 0; j < n; ++j) \
> > > > { \
> > > > last = x[j]; \
> > > > x[j] = last * value; \
> > > > } \
> > > > return last; \
> > > > }
> > > >
> > > > #define TEST_ALL(T) \
> > > > T (uint8_t) \
> > > >
> > > > TEST_ALL (EXTRACT_LAST)
> > > >
> > > > ARM SVE IR:
> > > >
> > > > Preheader:
> > > > max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... });
> > > >
> > > > Loop:
> > > > ...
> > > > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)>
> > > > ...
> > > > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22);
> > > > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > > > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27);
> > > > ...
> > > > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... });
> > > > ...
> > > >
> > > > Epilogue:
> > > > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23);
> > > >
> > > > For RVV since we prefer len in loop control, after this patch for RVV:
> > > >
> > > > Loop:
> > > > ...
> > > > loop_len_22 = SELECT_VL;
> > > > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22);
> > > > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26;
> > > > .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27);
> > > > ...
> > > >
> > > > Epilogue:
> > > > _25 = .VEC_EXTRACT (loop_len_22 + bias - 1, vect_last_12.8_23);
> > > >
> > > > Details of this approach:
> > > >
> > > > 1. Step 1 - Add 'vect_can_vectorize_extract_last_with_len_p' to enable live vectorization
> > > > for LEN loop control.
> > > >
> > > > This function we check whether target support:
> > > > - Use LEN as the loop control.
> > > > - Support VEC_EXTRACT optab.
> > > >
> > > > 2. Step 2 - Record LEN for loop control if 'vect_can_vectorize_extract_last_with_len_p' is true.
> > > >
> > > > 3. Step 3 - Gerenate VEC_EXTRACT (v, LEN + BIAS - 1).
> > > >
> > > > The only difference between mask and len is that len is using length generated by SELECT_VL and
> > > > use VEC_EXTRACT pattern. The rest of the live vectorization is totally the same ARM SVE.
> > > >
> > > > Bootstrap and Regression on X86 passed.
> > > >
> > > > Tested on ARM QEMU.
> > > >
> > > > Ok for trunk?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > * tree-vect-loop.cc (vect_can_vectorize_extract_last_with_len_p): New function.
> > > > (vectorizable_live_operation): Add loop len control.
> > > >
> > > > ---
> > > > gcc/tree-vect-loop.cc | 76 +++++++++++++++++++++++++++++++++++++++----
> > > > 1 file changed, 70 insertions(+), 6 deletions(-)
> > > >
> > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > > > index bf8d677b584..809b73b966c 100644
> > > > --- a/gcc/tree-vect-loop.cc
> > > > +++ b/gcc/tree-vect-loop.cc
> > > > @@ -8963,6 +8963,27 @@ vect_can_vectorize_without_simd_p (code_helper code)
> > > > && vect_can_vectorize_without_simd_p (tree_code (code)));
> > > > }
> > > >
> > > > +/* Return true if target supports extract last vectorization with LEN. */
> > > > +
> > > > +static bool
> > > > +vect_can_vectorize_extract_last_with_len_p (tree vectype)
> > > > +{
> > > > + /* Return false if target doesn't support LEN in loop control. */
> > > > + machine_mode vmode;
> > > > + machine_mode vec_mode = TYPE_MODE (vectype);
> > > > + if (!VECTOR_MODE_P (vec_mode))
> > > > + return false;
> > > > + if (!get_len_load_store_mode (vec_mode, true).exists (&vmode)
> > > > + || !get_len_load_store_mode (vec_mode, false).exists (&vmode))
> > > > + return false;
> > >
> > > So this "hidden" bit in the end decides whether to ...
> > >
> > > > + /* Target need to support VEC_EXTRACT to extract the last active element. */
> > > > + return convert_optab_handler (vec_extract_optab,
> > > > + vec_mode,
> > > > + TYPE_MODE (TREE_TYPE (vectype)))
> > > > + != CODE_FOR_nothing;
> > > > +}
> > > > +
> > > > /* Create vector init for vectorized iv. */
> > > > static tree
> > > > vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
> > > > @@ -10279,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> > > > {
> > > > if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
> > > > - OPTIMIZE_FOR_SPEED))
> > > > + OPTIMIZE_FOR_SPEED)
> > > > + && !vect_can_vectorize_extract_last_with_len_p (vectype))
> > > > {
> > > > if (dump_enabled_p ())
> > > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > @@ -10308,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > else
> > > > {
> > > > gcc_assert (ncopies == 1 && !slp_node);
> > > > - vect_record_loop_mask (loop_vinfo,
> > > > - &LOOP_VINFO_MASKS (loop_vinfo),
> > > > - 1, vectype, NULL);
> > > > + if (vect_can_vectorize_extract_last_with_len_p (vectype))
> > > > + vect_record_loop_len (loop_vinfo,
> > > > + &LOOP_VINFO_LENS (loop_vinfo),
> > > > + 1, vectype, 1);
> > >
> > > .. record a loop_len here. I think powerpc at least has .VEC_EXTRACT as
> > > well but of course .VEC_EXTRACT support itself doesn't have anything to
> > > do with 'len' support.
> > >
> > > x86 has .VEC_SET but not yet .VEC_EXTRACT, if it gets .VEC_EXTRACT
> > > its partial vector support still wants masks, not lens (and once
> > > we record both we fail).
> > >
> > > So how can we resolve the issue when a non-VL operation like
> > > .VEC_EXTRACT is used for _len support?
> > >
> > > Note x86 doens't yet support IFN_EXTRACT_LAST either.
> > >
> > > So, why do we test for get_len_load_store_mode and not just for
> > > VEC_EXTRACT?
> > >
> > > > + else
> > > > + vect_record_loop_mask (loop_vinfo,
> > > > + &LOOP_VINFO_MASKS (loop_vinfo),
> > > > + 1, vectype, NULL);
> > > > }
> > > > }
> > > > /* ??? Enable for loop costing as well. */
> > > > @@ -10336,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > > > gimple *vec_stmt;
> > > > if (slp_node)
> > > > {
> > > > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
> > > > + gcc_assert (!loop_vinfo
> > > > + || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> > > > + && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
> > > >
> > > > /* Get the correct slp vectorized stmt. */
> > > > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
> > > > @@ -10380,7 +10409,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
> > > >
> > > > gimple_seq stmts = NULL;
> > > > tree new_tree;
> > > > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > > > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> > > > + {
> > > > + /* Emit:
> > > > +
> > > > + SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> > > > +
> > > > + where VEC_LHS is the vectorized live-out result and MASK is
> > > > + the loop mask for the final iteration. */
> > > > + gcc_assert (ncopies == 1 && !slp_node);
> > > > + gimple_seq tem = NULL;
> > > > + gimple_stmt_iterator gsi = gsi_last (tem);
> > > > + tree len
> > > > + = vect_get_loop_len (loop_vinfo, &gsi,
> > > > + &LOOP_VINFO_LENS (loop_vinfo),
> > > > + 1, vectype, 0, 0);
> > > > +
> > > > + /* BIAS - 1. */
> > > > + signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > > > + tree bias_minus_one
> > > > + = int_const_binop (MINUS_EXPR,
> > > > + build_int_cst (TREE_TYPE (len), biasval),
> > > > + build_one_cst (TREE_TYPE (len)));
> > > > +
> > > > + /* LAST_INDEX = LEN + (BIAS - 1). */
> > > > + tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> > > > + len, bias_minus_one);
> > > > +
> > > > + /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
> > > > + tree scalar_res
> > > > + = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> > > > + vec_lhs_phi, last_index);
> > > > +
> > >
> > > can we double-check this on powerpc and s390?
> > >
> > > Thanks,
> > > Richard.
> > >
> > > > + /* Convert the extracted vector element to the scalar type. */
> > > > + new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> > > > + }
> > > > + else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> > > > {
> > > > /* Emit:
> > > >
> > >
> > >
> >
> >
>
>
Richard Biener <rguenther@suse.de> writes:
> On Fri, 11 Aug 2023, juzhe.zhong@rivai.ai wrote:
>
>> Hi, Richi.
>>
>> > 1. Target is using loop MASK as the partial vector loop control.
>> >> I don't think it checks for this?
>>
>> I am not sure whether I understand EXTRACT_LAST correctly.
>> But if target doesn't use loop MASK for partial vector loop control, how does target use EXTRACT_LAST?
>> Since EXTRACT_LAST is always extracting the last element of the vector according to MASK operand.
>>
>> > But we don't really know this at this point? The only thing we know
>> > is that nothing set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false.
>>
>> Yes. So I am try to use 'get_len_load_store' to check whether target support LEN loop control.
>>
>> Well, I admit it's not a good idea.
>>
>>
>> > I think it should work to change the direct_internal_fn_supported_p
>> > check for IFN_EXTRACT_LAST to a "poitive" one guarding
>>
>> > gcc_assert (ncopies == 1 && !slp_node);
>> > vect_record_loop_mask (loop_vinfo,
>> > &LOOP_VINFO_MASKS (loop_vinfo),
>> > 1, vectype, NULL);
>>
>> > and in the else branch check for VEC_EXTRACT support and if present
>> > record a loop len. Just in this case this particular order would
>> > be important.
>>
>> Do you mean change the codes as follows :?
>>
>> - if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
>> - OPTIMIZE_FOR_SPEED))
>> - {
>> - if (dump_enabled_p ())
>> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> - "can't operate on partial vectors "
>> - "because the target doesn't support extract "
>> - "last reduction.\n");
>> - LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>> - }
>> - else if (slp_node)
>> if (slp_node)
>> {
>> if (dump_enabled_p ())
>> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> "can't operate on partial vectors "
>> "because an SLP statement is live after "
>> "the loop.\n");
>> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>> }
>> else if (ncopies > 1)
>> {
>> if (dump_enabled_p ())
>> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> "can't operate on partial vectors "
>> "because ncopies is greater than 1.\n");
>> LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>> }
>> else
>> {
>> gcc_assert (ncopies == 1 && !slp_node);
>> if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
>> OPTIMIZE_FOR_SPEED))
>> vect_record_loop_mask (loop_vinfo,
>> &LOOP_VINFO_MASKS (loop_vinfo),
>> 1, vectype, NULL);
>> else
>
> check here the target supports VEC_EXTRACT
>
>> vect_record_loop_len (loop_vinfo,
>> &LOOP_VINFO_LENS (loop_vinfo),
>> 1, vectype, 1);
>
> else set LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P to false with a
> diagnostic.
I agree with all this FWIW. That is, the check should be based on
.VEC_EXTRACT alone, but .EXTRACT_LAST should take priority (not least
because SVE provides both .VEC_EXTRACT and .EXTRACT_LAST).
Thanks,
Richard
@@ -8963,6 +8963,27 @@ vect_can_vectorize_without_simd_p (code_helper code)
&& vect_can_vectorize_without_simd_p (tree_code (code)));
}
+/* Return true if target supports extract last vectorization with LEN. */
+
+static bool
+vect_can_vectorize_extract_last_with_len_p (tree vectype)
+{
+ /* Return false if target doesn't support LEN in loop control. */
+ machine_mode vmode;
+ machine_mode vec_mode = TYPE_MODE (vectype);
+ if (!VECTOR_MODE_P (vec_mode))
+ return false;
+ if (!get_len_load_store_mode (vec_mode, true).exists (&vmode)
+ || !get_len_load_store_mode (vec_mode, false).exists (&vmode))
+ return false;
+
+ /* Target need to support VEC_EXTRACT to extract the last active element. */
+ return convert_optab_handler (vec_extract_optab,
+ vec_mode,
+ TYPE_MODE (TREE_TYPE (vectype)))
+ != CODE_FOR_nothing;
+}
+
/* Create vector init for vectorized iv. */
static tree
vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
@@ -10279,7 +10300,8 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
{
if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
- OPTIMIZE_FOR_SPEED))
+ OPTIMIZE_FOR_SPEED)
+ && !vect_can_vectorize_extract_last_with_len_p (vectype))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -10308,9 +10330,14 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
else
{
gcc_assert (ncopies == 1 && !slp_node);
- vect_record_loop_mask (loop_vinfo,
- &LOOP_VINFO_MASKS (loop_vinfo),
- 1, vectype, NULL);
+ if (vect_can_vectorize_extract_last_with_len_p (vectype))
+ vect_record_loop_len (loop_vinfo,
+ &LOOP_VINFO_LENS (loop_vinfo),
+ 1, vectype, 1);
+ else
+ vect_record_loop_mask (loop_vinfo,
+ &LOOP_VINFO_MASKS (loop_vinfo),
+ 1, vectype, NULL);
}
}
/* ??? Enable for loop costing as well. */
@@ -10336,7 +10363,9 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
gimple *vec_stmt;
if (slp_node)
{
- gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+ gcc_assert (!loop_vinfo
+ || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
/* Get the correct slp vectorized stmt. */
vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
@@ -10380,7 +10409,42 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
gimple_seq stmts = NULL;
tree new_tree;
- if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+ {
+ /* Emit:
+
+ SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
+
+ where VEC_LHS is the vectorized live-out result and MASK is
+ the loop mask for the final iteration. */
+ gcc_assert (ncopies == 1 && !slp_node);
+ gimple_seq tem = NULL;
+ gimple_stmt_iterator gsi = gsi_last (tem);
+ tree len
+ = vect_get_loop_len (loop_vinfo, &gsi,
+ &LOOP_VINFO_LENS (loop_vinfo),
+ 1, vectype, 0, 0);
+
+ /* BIAS - 1. */
+ signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ tree bias_minus_one
+ = int_const_binop (MINUS_EXPR,
+ build_int_cst (TREE_TYPE (len), biasval),
+ build_one_cst (TREE_TYPE (len)));
+
+ /* LAST_INDEX = LEN + (BIAS - 1). */
+ tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
+ len, bias_minus_one);
+
+ /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
+ tree scalar_res
+ = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
+ vec_lhs_phi, last_index);
+
+ /* Convert the extracted vector element to the scalar type. */
+ new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+ }
+ else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
{
/* Emit: