VECT: Apply LEN_FOLD_EXTRACT_LAST into loop vectorizer
Checks
Commit Message
Hi.
This patch is apply LEN_FOLD_EXTRACT_LAST into loop vectorizer.
Consider this following case:
#include <stdint.h>
#define N 32
/* Simple condition reduction. */
int __attribute__ ((noinline, noclone))
condition_reduction (int *a, int min_v)
{
int last = 66; /* High start value. */
for (int i = 0; i < N; i++)
if (a[i] < min_v)
last = i;
return last;
}
With this patch, we can generate this following IR:
_44 = .SELECT_VL (ivtmp_42, POLY_INT_CST [4, 4]);
_34 = vect_vec_iv_.5_33 + { POLY_INT_CST [4, 4], ... };
ivtmp_36 = _44 * 4;
vect__4.8_39 = .MASK_LEN_LOAD (vectp_a.6_37, 32B, { -1, ... }, _44, 0);
mask__11.9_41 = vect__4.8_39 < vect_cst__40;
last_5 = .LEN_FOLD_EXTRACT_LAST (last_14, mask__11.9_41, vect_vec_iv_.5_33, _44, 0);
...
gcc/ChangeLog:
* tree-vect-loop.cc (vectorizable_reduction): Apply LEN_FOLD_EXTRACT_LAST.
* tree-vect-stmts.cc (vectorizable_condition): Ditto.
---
gcc/tree-vect-loop.cc | 7 ++++--
gcc/tree-vect-stmts.cc | 52 ++++++++++++++++++++++++++++++++++++------
2 files changed, 50 insertions(+), 9 deletions(-)
Comments
On Thu, 24 Aug 2023, Juzhe-Zhong wrote:
> Hi.
>
> This patch is apply LEN_FOLD_EXTRACT_LAST into loop vectorizer.
>
> Consider this following case:
> #include <stdint.h>
>
> #define N 32
>
> /* Simple condition reduction. */
>
> int __attribute__ ((noinline, noclone))
> condition_reduction (int *a, int min_v)
> {
> int last = 66; /* High start value. */
>
> for (int i = 0; i < N; i++)
> if (a[i] < min_v)
> last = i;
>
> return last;
> }
>
> With this patch, we can generate this following IR:
>
> _44 = .SELECT_VL (ivtmp_42, POLY_INT_CST [4, 4]);
> _34 = vect_vec_iv_.5_33 + { POLY_INT_CST [4, 4], ... };
> ivtmp_36 = _44 * 4;
> vect__4.8_39 = .MASK_LEN_LOAD (vectp_a.6_37, 32B, { -1, ... }, _44, 0);
>
> mask__11.9_41 = vect__4.8_39 < vect_cst__40;
> last_5 = .LEN_FOLD_EXTRACT_LAST (last_14, mask__11.9_41, vect_vec_iv_.5_33, _44, 0);
> ...
LGTM.
Thanks,
Richard.
> gcc/ChangeLog:
>
> * tree-vect-loop.cc (vectorizable_reduction): Apply LEN_FOLD_EXTRACT_LAST.
> * tree-vect-stmts.cc (vectorizable_condition): Ditto.
>
> ---
> gcc/tree-vect-loop.cc | 7 ++++--
> gcc/tree-vect-stmts.cc | 52 ++++++++++++++++++++++++++++++++++++------
> 2 files changed, 50 insertions(+), 9 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 1cd6c291377..ebee8037e02 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7494,8 +7494,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> }
>
> if (reduc_chain_length == 1
> - && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
> - vectype_in, OPTIMIZE_FOR_SPEED))
> + && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
> + OPTIMIZE_FOR_SPEED)
> + || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> + vectype_in,
> + OPTIMIZE_FOR_SPEED)))
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 413a88750d6..be9f3a280bd 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -11740,8 +11740,17 @@ vectorizable_condition (vec_info *vinfo,
> && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> {
> if (reduction_type == EXTRACT_LAST_REDUCTION)
> - vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
> - ncopies * vec_num, vectype, NULL);
> + {
> + if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> + vectype, OPTIMIZE_FOR_SPEED))
> + vect_record_loop_len (loop_vinfo,
> + &LOOP_VINFO_LENS (loop_vinfo),
> + ncopies * vec_num, vectype, 1);
> + else
> + vect_record_loop_mask (loop_vinfo,
> + &LOOP_VINFO_MASKS (loop_vinfo),
> + ncopies * vec_num, vectype, NULL);
> + }
> /* Extra inactive lanes should be safe for vect_nested_cycle. */
> else if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_nested_cycle)
> {
> @@ -11772,7 +11781,13 @@ vectorizable_condition (vec_info *vinfo,
> mask to the condition, or to its inverse. */
>
> vec_loop_masks *masks = NULL;
> - if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> + vec_loop_lens *lens = NULL;
> + if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> + if (reduction_type == EXTRACT_LAST_REDUCTION)
> + lens = &LOOP_VINFO_LENS (loop_vinfo);
> + }
> + else if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> {
> if (reduction_type == EXTRACT_LAST_REDUCTION)
> masks = &LOOP_VINFO_MASKS (loop_vinfo);
> @@ -11910,7 +11925,8 @@ vectorizable_condition (vec_info *vinfo,
> /* Force vec_compare to be an SSA_NAME rather than a comparison,
> in cases where that's necessary. */
>
> - if (masks || reduction_type == EXTRACT_LAST_REDUCTION)
> + tree len = NULL_TREE, bias = NULL_TREE;
> + if (masks || lens || reduction_type == EXTRACT_LAST_REDUCTION)
> {
> if (!is_gimple_val (vec_compare))
> {
> @@ -11931,6 +11947,23 @@ vectorizable_condition (vec_info *vinfo,
> vec_compare = vec_compare_name;
> }
>
> + if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> + vectype, OPTIMIZE_FOR_SPEED))
> + {
> + if (lens)
> + {
> + len = vect_get_loop_len (loop_vinfo, gsi, lens,
> + vec_num * ncopies, vectype, i, 1);
> + signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + bias = build_int_cst (intQI_type_node, biasval);
> + }
> + else
> + {
> + len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> + bias = build_int_cst (intQI_type_node, 0);
> + }
> + }
> if (masks)
> {
> tree loop_mask
> @@ -11950,9 +11983,14 @@ vectorizable_condition (vec_info *vinfo,
> {
> gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
> tree lhs = gimple_get_lhs (old_stmt);
> - new_stmt = gimple_build_call_internal
> - (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
> - vec_then_clause);
> + if (len)
> + new_stmt = gimple_build_call_internal
> + (IFN_LEN_FOLD_EXTRACT_LAST, 5, else_clause, vec_compare,
> + vec_then_clause, len, bias);
> + else
> + new_stmt = gimple_build_call_internal
> + (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
> + vec_then_clause);
> gimple_call_set_lhs (new_stmt, lhs);
> SSA_NAME_DEF_STMT (lhs) = new_stmt;
> if (old_stmt == gsi_stmt (*gsi))
>
Committed, thanks Richard.
Pan
-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Richard Biener via Gcc-patches
Sent: Thursday, August 24, 2023 2:39 PM
To: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Cc: gcc-patches@gcc.gnu.org; richard.sandiford@arm.com
Subject: Re: [PATCH] VECT: Apply LEN_FOLD_EXTRACT_LAST into loop vectorizer
On Thu, 24 Aug 2023, Juzhe-Zhong wrote:
> Hi.
>
> This patch is apply LEN_FOLD_EXTRACT_LAST into loop vectorizer.
>
> Consider this following case:
> #include <stdint.h>
>
> #define N 32
>
> /* Simple condition reduction. */
>
> int __attribute__ ((noinline, noclone))
> condition_reduction (int *a, int min_v)
> {
> int last = 66; /* High start value. */
>
> for (int i = 0; i < N; i++)
> if (a[i] < min_v)
> last = i;
>
> return last;
> }
>
> With this patch, we can generate this following IR:
>
> _44 = .SELECT_VL (ivtmp_42, POLY_INT_CST [4, 4]);
> _34 = vect_vec_iv_.5_33 + { POLY_INT_CST [4, 4], ... };
> ivtmp_36 = _44 * 4;
> vect__4.8_39 = .MASK_LEN_LOAD (vectp_a.6_37, 32B, { -1, ... }, _44, 0);
>
> mask__11.9_41 = vect__4.8_39 < vect_cst__40;
> last_5 = .LEN_FOLD_EXTRACT_LAST (last_14, mask__11.9_41, vect_vec_iv_.5_33, _44, 0);
> ...
LGTM.
Thanks,
Richard.
> gcc/ChangeLog:
>
> * tree-vect-loop.cc (vectorizable_reduction): Apply LEN_FOLD_EXTRACT_LAST.
> * tree-vect-stmts.cc (vectorizable_condition): Ditto.
>
> ---
> gcc/tree-vect-loop.cc | 7 ++++--
> gcc/tree-vect-stmts.cc | 52 ++++++++++++++++++++++++++++++++++++------
> 2 files changed, 50 insertions(+), 9 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 1cd6c291377..ebee8037e02 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7494,8 +7494,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> }
>
> if (reduc_chain_length == 1
> - && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
> - vectype_in, OPTIMIZE_FOR_SPEED))
> + && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
> + OPTIMIZE_FOR_SPEED)
> + || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> + vectype_in,
> + OPTIMIZE_FOR_SPEED)))
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 413a88750d6..be9f3a280bd 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -11740,8 +11740,17 @@ vectorizable_condition (vec_info *vinfo,
> && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> {
> if (reduction_type == EXTRACT_LAST_REDUCTION)
> - vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
> - ncopies * vec_num, vectype, NULL);
> + {
> + if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> + vectype, OPTIMIZE_FOR_SPEED))
> + vect_record_loop_len (loop_vinfo,
> + &LOOP_VINFO_LENS (loop_vinfo),
> + ncopies * vec_num, vectype, 1);
> + else
> + vect_record_loop_mask (loop_vinfo,
> + &LOOP_VINFO_MASKS (loop_vinfo),
> + ncopies * vec_num, vectype, NULL);
> + }
> /* Extra inactive lanes should be safe for vect_nested_cycle. */
> else if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_nested_cycle)
> {
> @@ -11772,7 +11781,13 @@ vectorizable_condition (vec_info *vinfo,
> mask to the condition, or to its inverse. */
>
> vec_loop_masks *masks = NULL;
> - if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> + vec_loop_lens *lens = NULL;
> + if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> + {
> + if (reduction_type == EXTRACT_LAST_REDUCTION)
> + lens = &LOOP_VINFO_LENS (loop_vinfo);
> + }
> + else if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> {
> if (reduction_type == EXTRACT_LAST_REDUCTION)
> masks = &LOOP_VINFO_MASKS (loop_vinfo);
> @@ -11910,7 +11925,8 @@ vectorizable_condition (vec_info *vinfo,
> /* Force vec_compare to be an SSA_NAME rather than a comparison,
> in cases where that's necessary. */
>
> - if (masks || reduction_type == EXTRACT_LAST_REDUCTION)
> + tree len = NULL_TREE, bias = NULL_TREE;
> + if (masks || lens || reduction_type == EXTRACT_LAST_REDUCTION)
> {
> if (!is_gimple_val (vec_compare))
> {
> @@ -11931,6 +11947,23 @@ vectorizable_condition (vec_info *vinfo,
> vec_compare = vec_compare_name;
> }
>
> + if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
> + vectype, OPTIMIZE_FOR_SPEED))
> + {
> + if (lens)
> + {
> + len = vect_get_loop_len (loop_vinfo, gsi, lens,
> + vec_num * ncopies, vectype, i, 1);
> + signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + bias = build_int_cst (intQI_type_node, biasval);
> + }
> + else
> + {
> + len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> + bias = build_int_cst (intQI_type_node, 0);
> + }
> + }
> if (masks)
> {
> tree loop_mask
> @@ -11950,9 +11983,14 @@ vectorizable_condition (vec_info *vinfo,
> {
> gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
> tree lhs = gimple_get_lhs (old_stmt);
> - new_stmt = gimple_build_call_internal
> - (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
> - vec_then_clause);
> + if (len)
> + new_stmt = gimple_build_call_internal
> + (IFN_LEN_FOLD_EXTRACT_LAST, 5, else_clause, vec_compare,
> + vec_then_clause, len, bias);
> + else
> + new_stmt = gimple_build_call_internal
> + (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
> + vec_then_clause);
> gimple_call_set_lhs (new_stmt, lhs);
> SSA_NAME_DEF_STMT (lhs) = new_stmt;
> if (old_stmt == gsi_stmt (*gsi))
>
@@ -7494,8 +7494,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
}
if (reduc_chain_length == 1
- && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
- vectype_in, OPTIMIZE_FOR_SPEED))
+ && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
+ OPTIMIZE_FOR_SPEED)
+ || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
+ vectype_in,
+ OPTIMIZE_FOR_SPEED)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -11740,8 +11740,17 @@ vectorizable_condition (vec_info *vinfo,
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
{
if (reduction_type == EXTRACT_LAST_REDUCTION)
- vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
- ncopies * vec_num, vectype, NULL);
+ {
+ if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
+ vectype, OPTIMIZE_FOR_SPEED))
+ vect_record_loop_len (loop_vinfo,
+ &LOOP_VINFO_LENS (loop_vinfo),
+ ncopies * vec_num, vectype, 1);
+ else
+ vect_record_loop_mask (loop_vinfo,
+ &LOOP_VINFO_MASKS (loop_vinfo),
+ ncopies * vec_num, vectype, NULL);
+ }
/* Extra inactive lanes should be safe for vect_nested_cycle. */
else if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_nested_cycle)
{
@@ -11772,7 +11781,13 @@ vectorizable_condition (vec_info *vinfo,
mask to the condition, or to its inverse. */
vec_loop_masks *masks = NULL;
- if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ vec_loop_lens *lens = NULL;
+ if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+ {
+ if (reduction_type == EXTRACT_LAST_REDUCTION)
+ lens = &LOOP_VINFO_LENS (loop_vinfo);
+ }
+ else if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
{
if (reduction_type == EXTRACT_LAST_REDUCTION)
masks = &LOOP_VINFO_MASKS (loop_vinfo);
@@ -11910,7 +11925,8 @@ vectorizable_condition (vec_info *vinfo,
/* Force vec_compare to be an SSA_NAME rather than a comparison,
in cases where that's necessary. */
- if (masks || reduction_type == EXTRACT_LAST_REDUCTION)
+ tree len = NULL_TREE, bias = NULL_TREE;
+ if (masks || lens || reduction_type == EXTRACT_LAST_REDUCTION)
{
if (!is_gimple_val (vec_compare))
{
@@ -11931,6 +11947,23 @@ vectorizable_condition (vec_info *vinfo,
vec_compare = vec_compare_name;
}
+ if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
+ vectype, OPTIMIZE_FOR_SPEED))
+ {
+ if (lens)
+ {
+ len = vect_get_loop_len (loop_vinfo, gsi, lens,
+ vec_num * ncopies, vectype, i, 1);
+ signed char biasval
+ = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ bias = build_int_cst (intQI_type_node, biasval);
+ }
+ else
+ {
+ len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+ bias = build_int_cst (intQI_type_node, 0);
+ }
+ }
if (masks)
{
tree loop_mask
@@ -11950,9 +11983,14 @@ vectorizable_condition (vec_info *vinfo,
{
gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
tree lhs = gimple_get_lhs (old_stmt);
- new_stmt = gimple_build_call_internal
- (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
- vec_then_clause);
+ if (len)
+ new_stmt = gimple_build_call_internal
+ (IFN_LEN_FOLD_EXTRACT_LAST, 5, else_clause, vec_compare,
+ vec_then_clause, len, bias);
+ else
+ new_stmt = gimple_build_call_internal
+ (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
+ vec_then_clause);
gimple_call_set_lhs (new_stmt, lhs);
SSA_NAME_DEF_STMT (lhs) = new_stmt;
if (old_stmt == gsi_stmt (*gsi))