[V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
Checks
Commit Message
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
Hi, Richard and Richi.
Address comments from Richard.
Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
Since:
/* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
on arguments before and after vectorized.
Before vectorized:
LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
After vectorized:
LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
*/
I add "vectorized_p" default argument into internal_fn_mask_index.
So that we could simplify the codes.
The len_mask_gather_load/len_mask_scatter_store patterns have been added.
Now, this patch applies them into vectorizer.
Here is the example:
void
f (int *restrict a,
int *restrict b, int n,
int base, int step,
int *restrict cond)
{
for (int i = 0; i < n; ++i)
{
if (cond[i])
a[i * 4] = b[i];
}
}
Gimple IR:
<bb 3> [local count: 105119324]:
_58 = (unsigned long) n_13(D);
<bb 4> [local count: 630715945]:
# vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
# vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
# vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
# ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
_61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
ivtmp_44 = _61 * 4;
vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
mask__24.10_49 = vect__4.9_47 != { 0, ... };
vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
ivtmp_54 = _61 * 16;
.LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49);
vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
ivtmp_60 = ivtmp_59 - _61;
if (ivtmp_60 != 0)
goto <bb 4>; [83.33%]
else
goto <bb 5>; [16.67%]
gcc/ChangeLog:
* internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
* internal-fn.h (internal_fn_mask_index): Ditto.
* optabs-query.cc (supports_vec_gather_load_p): Ditto.
(supports_vec_scatter_store_p): Ditto.
* tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
* tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto.
(check_load_store_for_partial_vectors): Ditto.
(vect_get_strided_load_store_ops): Ditto.
(vectorizable_store): Ditto.
(vectorizable_load): Ditto.
---
gcc/internal-fn.cc | 16 ++++-
gcc/internal-fn.h | 2 +-
gcc/optabs-query.cc | 2 +
gcc/tree-vect-data-refs.cc | 18 ++++-
gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------
5 files changed, 150 insertions(+), 23 deletions(-)
Comments
Oh. Sorry for incorrect typo in commit log.
>> Address comments from Richard.
Change it into "Address comments from Richi." :).
Thanks.
juzhe.zhong@rivai.ai
From: juzhe.zhong
Date: 2023-07-04 17:54
To: gcc-patches
CC: richard.sandiford; rguenther; Ju-Zhe Zhong
Subject: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
Hi, Richard and Richi.
Address comments from Richard.
Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
Since:
/* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
on arguments before and after vectorized.
Before vectorized:
LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
After vectorized:
LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
*/
I add "vectorized_p" default argument into internal_fn_mask_index.
So that we could simplify the codes.
The len_mask_gather_load/len_mask_scatter_store patterns have been added.
Now, this patch applies them into vectorizer.
Here is the example:
void
f (int *restrict a,
int *restrict b, int n,
int base, int step,
int *restrict cond)
{
for (int i = 0; i < n; ++i)
{
if (cond[i])
a[i * 4] = b[i];
}
}
Gimple IR:
<bb 3> [local count: 105119324]:
_58 = (unsigned long) n_13(D);
<bb 4> [local count: 630715945]:
# vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
# vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
# vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
# ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
_61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
ivtmp_44 = _61 * 4;
vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
mask__24.10_49 = vect__4.9_47 != { 0, ... };
vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
ivtmp_54 = _61 * 16;
.LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49);
vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
ivtmp_60 = ivtmp_59 - _61;
if (ivtmp_60 != 0)
goto <bb 4>; [83.33%]
else
goto <bb 5>; [16.67%]
gcc/ChangeLog:
* internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
* internal-fn.h (internal_fn_mask_index): Ditto.
* optabs-query.cc (supports_vec_gather_load_p): Ditto.
(supports_vec_scatter_store_p): Ditto.
* tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
* tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto.
(check_load_store_for_partial_vectors): Ditto.
(vect_get_strided_load_store_ops): Ditto.
(vectorizable_store): Ditto.
(vectorizable_load): Ditto.
---
gcc/internal-fn.cc | 16 ++++-
gcc/internal-fn.h | 2 +-
gcc/optabs-query.cc | 2 +
gcc/tree-vect-data-refs.cc | 18 ++++-
gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------
5 files changed, 150 insertions(+), 23 deletions(-)
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 303df102d81..2c78c870de8 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn)
otherwise return -1. */
int
-internal_fn_mask_index (internal_fn fn)
+internal_fn_mask_index (internal_fn fn, bool vectoried_p)
{
switch (fn)
{
@@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn)
case IFN_LEN_MASK_STORE:
return 4;
+ /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
+ on arguments before and after vectorized.
+
+ Before vectorized:
+ LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
+
+ After vectorized:
+ LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
+ */
case IFN_LEN_MASK_GATHER_LOAD:
case IFN_LEN_MASK_SCATTER_STORE:
- return 6;
+ if (vectoried_p)
+ return 6;
+ else
+ return 4;
default:
return (conditional_internal_fn_code (fn) != ERROR_MARK
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 4234bbfed87..e9168c16297 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
extern bool internal_load_fn_p (internal_fn);
extern bool internal_store_fn_p (internal_fn);
extern bool internal_gather_scatter_fn_p (internal_fn);
-extern int internal_fn_mask_index (internal_fn);
+extern int internal_fn_mask_index (internal_fn, bool = true);
extern int internal_fn_len_index (internal_fn);
extern int internal_fn_stored_value_index (internal_fn);
extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 2fdd0d34354..bf1f484e874 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
this_fn_optabs->supports_vec_gather_load[mode]
= (supports_vec_convert_optab_p (gather_load_optab, mode)
|| supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+ || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
? 1 : -1);
return this_fn_optabs->supports_vec_gather_load[mode] > 0;
@@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
this_fn_optabs->supports_vec_scatter_store[mode]
= (supports_vec_convert_optab_p (scatter_store_optab, mode)
|| supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+ || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
? 1 : -1);
return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index ebe93832b1e..ab2af103cb4 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
return false;
/* Work out which function we need. */
- internal_fn ifn, alt_ifn;
+ internal_fn ifn, alt_ifn, alt_ifn2;
if (read_p)
{
ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
alt_ifn = IFN_MASK_GATHER_LOAD;
+ /* When target supports LEN_MASK_GATHER_LOAD, we always
+ use LEN_MASK_GATHER_LOAD regardless whether len and
+ mask are valid or not. */
+ alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
}
else
{
ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
alt_ifn = IFN_MASK_SCATTER_STORE;
+ /* When target supports LEN_MASK_SCATTER_STORE, we always
+ use LEN_MASK_SCATTER_STORE regardless whether len and
+ mask are valid or not. */
+ alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
}
for (;;)
@@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
*offset_vectype_out = offset_vectype;
return true;
}
+ else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
+ memory_type,
+ offset_vectype, scale))
+ {
+ *ifn_out = alt_ifn2;
+ *offset_vectype_out = offset_vectype;
+ return true;
+ }
if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
&& TYPE_PRECISION (offset_type) >= element_bits)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a0c39268bf0..33ec33f8b8d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info)
if (call && gimple_call_internal_p (call))
{
internal_fn ifn = gimple_call_internal_fn (call);
- int mask_index = internal_fn_mask_index (ifn);
+ int mask_index = internal_fn_mask_index (ifn, false);
if (mask_index >= 0
&& use == gimple_call_arg (call, mask_index))
return true;
@@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
gs_info->offset_vectype,
gs_info->scale))
{
+ ifn = (is_load
+ ? IFN_LEN_MASK_GATHER_LOAD
+ : IFN_LEN_MASK_SCATTER_STORE);
+ if (internal_gather_scatter_fn_supported_p (ifn, vectype,
+ gs_info->memory_type,
+ gs_info->offset_vectype,
+ gs_info->scale))
+ {
+ vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+ vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+ return;
+ }
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"can't operate on partial vectors because"
@@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
static void
vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
loop_vec_info loop_vinfo,
+ gimple_stmt_iterator *gsi,
gather_scatter_info *gs_info,
- tree *dataref_bump, tree *vec_offset)
+ tree *dataref_bump, tree *vec_offset,
+ vec_loop_lens *loop_lens)
{
struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
- tree bump = size_binop (MULT_EXPR,
- fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
- size_int (TYPE_VECTOR_SUBPARTS (vectype)));
- *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
+ if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
+ {
+ /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
+ ivtmp_8 = _31 * 16 (step in bytes);
+ .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
+ vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */
+ tree loop_len
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
+ tree tmp
+ = fold_build2 (MULT_EXPR, sizetype,
+ fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
+ loop_len);
+ tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
+ gassign *assign = gimple_build_assign (bump, tmp);
+ gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+ *dataref_bump = bump;
+ }
+ else
+ {
+ tree bump
+ = size_binop (MULT_EXPR,
+ fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
+ size_int (TYPE_VECTOR_SUBPARTS (vectype)));
+ *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
+ }
/* The offset given in GS_INFO can have pointer type, so use the element
type of the vector instead. */
@@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo,
return false;
}
- int mask_index = internal_fn_mask_index (ifn);
+ int mask_index = internal_fn_mask_index (ifn, false);
if (mask_index >= 0
&& !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
&mask, NULL, &mask_dt, &mask_vectype))
@@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
else if (memory_access_type == VMAT_GATHER_SCATTER)
{
aggr_type = elem_type;
- vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
- &bump, &vec_offset);
+ vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
+ &bump, &vec_offset, loop_lens);
}
else
{
@@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
unsigned HOST_WIDE_INT align;
tree final_mask = NULL_TREE;
+ tree final_len = NULL_TREE;
+ tree bias = NULL_TREE;
if (loop_masks)
final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
vec_num * ncopies,
@@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo,
if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
vec_offset = vec_offsets[vec_num * j + i];
tree scale = size_int (gs_info.scale);
+
+ if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE)
+ {
+ if (loop_lens)
+ {
+ final_len
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+ vec_num * ncopies, vectype,
+ vec_num * j + i, 1);
+ }
+ else
+ {
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ final_len
+ = build_int_cst (iv_type,
+ TYPE_VECTOR_SUBPARTS (vectype));
+ }
+ signed char biasval
+ = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ bias = build_int_cst (intQI_type_node, biasval);
+ if (!final_mask)
+ {
+ mask_vectype = truth_type_for (vectype);
+ final_mask = build_minus_one_cst (mask_vectype);
+ }
+ }
+
gcall *call;
- if (final_mask)
+ if (final_len && final_len)
+ call
+ = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
+ 7, dataref_ptr, vec_offset,
+ scale, vec_oprnd, final_len,
+ bias, final_mask);
+ else if (final_mask)
call = gimple_build_call_internal
(IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
scale, vec_oprnd, final_mask);
@@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo,
machine_mode vmode = TYPE_MODE (vectype);
machine_mode new_vmode = vmode;
internal_fn partial_ifn = IFN_LAST;
- /* Produce 'len' and 'bias' argument. */
- tree final_len = NULL_TREE;
- tree bias = NULL_TREE;
if (loop_lens)
{
opt_machine_mode new_ovmode
@@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo,
else if (memory_access_type == VMAT_GATHER_SCATTER)
{
aggr_type = elem_type;
- vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
- &bump, &vec_offset);
+ vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
+ &bump, &vec_offset, loop_lens);
}
else
{
@@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo,
for (i = 0; i < vec_num; i++)
{
tree final_mask = NULL_TREE;
+ tree final_len = NULL_TREE;
+ tree bias = NULL_TREE;
if (loop_masks
&& memory_access_type != VMAT_INVARIANT)
final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
@@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo,
vec_offset = vec_offsets[vec_num * j + i];
tree zero = build_zero_cst (vectype);
tree scale = size_int (gs_info.scale);
+
+ if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
+ {
+ if (loop_lens)
+ {
+ final_len = vect_get_loop_len (
+ loop_vinfo, gsi, loop_lens, vec_num * ncopies,
+ vectype, vec_num * j + i, 1);
+ }
+ else
+ {
+ tree iv_type
+ = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ final_len = build_int_cst (
+ iv_type, TYPE_VECTOR_SUBPARTS (vectype));
+ }
+ signed char biasval
+ = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ bias = build_int_cst (intQI_type_node, biasval);
+ if (!final_mask)
+ {
+ mask_vectype = truth_type_for (vectype);
+ final_mask = build_minus_one_cst (mask_vectype);
+ }
+ }
+
gcall *call;
- if (final_mask)
+ if (final_len && final_mask)
+ call = gimple_build_call_internal (
+ IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
+ vec_offset, scale, zero, final_len, bias,
+ final_mask);
+ else if (final_mask)
call = gimple_build_call_internal
(IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
vec_offset, scale, zero, final_mask);
@@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo,
machine_mode vmode = TYPE_MODE (vectype);
machine_mode new_vmode = vmode;
internal_fn partial_ifn = IFN_LAST;
- /* Produce 'len' and 'bias' argument. */
- tree final_len = NULL_TREE;
- tree bias = NULL_TREE;
if (loop_lens)
{
opt_machine_mode new_ovmode
--
2.36.3
On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
>
> Address comments from Richard.
>
> Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
>
> Since:
> /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> on arguments before and after vectorized.
>
> Before vectorized:
> LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
>
> After vectorized:
> LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> */
>
> I add "vectorized_p" default argument into internal_fn_mask_index.
> So that we could simplify the codes.
Eh, it's obvious that we should have the "vectorized" form
also in the 'scalar' variant. If you think there's no reasonable
way to add a value for len or bias then instead re-order the
arguments so 'mask' comes first and the len/bias pair last.
But IMHO "any" len/bias value should do here.
The rest looks OK now.
Thanks,
Richard.
> The len_mask_gather_load/len_mask_scatter_store patterns have been added.
> Now, this patch applies them into vectorizer.
>
> Here is the example:
>
> void
> f (int *restrict a,
> int *restrict b, int n,
> int base, int step,
> int *restrict cond)
> {
> for (int i = 0; i < n; ++i)
> {
> if (cond[i])
> a[i * 4] = b[i];
> }
> }
>
> Gimple IR:
>
> <bb 3> [local count: 105119324]:
> _58 = (unsigned long) n_13(D);
>
> <bb 4> [local count: 630715945]:
> # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
> # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
> # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
> # ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
> _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
> ivtmp_44 = _61 * 4;
> vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
> mask__24.10_49 = vect__4.9_47 != { 0, ... };
> vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
> ivtmp_54 = _61 * 16;
> .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49);
> vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
> vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
> vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
> ivtmp_60 = ivtmp_59 - _61;
> if (ivtmp_60 != 0)
> goto <bb 4>; [83.33%]
> else
> goto <bb 5>; [16.67%]
>
> gcc/ChangeLog:
>
> * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
> * internal-fn.h (internal_fn_mask_index): Ditto.
> * optabs-query.cc (supports_vec_gather_load_p): Ditto.
> (supports_vec_scatter_store_p): Ditto.
> * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
> * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto.
> (check_load_store_for_partial_vectors): Ditto.
> (vect_get_strided_load_store_ops): Ditto.
> (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
>
> ---
> gcc/internal-fn.cc | 16 ++++-
> gcc/internal-fn.h | 2 +-
> gcc/optabs-query.cc | 2 +
> gcc/tree-vect-data-refs.cc | 18 ++++-
> gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------
> 5 files changed, 150 insertions(+), 23 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 303df102d81..2c78c870de8 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn)
> otherwise return -1. */
>
> int
> -internal_fn_mask_index (internal_fn fn)
> +internal_fn_mask_index (internal_fn fn, bool vectoried_p)
> {
> switch (fn)
> {
> @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn)
> case IFN_LEN_MASK_STORE:
> return 4;
>
> + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> + on arguments before and after vectorized.
> +
> + Before vectorized:
> + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
> +
> + After vectorized:
> + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> + */
> case IFN_LEN_MASK_GATHER_LOAD:
> case IFN_LEN_MASK_SCATTER_STORE:
> - return 6;
> + if (vectoried_p)
> + return 6;
> + else
> + return 4;
>
> default:
> return (conditional_internal_fn_code (fn) != ERROR_MARK
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..e9168c16297 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> extern bool internal_load_fn_p (internal_fn);
> extern bool internal_store_fn_p (internal_fn);
> extern bool internal_gather_scatter_fn_p (internal_fn);
> -extern int internal_fn_mask_index (internal_fn);
> +extern int internal_fn_mask_index (internal_fn, bool = true);
> extern int internal_fn_len_index (internal_fn);
> extern int internal_fn_stored_value_index (internal_fn);
> extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
> diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> index 2fdd0d34354..bf1f484e874 100644
> --- a/gcc/optabs-query.cc
> +++ b/gcc/optabs-query.cc
> @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
> this_fn_optabs->supports_vec_gather_load[mode]
> = (supports_vec_convert_optab_p (gather_load_optab, mode)
> || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
> + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
> ? 1 : -1);
>
> return this_fn_optabs->supports_vec_gather_load[mode] > 0;
> @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
> this_fn_optabs->supports_vec_scatter_store[mode]
> = (supports_vec_convert_optab_p (scatter_store_optab, mode)
> || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
> + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
> ? 1 : -1);
>
> return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index ebe93832b1e..ab2af103cb4 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
> return false;
>
> /* Work out which function we need. */
> - internal_fn ifn, alt_ifn;
> + internal_fn ifn, alt_ifn, alt_ifn2;
> if (read_p)
> {
> ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
> alt_ifn = IFN_MASK_GATHER_LOAD;
> + /* When target supports LEN_MASK_GATHER_LOAD, we always
> + use LEN_MASK_GATHER_LOAD regardless whether len and
> + mask are valid or not. */
> + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
> }
> else
> {
> ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
> alt_ifn = IFN_MASK_SCATTER_STORE;
> + /* When target supports LEN_MASK_SCATTER_STORE, we always
> + use LEN_MASK_SCATTER_STORE regardless whether len and
> + mask are valid or not. */
> + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
> }
>
> for (;;)
> @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
> *offset_vectype_out = offset_vectype;
> return true;
> }
> + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
> + memory_type,
> + offset_vectype, scale))
> + {
> + *ifn_out = alt_ifn2;
> + *offset_vectype_out = offset_vectype;
> + return true;
> + }
>
> if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
> && TYPE_PRECISION (offset_type) >= element_bits)
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index a0c39268bf0..33ec33f8b8d 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info)
> if (call && gimple_call_internal_p (call))
> {
> internal_fn ifn = gimple_call_internal_fn (call);
> - int mask_index = internal_fn_mask_index (ifn);
> + int mask_index = internal_fn_mask_index (ifn, false);
> if (mask_index >= 0
> && use == gimple_call_arg (call, mask_index))
> return true;
> @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> gs_info->offset_vectype,
> gs_info->scale))
> {
> + ifn = (is_load
> + ? IFN_LEN_MASK_GATHER_LOAD
> + : IFN_LEN_MASK_SCATTER_STORE);
> + if (internal_gather_scatter_fn_supported_p (ifn, vectype,
> + gs_info->memory_type,
> + gs_info->offset_vectype,
> + gs_info->scale))
> + {
> + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> + return;
> + }
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> "can't operate on partial vectors because"
> @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
> static void
> vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
> loop_vec_info loop_vinfo,
> + gimple_stmt_iterator *gsi,
> gather_scatter_info *gs_info,
> - tree *dataref_bump, tree *vec_offset)
> + tree *dataref_bump, tree *vec_offset,
> + vec_loop_lens *loop_lens)
> {
> struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
> tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>
> - tree bump = size_binop (MULT_EXPR,
> - fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> - size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> + {
> + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
> + ivtmp_8 = _31 * 16 (step in bytes);
> + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
> + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */
> + tree loop_len
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
> + tree tmp
> + = fold_build2 (MULT_EXPR, sizetype,
> + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> + loop_len);
> + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
> + gassign *assign = gimple_build_assign (bump, tmp);
> + gsi_insert_before (gsi, assign, GSI_SAME_STMT);
> + *dataref_bump = bump;
> + }
> + else
> + {
> + tree bump
> + = size_binop (MULT_EXPR,
> + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> + size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> + }
>
> /* The offset given in GS_INFO can have pointer type, so use the element
> type of the vector instead. */
> @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo,
> return false;
> }
>
> - int mask_index = internal_fn_mask_index (ifn);
> + int mask_index = internal_fn_mask_index (ifn, false);
> if (mask_index >= 0
> && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
> &mask, NULL, &mask_dt, &mask_vectype))
> @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
> else if (memory_access_type == VMAT_GATHER_SCATTER)
> {
> aggr_type = elem_type;
> - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> - &bump, &vec_offset);
> + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> + &bump, &vec_offset, loop_lens);
> }
> else
> {
> @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
> unsigned HOST_WIDE_INT align;
>
> tree final_mask = NULL_TREE;
> + tree final_len = NULL_TREE;
> + tree bias = NULL_TREE;
> if (loop_masks)
> final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> vec_num * ncopies,
> @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo,
> if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> vec_offset = vec_offsets[vec_num * j + i];
> tree scale = size_int (gs_info.scale);
> +
> + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE)
> + {
> + if (loop_lens)
> + {
> + final_len
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i, 1);
> + }
> + else
> + {
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> + final_len
> + = build_int_cst (iv_type,
> + TYPE_VECTOR_SUBPARTS (vectype));
> + }
> + signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + bias = build_int_cst (intQI_type_node, biasval);
> + if (!final_mask)
> + {
> + mask_vectype = truth_type_for (vectype);
> + final_mask = build_minus_one_cst (mask_vectype);
> + }
> + }
> +
> gcall *call;
> - if (final_mask)
> + if (final_len && final_len)
> + call
> + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
> + 7, dataref_ptr, vec_offset,
> + scale, vec_oprnd, final_len,
> + bias, final_mask);
> + else if (final_mask)
> call = gimple_build_call_internal
> (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
> scale, vec_oprnd, final_mask);
> @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo,
> machine_mode vmode = TYPE_MODE (vectype);
> machine_mode new_vmode = vmode;
> internal_fn partial_ifn = IFN_LAST;
> - /* Produce 'len' and 'bias' argument. */
> - tree final_len = NULL_TREE;
> - tree bias = NULL_TREE;
> if (loop_lens)
> {
> opt_machine_mode new_ovmode
> @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo,
> else if (memory_access_type == VMAT_GATHER_SCATTER)
> {
> aggr_type = elem_type;
> - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> - &bump, &vec_offset);
> + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> + &bump, &vec_offset, loop_lens);
> }
> else
> {
> @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo,
> for (i = 0; i < vec_num; i++)
> {
> tree final_mask = NULL_TREE;
> + tree final_len = NULL_TREE;
> + tree bias = NULL_TREE;
> if (loop_masks
> && memory_access_type != VMAT_INVARIANT)
> final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo,
> vec_offset = vec_offsets[vec_num * j + i];
> tree zero = build_zero_cst (vectype);
> tree scale = size_int (gs_info.scale);
> +
> + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
> + {
> + if (loop_lens)
> + {
> + final_len = vect_get_loop_len (
> + loop_vinfo, gsi, loop_lens, vec_num * ncopies,
> + vectype, vec_num * j + i, 1);
> + }
> + else
> + {
> + tree iv_type
> + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> + final_len = build_int_cst (
> + iv_type, TYPE_VECTOR_SUBPARTS (vectype));
> + }
> + signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + bias = build_int_cst (intQI_type_node, biasval);
> + if (!final_mask)
> + {
> + mask_vectype = truth_type_for (vectype);
> + final_mask = build_minus_one_cst (mask_vectype);
> + }
> + }
> +
> gcall *call;
> - if (final_mask)
> + if (final_len && final_mask)
> + call = gimple_build_call_internal (
> + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
> + vec_offset, scale, zero, final_len, bias,
> + final_mask);
> + else if (final_mask)
> call = gimple_build_call_internal
> (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
> vec_offset, scale, zero, final_mask);
> @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo,
> machine_mode vmode = TYPE_MODE (vectype);
> machine_mode new_vmode = vmode;
> internal_fn partial_ifn = IFN_LAST;
> - /* Produce 'len' and 'bias' argument. */
> - tree final_len = NULL_TREE;
> - tree bias = NULL_TREE;
> if (loop_lens)
> {
> opt_machine_mode new_ovmode
>
Hi, Richi.
>> Eh, it's obvious that we should have the "vectorized" form
>> also in the 'scalar' variant. If you think there's no reasonable
>> way to add a value for len or bias then instead re-order the
>> arguments so 'mask' comes first and the len/bias pair last.
I found this patch is buggy when I am testing LEN_MAS_GATHER_LOAD.
And reoder 'mask' comes first can not help.
Here, we have 'GATHER_LOAD' and 'MASK_GATHER_LOAD',
For GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0);
I change it into:
LEN_MASK_GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0);
In this situation, internal_fn_mask_index
should return -1.
Wheras, MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33);
I change it into LEN_MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33);
it should return the index = 4.
I can't differentiate them by only using LEN_MASK_GATHER_LOAD.
Could I revise internal_fn_mask_index
as follows ?
int
internal_fn_mask_index (internal_fn fn, int nargs)
{
switch (fn)
{
case IFN_MASK_LOAD:
case IFN_MASK_LOAD_LANES:
case IFN_MASK_STORE:
case IFN_MASK_STORE_LANES:
return 2;
case IFN_MASK_GATHER_LOAD:
case IFN_MASK_SCATTER_STORE:
case IFN_LEN_MASK_LOAD:
case IFN_LEN_MASK_STORE:
return 4;
/* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
on arguments before and after vectorized.
Before vectorized:
LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
After vectorized:
LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
*/
case IFN_LEN_MASK_GATHER_LOAD:
case IFN_LEN_MASK_SCATTER_STORE:
return nargs == 4 ? -1 : nargs == 5 ? 4 : 6;
default:
return (conditional_internal_fn_code (fn) != ERROR_MARK
|| get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
}
}
Thanks.
juzhe.zhong@rivai.ai
From: Richard Biener
Date: 2023-07-04 19:05
To: Ju-Zhe Zhong
CC: gcc-patches; richard.sandiford
Subject: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote:
> From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
>
> Hi, Richard and Richi.
>
> Address comments from Richard.
>
> Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
>
> Since:
> /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> on arguments before and after vectorized.
>
> Before vectorized:
> LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
>
> After vectorized:
> LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> */
>
> I add "vectorized_p" default argument into internal_fn_mask_index.
> So that we could simplify the codes.
Eh, it's obvious that we should have the "vectorized" form
also in the 'scalar' variant. If you think there's no reasonable
way to add a value for len or bias then instead re-order the
arguments so 'mask' comes first and the len/bias pair last.
But IMHO "any" len/bias value should do here.
The rest looks OK now.
Thanks,
Richard.
> The len_mask_gather_load/len_mask_scatter_store patterns have been added.
> Now, this patch applies them into vectorizer.
>
> Here is the example:
>
> void
> f (int *restrict a,
> int *restrict b, int n,
> int base, int step,
> int *restrict cond)
> {
> for (int i = 0; i < n; ++i)
> {
> if (cond[i])
> a[i * 4] = b[i];
> }
> }
>
> Gimple IR:
>
> <bb 3> [local count: 105119324]:
> _58 = (unsigned long) n_13(D);
>
> <bb 4> [local count: 630715945]:
> # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
> # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
> # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
> # ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
> _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
> ivtmp_44 = _61 * 4;
> vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
> mask__24.10_49 = vect__4.9_47 != { 0, ... };
> vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
> ivtmp_54 = _61 * 16;
> .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49);
> vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
> vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
> vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
> ivtmp_60 = ivtmp_59 - _61;
> if (ivtmp_60 != 0)
> goto <bb 4>; [83.33%]
> else
> goto <bb 5>; [16.67%]
>
> gcc/ChangeLog:
>
> * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
> * internal-fn.h (internal_fn_mask_index): Ditto.
> * optabs-query.cc (supports_vec_gather_load_p): Ditto.
> (supports_vec_scatter_store_p): Ditto.
> * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
> * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto.
> (check_load_store_for_partial_vectors): Ditto.
> (vect_get_strided_load_store_ops): Ditto.
> (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
>
> ---
> gcc/internal-fn.cc | 16 ++++-
> gcc/internal-fn.h | 2 +-
> gcc/optabs-query.cc | 2 +
> gcc/tree-vect-data-refs.cc | 18 ++++-
> gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------
> 5 files changed, 150 insertions(+), 23 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 303df102d81..2c78c870de8 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn)
> otherwise return -1. */
>
> int
> -internal_fn_mask_index (internal_fn fn)
> +internal_fn_mask_index (internal_fn fn, bool vectoried_p)
> {
> switch (fn)
> {
> @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn)
> case IFN_LEN_MASK_STORE:
> return 4;
>
> + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> + on arguments before and after vectorized.
> +
> + Before vectorized:
> + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
> +
> + After vectorized:
> + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> + */
> case IFN_LEN_MASK_GATHER_LOAD:
> case IFN_LEN_MASK_SCATTER_STORE:
> - return 6;
> + if (vectoried_p)
> + return 6;
> + else
> + return 4;
>
> default:
> return (conditional_internal_fn_code (fn) != ERROR_MARK
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..e9168c16297 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> extern bool internal_load_fn_p (internal_fn);
> extern bool internal_store_fn_p (internal_fn);
> extern bool internal_gather_scatter_fn_p (internal_fn);
> -extern int internal_fn_mask_index (internal_fn);
> +extern int internal_fn_mask_index (internal_fn, bool = true);
> extern int internal_fn_len_index (internal_fn);
> extern int internal_fn_stored_value_index (internal_fn);
> extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
> diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> index 2fdd0d34354..bf1f484e874 100644
> --- a/gcc/optabs-query.cc
> +++ b/gcc/optabs-query.cc
> @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
> this_fn_optabs->supports_vec_gather_load[mode]
> = (supports_vec_convert_optab_p (gather_load_optab, mode)
> || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
> + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
> ? 1 : -1);
>
> return this_fn_optabs->supports_vec_gather_load[mode] > 0;
> @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
> this_fn_optabs->supports_vec_scatter_store[mode]
> = (supports_vec_convert_optab_p (scatter_store_optab, mode)
> || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
> + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
> ? 1 : -1);
>
> return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index ebe93832b1e..ab2af103cb4 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
> return false;
>
> /* Work out which function we need. */
> - internal_fn ifn, alt_ifn;
> + internal_fn ifn, alt_ifn, alt_ifn2;
> if (read_p)
> {
> ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
> alt_ifn = IFN_MASK_GATHER_LOAD;
> + /* When target supports LEN_MASK_GATHER_LOAD, we always
> + use LEN_MASK_GATHER_LOAD regardless whether len and
> + mask are valid or not. */
> + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
> }
> else
> {
> ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
> alt_ifn = IFN_MASK_SCATTER_STORE;
> + /* When target supports LEN_MASK_SCATTER_STORE, we always
> + use LEN_MASK_SCATTER_STORE regardless whether len and
> + mask are valid or not. */
> + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
> }
>
> for (;;)
> @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
> *offset_vectype_out = offset_vectype;
> return true;
> }
> + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
> + memory_type,
> + offset_vectype, scale))
> + {
> + *ifn_out = alt_ifn2;
> + *offset_vectype_out = offset_vectype;
> + return true;
> + }
>
> if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
> && TYPE_PRECISION (offset_type) >= element_bits)
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index a0c39268bf0..33ec33f8b8d 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info)
> if (call && gimple_call_internal_p (call))
> {
> internal_fn ifn = gimple_call_internal_fn (call);
> - int mask_index = internal_fn_mask_index (ifn);
> + int mask_index = internal_fn_mask_index (ifn, false);
> if (mask_index >= 0
> && use == gimple_call_arg (call, mask_index))
> return true;
> @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> gs_info->offset_vectype,
> gs_info->scale))
> {
> + ifn = (is_load
> + ? IFN_LEN_MASK_GATHER_LOAD
> + : IFN_LEN_MASK_SCATTER_STORE);
> + if (internal_gather_scatter_fn_supported_p (ifn, vectype,
> + gs_info->memory_type,
> + gs_info->offset_vectype,
> + gs_info->scale))
> + {
> + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> + return;
> + }
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> "can't operate on partial vectors because"
> @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
> static void
> vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
> loop_vec_info loop_vinfo,
> + gimple_stmt_iterator *gsi,
> gather_scatter_info *gs_info,
> - tree *dataref_bump, tree *vec_offset)
> + tree *dataref_bump, tree *vec_offset,
> + vec_loop_lens *loop_lens)
> {
> struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
> tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>
> - tree bump = size_binop (MULT_EXPR,
> - fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> - size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> + {
> + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
> + ivtmp_8 = _31 * 16 (step in bytes);
> + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
> + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */
> + tree loop_len
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
> + tree tmp
> + = fold_build2 (MULT_EXPR, sizetype,
> + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> + loop_len);
> + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
> + gassign *assign = gimple_build_assign (bump, tmp);
> + gsi_insert_before (gsi, assign, GSI_SAME_STMT);
> + *dataref_bump = bump;
> + }
> + else
> + {
> + tree bump
> + = size_binop (MULT_EXPR,
> + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> + size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> + }
>
> /* The offset given in GS_INFO can have pointer type, so use the element
> type of the vector instead. */
> @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo,
> return false;
> }
>
> - int mask_index = internal_fn_mask_index (ifn);
> + int mask_index = internal_fn_mask_index (ifn, false);
> if (mask_index >= 0
> && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
> &mask, NULL, &mask_dt, &mask_vectype))
> @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
> else if (memory_access_type == VMAT_GATHER_SCATTER)
> {
> aggr_type = elem_type;
> - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> - &bump, &vec_offset);
> + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> + &bump, &vec_offset, loop_lens);
> }
> else
> {
> @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
> unsigned HOST_WIDE_INT align;
>
> tree final_mask = NULL_TREE;
> + tree final_len = NULL_TREE;
> + tree bias = NULL_TREE;
> if (loop_masks)
> final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> vec_num * ncopies,
> @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo,
> if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> vec_offset = vec_offsets[vec_num * j + i];
> tree scale = size_int (gs_info.scale);
> +
> + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE)
> + {
> + if (loop_lens)
> + {
> + final_len
> + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> + vec_num * ncopies, vectype,
> + vec_num * j + i, 1);
> + }
> + else
> + {
> + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> + final_len
> + = build_int_cst (iv_type,
> + TYPE_VECTOR_SUBPARTS (vectype));
> + }
> + signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + bias = build_int_cst (intQI_type_node, biasval);
> + if (!final_mask)
> + {
> + mask_vectype = truth_type_for (vectype);
> + final_mask = build_minus_one_cst (mask_vectype);
> + }
> + }
> +
> gcall *call;
> - if (final_mask)
> + if (final_len && final_len)
> + call
> + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
> + 7, dataref_ptr, vec_offset,
> + scale, vec_oprnd, final_len,
> + bias, final_mask);
> + else if (final_mask)
> call = gimple_build_call_internal
> (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
> scale, vec_oprnd, final_mask);
> @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo,
> machine_mode vmode = TYPE_MODE (vectype);
> machine_mode new_vmode = vmode;
> internal_fn partial_ifn = IFN_LAST;
> - /* Produce 'len' and 'bias' argument. */
> - tree final_len = NULL_TREE;
> - tree bias = NULL_TREE;
> if (loop_lens)
> {
> opt_machine_mode new_ovmode
> @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo,
> else if (memory_access_type == VMAT_GATHER_SCATTER)
> {
> aggr_type = elem_type;
> - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> - &bump, &vec_offset);
> + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> + &bump, &vec_offset, loop_lens);
> }
> else
> {
> @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo,
> for (i = 0; i < vec_num; i++)
> {
> tree final_mask = NULL_TREE;
> + tree final_len = NULL_TREE;
> + tree bias = NULL_TREE;
> if (loop_masks
> && memory_access_type != VMAT_INVARIANT)
> final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo,
> vec_offset = vec_offsets[vec_num * j + i];
> tree zero = build_zero_cst (vectype);
> tree scale = size_int (gs_info.scale);
> +
> + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
> + {
> + if (loop_lens)
> + {
> + final_len = vect_get_loop_len (
> + loop_vinfo, gsi, loop_lens, vec_num * ncopies,
> + vectype, vec_num * j + i, 1);
> + }
> + else
> + {
> + tree iv_type
> + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> + final_len = build_int_cst (
> + iv_type, TYPE_VECTOR_SUBPARTS (vectype));
> + }
> + signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + bias = build_int_cst (intQI_type_node, biasval);
> + if (!final_mask)
> + {
> + mask_vectype = truth_type_for (vectype);
> + final_mask = build_minus_one_cst (mask_vectype);
> + }
> + }
> +
> gcall *call;
> - if (final_mask)
> + if (final_len && final_mask)
> + call = gimple_build_call_internal (
> + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
> + vec_offset, scale, zero, final_len, bias,
> + final_mask);
> + else if (final_mask)
> call = gimple_build_call_internal
> (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
> vec_offset, scale, zero, final_mask);
> @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo,
> machine_mode vmode = TYPE_MODE (vectype);
> machine_mode new_vmode = vmode;
> internal_fn partial_ifn = IFN_LAST;
> - /* Produce 'len' and 'bias' argument. */
> - tree final_len = NULL_TREE;
> - tree bias = NULL_TREE;
> if (loop_lens)
> {
> opt_machine_mode new_ovmode
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote:
> Hi, Richi.
>
> >> Eh, it's obvious that we should have the "vectorized" form
> >> also in the 'scalar' variant. If you think there's no reasonable
> >> way to add a value for len or bias then instead re-order the
> >> arguments so 'mask' comes first and the len/bias pair last.
>
> I found this patch is buggy when I am testing LEN_MAS_GATHER_LOAD.
> And reoder 'mask' comes first can not help.
>
> Here, we have 'GATHER_LOAD' and 'MASK_GATHER_LOAD',
> For GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0);
> I change it into:
>
> LEN_MASK_GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0);
> In this situation, internal_fn_mask_index
> should return -1.
>
> Wheras, MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33);
> I change it into LEN_MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33);
> it should return the index = 4.
>
> I can't differentiate them by only using LEN_MASK_GATHER_LOAD.
> Could I revise internal_fn_mask_index
> as follows ?
No, please adjust the gather pattern recognition to produce either
appropriate LEN_ variant IFNs or simply keep only the unconditional
and conditional mask variants from patterns but code generate
the len_ variants. I don't really see what the problem is.
Maybe you fail to specify the appropriate ifn when you inspect
the scalar internal fn call?
> int
> internal_fn_mask_index (internal_fn fn, int nargs)
> {
> switch (fn)
> {
> case IFN_MASK_LOAD:
> case IFN_MASK_LOAD_LANES:
> case IFN_MASK_STORE:
> case IFN_MASK_STORE_LANES:
> return 2;
>
> case IFN_MASK_GATHER_LOAD:
> case IFN_MASK_SCATTER_STORE:
> case IFN_LEN_MASK_LOAD:
> case IFN_LEN_MASK_STORE:
> return 4;
>
> /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> on arguments before and after vectorized.
>
> Before vectorized:
> LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
>
> After vectorized:
> LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> */
> case IFN_LEN_MASK_GATHER_LOAD:
> case IFN_LEN_MASK_SCATTER_STORE:
> return nargs == 4 ? -1 : nargs == 5 ? 4 : 6;
>
> default:
> return (conditional_internal_fn_code (fn) != ERROR_MARK
> || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
> }
> }
>
>
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>
> From: Richard Biener
> Date: 2023-07-04 19:05
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford
> Subject: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
> On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote:
>
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> >
> > Hi, Richard and Richi.
> >
> > Address comments from Richard.
> >
> > Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
> >
> > Since:
> > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> > on arguments before and after vectorized.
> >
> > Before vectorized:
> > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
> >
> > After vectorized:
> > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> > */
> >
> > I add "vectorized_p" default argument into internal_fn_mask_index.
> > So that we could simplify the codes.
>
> Eh, it's obvious that we should have the "vectorized" form
> also in the 'scalar' variant. If you think there's no reasonable
> way to add a value for len or bias then instead re-order the
> arguments so 'mask' comes first and the len/bias pair last.
>
> But IMHO "any" len/bias value should do here.
>
> The rest looks OK now.
>
> Thanks,
> Richard.
>
> > The len_mask_gather_load/len_mask_scatter_store patterns have been added.
> > Now, this patch applies them into vectorizer.
> >
> > Here is the example:
> >
> > void
> > f (int *restrict a,
> > int *restrict b, int n,
> > int base, int step,
> > int *restrict cond)
> > {
> > for (int i = 0; i < n; ++i)
> > {
> > if (cond[i])
> > a[i * 4] = b[i];
> > }
> > }
> >
> > Gimple IR:
> >
> > <bb 3> [local count: 105119324]:
> > _58 = (unsigned long) n_13(D);
> >
> > <bb 4> [local count: 630715945]:
> > # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
> > # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
> > # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
> > # ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
> > _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
> > ivtmp_44 = _61 * 4;
> > vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
> > mask__24.10_49 = vect__4.9_47 != { 0, ... };
> > vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
> > ivtmp_54 = _61 * 16;
> > .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49);
> > vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
> > vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
> > vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
> > ivtmp_60 = ivtmp_59 - _61;
> > if (ivtmp_60 != 0)
> > goto <bb 4>; [83.33%]
> > else
> > goto <bb 5>; [16.67%]
> >
> > gcc/ChangeLog:
> >
> > * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
> > * internal-fn.h (internal_fn_mask_index): Ditto.
> > * optabs-query.cc (supports_vec_gather_load_p): Ditto.
> > (supports_vec_scatter_store_p): Ditto.
> > * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
> > * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto.
> > (check_load_store_for_partial_vectors): Ditto.
> > (vect_get_strided_load_store_ops): Ditto.
> > (vectorizable_store): Ditto.
> > (vectorizable_load): Ditto.
> >
> > ---
> > gcc/internal-fn.cc | 16 ++++-
> > gcc/internal-fn.h | 2 +-
> > gcc/optabs-query.cc | 2 +
> > gcc/tree-vect-data-refs.cc | 18 ++++-
> > gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------
> > 5 files changed, 150 insertions(+), 23 deletions(-)
> >
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index 303df102d81..2c78c870de8 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn)
> > otherwise return -1. */
> >
> > int
> > -internal_fn_mask_index (internal_fn fn)
> > +internal_fn_mask_index (internal_fn fn, bool vectoried_p)
> > {
> > switch (fn)
> > {
> > @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn)
> > case IFN_LEN_MASK_STORE:
> > return 4;
> >
> > + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> > + on arguments before and after vectorized.
> > +
> > + Before vectorized:
> > + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
> > +
> > + After vectorized:
> > + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> > + */
> > case IFN_LEN_MASK_GATHER_LOAD:
> > case IFN_LEN_MASK_SCATTER_STORE:
> > - return 6;
> > + if (vectoried_p)
> > + return 6;
> > + else
> > + return 4;
> >
> > default:
> > return (conditional_internal_fn_code (fn) != ERROR_MARK
> > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > index 4234bbfed87..e9168c16297 100644
> > --- a/gcc/internal-fn.h
> > +++ b/gcc/internal-fn.h
> > @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> > extern bool internal_load_fn_p (internal_fn);
> > extern bool internal_store_fn_p (internal_fn);
> > extern bool internal_gather_scatter_fn_p (internal_fn);
> > -extern int internal_fn_mask_index (internal_fn);
> > +extern int internal_fn_mask_index (internal_fn, bool = true);
> > extern int internal_fn_len_index (internal_fn);
> > extern int internal_fn_stored_value_index (internal_fn);
> > extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
> > diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> > index 2fdd0d34354..bf1f484e874 100644
> > --- a/gcc/optabs-query.cc
> > +++ b/gcc/optabs-query.cc
> > @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
> > this_fn_optabs->supports_vec_gather_load[mode]
> > = (supports_vec_convert_optab_p (gather_load_optab, mode)
> > || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
> > + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
> > ? 1 : -1);
> >
> > return this_fn_optabs->supports_vec_gather_load[mode] > 0;
> > @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
> > this_fn_optabs->supports_vec_scatter_store[mode]
> > = (supports_vec_convert_optab_p (scatter_store_optab, mode)
> > || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
> > + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
> > ? 1 : -1);
> >
> > return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
> > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > index ebe93832b1e..ab2af103cb4 100644
> > --- a/gcc/tree-vect-data-refs.cc
> > +++ b/gcc/tree-vect-data-refs.cc
> > @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
> > return false;
> >
> > /* Work out which function we need. */
> > - internal_fn ifn, alt_ifn;
> > + internal_fn ifn, alt_ifn, alt_ifn2;
> > if (read_p)
> > {
> > ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
> > alt_ifn = IFN_MASK_GATHER_LOAD;
> > + /* When target supports LEN_MASK_GATHER_LOAD, we always
> > + use LEN_MASK_GATHER_LOAD regardless whether len and
> > + mask are valid or not. */
> > + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
> > }
> > else
> > {
> > ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
> > alt_ifn = IFN_MASK_SCATTER_STORE;
> > + /* When target supports LEN_MASK_SCATTER_STORE, we always
> > + use LEN_MASK_SCATTER_STORE regardless whether len and
> > + mask are valid or not. */
> > + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
> > }
> >
> > for (;;)
> > @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
> > *offset_vectype_out = offset_vectype;
> > return true;
> > }
> > + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
> > + memory_type,
> > + offset_vectype, scale))
> > + {
> > + *ifn_out = alt_ifn2;
> > + *offset_vectype_out = offset_vectype;
> > + return true;
> > + }
> >
> > if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
> > && TYPE_PRECISION (offset_type) >= element_bits)
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index a0c39268bf0..33ec33f8b8d 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info)
> > if (call && gimple_call_internal_p (call))
> > {
> > internal_fn ifn = gimple_call_internal_fn (call);
> > - int mask_index = internal_fn_mask_index (ifn);
> > + int mask_index = internal_fn_mask_index (ifn, false);
> > if (mask_index >= 0
> > && use == gimple_call_arg (call, mask_index))
> > return true;
> > @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> > gs_info->offset_vectype,
> > gs_info->scale))
> > {
> > + ifn = (is_load
> > + ? IFN_LEN_MASK_GATHER_LOAD
> > + : IFN_LEN_MASK_SCATTER_STORE);
> > + if (internal_gather_scatter_fn_supported_p (ifn, vectype,
> > + gs_info->memory_type,
> > + gs_info->offset_vectype,
> > + gs_info->scale))
> > + {
> > + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> > + return;
> > + }
> > if (dump_enabled_p ())
> > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > "can't operate on partial vectors because"
> > @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
> > static void
> > vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
> > loop_vec_info loop_vinfo,
> > + gimple_stmt_iterator *gsi,
> > gather_scatter_info *gs_info,
> > - tree *dataref_bump, tree *vec_offset)
> > + tree *dataref_bump, tree *vec_offset,
> > + vec_loop_lens *loop_lens)
> > {
> > struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
> > tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> >
> > - tree bump = size_binop (MULT_EXPR,
> > - fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> > - size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> > - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> > + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> > + {
> > + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
> > + ivtmp_8 = _31 * 16 (step in bytes);
> > + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
> > + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */
> > + tree loop_len
> > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
> > + tree tmp
> > + = fold_build2 (MULT_EXPR, sizetype,
> > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> > + loop_len);
> > + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
> > + gassign *assign = gimple_build_assign (bump, tmp);
> > + gsi_insert_before (gsi, assign, GSI_SAME_STMT);
> > + *dataref_bump = bump;
> > + }
> > + else
> > + {
> > + tree bump
> > + = size_binop (MULT_EXPR,
> > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> > + size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> > + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> > + }
> >
> > /* The offset given in GS_INFO can have pointer type, so use the element
> > type of the vector instead. */
> > @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo,
> > return false;
> > }
> >
> > - int mask_index = internal_fn_mask_index (ifn);
> > + int mask_index = internal_fn_mask_index (ifn, false);
> > if (mask_index >= 0
> > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
> > &mask, NULL, &mask_dt, &mask_vectype))
> > @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
> > else if (memory_access_type == VMAT_GATHER_SCATTER)
> > {
> > aggr_type = elem_type;
> > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> > - &bump, &vec_offset);
> > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> > + &bump, &vec_offset, loop_lens);
> > }
> > else
> > {
> > @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
> > unsigned HOST_WIDE_INT align;
> >
> > tree final_mask = NULL_TREE;
> > + tree final_len = NULL_TREE;
> > + tree bias = NULL_TREE;
> > if (loop_masks)
> > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> > vec_num * ncopies,
> > @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo,
> > if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> > vec_offset = vec_offsets[vec_num * j + i];
> > tree scale = size_int (gs_info.scale);
> > +
> > + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE)
> > + {
> > + if (loop_lens)
> > + {
> > + final_len
> > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > + vec_num * ncopies, vectype,
> > + vec_num * j + i, 1);
> > + }
> > + else
> > + {
> > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > + final_len
> > + = build_int_cst (iv_type,
> > + TYPE_VECTOR_SUBPARTS (vectype));
> > + }
> > + signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > + bias = build_int_cst (intQI_type_node, biasval);
> > + if (!final_mask)
> > + {
> > + mask_vectype = truth_type_for (vectype);
> > + final_mask = build_minus_one_cst (mask_vectype);
> > + }
> > + }
> > +
> > gcall *call;
> > - if (final_mask)
> > + if (final_len && final_len)
> > + call
> > + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
> > + 7, dataref_ptr, vec_offset,
> > + scale, vec_oprnd, final_len,
> > + bias, final_mask);
> > + else if (final_mask)
> > call = gimple_build_call_internal
> > (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
> > scale, vec_oprnd, final_mask);
> > @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo,
> > machine_mode vmode = TYPE_MODE (vectype);
> > machine_mode new_vmode = vmode;
> > internal_fn partial_ifn = IFN_LAST;
> > - /* Produce 'len' and 'bias' argument. */
> > - tree final_len = NULL_TREE;
> > - tree bias = NULL_TREE;
> > if (loop_lens)
> > {
> > opt_machine_mode new_ovmode
> > @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo,
> > else if (memory_access_type == VMAT_GATHER_SCATTER)
> > {
> > aggr_type = elem_type;
> > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> > - &bump, &vec_offset);
> > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> > + &bump, &vec_offset, loop_lens);
> > }
> > else
> > {
> > @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo,
> > for (i = 0; i < vec_num; i++)
> > {
> > tree final_mask = NULL_TREE;
> > + tree final_len = NULL_TREE;
> > + tree bias = NULL_TREE;
> > if (loop_masks
> > && memory_access_type != VMAT_INVARIANT)
> > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> > @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo,
> > vec_offset = vec_offsets[vec_num * j + i];
> > tree zero = build_zero_cst (vectype);
> > tree scale = size_int (gs_info.scale);
> > +
> > + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
> > + {
> > + if (loop_lens)
> > + {
> > + final_len = vect_get_loop_len (
> > + loop_vinfo, gsi, loop_lens, vec_num * ncopies,
> > + vectype, vec_num * j + i, 1);
> > + }
> > + else
> > + {
> > + tree iv_type
> > + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > + final_len = build_int_cst (
> > + iv_type, TYPE_VECTOR_SUBPARTS (vectype));
> > + }
> > + signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > + bias = build_int_cst (intQI_type_node, biasval);
> > + if (!final_mask)
> > + {
> > + mask_vectype = truth_type_for (vectype);
> > + final_mask = build_minus_one_cst (mask_vectype);
> > + }
> > + }
> > +
> > gcall *call;
> > - if (final_mask)
> > + if (final_len && final_mask)
> > + call = gimple_build_call_internal (
> > + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
> > + vec_offset, scale, zero, final_len, bias,
> > + final_mask);
> > + else if (final_mask)
> > call = gimple_build_call_internal
> > (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
> > vec_offset, scale, zero, final_mask);
> > @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo,
> > machine_mode vmode = TYPE_MODE (vectype);
> > machine_mode new_vmode = vmode;
> > internal_fn partial_ifn = IFN_LAST;
> > - /* Produce 'len' and 'bias' argument. */
> > - tree final_len = NULL_TREE;
> > - tree bias = NULL_TREE;
> > if (loop_lens)
> > {
> > opt_machine_mode new_ovmode
> >
>
>
Hi, Richi.
For GATHER_LOAD which doesn't have len and mask.
Should I keep it as gather_load, then I support both gather_load and len_mask_gather_load ?
Or I should normalize it into len_mask_gather_load with length = vf and mask = {1,1,1,1,1,...}, then
I only need to support len_mask_gather_load in RISC-V port?
Thanks.
juzhe.zhong@rivai.ai
From: Richard Biener
Date: 2023-07-04 19:17
To: juzhe.zhong@rivai.ai
CC: gcc-patches; richard.sandiford
Subject: Re: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote:
> Hi, Richi.
>
> >> Eh, it's obvious that we should have the "vectorized" form
> >> also in the 'scalar' variant. If you think there's no reasonable
> >> way to add a value for len or bias then instead re-order the
> >> arguments so 'mask' comes first and the len/bias pair last.
>
> I found this patch is buggy when I am testing LEN_MAS_GATHER_LOAD.
> And reoder 'mask' comes first can not help.
>
> Here, we have 'GATHER_LOAD' and 'MASK_GATHER_LOAD',
> For GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0);
> I change it into:
>
> LEN_MASK_GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0);
> In this situation, internal_fn_mask_index
> should return -1.
>
> Wheras, MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33);
> I change it into LEN_MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33);
> it should return the index = 4.
>
> I can't differentiate them by only using LEN_MASK_GATHER_LOAD.
> Could I revise internal_fn_mask_index
> as follows ?
No, please adjust the gather pattern recognition to produce either
appropriate LEN_ variant IFNs or simply keep only the unconditional
and conditional mask variants from patterns but code generate
the len_ variants. I don't really see what the problem is.
Maybe you fail to specify the appropriate ifn when you inspect
the scalar internal fn call?
> int
> internal_fn_mask_index (internal_fn fn, int nargs)
> {
> switch (fn)
> {
> case IFN_MASK_LOAD:
> case IFN_MASK_LOAD_LANES:
> case IFN_MASK_STORE:
> case IFN_MASK_STORE_LANES:
> return 2;
>
> case IFN_MASK_GATHER_LOAD:
> case IFN_MASK_SCATTER_STORE:
> case IFN_LEN_MASK_LOAD:
> case IFN_LEN_MASK_STORE:
> return 4;
>
> /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> on arguments before and after vectorized.
>
> Before vectorized:
> LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
>
> After vectorized:
> LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> */
> case IFN_LEN_MASK_GATHER_LOAD:
> case IFN_LEN_MASK_SCATTER_STORE:
> return nargs == 4 ? -1 : nargs == 5 ? 4 : 6;
>
> default:
> return (conditional_internal_fn_code (fn) != ERROR_MARK
> || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
> }
> }
>
>
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>
> From: Richard Biener
> Date: 2023-07-04 19:05
> To: Ju-Zhe Zhong
> CC: gcc-patches; richard.sandiford
> Subject: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
> On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote:
>
> > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> >
> > Hi, Richard and Richi.
> >
> > Address comments from Richard.
> >
> > Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
> >
> > Since:
> > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> > on arguments before and after vectorized.
> >
> > Before vectorized:
> > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
> >
> > After vectorized:
> > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> > */
> >
> > I add "vectorized_p" default argument into internal_fn_mask_index.
> > So that we could simplify the codes.
>
> Eh, it's obvious that we should have the "vectorized" form
> also in the 'scalar' variant. If you think there's no reasonable
> way to add a value for len or bias then instead re-order the
> arguments so 'mask' comes first and the len/bias pair last.
>
> But IMHO "any" len/bias value should do here.
>
> The rest looks OK now.
>
> Thanks,
> Richard.
>
> > The len_mask_gather_load/len_mask_scatter_store patterns have been added.
> > Now, this patch applies them into vectorizer.
> >
> > Here is the example:
> >
> > void
> > f (int *restrict a,
> > int *restrict b, int n,
> > int base, int step,
> > int *restrict cond)
> > {
> > for (int i = 0; i < n; ++i)
> > {
> > if (cond[i])
> > a[i * 4] = b[i];
> > }
> > }
> >
> > Gimple IR:
> >
> > <bb 3> [local count: 105119324]:
> > _58 = (unsigned long) n_13(D);
> >
> > <bb 4> [local count: 630715945]:
> > # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
> > # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
> > # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
> > # ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
> > _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
> > ivtmp_44 = _61 * 4;
> > vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
> > mask__24.10_49 = vect__4.9_47 != { 0, ... };
> > vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
> > ivtmp_54 = _61 * 16;
> > .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49);
> > vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
> > vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
> > vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
> > ivtmp_60 = ivtmp_59 - _61;
> > if (ivtmp_60 != 0)
> > goto <bb 4>; [83.33%]
> > else
> > goto <bb 5>; [16.67%]
> >
> > gcc/ChangeLog:
> >
> > * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
> > * internal-fn.h (internal_fn_mask_index): Ditto.
> > * optabs-query.cc (supports_vec_gather_load_p): Ditto.
> > (supports_vec_scatter_store_p): Ditto.
> > * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
> > * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto.
> > (check_load_store_for_partial_vectors): Ditto.
> > (vect_get_strided_load_store_ops): Ditto.
> > (vectorizable_store): Ditto.
> > (vectorizable_load): Ditto.
> >
> > ---
> > gcc/internal-fn.cc | 16 ++++-
> > gcc/internal-fn.h | 2 +-
> > gcc/optabs-query.cc | 2 +
> > gcc/tree-vect-data-refs.cc | 18 ++++-
> > gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------
> > 5 files changed, 150 insertions(+), 23 deletions(-)
> >
> > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > index 303df102d81..2c78c870de8 100644
> > --- a/gcc/internal-fn.cc
> > +++ b/gcc/internal-fn.cc
> > @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn)
> > otherwise return -1. */
> >
> > int
> > -internal_fn_mask_index (internal_fn fn)
> > +internal_fn_mask_index (internal_fn fn, bool vectoried_p)
> > {
> > switch (fn)
> > {
> > @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn)
> > case IFN_LEN_MASK_STORE:
> > return 4;
> >
> > + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> > + on arguments before and after vectorized.
> > +
> > + Before vectorized:
> > + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
> > +
> > + After vectorized:
> > + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> > + */
> > case IFN_LEN_MASK_GATHER_LOAD:
> > case IFN_LEN_MASK_SCATTER_STORE:
> > - return 6;
> > + if (vectoried_p)
> > + return 6;
> > + else
> > + return 4;
> >
> > default:
> > return (conditional_internal_fn_code (fn) != ERROR_MARK
> > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > index 4234bbfed87..e9168c16297 100644
> > --- a/gcc/internal-fn.h
> > +++ b/gcc/internal-fn.h
> > @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> > extern bool internal_load_fn_p (internal_fn);
> > extern bool internal_store_fn_p (internal_fn);
> > extern bool internal_gather_scatter_fn_p (internal_fn);
> > -extern int internal_fn_mask_index (internal_fn);
> > +extern int internal_fn_mask_index (internal_fn, bool = true);
> > extern int internal_fn_len_index (internal_fn);
> > extern int internal_fn_stored_value_index (internal_fn);
> > extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
> > diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> > index 2fdd0d34354..bf1f484e874 100644
> > --- a/gcc/optabs-query.cc
> > +++ b/gcc/optabs-query.cc
> > @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
> > this_fn_optabs->supports_vec_gather_load[mode]
> > = (supports_vec_convert_optab_p (gather_load_optab, mode)
> > || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
> > + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
> > ? 1 : -1);
> >
> > return this_fn_optabs->supports_vec_gather_load[mode] > 0;
> > @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
> > this_fn_optabs->supports_vec_scatter_store[mode]
> > = (supports_vec_convert_optab_p (scatter_store_optab, mode)
> > || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
> > + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
> > ? 1 : -1);
> >
> > return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
> > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > index ebe93832b1e..ab2af103cb4 100644
> > --- a/gcc/tree-vect-data-refs.cc
> > +++ b/gcc/tree-vect-data-refs.cc
> > @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
> > return false;
> >
> > /* Work out which function we need. */
> > - internal_fn ifn, alt_ifn;
> > + internal_fn ifn, alt_ifn, alt_ifn2;
> > if (read_p)
> > {
> > ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
> > alt_ifn = IFN_MASK_GATHER_LOAD;
> > + /* When target supports LEN_MASK_GATHER_LOAD, we always
> > + use LEN_MASK_GATHER_LOAD regardless whether len and
> > + mask are valid or not. */
> > + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
> > }
> > else
> > {
> > ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
> > alt_ifn = IFN_MASK_SCATTER_STORE;
> > + /* When target supports LEN_MASK_SCATTER_STORE, we always
> > + use LEN_MASK_SCATTER_STORE regardless whether len and
> > + mask are valid or not. */
> > + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
> > }
> >
> > for (;;)
> > @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
> > *offset_vectype_out = offset_vectype;
> > return true;
> > }
> > + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
> > + memory_type,
> > + offset_vectype, scale))
> > + {
> > + *ifn_out = alt_ifn2;
> > + *offset_vectype_out = offset_vectype;
> > + return true;
> > + }
> >
> > if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
> > && TYPE_PRECISION (offset_type) >= element_bits)
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index a0c39268bf0..33ec33f8b8d 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info)
> > if (call && gimple_call_internal_p (call))
> > {
> > internal_fn ifn = gimple_call_internal_fn (call);
> > - int mask_index = internal_fn_mask_index (ifn);
> > + int mask_index = internal_fn_mask_index (ifn, false);
> > if (mask_index >= 0
> > && use == gimple_call_arg (call, mask_index))
> > return true;
> > @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> > gs_info->offset_vectype,
> > gs_info->scale))
> > {
> > + ifn = (is_load
> > + ? IFN_LEN_MASK_GATHER_LOAD
> > + : IFN_LEN_MASK_SCATTER_STORE);
> > + if (internal_gather_scatter_fn_supported_p (ifn, vectype,
> > + gs_info->memory_type,
> > + gs_info->offset_vectype,
> > + gs_info->scale))
> > + {
> > + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> > + return;
> > + }
> > if (dump_enabled_p ())
> > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > "can't operate on partial vectors because"
> > @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
> > static void
> > vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
> > loop_vec_info loop_vinfo,
> > + gimple_stmt_iterator *gsi,
> > gather_scatter_info *gs_info,
> > - tree *dataref_bump, tree *vec_offset)
> > + tree *dataref_bump, tree *vec_offset,
> > + vec_loop_lens *loop_lens)
> > {
> > struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
> > tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> >
> > - tree bump = size_binop (MULT_EXPR,
> > - fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> > - size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> > - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> > + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> > + {
> > + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
> > + ivtmp_8 = _31 * 16 (step in bytes);
> > + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
> > + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */
> > + tree loop_len
> > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
> > + tree tmp
> > + = fold_build2 (MULT_EXPR, sizetype,
> > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> > + loop_len);
> > + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
> > + gassign *assign = gimple_build_assign (bump, tmp);
> > + gsi_insert_before (gsi, assign, GSI_SAME_STMT);
> > + *dataref_bump = bump;
> > + }
> > + else
> > + {
> > + tree bump
> > + = size_binop (MULT_EXPR,
> > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> > + size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> > + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> > + }
> >
> > /* The offset given in GS_INFO can have pointer type, so use the element
> > type of the vector instead. */
> > @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo,
> > return false;
> > }
> >
> > - int mask_index = internal_fn_mask_index (ifn);
> > + int mask_index = internal_fn_mask_index (ifn, false);
> > if (mask_index >= 0
> > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
> > &mask, NULL, &mask_dt, &mask_vectype))
> > @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
> > else if (memory_access_type == VMAT_GATHER_SCATTER)
> > {
> > aggr_type = elem_type;
> > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> > - &bump, &vec_offset);
> > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> > + &bump, &vec_offset, loop_lens);
> > }
> > else
> > {
> > @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
> > unsigned HOST_WIDE_INT align;
> >
> > tree final_mask = NULL_TREE;
> > + tree final_len = NULL_TREE;
> > + tree bias = NULL_TREE;
> > if (loop_masks)
> > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> > vec_num * ncopies,
> > @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo,
> > if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> > vec_offset = vec_offsets[vec_num * j + i];
> > tree scale = size_int (gs_info.scale);
> > +
> > + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE)
> > + {
> > + if (loop_lens)
> > + {
> > + final_len
> > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > + vec_num * ncopies, vectype,
> > + vec_num * j + i, 1);
> > + }
> > + else
> > + {
> > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > + final_len
> > + = build_int_cst (iv_type,
> > + TYPE_VECTOR_SUBPARTS (vectype));
> > + }
> > + signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > + bias = build_int_cst (intQI_type_node, biasval);
> > + if (!final_mask)
> > + {
> > + mask_vectype = truth_type_for (vectype);
> > + final_mask = build_minus_one_cst (mask_vectype);
> > + }
> > + }
> > +
> > gcall *call;
> > - if (final_mask)
> > + if (final_len && final_len)
> > + call
> > + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
> > + 7, dataref_ptr, vec_offset,
> > + scale, vec_oprnd, final_len,
> > + bias, final_mask);
> > + else if (final_mask)
> > call = gimple_build_call_internal
> > (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
> > scale, vec_oprnd, final_mask);
> > @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo,
> > machine_mode vmode = TYPE_MODE (vectype);
> > machine_mode new_vmode = vmode;
> > internal_fn partial_ifn = IFN_LAST;
> > - /* Produce 'len' and 'bias' argument. */
> > - tree final_len = NULL_TREE;
> > - tree bias = NULL_TREE;
> > if (loop_lens)
> > {
> > opt_machine_mode new_ovmode
> > @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo,
> > else if (memory_access_type == VMAT_GATHER_SCATTER)
> > {
> > aggr_type = elem_type;
> > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> > - &bump, &vec_offset);
> > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> > + &bump, &vec_offset, loop_lens);
> > }
> > else
> > {
> > @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo,
> > for (i = 0; i < vec_num; i++)
> > {
> > tree final_mask = NULL_TREE;
> > + tree final_len = NULL_TREE;
> > + tree bias = NULL_TREE;
> > if (loop_masks
> > && memory_access_type != VMAT_INVARIANT)
> > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> > @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo,
> > vec_offset = vec_offsets[vec_num * j + i];
> > tree zero = build_zero_cst (vectype);
> > tree scale = size_int (gs_info.scale);
> > +
> > + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
> > + {
> > + if (loop_lens)
> > + {
> > + final_len = vect_get_loop_len (
> > + loop_vinfo, gsi, loop_lens, vec_num * ncopies,
> > + vectype, vec_num * j + i, 1);
> > + }
> > + else
> > + {
> > + tree iv_type
> > + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > + final_len = build_int_cst (
> > + iv_type, TYPE_VECTOR_SUBPARTS (vectype));
> > + }
> > + signed char biasval
> > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > + bias = build_int_cst (intQI_type_node, biasval);
> > + if (!final_mask)
> > + {
> > + mask_vectype = truth_type_for (vectype);
> > + final_mask = build_minus_one_cst (mask_vectype);
> > + }
> > + }
> > +
> > gcall *call;
> > - if (final_mask)
> > + if (final_len && final_mask)
> > + call = gimple_build_call_internal (
> > + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
> > + vec_offset, scale, zero, final_len, bias,
> > + final_mask);
> > + else if (final_mask)
> > call = gimple_build_call_internal
> > (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
> > vec_offset, scale, zero, final_mask);
> > @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo,
> > machine_mode vmode = TYPE_MODE (vectype);
> > machine_mode new_vmode = vmode;
> > internal_fn partial_ifn = IFN_LAST;
> > - /* Produce 'len' and 'bias' argument. */
> > - tree final_len = NULL_TREE;
> > - tree bias = NULL_TREE;
> > if (loop_lens)
> > {
> > opt_machine_mode new_ovmode
> >
>
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)
On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote:
> Hi, Richi.
>
> For GATHER_LOAD which doesn't have len and mask.
>
> Should I keep it as gather_load, then I support both gather_load and
> len_mask_gather_load ?
>
> Or I should normalize it into len_mask_gather_load with length = vf and
> mask = {1,1,1,1,1,...}, then I only need to support len_mask_gather_load
> in RISC-V port?
I think that pattern recog should keep it as GATHER_LOAD or
MASK_GATHER_LOAD depending on what if-conversion did. The later
handling should then use the appropriate vector IFN which could
add a dummy -1 mask or a dummy len and handle partial vectors.
Richard.
> Thanks.
>
>
> juzhe.zhong@rivai.ai
>
> From: Richard Biener
> Date: 2023-07-04 19:17
> To: juzhe.zhong@rivai.ai
> CC: gcc-patches; richard.sandiford
> Subject: Re: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
> On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote:
>
> > Hi, Richi.
> >
> > >> Eh, it's obvious that we should have the "vectorized" form
> > >> also in the 'scalar' variant. If you think there's no reasonable
> > >> way to add a value for len or bias then instead re-order the
> > >> arguments so 'mask' comes first and the len/bias pair last.
> >
> > I found this patch is buggy when I am testing LEN_MAS_GATHER_LOAD.
> > And reoder 'mask' comes first can not help.
> >
> > Here, we have 'GATHER_LOAD' and 'MASK_GATHER_LOAD',
> > For GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0);
> > I change it into:
> >
> > LEN_MASK_GATHER_LOAD ((sizetype) src_18(D), _6, 1, 0);
> > In this situation, internal_fn_mask_index
> > should return -1.
> >
> > Wheras, MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33);
> > I change it into LEN_MASK_GATHER_LOAD ((sizetype) _56, _8, 1, 0, _33);
> > it should return the index = 4.
> >
> > I can't differentiate them by only using LEN_MASK_GATHER_LOAD.
> > Could I revise internal_fn_mask_index
> > as follows ?
>
> No, please adjust the gather pattern recognition to produce either
> appropriate LEN_ variant IFNs or simply keep only the unconditional
> and conditional mask variants from patterns but code generate
> the len_ variants. I don't really see what the problem is.
> Maybe you fail to specify the appropriate ifn when you inspect
> the scalar internal fn call?
>
> > int
> > internal_fn_mask_index (internal_fn fn, int nargs)
> > {
> > switch (fn)
> > {
> > case IFN_MASK_LOAD:
> > case IFN_MASK_LOAD_LANES:
> > case IFN_MASK_STORE:
> > case IFN_MASK_STORE_LANES:
> > return 2;
> >
> > case IFN_MASK_GATHER_LOAD:
> > case IFN_MASK_SCATTER_STORE:
> > case IFN_LEN_MASK_LOAD:
> > case IFN_LEN_MASK_STORE:
> > return 4;
> >
> > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> > on arguments before and after vectorized.
> >
> > Before vectorized:
> > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
> >
> > After vectorized:
> > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> > */
> > case IFN_LEN_MASK_GATHER_LOAD:
> > case IFN_LEN_MASK_SCATTER_STORE:
> > return nargs == 4 ? -1 : nargs == 5 ? 4 : 6;
> >
> > default:
> > return (conditional_internal_fn_code (fn) != ERROR_MARK
> > || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
> > }
> > }
> >
> >
> > Thanks.
> >
> >
> > juzhe.zhong@rivai.ai
> >
> > From: Richard Biener
> > Date: 2023-07-04 19:05
> > To: Ju-Zhe Zhong
> > CC: gcc-patches; richard.sandiford
> > Subject: Re: [PATCH V3] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer
> > On Tue, 4 Jul 2023, juzhe.zhong@rivai.ai wrote:
> >
> > > From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> > >
> > > Hi, Richard and Richi.
> > >
> > > Address comments from Richard.
> > >
> > > Make gs_info.ifn = LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE.
> > >
> > > Since:
> > > /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> > > on arguments before and after vectorized.
> > >
> > > Before vectorized:
> > > LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
> > >
> > > After vectorized:
> > > LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> > > */
> > >
> > > I add "vectorized_p" default argument into internal_fn_mask_index.
> > > So that we could simplify the codes.
> >
> > Eh, it's obvious that we should have the "vectorized" form
> > also in the 'scalar' variant. If you think there's no reasonable
> > way to add a value for len or bias then instead re-order the
> > arguments so 'mask' comes first and the len/bias pair last.
> >
> > But IMHO "any" len/bias value should do here.
> >
> > The rest looks OK now.
> >
> > Thanks,
> > Richard.
> >
> > > The len_mask_gather_load/len_mask_scatter_store patterns have been added.
> > > Now, this patch applies them into vectorizer.
> > >
> > > Here is the example:
> > >
> > > void
> > > f (int *restrict a,
> > > int *restrict b, int n,
> > > int base, int step,
> > > int *restrict cond)
> > > {
> > > for (int i = 0; i < n; ++i)
> > > {
> > > if (cond[i])
> > > a[i * 4] = b[i];
> > > }
> > > }
> > >
> > > Gimple IR:
> > >
> > > <bb 3> [local count: 105119324]:
> > > _58 = (unsigned long) n_13(D);
> > >
> > > <bb 4> [local count: 630715945]:
> > > # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
> > > # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
> > > # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
> > > # ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
> > > _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
> > > ivtmp_44 = _61 * 4;
> > > vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
> > > mask__24.10_49 = vect__4.9_47 != { 0, ... };
> > > vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
> > > ivtmp_54 = _61 * 16;
> > > .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, vect__8.13_53, _61, 0, mask__24.10_49);
> > > vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
> > > vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
> > > vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
> > > ivtmp_60 = ivtmp_59 - _61;
> > > if (ivtmp_60 != 0)
> > > goto <bb 4>; [83.33%]
> > > else
> > > goto <bb 5>; [16.67%]
> > >
> > > gcc/ChangeLog:
> > >
> > > * internal-fn.cc (internal_fn_mask_index): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
> > > * internal-fn.h (internal_fn_mask_index): Ditto.
> > > * optabs-query.cc (supports_vec_gather_load_p): Ditto.
> > > (supports_vec_scatter_store_p): Ditto.
> > > * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
> > > * tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p): Ditto.
> > > (check_load_store_for_partial_vectors): Ditto.
> > > (vect_get_strided_load_store_ops): Ditto.
> > > (vectorizable_store): Ditto.
> > > (vectorizable_load): Ditto.
> > >
> > > ---
> > > gcc/internal-fn.cc | 16 ++++-
> > > gcc/internal-fn.h | 2 +-
> > > gcc/optabs-query.cc | 2 +
> > > gcc/tree-vect-data-refs.cc | 18 ++++-
> > > gcc/tree-vect-stmts.cc | 135 +++++++++++++++++++++++++++++++------
> > > 5 files changed, 150 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > index 303df102d81..2c78c870de8 100644
> > > --- a/gcc/internal-fn.cc
> > > +++ b/gcc/internal-fn.cc
> > > @@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn)
> > > otherwise return -1. */
> > >
> > > int
> > > -internal_fn_mask_index (internal_fn fn)
> > > +internal_fn_mask_index (internal_fn fn, bool vectoried_p)
> > > {
> > > switch (fn)
> > > {
> > > @@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn)
> > > case IFN_LEN_MASK_STORE:
> > > return 4;
> > >
> > > + /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
> > > + on arguments before and after vectorized.
> > > +
> > > + Before vectorized:
> > > + LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
> > > +
> > > + After vectorized:
> > > + LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
> > > + */
> > > case IFN_LEN_MASK_GATHER_LOAD:
> > > case IFN_LEN_MASK_SCATTER_STORE:
> > > - return 6;
> > > + if (vectoried_p)
> > > + return 6;
> > > + else
> > > + return 4;
> > >
> > > default:
> > > return (conditional_internal_fn_code (fn) != ERROR_MARK
> > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > index 4234bbfed87..e9168c16297 100644
> > > --- a/gcc/internal-fn.h
> > > +++ b/gcc/internal-fn.h
> > > @@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> > > extern bool internal_load_fn_p (internal_fn);
> > > extern bool internal_store_fn_p (internal_fn);
> > > extern bool internal_gather_scatter_fn_p (internal_fn);
> > > -extern int internal_fn_mask_index (internal_fn);
> > > +extern int internal_fn_mask_index (internal_fn, bool = true);
> > > extern int internal_fn_len_index (internal_fn);
> > > extern int internal_fn_stored_value_index (internal_fn);
> > > extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
> > > diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> > > index 2fdd0d34354..bf1f484e874 100644
> > > --- a/gcc/optabs-query.cc
> > > +++ b/gcc/optabs-query.cc
> > > @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
> > > this_fn_optabs->supports_vec_gather_load[mode]
> > > = (supports_vec_convert_optab_p (gather_load_optab, mode)
> > > || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
> > > + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
> > > ? 1 : -1);
> > >
> > > return this_fn_optabs->supports_vec_gather_load[mode] > 0;
> > > @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
> > > this_fn_optabs->supports_vec_scatter_store[mode]
> > > = (supports_vec_convert_optab_p (scatter_store_optab, mode)
> > > || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
> > > + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
> > > ? 1 : -1);
> > >
> > > return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
> > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> > > index ebe93832b1e..ab2af103cb4 100644
> > > --- a/gcc/tree-vect-data-refs.cc
> > > +++ b/gcc/tree-vect-data-refs.cc
> > > @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
> > > return false;
> > >
> > > /* Work out which function we need. */
> > > - internal_fn ifn, alt_ifn;
> > > + internal_fn ifn, alt_ifn, alt_ifn2;
> > > if (read_p)
> > > {
> > > ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
> > > alt_ifn = IFN_MASK_GATHER_LOAD;
> > > + /* When target supports LEN_MASK_GATHER_LOAD, we always
> > > + use LEN_MASK_GATHER_LOAD regardless whether len and
> > > + mask are valid or not. */
> > > + alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
> > > }
> > > else
> > > {
> > > ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
> > > alt_ifn = IFN_MASK_SCATTER_STORE;
> > > + /* When target supports LEN_MASK_SCATTER_STORE, we always
> > > + use LEN_MASK_SCATTER_STORE regardless whether len and
> > > + mask are valid or not. */
> > > + alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
> > > }
> > >
> > > for (;;)
> > > @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
> > > *offset_vectype_out = offset_vectype;
> > > return true;
> > > }
> > > + else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
> > > + memory_type,
> > > + offset_vectype, scale))
> > > + {
> > > + *ifn_out = alt_ifn2;
> > > + *offset_vectype_out = offset_vectype;
> > > + return true;
> > > + }
> > >
> > > if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
> > > && TYPE_PRECISION (offset_type) >= element_bits)
> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > index a0c39268bf0..33ec33f8b8d 100644
> > > --- a/gcc/tree-vect-stmts.cc
> > > +++ b/gcc/tree-vect-stmts.cc
> > > @@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info)
> > > if (call && gimple_call_internal_p (call))
> > > {
> > > internal_fn ifn = gimple_call_internal_fn (call);
> > > - int mask_index = internal_fn_mask_index (ifn);
> > > + int mask_index = internal_fn_mask_index (ifn, false);
> > > if (mask_index >= 0
> > > && use == gimple_call_arg (call, mask_index))
> > > return true;
> > > @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
> > > gs_info->offset_vectype,
> > > gs_info->scale))
> > > {
> > > + ifn = (is_load
> > > + ? IFN_LEN_MASK_GATHER_LOAD
> > > + : IFN_LEN_MASK_SCATTER_STORE);
> > > + if (internal_gather_scatter_fn_supported_p (ifn, vectype,
> > > + gs_info->memory_type,
> > > + gs_info->offset_vectype,
> > > + gs_info->scale))
> > > + {
> > > + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> > > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> > > + return;
> > > + }
> > > if (dump_enabled_p ())
> > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > "can't operate on partial vectors because"
> > > @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
> > > static void
> > > vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
> > > loop_vec_info loop_vinfo,
> > > + gimple_stmt_iterator *gsi,
> > > gather_scatter_info *gs_info,
> > > - tree *dataref_bump, tree *vec_offset)
> > > + tree *dataref_bump, tree *vec_offset,
> > > + vec_loop_lens *loop_lens)
> > > {
> > > struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
> > > tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> > >
> > > - tree bump = size_binop (MULT_EXPR,
> > > - fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> > > - size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> > > - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> > > + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> > > + {
> > > + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
> > > + ivtmp_8 = _31 * 16 (step in bytes);
> > > + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
> > > + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */
> > > + tree loop_len
> > > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
> > > + tree tmp
> > > + = fold_build2 (MULT_EXPR, sizetype,
> > > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> > > + loop_len);
> > > + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
> > > + gassign *assign = gimple_build_assign (bump, tmp);
> > > + gsi_insert_before (gsi, assign, GSI_SAME_STMT);
> > > + *dataref_bump = bump;
> > > + }
> > > + else
> > > + {
> > > + tree bump
> > > + = size_binop (MULT_EXPR,
> > > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> > > + size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> > > + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> > > + }
> > >
> > > /* The offset given in GS_INFO can have pointer type, so use the element
> > > type of the vector instead. */
> > > @@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo,
> > > return false;
> > > }
> > >
> > > - int mask_index = internal_fn_mask_index (ifn);
> > > + int mask_index = internal_fn_mask_index (ifn, false);
> > > if (mask_index >= 0
> > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
> > > &mask, NULL, &mask_dt, &mask_vectype))
> > > @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
> > > else if (memory_access_type == VMAT_GATHER_SCATTER)
> > > {
> > > aggr_type = elem_type;
> > > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> > > - &bump, &vec_offset);
> > > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> > > + &bump, &vec_offset, loop_lens);
> > > }
> > > else
> > > {
> > > @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
> > > unsigned HOST_WIDE_INT align;
> > >
> > > tree final_mask = NULL_TREE;
> > > + tree final_len = NULL_TREE;
> > > + tree bias = NULL_TREE;
> > > if (loop_masks)
> > > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> > > vec_num * ncopies,
> > > @@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo,
> > > if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> > > vec_offset = vec_offsets[vec_num * j + i];
> > > tree scale = size_int (gs_info.scale);
> > > +
> > > + if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE)
> > > + {
> > > + if (loop_lens)
> > > + {
> > > + final_len
> > > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> > > + vec_num * ncopies, vectype,
> > > + vec_num * j + i, 1);
> > > + }
> > > + else
> > > + {
> > > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > > + final_len
> > > + = build_int_cst (iv_type,
> > > + TYPE_VECTOR_SUBPARTS (vectype));
> > > + }
> > > + signed char biasval
> > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > > + bias = build_int_cst (intQI_type_node, biasval);
> > > + if (!final_mask)
> > > + {
> > > + mask_vectype = truth_type_for (vectype);
> > > + final_mask = build_minus_one_cst (mask_vectype);
> > > + }
> > > + }
> > > +
> > > gcall *call;
> > > - if (final_mask)
> > > + if (final_len && final_len)
> > > + call
> > > + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
> > > + 7, dataref_ptr, vec_offset,
> > > + scale, vec_oprnd, final_len,
> > > + bias, final_mask);
> > > + else if (final_mask)
> > > call = gimple_build_call_internal
> > > (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
> > > scale, vec_oprnd, final_mask);
> > > @@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo,
> > > machine_mode vmode = TYPE_MODE (vectype);
> > > machine_mode new_vmode = vmode;
> > > internal_fn partial_ifn = IFN_LAST;
> > > - /* Produce 'len' and 'bias' argument. */
> > > - tree final_len = NULL_TREE;
> > > - tree bias = NULL_TREE;
> > > if (loop_lens)
> > > {
> > > opt_machine_mode new_ovmode
> > > @@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo,
> > > else if (memory_access_type == VMAT_GATHER_SCATTER)
> > > {
> > > aggr_type = elem_type;
> > > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> > > - &bump, &vec_offset);
> > > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> > > + &bump, &vec_offset, loop_lens);
> > > }
> > > else
> > > {
> > > @@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo,
> > > for (i = 0; i < vec_num; i++)
> > > {
> > > tree final_mask = NULL_TREE;
> > > + tree final_len = NULL_TREE;
> > > + tree bias = NULL_TREE;
> > > if (loop_masks
> > > && memory_access_type != VMAT_INVARIANT)
> > > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> > > @@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo,
> > > vec_offset = vec_offsets[vec_num * j + i];
> > > tree zero = build_zero_cst (vectype);
> > > tree scale = size_int (gs_info.scale);
> > > +
> > > + if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
> > > + {
> > > + if (loop_lens)
> > > + {
> > > + final_len = vect_get_loop_len (
> > > + loop_vinfo, gsi, loop_lens, vec_num * ncopies,
> > > + vectype, vec_num * j + i, 1);
> > > + }
> > > + else
> > > + {
> > > + tree iv_type
> > > + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> > > + final_len = build_int_cst (
> > > + iv_type, TYPE_VECTOR_SUBPARTS (vectype));
> > > + }
> > > + signed char biasval
> > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> > > + bias = build_int_cst (intQI_type_node, biasval);
> > > + if (!final_mask)
> > > + {
> > > + mask_vectype = truth_type_for (vectype);
> > > + final_mask = build_minus_one_cst (mask_vectype);
> > > + }
> > > + }
> > > +
> > > gcall *call;
> > > - if (final_mask)
> > > + if (final_len && final_mask)
> > > + call = gimple_build_call_internal (
> > > + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
> > > + vec_offset, scale, zero, final_len, bias,
> > > + final_mask);
> > > + else if (final_mask)
> > > call = gimple_build_call_internal
> > > (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
> > > vec_offset, scale, zero, final_mask);
> > > @@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo,
> > > machine_mode vmode = TYPE_MODE (vectype);
> > > machine_mode new_vmode = vmode;
> > > internal_fn partial_ifn = IFN_LAST;
> > > - /* Produce 'len' and 'bias' argument. */
> > > - tree final_len = NULL_TREE;
> > > - tree bias = NULL_TREE;
> > > if (loop_lens)
> > > {
> > > opt_machine_mode new_ovmode
> > >
> >
> >
>
>
@@ -4483,7 +4483,7 @@ internal_fn_len_index (internal_fn fn)
otherwise return -1. */
int
-internal_fn_mask_index (internal_fn fn)
+internal_fn_mask_index (internal_fn fn, bool vectoried_p)
{
switch (fn)
{
@@ -4499,9 +4499,21 @@ internal_fn_mask_index (internal_fn fn)
case IFN_LEN_MASK_STORE:
return 4;
+ /* LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE are different
+ on arguments before and after vectorized.
+
+ Before vectorized:
+ LEN_MASK_GATHER_LOAD (ptr, align, offset, mask);
+
+ After vectorized:
+ LEN_MASK_GATHER_LOAD (ptr, align, offset, len, bias, mask);
+ */
case IFN_LEN_MASK_GATHER_LOAD:
case IFN_LEN_MASK_SCATTER_STORE:
- return 6;
+ if (vectoried_p)
+ return 6;
+ else
+ return 4;
default:
return (conditional_internal_fn_code (fn) != ERROR_MARK
@@ -233,7 +233,7 @@ extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
extern bool internal_load_fn_p (internal_fn);
extern bool internal_store_fn_p (internal_fn);
extern bool internal_gather_scatter_fn_p (internal_fn);
-extern int internal_fn_mask_index (internal_fn);
+extern int internal_fn_mask_index (internal_fn, bool = true);
extern int internal_fn_len_index (internal_fn);
extern int internal_fn_stored_value_index (internal_fn);
extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
@@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
this_fn_optabs->supports_vec_gather_load[mode]
= (supports_vec_convert_optab_p (gather_load_optab, mode)
|| supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+ || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
? 1 : -1);
return this_fn_optabs->supports_vec_gather_load[mode] > 0;
@@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
this_fn_optabs->supports_vec_scatter_store[mode]
= (supports_vec_convert_optab_p (scatter_store_optab, mode)
|| supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+ || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
? 1 : -1);
return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
@@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
return false;
/* Work out which function we need. */
- internal_fn ifn, alt_ifn;
+ internal_fn ifn, alt_ifn, alt_ifn2;
if (read_p)
{
ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
alt_ifn = IFN_MASK_GATHER_LOAD;
+ /* When target supports LEN_MASK_GATHER_LOAD, we always
+ use LEN_MASK_GATHER_LOAD regardless whether len and
+ mask are valid or not. */
+ alt_ifn2 = IFN_LEN_MASK_GATHER_LOAD;
}
else
{
ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
alt_ifn = IFN_MASK_SCATTER_STORE;
+ /* When target supports LEN_MASK_SCATTER_STORE, we always
+ use LEN_MASK_SCATTER_STORE regardless whether len and
+ mask are valid or not. */
+ alt_ifn2 = IFN_LEN_MASK_SCATTER_STORE;
}
for (;;)
@@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
*offset_vectype_out = offset_vectype;
return true;
}
+ else if (internal_gather_scatter_fn_supported_p (alt_ifn2, vectype,
+ memory_type,
+ offset_vectype, scale))
+ {
+ *ifn_out = alt_ifn2;
+ *offset_vectype_out = offset_vectype;
+ return true;
+ }
if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
&& TYPE_PRECISION (offset_type) >= element_bits)
@@ -433,7 +433,7 @@ exist_non_indexing_operands_for_use_p (tree use, stmt_vec_info stmt_info)
if (call && gimple_call_internal_p (call))
{
internal_fn ifn = gimple_call_internal_fn (call);
- int mask_index = internal_fn_mask_index (ifn);
+ int mask_index = internal_fn_mask_index (ifn, false);
if (mask_index >= 0
&& use == gimple_call_arg (call, mask_index))
return true;
@@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
gs_info->offset_vectype,
gs_info->scale))
{
+ ifn = (is_load
+ ? IFN_LEN_MASK_GATHER_LOAD
+ : IFN_LEN_MASK_SCATTER_STORE);
+ if (internal_gather_scatter_fn_supported_p (ifn, vectype,
+ gs_info->memory_type,
+ gs_info->offset_vectype,
+ gs_info->scale))
+ {
+ vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+ vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+ return;
+ }
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"can't operate on partial vectors because"
@@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
static void
vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
loop_vec_info loop_vinfo,
+ gimple_stmt_iterator *gsi,
gather_scatter_info *gs_info,
- tree *dataref_bump, tree *vec_offset)
+ tree *dataref_bump, tree *vec_offset,
+ vec_loop_lens *loop_lens)
{
struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
- tree bump = size_binop (MULT_EXPR,
- fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
- size_int (TYPE_VECTOR_SUBPARTS (vectype)));
- *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
+ if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
+ {
+ /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
+ ivtmp_8 = _31 * 16 (step in bytes);
+ .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
+ vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */
+ tree loop_len
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
+ tree tmp
+ = fold_build2 (MULT_EXPR, sizetype,
+ fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
+ loop_len);
+ tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
+ gassign *assign = gimple_build_assign (bump, tmp);
+ gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+ *dataref_bump = bump;
+ }
+ else
+ {
+ tree bump
+ = size_binop (MULT_EXPR,
+ fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
+ size_int (TYPE_VECTOR_SUBPARTS (vectype)));
+ *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
+ }
/* The offset given in GS_INFO can have pointer type, so use the element
type of the vector instead. */
@@ -8013,7 +8048,7 @@ vectorizable_store (vec_info *vinfo,
return false;
}
- int mask_index = internal_fn_mask_index (ifn);
+ int mask_index = internal_fn_mask_index (ifn, false);
if (mask_index >= 0
&& !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index,
&mask, NULL, &mask_dt, &mask_vectype))
@@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
else if (memory_access_type == VMAT_GATHER_SCATTER)
{
aggr_type = elem_type;
- vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
- &bump, &vec_offset);
+ vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
+ &bump, &vec_offset, loop_lens);
}
else
{
@@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
unsigned HOST_WIDE_INT align;
tree final_mask = NULL_TREE;
+ tree final_len = NULL_TREE;
+ tree bias = NULL_TREE;
if (loop_masks)
final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
vec_num * ncopies,
@@ -8929,8 +8966,41 @@ vectorizable_store (vec_info *vinfo,
if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
vec_offset = vec_offsets[vec_num * j + i];
tree scale = size_int (gs_info.scale);
+
+ if (gs_info.ifn == IFN_LEN_MASK_SCATTER_STORE)
+ {
+ if (loop_lens)
+ {
+ final_len
+ = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+ vec_num * ncopies, vectype,
+ vec_num * j + i, 1);
+ }
+ else
+ {
+ tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ final_len
+ = build_int_cst (iv_type,
+ TYPE_VECTOR_SUBPARTS (vectype));
+ }
+ signed char biasval
+ = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ bias = build_int_cst (intQI_type_node, biasval);
+ if (!final_mask)
+ {
+ mask_vectype = truth_type_for (vectype);
+ final_mask = build_minus_one_cst (mask_vectype);
+ }
+ }
+
gcall *call;
- if (final_mask)
+ if (final_len && final_len)
+ call
+ = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
+ 7, dataref_ptr, vec_offset,
+ scale, vec_oprnd, final_len,
+ bias, final_mask);
+ else if (final_mask)
call = gimple_build_call_internal
(IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
scale, vec_oprnd, final_mask);
@@ -9047,9 +9117,6 @@ vectorizable_store (vec_info *vinfo,
machine_mode vmode = TYPE_MODE (vectype);
machine_mode new_vmode = vmode;
internal_fn partial_ifn = IFN_LAST;
- /* Produce 'len' and 'bias' argument. */
- tree final_len = NULL_TREE;
- tree bias = NULL_TREE;
if (loop_lens)
{
opt_machine_mode new_ovmode
@@ -10177,8 +10244,8 @@ vectorizable_load (vec_info *vinfo,
else if (memory_access_type == VMAT_GATHER_SCATTER)
{
aggr_type = elem_type;
- vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
- &bump, &vec_offset);
+ vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
+ &bump, &vec_offset, loop_lens);
}
else
{
@@ -10339,6 +10406,8 @@ vectorizable_load (vec_info *vinfo,
for (i = 0; i < vec_num; i++)
{
tree final_mask = NULL_TREE;
+ tree final_len = NULL_TREE;
+ tree bias = NULL_TREE;
if (loop_masks
&& memory_access_type != VMAT_INVARIANT)
final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
@@ -10368,8 +10437,39 @@ vectorizable_load (vec_info *vinfo,
vec_offset = vec_offsets[vec_num * j + i];
tree zero = build_zero_cst (vectype);
tree scale = size_int (gs_info.scale);
+
+ if (gs_info.ifn == IFN_LEN_MASK_GATHER_LOAD)
+ {
+ if (loop_lens)
+ {
+ final_len = vect_get_loop_len (
+ loop_vinfo, gsi, loop_lens, vec_num * ncopies,
+ vectype, vec_num * j + i, 1);
+ }
+ else
+ {
+ tree iv_type
+ = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+ final_len = build_int_cst (
+ iv_type, TYPE_VECTOR_SUBPARTS (vectype));
+ }
+ signed char biasval
+ = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ bias = build_int_cst (intQI_type_node, biasval);
+ if (!final_mask)
+ {
+ mask_vectype = truth_type_for (vectype);
+ final_mask = build_minus_one_cst (mask_vectype);
+ }
+ }
+
gcall *call;
- if (final_mask)
+ if (final_len && final_mask)
+ call = gimple_build_call_internal (
+ IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
+ vec_offset, scale, zero, final_len, bias,
+ final_mask);
+ else if (final_mask)
call = gimple_build_call_internal
(IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
vec_offset, scale, zero, final_mask);
@@ -10462,9 +10562,6 @@ vectorizable_load (vec_info *vinfo,
machine_mode vmode = TYPE_MODE (vectype);
machine_mode new_vmode = vmode;
internal_fn partial_ifn = IFN_LAST;
- /* Produce 'len' and 'bias' argument. */
- tree final_len = NULL_TREE;
- tree bias = NULL_TREE;
if (loop_lens)
{
opt_machine_mode new_ovmode