Adjust vectorized cost for reduction.
Checks
Commit Message
x86 doesn't support horizontal reduction instructions, reduc_op_scal_m
is emulated with vec_extract_half + op(half vector length)
Take that into account when calculating cost for vectorization.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
No big performance impact on SPEC2017 as measured on ICX.
Ok for trunk?
gcc/ChangeLog:
PR target/112325
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
Handle reduction vec_to_scalar.
(ix86_vector_costs::ix86_vect_reduc_cost): New function.
---
gcc/config/i386/i386.cc | 45 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 45 insertions(+)
Comments
On Tue, Dec 12, 2023 at 7:12 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> x86 doesn't support horizontal reduction instructions, reduc_op_scal_m
> is emulated with vec_extract_half + op(half vector length)
> Take that into account when calculating cost for vectorization.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> No big performance impact on SPEC2017 as measured on ICX.
> Ok for trunk?
I don't think keying on only on vec_to_scalar is good since
vect_model_reduction_cost will always use that when
extracting the scalar result element from the final vector
as well so you'll get double-counting here.
There is currently no good way of identifying the cases
the vectorizer chose reduc_*_scal, this operation
is identified as vector_stmt.
There is STMT_VINFO_REDUC_FN though, but I'm
not 100% positive the stmt_info you get passed has
this set (it's probably on the info_for_reduction node).
It should be possible to invent a new accessor like
vect_reduc_type () computing REDUC_FN though.
Richard.
> gcc/ChangeLog:
>
> PR target/112325
> * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
> Handle reduction vec_to_scalar.
> (ix86_vector_costs::ix86_vect_reduc_cost): New function.
> ---
> gcc/config/i386/i386.cc | 45 +++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 45 insertions(+)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 4b6bad37c8f..02c9a5004a1 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -24603,6 +24603,7 @@ private:
>
> /* Estimate register pressure of the vectorized code. */
> void ix86_vect_estimate_reg_pressure ();
> + unsigned ix86_vect_reduc_cost (stmt_vec_info, tree);
> /* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for
> estimation of register pressure.
> ??? Currently it's only used by vec_construct/scalar_to_vec
> @@ -24845,6 +24846,12 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
> if (TREE_CODE (op) == SSA_NAME)
> TREE_VISITED (op) = 0;
> }
> + /* This is a reduc_*_scal_m, x86 support reduc_*_scal_m with emulation. */
> + else if (kind == vec_to_scalar
> + && stmt_info
> + && vect_is_reduction (stmt_info))
> + stmt_cost = ix86_vect_reduc_cost (stmt_info, vectype);
> +
> if (stmt_cost == -1)
> stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
>
> @@ -24875,6 +24882,44 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
> return retval;
> }
>
> +/* x86 doesn't support horizontal reduction instructions,
> + redc_op_scal_m is emulated with vec_extract_hi + op. */
> +unsigned
> +ix86_vector_costs::ix86_vect_reduc_cost (stmt_vec_info stmt_info,
> + tree vectype)
> +{
> + gcc_assert (vectype);
> + unsigned cost = 0;
> + machine_mode mode = TYPE_MODE (vectype);
> + unsigned len = GET_MODE_SIZE (mode);
> +
> + /* PSADBW is used for reduc_plus_scal_{v16qi, v8qi, v4qi}. */
> + if (GET_MODE_INNER (mode) == E_QImode
> + && stmt_info
> + && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN
> + && gimple_assign_rhs_code (stmt_info->stmt) == PLUS_EXPR)
> + {
> + cost = ix86_cost->sse_op;
> + /* vec_extract_hi + vpaddb for 256/512-bit reduc_plus_scal_v*qi. */
> + if (len > 16)
> + cost += exact_log2 (len >> 4) * ix86_cost->sse_op * 2;
> + }
> + else
> + /* vec_extract_hi + op. */
> + cost = ix86_cost->sse_op * exact_log2 (TYPE_VECTOR_SUBPARTS (vectype)) * 2;
> +
> + /* Cout extra uops for TARGET_*_SPLIT_REGS. NB: There's no target which
> + supports 512-bit vector but has TARGET_AVX256/128_SPLIT_REGS.
> + ix86_vect_cost is not used since reduction instruction sequence are
> + consisted with mixed vector-length instructions after vec_extract_hi. */
> + if ((len == 64 && TARGET_AVX512_SPLIT_REGS)
> + || (len == 32 && TARGET_AVX256_SPLIT_REGS)
> + || (len == 16 && TARGET_AVX256_SPLIT_REGS))
> + cost += ix86_cost->sse_op;
> +
> + return cost;
> +}
> +
> void
> ix86_vector_costs::ix86_vect_estimate_reg_pressure ()
> {
> --
> 2.31.1
>
@@ -24603,6 +24603,7 @@ private:
/* Estimate register pressure of the vectorized code. */
void ix86_vect_estimate_reg_pressure ();
+ unsigned ix86_vect_reduc_cost (stmt_vec_info, tree);
/* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for
estimation of register pressure.
??? Currently it's only used by vec_construct/scalar_to_vec
@@ -24845,6 +24846,12 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
if (TREE_CODE (op) == SSA_NAME)
TREE_VISITED (op) = 0;
}
+ /* This is a reduc_*_scal_m, x86 support reduc_*_scal_m with emulation. */
+ else if (kind == vec_to_scalar
+ && stmt_info
+ && vect_is_reduction (stmt_info))
+ stmt_cost = ix86_vect_reduc_cost (stmt_info, vectype);
+
if (stmt_cost == -1)
stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
@@ -24875,6 +24882,44 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
return retval;
}
+/* x86 doesn't support horizontal reduction instructions,
+ redc_op_scal_m is emulated with vec_extract_hi + op. */
+unsigned
+ix86_vector_costs::ix86_vect_reduc_cost (stmt_vec_info stmt_info,
+ tree vectype)
+{
+ gcc_assert (vectype);
+ unsigned cost = 0;
+ machine_mode mode = TYPE_MODE (vectype);
+ unsigned len = GET_MODE_SIZE (mode);
+
+ /* PSADBW is used for reduc_plus_scal_{v16qi, v8qi, v4qi}. */
+ if (GET_MODE_INNER (mode) == E_QImode
+ && stmt_info
+ && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN
+ && gimple_assign_rhs_code (stmt_info->stmt) == PLUS_EXPR)
+ {
+ cost = ix86_cost->sse_op;
+ /* vec_extract_hi + vpaddb for 256/512-bit reduc_plus_scal_v*qi. */
+ if (len > 16)
+ cost += exact_log2 (len >> 4) * ix86_cost->sse_op * 2;
+ }
+ else
+ /* vec_extract_hi + op. */
+ cost = ix86_cost->sse_op * exact_log2 (TYPE_VECTOR_SUBPARTS (vectype)) * 2;
+
+ /* Cout extra uops for TARGET_*_SPLIT_REGS. NB: There's no target which
+ supports 512-bit vector but has TARGET_AVX256/128_SPLIT_REGS.
+ ix86_vect_cost is not used since reduction instruction sequence are
+ consisted with mixed vector-length instructions after vec_extract_hi. */
+ if ((len == 64 && TARGET_AVX512_SPLIT_REGS)
+ || (len == 32 && TARGET_AVX256_SPLIT_REGS)
+ || (len == 16 && TARGET_AVX256_SPLIT_REGS))
+ cost += ix86_cost->sse_op;
+
+ return cost;
+}
+
void
ix86_vector_costs::ix86_vect_estimate_reg_pressure ()
{