PR gcc/110148:Avoid adding loop-carried ops to long chains
Checks
Commit Message
From: Lili Cui <lili.cui@intel.com>
Hi Maintainer
This patch is to fix TSVC242 regression related to loop-carried ops.
Bootstrapped and regtested. Ok for trunk?
Regards
Lili.
Avoid adding loop-carried ops to long chains, otherwise the whole chain will
have dependencies across the loop iteration. Just keep loop-carried ops in a
separate chain.
E.g.
x_1 = phi(x_0, x_2)
y_1 = phi(y_0, y_2)
a + b + c + d + e + x1 + y1
SSA1 = a + b;
SSA2 = c + d;
SSA3 = SSA1 + e;
SSA4 = SSA3 + SSA2;
SSA5 = x1 + y1;
SSA6 = SSA4 + SSA5;
With the patch applied, these test cases improved by 32%~100%.
S242:
for (int i = 1; i < LEN_1D; ++i) {
a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i];}
Case 1:
for (int i = 1; i < LEN_1D; ++i) {
a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}
Case 2:
for (int i = 1; i < LEN_1D; ++i) {
a[i] = a[i - 1] + b[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}
The value is the execution time
A: original version
B: with FMA patch g:e5405f065bace0685cb3b8878d1dfc7a6e7ef409(base on A)
C: with current patch(base on B)
A B C B/A C/A
s242 2.859 5.152 2.859 1.802028681 1
case 1 5.489 5.488 3.511 0.999818 0.64
case 2 7.216 7.499 4.885 1.039218 0.68
gcc/ChangeLog:
PR tree-optimization/110148
* tree-ssa-reassoc.cc (rewrite_expr_tree_parallel): Handle loop-carried
ops in this function.
---
gcc/tree-ssa-reassoc.cc | 236 ++++++++++++++++++++++++++++------------
1 file changed, 167 insertions(+), 69 deletions(-)
Comments
On Thu, Jun 29, 2023 at 3:49 AM Cui, Lili <lili.cui@intel.com> wrote:
>
> From: Lili Cui <lili.cui@intel.com>
>
> Hi Maintainer
>
> This patch is to fix TSVC242 regression related to loop-carried ops.
>
> Bootstrapped and regtested. Ok for trunk?
OK.
Thanks,
Richard.
> Regards
> Lili.
>
> Avoid adding loop-carried ops to long chains, otherwise the whole chain will
> have dependencies across the loop iteration. Just keep loop-carried ops in a
> separate chain.
> E.g.
> x_1 = phi(x_0, x_2)
> y_1 = phi(y_0, y_2)
>
> a + b + c + d + e + x1 + y1
>
> SSA1 = a + b;
> SSA2 = c + d;
> SSA3 = SSA1 + e;
> SSA4 = SSA3 + SSA2;
> SSA5 = x1 + y1;
> SSA6 = SSA4 + SSA5;
>
> With the patch applied, these test cases improved by 32%~100%.
>
> S242:
> for (int i = 1; i < LEN_1D; ++i) {
> a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i];}
>
> Case 1:
> for (int i = 1; i < LEN_1D; ++i) {
> a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}
>
> Case 2:
> for (int i = 1; i < LEN_1D; ++i) {
> a[i] = a[i - 1] + b[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}
>
> The value is the execution time
> A: original version
> B: with FMA patch g:e5405f065bace0685cb3b8878d1dfc7a6e7ef409(base on A)
> C: with current patch(base on B)
>
> A B C B/A C/A
> s242 2.859 5.152 2.859 1.802028681 1
> case 1 5.489 5.488 3.511 0.999818 0.64
> case 2 7.216 7.499 4.885 1.039218 0.68
>
> gcc/ChangeLog:
> PR tree-optimization/110148
> * tree-ssa-reassoc.cc (rewrite_expr_tree_parallel): Handle loop-carried
> ops in this function.
> ---
> gcc/tree-ssa-reassoc.cc | 236 ++++++++++++++++++++++++++++------------
> 1 file changed, 167 insertions(+), 69 deletions(-)
>
> diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
> index 96c88ec003e..f5da385e0b2 100644
> --- a/gcc/tree-ssa-reassoc.cc
> +++ b/gcc/tree-ssa-reassoc.cc
> @@ -5471,37 +5471,62 @@ get_reassociation_width (int ops_num, enum tree_code opc,
> return width;
> }
>
> +#define SPECIAL_BIASED_END_STMT 0 /* It is the end stmt of all ops. */
> +#define BIASED_END_STMT 1 /* It is the end stmt of normal or biased ops. */
> +#define NORMAL_END_STMT 2 /* It is the end stmt of normal ops. */
> +
> /* Rewrite statements with dependency chain with regard the chance to generate
> FMA.
> For the chain with FMA: Try to keep fma opportunity as much as possible.
> For the chain without FMA: Putting the computation in rank order and trying
> to allow operations to be executed in parallel.
> E.g.
> - e + f + g + a * b + c * d;
> + e + f + a * b + c * d;
>
> - ssa1 = e + f;
> - ssa2 = g + a * b;
> - ssa3 = ssa1 + c * d;
> - ssa4 = ssa2 + ssa3;
> + ssa1 = e + a * b;
> + ssa2 = f + c * d;
> + ssa3 = ssa1 + ssa2;
>
> This reassociation approach preserves the chance of fma generation as much
> - as possible. */
> + as possible.
> +
> + Another thing is to avoid adding loop-carried ops to long chains, otherwise
> + the whole chain will have dependencies across the loop iteration. Just keep
> + loop-carried ops in a separate chain.
> + E.g.
> + x_1 = phi(x_0, x_2)
> + y_1 = phi(y_0, y_2)
> +
> + a + b + c + d + e + x1 + y1
> +
> + SSA1 = a + b;
> + SSA2 = c + d;
> + SSA3 = SSA1 + e;
> + SSA4 = SSA3 + SSA2;
> + SSA5 = x1 + y1;
> + SSA6 = SSA4 + SSA5;
> + */
> static void
> rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
> - const vec<operand_entry *> &ops)
> + const vec<operand_entry *> &ops)
> {
> enum tree_code opcode = gimple_assign_rhs_code (stmt);
> int op_num = ops.length ();
> + int op_normal_num = op_num;
> gcc_assert (op_num > 0);
> int stmt_num = op_num - 1;
> gimple **stmts = XALLOCAVEC (gimple *, stmt_num);
> - int op_index = op_num - 1;
> - int width_count = width;
> int i = 0, j = 0;
> tree tmp_op[2], op1;
> operand_entry *oe;
> gimple *stmt1 = NULL;
> tree last_rhs1 = gimple_assign_rhs1 (stmt);
> + int last_rhs1_stmt_index = 0, last_rhs2_stmt_index = 0;
> + int width_active = 0, width_count = 0;
> + bool has_biased = false, ops_changed = false;
> + auto_vec<operand_entry *> ops_normal;
> + auto_vec<operand_entry *> ops_biased;
> + vec<operand_entry *> *ops1;
>
> /* We start expression rewriting from the top statements.
> So, in this loop we create a full list of statements
> @@ -5510,83 +5535,155 @@ rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
> for (i = stmt_num - 2; i >= 0; i--)
> stmts[i] = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmts[i+1]));
>
> - /* Width should not be larger than op_num / 2, since we can not create
> + /* Avoid adding loop-carried ops to long chains, first filter out the
> + loop-carried. But we need to make sure that the length of the remainder
> + is not less than 4, which is the smallest ops length we can break the
> + dependency. */
> + FOR_EACH_VEC_ELT (ops, i, oe)
> + {
> + if (TREE_CODE (oe->op) == SSA_NAME
> + && bitmap_bit_p (biased_names, SSA_NAME_VERSION (oe->op))
> + && op_normal_num > 4)
> + {
> + ops_biased.safe_push (oe);
> + has_biased = true;
> + op_normal_num --;
> + }
> + else
> + ops_normal.safe_push (oe);
> + }
> +
> + /* Width should not be larger than ops length / 2, since we can not create
> more parallel dependency chains that exceeds such value. */
> - width = width <= op_num / 2 ? width : op_num / 2;
> + int width_normal = op_normal_num / 2;
> + int width_biased = (op_num - op_normal_num) / 2;
> + width_normal = width <= width_normal ? width : width_normal;
> + width_biased = width <= width_biased ? width : width_biased;
> +
> + ops1 = &ops_normal;
> + width_count = width_active = width_normal;
>
> /* Build parallel dependency chain according to width. */
> - for (i = 0; i < width; i++)
> + for (i = 0; i < stmt_num; i++)
> {
> - /* If the chain has FAM, we do not swap two operands. */
> - if (op_index > 1 && !has_fma)
> - swap_ops_for_binary_stmt (ops, op_index - 2);
> -
> - for (j = 0; j < 2; j++)
> + if (dump_file && (dump_flags & TDF_DETAILS))
> {
> - gcc_assert (op_index >= 0);
> - oe = ops[op_index--];
> - tmp_op[j] = oe->op;
> - /* If the stmt that defines operand has to be inserted, insert it
> - before the use. */
> - stmt1 = oe->stmt_to_insert;
> - if (stmt1)
> - insert_stmt_before_use (stmts[i], stmt1);
> - stmt1 = NULL;
> + fprintf (dump_file, "Transforming ");
> + print_gimple_stmt (dump_file, stmts[i], 0);
> }
> - stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> - tmp_op[1],
> - tmp_op[0],
> - opcode);
> - gimple_set_visited (stmts[i], true);
>
> - if (dump_file && (dump_flags & TDF_DETAILS))
> + /* When the work of normal ops is over, but the loop is not over,
> + continue to do biased ops. */
> + if (width_count == 0 && ops1 == &ops_normal)
> {
> - fprintf (dump_file, " into ");
> - print_gimple_stmt (dump_file, stmts[i], 0);
> + ops1 = &ops_biased;
> + width_count = width_active = width_biased;
> + ops_changed = true;
> }
> - }
>
> - for (i = width; i < stmt_num; i++)
> - {
> - /* We keep original statement only for the last one. All others are
> - recreated. */
> - if ( op_index < 0)
> + /* Swap the operands if no FMA in the chain. */
> + if (ops1->length () > 2 && !has_fma)
> + swap_ops_for_binary_stmt (*ops1, ops1->length () - 3);
> +
> + if (i < width_active
> + || (ops_changed && i <= (last_rhs1_stmt_index + width_active)))
> {
> - if (width_count == 2)
> + for (j = 0; j < 2; j++)
> {
> -
> - /* We keep original statement only for the last one. All
> - others are recreated. */
> - gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[i-1]));
> - gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs (stmts[i-2]));
> - update_stmt (stmts[i]);
> + oe = ops1->pop ();
> + tmp_op[j] = oe->op;
> + /* If the stmt that defines operand has to be inserted, insert it
> + before the use. */
> + stmt1 = oe->stmt_to_insert;
> + if (stmt1)
> + insert_stmt_before_use (stmts[i], stmt1);
> + stmt1 = NULL;
> }
> - else
> - {
> + stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> + tmp_op[1],
> + tmp_op[0],
> + opcode);
> + gimple_set_visited (stmts[i], true);
>
> - stmts[i] =
> - build_and_add_sum (TREE_TYPE (last_rhs1),
> - gimple_assign_lhs (stmts[i-width_count]),
> - gimple_assign_lhs (stmts[i-width_count+1]),
> - opcode);
> - gimple_set_visited (stmts[i], true);
> - width_count--;
> - }
> }
> else
> {
> - /* Attach the rest of the ops to the parallel dependency chain. */
> - oe = ops[op_index--];
> - op1 = oe->op;
> - stmt1 = oe->stmt_to_insert;
> - if (stmt1)
> - insert_stmt_before_use (stmts[i], stmt1);
> - stmt1 = NULL;
> - stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> - gimple_assign_lhs (stmts[i-width]),
> - op1,
> - opcode);
> - gimple_set_visited (stmts[i], true);
> + /* We keep original statement only for the last one. All others are
> + recreated. */
> + if (!ops1->length ())
> + {
> + /* For biased length equal to 2. */
> + if (width_count == BIASED_END_STMT && !last_rhs2_stmt_index)
> + last_rhs2_stmt_index = i - 1;
> +
> + /* When width_count == 2 and there is no biased, just finish. */
> + if (width_count == NORMAL_END_STMT && !has_biased)
> + {
> + last_rhs1_stmt_index = i - 1;
> + last_rhs2_stmt_index = i - 2;
> + }
> + if (last_rhs1_stmt_index && (last_rhs2_stmt_index || !has_biased))
> + {
> + /* We keep original statement only for the last one. All
> + others are recreated. */
> + gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[last_rhs1_stmt_index]));
> + gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs (stmts[last_rhs2_stmt_index]));
> + update_stmt (stmts[i]);
> + }
> + else
> + {
> + stmts[i] =
> + build_and_add_sum (TREE_TYPE (last_rhs1),
> + gimple_assign_lhs (stmts[i-width_count]),
> + gimple_assign_lhs (stmts[i-width_count+1]),
> + opcode);
> + gimple_set_visited (stmts[i], true);
> + width_count--;
> +
> + /* It is the end of normal or biased ops.
> + last_rhs1_stmt_index used to record the last stmt index
> + for normal ops. last_rhs2_stmt_index used to record the
> + last stmt index for biased ops. */
> + if (width_count == BIASED_END_STMT)
> + {
> + gcc_assert (has_biased);
> + if (ops_biased.length ())
> + last_rhs1_stmt_index = i;
> + else
> + last_rhs2_stmt_index = i;
> + width_count--;
> + }
> + }
> + }
> + else
> + {
> + /* Attach the rest of the ops to the parallel dependency chain. */
> + oe = ops1->pop ();
> + op1 = oe->op;
> + stmt1 = oe->stmt_to_insert;
> + if (stmt1)
> + insert_stmt_before_use (stmts[i], stmt1);
> + stmt1 = NULL;
> +
> + /* For only one biased ops. */
> + if (width_count == SPECIAL_BIASED_END_STMT)
> + {
> + /* We keep original statement only for the last one. All
> + others are recreated. */
> + gcc_assert (has_biased);
> + gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[last_rhs1_stmt_index]));
> + gimple_assign_set_rhs2 (stmts[i], op1);
> + update_stmt (stmts[i]);
> + }
> + else
> + {
> + stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> + gimple_assign_lhs (stmts[i-width_active]),
> + op1,
> + opcode);
> + gimple_set_visited (stmts[i], true);
> + }
> + }
> }
>
> if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -5595,6 +5692,7 @@ rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
> print_gimple_stmt (dump_file, stmts[i], 0);
> }
> }
> +
> remove_visited_stmt_chain (last_rhs1);
> }
>
> --
> 2.25.1
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Thursday, June 29, 2023 2:42 PM
> To: Cui, Lili <lili.cui@intel.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] PR gcc/110148:Avoid adding loop-carried ops to long
> chains
>
> On Thu, Jun 29, 2023 at 3:49 AM Cui, Lili <lili.cui@intel.com> wrote:
> >
> > From: Lili Cui <lili.cui@intel.com>
> >
> > Hi Maintainer
> >
> > This patch is to fix TSVC242 regression related to loop-carried ops.
> >
> > Bootstrapped and regtested. Ok for trunk?
>
> OK.
>
Committed, thanks Richard.
Regards,
Lili.
> Thanks,
> Richard.
>
> > Regards
> > Lili.
> >
> > Avoid adding loop-carried ops to long chains, otherwise the whole
> > chain will have dependencies across the loop iteration. Just keep
> > loop-carried ops in a separate chain.
> > E.g.
> > x_1 = phi(x_0, x_2)
> > y_1 = phi(y_0, y_2)
> >
> > a + b + c + d + e + x1 + y1
> >
> > SSA1 = a + b;
> > SSA2 = c + d;
> > SSA3 = SSA1 + e;
> > SSA4 = SSA3 + SSA2;
> > SSA5 = x1 + y1;
> > SSA6 = SSA4 + SSA5;
> >
> > With the patch applied, these test cases improved by 32%~100%.
> >
> > S242:
> > for (int i = 1; i < LEN_1D; ++i) {
> > a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i];}
> >
> > Case 1:
> > for (int i = 1; i < LEN_1D; ++i) {
> > a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}
> >
> > Case 2:
> > for (int i = 1; i < LEN_1D; ++i) {
> > a[i] = a[i - 1] + b[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}
> >
> > The value is the execution time
> > A: original version
> > B: with FMA patch g:e5405f065bace0685cb3b8878d1dfc7a6e7ef409(base
> on
> > A)
> > C: with current patch(base on B)
> >
> > A B C B/A C/A
> > s242 2.859 5.152 2.859 1.802028681 1
> > case 1 5.489 5.488 3.511 0.999818 0.64
> > case 2 7.216 7.499 4.885 1.039218 0.68
> >
> > gcc/ChangeLog:
> > PR tree-optimization/110148
> > * tree-ssa-reassoc.cc (rewrite_expr_tree_parallel): Handle loop-carried
> > ops in this function.
> > ---
> > gcc/tree-ssa-reassoc.cc | 236
> > ++++++++++++++++++++++++++++------------
> > 1 file changed, 167 insertions(+), 69 deletions(-)
> >
> > diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc index
> > 96c88ec003e..f5da385e0b2 100644
> > --- a/gcc/tree-ssa-reassoc.cc
> > +++ b/gcc/tree-ssa-reassoc.cc
> > @@ -5471,37 +5471,62 @@ get_reassociation_width (int ops_num, enum
> tree_code opc,
> > return width;
> > }
> >
> > +#define SPECIAL_BIASED_END_STMT 0 /* It is the end stmt of all ops.
> > +*/ #define BIASED_END_STMT 1 /* It is the end stmt of normal or
> > +biased ops. */ #define NORMAL_END_STMT 2 /* It is the end stmt of
> > +normal ops. */
> > +
> > /* Rewrite statements with dependency chain with regard the chance to
> generate
> > FMA.
> > For the chain with FMA: Try to keep fma opportunity as much as possible.
> > For the chain without FMA: Putting the computation in rank order and
> trying
> > to allow operations to be executed in parallel.
> > E.g.
> > - e + f + g + a * b + c * d;
> > + e + f + a * b + c * d;
> >
> > - ssa1 = e + f;
> > - ssa2 = g + a * b;
> > - ssa3 = ssa1 + c * d;
> > - ssa4 = ssa2 + ssa3;
> > + ssa1 = e + a * b;
> > + ssa2 = f + c * d;
> > + ssa3 = ssa1 + ssa2;
> >
> > This reassociation approach preserves the chance of fma generation as
> much
> > - as possible. */
> > + as possible.
> > +
> > + Another thing is to avoid adding loop-carried ops to long chains,
> otherwise
> > + the whole chain will have dependencies across the loop iteration. Just
> keep
> > + loop-carried ops in a separate chain.
> > + E.g.
> > + x_1 = phi(x_0, x_2)
> > + y_1 = phi(y_0, y_2)
> > +
> > + a + b + c + d + e + x1 + y1
> > +
> > + SSA1 = a + b;
> > + SSA2 = c + d;
> > + SSA3 = SSA1 + e;
> > + SSA4 = SSA3 + SSA2;
> > + SSA5 = x1 + y1;
> > + SSA6 = SSA4 + SSA5;
> > + */
> > static void
> > rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
> > - const vec<operand_entry *> &ops)
> > + const vec<operand_entry *> &ops)
> > {
> > enum tree_code opcode = gimple_assign_rhs_code (stmt);
> > int op_num = ops.length ();
> > + int op_normal_num = op_num;
> > gcc_assert (op_num > 0);
> > int stmt_num = op_num - 1;
> > gimple **stmts = XALLOCAVEC (gimple *, stmt_num);
> > - int op_index = op_num - 1;
> > - int width_count = width;
> > int i = 0, j = 0;
> > tree tmp_op[2], op1;
> > operand_entry *oe;
> > gimple *stmt1 = NULL;
> > tree last_rhs1 = gimple_assign_rhs1 (stmt);
> > + int last_rhs1_stmt_index = 0, last_rhs2_stmt_index = 0; int
> > + width_active = 0, width_count = 0; bool has_biased = false,
> > + ops_changed = false; auto_vec<operand_entry *> ops_normal;
> > + auto_vec<operand_entry *> ops_biased; vec<operand_entry *> *ops1;
> >
> > /* We start expression rewriting from the top statements.
> > So, in this loop we create a full list of statements @@ -5510,83
> > +5535,155 @@ rewrite_expr_tree_parallel (gassign *stmt, int width, bool
> has_fma,
> > for (i = stmt_num - 2; i >= 0; i--)
> > stmts[i] = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmts[i+1]));
> >
> > - /* Width should not be larger than op_num / 2, since we can not
> > create
> > + /* Avoid adding loop-carried ops to long chains, first filter out the
> > + loop-carried. But we need to make sure that the length of the
> remainder
> > + is not less than 4, which is the smallest ops length we can break the
> > + dependency. */
> > + FOR_EACH_VEC_ELT (ops, i, oe)
> > + {
> > + if (TREE_CODE (oe->op) == SSA_NAME
> > + && bitmap_bit_p (biased_names, SSA_NAME_VERSION (oe->op))
> > + && op_normal_num > 4)
> > + {
> > + ops_biased.safe_push (oe);
> > + has_biased = true;
> > + op_normal_num --;
> > + }
> > + else
> > + ops_normal.safe_push (oe);
> > + }
> > +
> > + /* Width should not be larger than ops length / 2, since we can not
> > + create
> > more parallel dependency chains that exceeds such value. */
> > - width = width <= op_num / 2 ? width : op_num / 2;
> > + int width_normal = op_normal_num / 2; int width_biased = (op_num -
> > + op_normal_num) / 2; width_normal = width <= width_normal ? width :
> > + width_normal; width_biased = width <= width_biased ? width :
> > + width_biased;
> > +
> > + ops1 = &ops_normal;
> > + width_count = width_active = width_normal;
> >
> > /* Build parallel dependency chain according to width. */
> > - for (i = 0; i < width; i++)
> > + for (i = 0; i < stmt_num; i++)
> > {
> > - /* If the chain has FAM, we do not swap two operands. */
> > - if (op_index > 1 && !has_fma)
> > - swap_ops_for_binary_stmt (ops, op_index - 2);
> > -
> > - for (j = 0; j < 2; j++)
> > + if (dump_file && (dump_flags & TDF_DETAILS))
> > {
> > - gcc_assert (op_index >= 0);
> > - oe = ops[op_index--];
> > - tmp_op[j] = oe->op;
> > - /* If the stmt that defines operand has to be inserted, insert it
> > - before the use. */
> > - stmt1 = oe->stmt_to_insert;
> > - if (stmt1)
> > - insert_stmt_before_use (stmts[i], stmt1);
> > - stmt1 = NULL;
> > + fprintf (dump_file, "Transforming ");
> > + print_gimple_stmt (dump_file, stmts[i], 0);
> > }
> > - stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> > - tmp_op[1],
> > - tmp_op[0],
> > - opcode);
> > - gimple_set_visited (stmts[i], true);
> >
> > - if (dump_file && (dump_flags & TDF_DETAILS))
> > + /* When the work of normal ops is over, but the loop is not over,
> > + continue to do biased ops. */
> > + if (width_count == 0 && ops1 == &ops_normal)
> > {
> > - fprintf (dump_file, " into ");
> > - print_gimple_stmt (dump_file, stmts[i], 0);
> > + ops1 = &ops_biased;
> > + width_count = width_active = width_biased;
> > + ops_changed = true;
> > }
> > - }
> >
> > - for (i = width; i < stmt_num; i++)
> > - {
> > - /* We keep original statement only for the last one. All others are
> > - recreated. */
> > - if ( op_index < 0)
> > + /* Swap the operands if no FMA in the chain. */
> > + if (ops1->length () > 2 && !has_fma)
> > + swap_ops_for_binary_stmt (*ops1, ops1->length () - 3);
> > +
> > + if (i < width_active
> > + || (ops_changed && i <= (last_rhs1_stmt_index +
> > + width_active)))
> > {
> > - if (width_count == 2)
> > + for (j = 0; j < 2; j++)
> > {
> > -
> > - /* We keep original statement only for the last one. All
> > - others are recreated. */
> > - gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[i-1]));
> > - gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs (stmts[i-2]));
> > - update_stmt (stmts[i]);
> > + oe = ops1->pop ();
> > + tmp_op[j] = oe->op;
> > + /* If the stmt that defines operand has to be inserted, insert it
> > + before the use. */
> > + stmt1 = oe->stmt_to_insert;
> > + if (stmt1)
> > + insert_stmt_before_use (stmts[i], stmt1);
> > + stmt1 = NULL;
> > }
> > - else
> > - {
> > + stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> > + tmp_op[1],
> > + tmp_op[0],
> > + opcode);
> > + gimple_set_visited (stmts[i], true);
> >
> > - stmts[i] =
> > - build_and_add_sum (TREE_TYPE (last_rhs1),
> > - gimple_assign_lhs (stmts[i-width_count]),
> > - gimple_assign_lhs (stmts[i-width_count+1]),
> > - opcode);
> > - gimple_set_visited (stmts[i], true);
> > - width_count--;
> > - }
> > }
> > else
> > {
> > - /* Attach the rest of the ops to the parallel dependency chain. */
> > - oe = ops[op_index--];
> > - op1 = oe->op;
> > - stmt1 = oe->stmt_to_insert;
> > - if (stmt1)
> > - insert_stmt_before_use (stmts[i], stmt1);
> > - stmt1 = NULL;
> > - stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> > - gimple_assign_lhs (stmts[i-width]),
> > - op1,
> > - opcode);
> > - gimple_set_visited (stmts[i], true);
> > + /* We keep original statement only for the last one. All others are
> > + recreated. */
> > + if (!ops1->length ())
> > + {
> > + /* For biased length equal to 2. */
> > + if (width_count == BIASED_END_STMT && !last_rhs2_stmt_index)
> > + last_rhs2_stmt_index = i - 1;
> > +
> > + /* When width_count == 2 and there is no biased, just finish. */
> > + if (width_count == NORMAL_END_STMT && !has_biased)
> > + {
> > + last_rhs1_stmt_index = i - 1;
> > + last_rhs2_stmt_index = i - 2;
> > + }
> > + if (last_rhs1_stmt_index && (last_rhs2_stmt_index || !has_biased))
> > + {
> > + /* We keep original statement only for the last one. All
> > + others are recreated. */
> > + gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs
> (stmts[last_rhs1_stmt_index]));
> > + gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs
> (stmts[last_rhs2_stmt_index]));
> > + update_stmt (stmts[i]);
> > + }
> > + else
> > + {
> > + stmts[i] =
> > + build_and_add_sum (TREE_TYPE (last_rhs1),
> > + gimple_assign_lhs (stmts[i-width_count]),
> > + gimple_assign_lhs (stmts[i-width_count+1]),
> > + opcode);
> > + gimple_set_visited (stmts[i], true);
> > + width_count--;
> > +
> > + /* It is the end of normal or biased ops.
> > + last_rhs1_stmt_index used to record the last stmt index
> > + for normal ops. last_rhs2_stmt_index used to record the
> > + last stmt index for biased ops. */
> > + if (width_count == BIASED_END_STMT)
> > + {
> > + gcc_assert (has_biased);
> > + if (ops_biased.length ())
> > + last_rhs1_stmt_index = i;
> > + else
> > + last_rhs2_stmt_index = i;
> > + width_count--;
> > + }
> > + }
> > + }
> > + else
> > + {
> > + /* Attach the rest of the ops to the parallel dependency chain. */
> > + oe = ops1->pop ();
> > + op1 = oe->op;
> > + stmt1 = oe->stmt_to_insert;
> > + if (stmt1)
> > + insert_stmt_before_use (stmts[i], stmt1);
> > + stmt1 = NULL;
> > +
> > + /* For only one biased ops. */
> > + if (width_count == SPECIAL_BIASED_END_STMT)
> > + {
> > + /* We keep original statement only for the last one. All
> > + others are recreated. */
> > + gcc_assert (has_biased);
> > + gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs
> (stmts[last_rhs1_stmt_index]));
> > + gimple_assign_set_rhs2 (stmts[i], op1);
> > + update_stmt (stmts[i]);
> > + }
> > + else
> > + {
> > + stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> > + gimple_assign_lhs (stmts[i-width_active]),
> > + op1,
> > + opcode);
> > + gimple_set_visited (stmts[i], true);
> > + }
> > + }
> > }
> >
> > if (dump_file && (dump_flags & TDF_DETAILS)) @@ -5595,6 +5692,7
> > @@ rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
> > print_gimple_stmt (dump_file, stmts[i], 0);
> > }
> > }
> > +
> > remove_visited_stmt_chain (last_rhs1); }
> >
> > --
> > 2.25.1
> >
@@ -5471,37 +5471,62 @@ get_reassociation_width (int ops_num, enum tree_code opc,
return width;
}
+#define SPECIAL_BIASED_END_STMT 0 /* It is the end stmt of all ops. */
+#define BIASED_END_STMT 1 /* It is the end stmt of normal or biased ops. */
+#define NORMAL_END_STMT 2 /* It is the end stmt of normal ops. */
+
/* Rewrite statements with dependency chain with regard the chance to generate
FMA.
For the chain with FMA: Try to keep fma opportunity as much as possible.
For the chain without FMA: Putting the computation in rank order and trying
to allow operations to be executed in parallel.
E.g.
- e + f + g + a * b + c * d;
+ e + f + a * b + c * d;
- ssa1 = e + f;
- ssa2 = g + a * b;
- ssa3 = ssa1 + c * d;
- ssa4 = ssa2 + ssa3;
+ ssa1 = e + a * b;
+ ssa2 = f + c * d;
+ ssa3 = ssa1 + ssa2;
This reassociation approach preserves the chance of fma generation as much
- as possible. */
+ as possible.
+
+ Another thing is to avoid adding loop-carried ops to long chains, otherwise
+ the whole chain will have dependencies across the loop iteration. Just keep
+ loop-carried ops in a separate chain.
+ E.g.
+ x_1 = phi(x_0, x_2)
+ y_1 = phi(y_0, y_2)
+
+ a + b + c + d + e + x1 + y1
+
+ SSA1 = a + b;
+ SSA2 = c + d;
+ SSA3 = SSA1 + e;
+ SSA4 = SSA3 + SSA2;
+ SSA5 = x1 + y1;
+ SSA6 = SSA4 + SSA5;
+ */
static void
rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
- const vec<operand_entry *> &ops)
+ const vec<operand_entry *> &ops)
{
enum tree_code opcode = gimple_assign_rhs_code (stmt);
int op_num = ops.length ();
+ int op_normal_num = op_num;
gcc_assert (op_num > 0);
int stmt_num = op_num - 1;
gimple **stmts = XALLOCAVEC (gimple *, stmt_num);
- int op_index = op_num - 1;
- int width_count = width;
int i = 0, j = 0;
tree tmp_op[2], op1;
operand_entry *oe;
gimple *stmt1 = NULL;
tree last_rhs1 = gimple_assign_rhs1 (stmt);
+ int last_rhs1_stmt_index = 0, last_rhs2_stmt_index = 0;
+ int width_active = 0, width_count = 0;
+ bool has_biased = false, ops_changed = false;
+ auto_vec<operand_entry *> ops_normal;
+ auto_vec<operand_entry *> ops_biased;
+ vec<operand_entry *> *ops1;
/* We start expression rewriting from the top statements.
So, in this loop we create a full list of statements
@@ -5510,83 +5535,155 @@ rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
for (i = stmt_num - 2; i >= 0; i--)
stmts[i] = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmts[i+1]));
- /* Width should not be larger than op_num / 2, since we can not create
+ /* Avoid adding loop-carried ops to long chains, first filter out the
+ loop-carried. But we need to make sure that the length of the remainder
+ is not less than 4, which is the smallest ops length we can break the
+ dependency. */
+ FOR_EACH_VEC_ELT (ops, i, oe)
+ {
+ if (TREE_CODE (oe->op) == SSA_NAME
+ && bitmap_bit_p (biased_names, SSA_NAME_VERSION (oe->op))
+ && op_normal_num > 4)
+ {
+ ops_biased.safe_push (oe);
+ has_biased = true;
+ op_normal_num --;
+ }
+ else
+ ops_normal.safe_push (oe);
+ }
+
+ /* Width should not be larger than ops length / 2, since we can not create
more parallel dependency chains that exceeds such value. */
- width = width <= op_num / 2 ? width : op_num / 2;
+ int width_normal = op_normal_num / 2;
+ int width_biased = (op_num - op_normal_num) / 2;
+ width_normal = width <= width_normal ? width : width_normal;
+ width_biased = width <= width_biased ? width : width_biased;
+
+ ops1 = &ops_normal;
+ width_count = width_active = width_normal;
/* Build parallel dependency chain according to width. */
- for (i = 0; i < width; i++)
+ for (i = 0; i < stmt_num; i++)
{
- /* If the chain has FAM, we do not swap two operands. */
- if (op_index > 1 && !has_fma)
- swap_ops_for_binary_stmt (ops, op_index - 2);
-
- for (j = 0; j < 2; j++)
+ if (dump_file && (dump_flags & TDF_DETAILS))
{
- gcc_assert (op_index >= 0);
- oe = ops[op_index--];
- tmp_op[j] = oe->op;
- /* If the stmt that defines operand has to be inserted, insert it
- before the use. */
- stmt1 = oe->stmt_to_insert;
- if (stmt1)
- insert_stmt_before_use (stmts[i], stmt1);
- stmt1 = NULL;
+ fprintf (dump_file, "Transforming ");
+ print_gimple_stmt (dump_file, stmts[i], 0);
}
- stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
- tmp_op[1],
- tmp_op[0],
- opcode);
- gimple_set_visited (stmts[i], true);
- if (dump_file && (dump_flags & TDF_DETAILS))
+ /* When the work of normal ops is over, but the loop is not over,
+ continue to do biased ops. */
+ if (width_count == 0 && ops1 == &ops_normal)
{
- fprintf (dump_file, " into ");
- print_gimple_stmt (dump_file, stmts[i], 0);
+ ops1 = &ops_biased;
+ width_count = width_active = width_biased;
+ ops_changed = true;
}
- }
- for (i = width; i < stmt_num; i++)
- {
- /* We keep original statement only for the last one. All others are
- recreated. */
- if ( op_index < 0)
+ /* Swap the operands if no FMA in the chain. */
+ if (ops1->length () > 2 && !has_fma)
+ swap_ops_for_binary_stmt (*ops1, ops1->length () - 3);
+
+ if (i < width_active
+ || (ops_changed && i <= (last_rhs1_stmt_index + width_active)))
{
- if (width_count == 2)
+ for (j = 0; j < 2; j++)
{
-
- /* We keep original statement only for the last one. All
- others are recreated. */
- gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[i-1]));
- gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs (stmts[i-2]));
- update_stmt (stmts[i]);
+ oe = ops1->pop ();
+ tmp_op[j] = oe->op;
+ /* If the stmt that defines operand has to be inserted, insert it
+ before the use. */
+ stmt1 = oe->stmt_to_insert;
+ if (stmt1)
+ insert_stmt_before_use (stmts[i], stmt1);
+ stmt1 = NULL;
}
- else
- {
+ stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
+ tmp_op[1],
+ tmp_op[0],
+ opcode);
+ gimple_set_visited (stmts[i], true);
- stmts[i] =
- build_and_add_sum (TREE_TYPE (last_rhs1),
- gimple_assign_lhs (stmts[i-width_count]),
- gimple_assign_lhs (stmts[i-width_count+1]),
- opcode);
- gimple_set_visited (stmts[i], true);
- width_count--;
- }
}
else
{
- /* Attach the rest of the ops to the parallel dependency chain. */
- oe = ops[op_index--];
- op1 = oe->op;
- stmt1 = oe->stmt_to_insert;
- if (stmt1)
- insert_stmt_before_use (stmts[i], stmt1);
- stmt1 = NULL;
- stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
- gimple_assign_lhs (stmts[i-width]),
- op1,
- opcode);
- gimple_set_visited (stmts[i], true);
+ /* We keep original statement only for the last one. All others are
+ recreated. */
+ if (!ops1->length ())
+ {
+ /* For biased length equal to 2. */
+ if (width_count == BIASED_END_STMT && !last_rhs2_stmt_index)
+ last_rhs2_stmt_index = i - 1;
+
+ /* When width_count == 2 and there is no biased, just finish. */
+ if (width_count == NORMAL_END_STMT && !has_biased)
+ {
+ last_rhs1_stmt_index = i - 1;
+ last_rhs2_stmt_index = i - 2;
+ }
+ if (last_rhs1_stmt_index && (last_rhs2_stmt_index || !has_biased))
+ {
+ /* We keep original statement only for the last one. All
+ others are recreated. */
+ gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[last_rhs1_stmt_index]));
+ gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs (stmts[last_rhs2_stmt_index]));
+ update_stmt (stmts[i]);
+ }
+ else
+ {
+ stmts[i] =
+ build_and_add_sum (TREE_TYPE (last_rhs1),
+ gimple_assign_lhs (stmts[i-width_count]),
+ gimple_assign_lhs (stmts[i-width_count+1]),
+ opcode);
+ gimple_set_visited (stmts[i], true);
+ width_count--;
+
+ /* It is the end of normal or biased ops.
+ last_rhs1_stmt_index used to record the last stmt index
+ for normal ops. last_rhs2_stmt_index used to record the
+ last stmt index for biased ops. */
+ if (width_count == BIASED_END_STMT)
+ {
+ gcc_assert (has_biased);
+ if (ops_biased.length ())
+ last_rhs1_stmt_index = i;
+ else
+ last_rhs2_stmt_index = i;
+ width_count--;
+ }
+ }
+ }
+ else
+ {
+ /* Attach the rest of the ops to the parallel dependency chain. */
+ oe = ops1->pop ();
+ op1 = oe->op;
+ stmt1 = oe->stmt_to_insert;
+ if (stmt1)
+ insert_stmt_before_use (stmts[i], stmt1);
+ stmt1 = NULL;
+
+ /* For only one biased ops. */
+ if (width_count == SPECIAL_BIASED_END_STMT)
+ {
+ /* We keep original statement only for the last one. All
+ others are recreated. */
+ gcc_assert (has_biased);
+ gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[last_rhs1_stmt_index]));
+ gimple_assign_set_rhs2 (stmts[i], op1);
+ update_stmt (stmts[i]);
+ }
+ else
+ {
+ stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
+ gimple_assign_lhs (stmts[i-width_active]),
+ op1,
+ opcode);
+ gimple_set_visited (stmts[i], true);
+ }
+ }
}
if (dump_file && (dump_flags & TDF_DETAILS))
@@ -5595,6 +5692,7 @@ rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
print_gimple_stmt (dump_file, stmts[i], 0);
}
}
+
remove_visited_stmt_chain (last_rhs1);
}