PR gcc/110148:Avoid adding loop-carried ops to long chains

Message ID 20230629014949.1995621-1-lili.cui@intel.com
State Unresolved
Headers
Series PR gcc/110148:Avoid adding loop-carried ops to long chains |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Li, Pan2 via Gcc-patches June 29, 2023, 1:49 a.m. UTC
  From: Lili Cui <lili.cui@intel.com>

Hi Maintainer

This patch is to fix TSVC242 regression related to loop-carried ops.

Bootstrapped and regtested. Ok for trunk?

Regards
Lili.

Avoid adding loop-carried ops to long chains, otherwise the whole chain will
have dependencies across the loop iteration. Just keep loop-carried ops in a
separate chain.
   E.g.
   x_1 = phi(x_0, x_2)
   y_1 = phi(y_0, y_2)

   a + b + c + d + e + x1 + y1

   SSA1 = a + b;
   SSA2 = c + d;
   SSA3 = SSA1 + e;
   SSA4 = SSA3 + SSA2;
   SSA5 = x1 + y1;
   SSA6 = SSA4 + SSA5;

With the patch applied, these test cases improved by 32%~100%.

S242:
for (int i = 1; i < LEN_1D; ++i) {
    a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i];}

Case 1:
for (int i = 1; i < LEN_1D; ++i) {
    a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}

Case 2:
for (int i = 1; i < LEN_1D; ++i) {
    a[i] = a[i - 1] + b[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}

The value is the execution time
A: original version
B: with FMA patch g:e5405f065bace0685cb3b8878d1dfc7a6e7ef409(base on A)
C: with current patch(base on B)

	  A 	  B 	  C 	B/A	        C/A
s242	2.859	5.152	2.859	1.802028681	1
case 1	5.489	5.488	3.511	0.999818	0.64
case 2	7.216	7.499	4.885	1.039218	0.68

gcc/ChangeLog:
	PR tree-optimization/110148
        * tree-ssa-reassoc.cc (rewrite_expr_tree_parallel): Handle loop-carried
	ops in this function.
---
 gcc/tree-ssa-reassoc.cc | 236 ++++++++++++++++++++++++++++------------
 1 file changed, 167 insertions(+), 69 deletions(-)
  

Comments

Richard Biener June 29, 2023, 6:42 a.m. UTC | #1
On Thu, Jun 29, 2023 at 3:49 AM Cui, Lili <lili.cui@intel.com> wrote:
>
> From: Lili Cui <lili.cui@intel.com>
>
> Hi Maintainer
>
> This patch is to fix TSVC242 regression related to loop-carried ops.
>
> Bootstrapped and regtested. Ok for trunk?

OK.

Thanks,
Richard.

> Regards
> Lili.
>
> Avoid adding loop-carried ops to long chains, otherwise the whole chain will
> have dependencies across the loop iteration. Just keep loop-carried ops in a
> separate chain.
>    E.g.
>    x_1 = phi(x_0, x_2)
>    y_1 = phi(y_0, y_2)
>
>    a + b + c + d + e + x1 + y1
>
>    SSA1 = a + b;
>    SSA2 = c + d;
>    SSA3 = SSA1 + e;
>    SSA4 = SSA3 + SSA2;
>    SSA5 = x1 + y1;
>    SSA6 = SSA4 + SSA5;
>
> With the patch applied, these test cases improved by 32%~100%.
>
> S242:
> for (int i = 1; i < LEN_1D; ++i) {
>     a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i];}
>
> Case 1:
> for (int i = 1; i < LEN_1D; ++i) {
>     a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}
>
> Case 2:
> for (int i = 1; i < LEN_1D; ++i) {
>     a[i] = a[i - 1] + b[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}
>
> The value is the execution time
> A: original version
> B: with FMA patch g:e5405f065bace0685cb3b8878d1dfc7a6e7ef409(base on A)
> C: with current patch(base on B)
>
>           A       B       C     B/A             C/A
> s242    2.859   5.152   2.859   1.802028681     1
> case 1  5.489   5.488   3.511   0.999818        0.64
> case 2  7.216   7.499   4.885   1.039218        0.68
>
> gcc/ChangeLog:
>         PR tree-optimization/110148
>         * tree-ssa-reassoc.cc (rewrite_expr_tree_parallel): Handle loop-carried
>         ops in this function.
> ---
>  gcc/tree-ssa-reassoc.cc | 236 ++++++++++++++++++++++++++++------------
>  1 file changed, 167 insertions(+), 69 deletions(-)
>
> diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
> index 96c88ec003e..f5da385e0b2 100644
> --- a/gcc/tree-ssa-reassoc.cc
> +++ b/gcc/tree-ssa-reassoc.cc
> @@ -5471,37 +5471,62 @@ get_reassociation_width (int ops_num, enum tree_code opc,
>    return width;
>  }
>
> +#define SPECIAL_BIASED_END_STMT 0 /* It is the end stmt of all ops.  */
> +#define BIASED_END_STMT 1 /* It is the end stmt of normal or biased ops.  */
> +#define NORMAL_END_STMT 2 /* It is the end stmt of normal ops.  */
> +
>  /* Rewrite statements with dependency chain with regard the chance to generate
>     FMA.
>     For the chain with FMA: Try to keep fma opportunity as much as possible.
>     For the chain without FMA: Putting the computation in rank order and trying
>     to allow operations to be executed in parallel.
>     E.g.
> -   e + f + g + a * b + c * d;
> +   e + f + a * b + c * d;
>
> -   ssa1 = e + f;
> -   ssa2 = g + a * b;
> -   ssa3 = ssa1 + c * d;
> -   ssa4 = ssa2 + ssa3;
> +   ssa1 = e + a * b;
> +   ssa2 = f + c * d;
> +   ssa3 = ssa1 + ssa2;
>
>     This reassociation approach preserves the chance of fma generation as much
> -   as possible.  */
> +   as possible.
> +
> +   Another thing is to avoid adding loop-carried ops to long chains, otherwise
> +   the whole chain will have dependencies across the loop iteration. Just keep
> +   loop-carried ops in a separate chain.
> +   E.g.
> +   x_1 = phi(x_0, x_2)
> +   y_1 = phi(y_0, y_2)
> +
> +   a + b + c + d + e + x1 + y1
> +
> +   SSA1 = a + b;
> +   SSA2 = c + d;
> +   SSA3 = SSA1 + e;
> +   SSA4 = SSA3 + SSA2;
> +   SSA5 = x1 + y1;
> +   SSA6 = SSA4 + SSA5;
> + */
>  static void
>  rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
> -                                        const vec<operand_entry *> &ops)
> +                           const vec<operand_entry *> &ops)
>  {
>    enum tree_code opcode = gimple_assign_rhs_code (stmt);
>    int op_num = ops.length ();
> +  int op_normal_num = op_num;
>    gcc_assert (op_num > 0);
>    int stmt_num = op_num - 1;
>    gimple **stmts = XALLOCAVEC (gimple *, stmt_num);
> -  int op_index = op_num - 1;
> -  int width_count = width;
>    int i = 0, j = 0;
>    tree tmp_op[2], op1;
>    operand_entry *oe;
>    gimple *stmt1 = NULL;
>    tree last_rhs1 = gimple_assign_rhs1 (stmt);
> +  int last_rhs1_stmt_index = 0, last_rhs2_stmt_index = 0;
> +  int width_active = 0, width_count = 0;
> +  bool has_biased = false, ops_changed = false;
> +  auto_vec<operand_entry *> ops_normal;
> +  auto_vec<operand_entry *> ops_biased;
> +  vec<operand_entry *> *ops1;
>
>    /* We start expression rewriting from the top statements.
>       So, in this loop we create a full list of statements
> @@ -5510,83 +5535,155 @@ rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
>    for (i = stmt_num - 2; i >= 0; i--)
>      stmts[i] = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmts[i+1]));
>
> -  /* Width should not be larger than op_num / 2, since we can not create
> +  /* Avoid adding loop-carried ops to long chains, first filter out the
> +     loop-carried.  But we need to make sure that the length of the remainder
> +     is not less than 4, which is the smallest ops length we can break the
> +     dependency.  */
> +  FOR_EACH_VEC_ELT (ops, i, oe)
> +    {
> +      if (TREE_CODE (oe->op) == SSA_NAME
> +         && bitmap_bit_p (biased_names, SSA_NAME_VERSION (oe->op))
> +         && op_normal_num > 4)
> +       {
> +         ops_biased.safe_push (oe);
> +         has_biased = true;
> +         op_normal_num --;
> +       }
> +      else
> +       ops_normal.safe_push (oe);
> +    }
> +
> +  /* Width should not be larger than ops length / 2, since we can not create
>       more parallel dependency chains that exceeds such value.  */
> -  width = width <= op_num / 2 ? width : op_num / 2;
> +  int width_normal = op_normal_num / 2;
> +  int width_biased = (op_num - op_normal_num) / 2;
> +  width_normal = width <= width_normal ? width : width_normal;
> +  width_biased = width <= width_biased ? width : width_biased;
> +
> +  ops1 = &ops_normal;
> +  width_count = width_active = width_normal;
>
>    /* Build parallel dependency chain according to width.  */
> -  for (i = 0; i < width; i++)
> +  for (i = 0; i < stmt_num; i++)
>      {
> -      /* If the chain has FAM, we do not swap two operands.  */
> -      if (op_index > 1 && !has_fma)
> -       swap_ops_for_binary_stmt (ops, op_index - 2);
> -
> -      for (j = 0; j < 2; j++)
> +      if (dump_file && (dump_flags & TDF_DETAILS))
>         {
> -         gcc_assert (op_index >= 0);
> -         oe = ops[op_index--];
> -         tmp_op[j] = oe->op;
> -         /* If the stmt that defines operand has to be inserted, insert it
> -            before the use.  */
> -         stmt1 = oe->stmt_to_insert;
> -         if (stmt1)
> -           insert_stmt_before_use (stmts[i], stmt1);
> -         stmt1 = NULL;
> +         fprintf (dump_file, "Transforming ");
> +         print_gimple_stmt (dump_file, stmts[i], 0);
>         }
> -      stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> -                                   tmp_op[1],
> -                                   tmp_op[0],
> -                                   opcode);
> -      gimple_set_visited (stmts[i], true);
>
> -      if (dump_file && (dump_flags & TDF_DETAILS))
> +      /* When the work of normal ops is over, but the loop is not over,
> +        continue to do biased ops.  */
> +      if (width_count == 0 && ops1 == &ops_normal)
>         {
> -         fprintf (dump_file, " into ");
> -         print_gimple_stmt (dump_file, stmts[i], 0);
> +         ops1 = &ops_biased;
> +         width_count = width_active = width_biased;
> +         ops_changed = true;
>         }
> -    }
>
> -  for (i = width; i < stmt_num; i++)
> -    {
> -      /* We keep original statement only for the last one.  All others are
> -        recreated.  */
> -      if ( op_index < 0)
> +      /* Swap the operands if no FMA in the chain.  */
> +      if (ops1->length () > 2 && !has_fma)
> +       swap_ops_for_binary_stmt (*ops1, ops1->length () - 3);
> +
> +      if (i < width_active
> +         || (ops_changed && i <= (last_rhs1_stmt_index + width_active)))
>         {
> -         if (width_count == 2)
> +         for (j = 0; j < 2; j++)
>             {
> -
> -             /* We keep original statement only for the last one.  All
> -                others are recreated.  */
> -             gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[i-1]));
> -             gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs (stmts[i-2]));
> -             update_stmt (stmts[i]);
> +             oe = ops1->pop ();
> +             tmp_op[j] = oe->op;
> +             /* If the stmt that defines operand has to be inserted, insert it
> +                before the use.  */
> +             stmt1 = oe->stmt_to_insert;
> +             if (stmt1)
> +               insert_stmt_before_use (stmts[i], stmt1);
> +             stmt1 = NULL;
>             }
> -         else
> -           {
> +         stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> +                                       tmp_op[1],
> +                                       tmp_op[0],
> +                                       opcode);
> +         gimple_set_visited (stmts[i], true);
>
> -             stmts[i] =
> -               build_and_add_sum (TREE_TYPE (last_rhs1),
> -                                  gimple_assign_lhs (stmts[i-width_count]),
> -                                  gimple_assign_lhs (stmts[i-width_count+1]),
> -                                  opcode);
> -             gimple_set_visited (stmts[i], true);
> -             width_count--;
> -           }
>         }
>        else
>         {
> -         /* Attach the rest of the ops to the parallel dependency chain.  */
> -         oe = ops[op_index--];
> -         op1 = oe->op;
> -         stmt1 = oe->stmt_to_insert;
> -         if (stmt1)
> -           insert_stmt_before_use (stmts[i], stmt1);
> -         stmt1 = NULL;
> -         stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> -                                       gimple_assign_lhs (stmts[i-width]),
> -                                       op1,
> -                                       opcode);
> -         gimple_set_visited (stmts[i], true);
> +         /* We keep original statement only for the last one.  All others are
> +            recreated.  */
> +         if (!ops1->length ())
> +           {
> +             /* For biased length equal to 2.  */
> +             if (width_count == BIASED_END_STMT && !last_rhs2_stmt_index)
> +               last_rhs2_stmt_index = i - 1;
> +
> +             /* When width_count == 2 and there is no biased, just finish.  */
> +             if (width_count == NORMAL_END_STMT && !has_biased)
> +               {
> +                 last_rhs1_stmt_index = i - 1;
> +                 last_rhs2_stmt_index = i - 2;
> +               }
> +             if (last_rhs1_stmt_index && (last_rhs2_stmt_index || !has_biased))
> +               {
> +                 /* We keep original statement only for the last one.  All
> +                    others are recreated.  */
> +                 gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[last_rhs1_stmt_index]));
> +                 gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs (stmts[last_rhs2_stmt_index]));
> +                 update_stmt (stmts[i]);
> +               }
> +             else
> +               {
> +                 stmts[i] =
> +                   build_and_add_sum (TREE_TYPE (last_rhs1),
> +                                      gimple_assign_lhs (stmts[i-width_count]),
> +                                      gimple_assign_lhs (stmts[i-width_count+1]),
> +                                      opcode);
> +                 gimple_set_visited (stmts[i], true);
> +                 width_count--;
> +
> +                 /* It is the end of normal or biased ops.
> +                    last_rhs1_stmt_index used to record the last stmt index
> +                    for normal ops.  last_rhs2_stmt_index used to record the
> +                    last stmt index for biased ops.  */
> +                 if (width_count == BIASED_END_STMT)
> +                   {
> +                     gcc_assert (has_biased);
> +                     if (ops_biased.length ())
> +                       last_rhs1_stmt_index = i;
> +                     else
> +                       last_rhs2_stmt_index = i;
> +                     width_count--;
> +                   }
> +               }
> +           }
> +         else
> +           {
> +             /* Attach the rest of the ops to the parallel dependency chain.  */
> +             oe = ops1->pop ();
> +             op1 = oe->op;
> +             stmt1 = oe->stmt_to_insert;
> +             if (stmt1)
> +               insert_stmt_before_use (stmts[i], stmt1);
> +             stmt1 = NULL;
> +
> +             /* For only one biased ops.  */
> +             if (width_count == SPECIAL_BIASED_END_STMT)
> +               {
> +                 /* We keep original statement only for the last one.  All
> +                    others are recreated.  */
> +                 gcc_assert (has_biased);
> +                 gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[last_rhs1_stmt_index]));
> +                 gimple_assign_set_rhs2 (stmts[i], op1);
> +                 update_stmt (stmts[i]);
> +               }
> +             else
> +               {
> +                 stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> +                                               gimple_assign_lhs (stmts[i-width_active]),
> +                                               op1,
> +                                               opcode);
> +                 gimple_set_visited (stmts[i], true);
> +               }
> +           }
>         }
>
>        if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -5595,6 +5692,7 @@ rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
>           print_gimple_stmt (dump_file, stmts[i], 0);
>         }
>      }
> +
>    remove_visited_stmt_chain (last_rhs1);
>  }
>
> --
> 2.25.1
>
  
Li, Pan2 via Gcc-patches June 29, 2023, 9:35 a.m. UTC | #2
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Thursday, June 29, 2023 2:42 PM
> To: Cui, Lili <lili.cui@intel.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] PR gcc/110148:Avoid adding loop-carried ops to long
> chains
> 
> On Thu, Jun 29, 2023 at 3:49 AM Cui, Lili <lili.cui@intel.com> wrote:
> >
> > From: Lili Cui <lili.cui@intel.com>
> >
> > Hi Maintainer
> >
> > This patch is to fix TSVC242 regression related to loop-carried ops.
> >
> > Bootstrapped and regtested. Ok for trunk?
> 
> OK.
> 
Committed, thanks Richard.

Regards,
Lili.

> Thanks,
> Richard.
> 
> > Regards
> > Lili.
> >
> > Avoid adding loop-carried ops to long chains, otherwise the whole
> > chain will have dependencies across the loop iteration. Just keep
> > loop-carried ops in a separate chain.
> >    E.g.
> >    x_1 = phi(x_0, x_2)
> >    y_1 = phi(y_0, y_2)
> >
> >    a + b + c + d + e + x1 + y1
> >
> >    SSA1 = a + b;
> >    SSA2 = c + d;
> >    SSA3 = SSA1 + e;
> >    SSA4 = SSA3 + SSA2;
> >    SSA5 = x1 + y1;
> >    SSA6 = SSA4 + SSA5;
> >
> > With the patch applied, these test cases improved by 32%~100%.
> >
> > S242:
> > for (int i = 1; i < LEN_1D; ++i) {
> >     a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i];}
> >
> > Case 1:
> > for (int i = 1; i < LEN_1D; ++i) {
> >     a[i] = a[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}
> >
> > Case 2:
> > for (int i = 1; i < LEN_1D; ++i) {
> >     a[i] = a[i - 1] + b[i - 1] + s1 + s2 + b[i] + c[i] + d[i] + e[i];}
> >
> > The value is the execution time
> > A: original version
> > B: with FMA patch g:e5405f065bace0685cb3b8878d1dfc7a6e7ef409(base
> on
> > A)
> > C: with current patch(base on B)
> >
> >           A       B       C     B/A             C/A
> > s242    2.859   5.152   2.859   1.802028681     1
> > case 1  5.489   5.488   3.511   0.999818        0.64
> > case 2  7.216   7.499   4.885   1.039218        0.68
> >
> > gcc/ChangeLog:
> >         PR tree-optimization/110148
> >         * tree-ssa-reassoc.cc (rewrite_expr_tree_parallel): Handle loop-carried
> >         ops in this function.
> > ---
> >  gcc/tree-ssa-reassoc.cc | 236
> > ++++++++++++++++++++++++++++------------
> >  1 file changed, 167 insertions(+), 69 deletions(-)
> >
> > diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc index
> > 96c88ec003e..f5da385e0b2 100644
> > --- a/gcc/tree-ssa-reassoc.cc
> > +++ b/gcc/tree-ssa-reassoc.cc
> > @@ -5471,37 +5471,62 @@ get_reassociation_width (int ops_num, enum
> tree_code opc,
> >    return width;
> >  }
> >
> > +#define SPECIAL_BIASED_END_STMT 0 /* It is the end stmt of all ops.
> > +*/ #define BIASED_END_STMT 1 /* It is the end stmt of normal or
> > +biased ops.  */ #define NORMAL_END_STMT 2 /* It is the end stmt of
> > +normal ops.  */
> > +
> >  /* Rewrite statements with dependency chain with regard the chance to
> generate
> >     FMA.
> >     For the chain with FMA: Try to keep fma opportunity as much as possible.
> >     For the chain without FMA: Putting the computation in rank order and
> trying
> >     to allow operations to be executed in parallel.
> >     E.g.
> > -   e + f + g + a * b + c * d;
> > +   e + f + a * b + c * d;
> >
> > -   ssa1 = e + f;
> > -   ssa2 = g + a * b;
> > -   ssa3 = ssa1 + c * d;
> > -   ssa4 = ssa2 + ssa3;
> > +   ssa1 = e + a * b;
> > +   ssa2 = f + c * d;
> > +   ssa3 = ssa1 + ssa2;
> >
> >     This reassociation approach preserves the chance of fma generation as
> much
> > -   as possible.  */
> > +   as possible.
> > +
> > +   Another thing is to avoid adding loop-carried ops to long chains,
> otherwise
> > +   the whole chain will have dependencies across the loop iteration. Just
> keep
> > +   loop-carried ops in a separate chain.
> > +   E.g.
> > +   x_1 = phi(x_0, x_2)
> > +   y_1 = phi(y_0, y_2)
> > +
> > +   a + b + c + d + e + x1 + y1
> > +
> > +   SSA1 = a + b;
> > +   SSA2 = c + d;
> > +   SSA3 = SSA1 + e;
> > +   SSA4 = SSA3 + SSA2;
> > +   SSA5 = x1 + y1;
> > +   SSA6 = SSA4 + SSA5;
> > + */
> >  static void
> >  rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
> > -                                        const vec<operand_entry *> &ops)
> > +                           const vec<operand_entry *> &ops)
> >  {
> >    enum tree_code opcode = gimple_assign_rhs_code (stmt);
> >    int op_num = ops.length ();
> > +  int op_normal_num = op_num;
> >    gcc_assert (op_num > 0);
> >    int stmt_num = op_num - 1;
> >    gimple **stmts = XALLOCAVEC (gimple *, stmt_num);
> > -  int op_index = op_num - 1;
> > -  int width_count = width;
> >    int i = 0, j = 0;
> >    tree tmp_op[2], op1;
> >    operand_entry *oe;
> >    gimple *stmt1 = NULL;
> >    tree last_rhs1 = gimple_assign_rhs1 (stmt);
> > +  int last_rhs1_stmt_index = 0, last_rhs2_stmt_index = 0;  int
> > + width_active = 0, width_count = 0;  bool has_biased = false,
> > + ops_changed = false;  auto_vec<operand_entry *> ops_normal;
> > + auto_vec<operand_entry *> ops_biased;  vec<operand_entry *> *ops1;
> >
> >    /* We start expression rewriting from the top statements.
> >       So, in this loop we create a full list of statements @@ -5510,83
> > +5535,155 @@ rewrite_expr_tree_parallel (gassign *stmt, int width, bool
> has_fma,
> >    for (i = stmt_num - 2; i >= 0; i--)
> >      stmts[i] = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmts[i+1]));
> >
> > -  /* Width should not be larger than op_num / 2, since we can not
> > create
> > +  /* Avoid adding loop-carried ops to long chains, first filter out the
> > +     loop-carried.  But we need to make sure that the length of the
> remainder
> > +     is not less than 4, which is the smallest ops length we can break the
> > +     dependency.  */
> > +  FOR_EACH_VEC_ELT (ops, i, oe)
> > +    {
> > +      if (TREE_CODE (oe->op) == SSA_NAME
> > +         && bitmap_bit_p (biased_names, SSA_NAME_VERSION (oe->op))
> > +         && op_normal_num > 4)
> > +       {
> > +         ops_biased.safe_push (oe);
> > +         has_biased = true;
> > +         op_normal_num --;
> > +       }
> > +      else
> > +       ops_normal.safe_push (oe);
> > +    }
> > +
> > +  /* Width should not be larger than ops length / 2, since we can not
> > + create
> >       more parallel dependency chains that exceeds such value.  */
> > -  width = width <= op_num / 2 ? width : op_num / 2;
> > +  int width_normal = op_normal_num / 2;  int width_biased = (op_num -
> > + op_normal_num) / 2;  width_normal = width <= width_normal ? width :
> > + width_normal;  width_biased = width <= width_biased ? width :
> > + width_biased;
> > +
> > +  ops1 = &ops_normal;
> > +  width_count = width_active = width_normal;
> >
> >    /* Build parallel dependency chain according to width.  */
> > -  for (i = 0; i < width; i++)
> > +  for (i = 0; i < stmt_num; i++)
> >      {
> > -      /* If the chain has FAM, we do not swap two operands.  */
> > -      if (op_index > 1 && !has_fma)
> > -       swap_ops_for_binary_stmt (ops, op_index - 2);
> > -
> > -      for (j = 0; j < 2; j++)
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> >         {
> > -         gcc_assert (op_index >= 0);
> > -         oe = ops[op_index--];
> > -         tmp_op[j] = oe->op;
> > -         /* If the stmt that defines operand has to be inserted, insert it
> > -            before the use.  */
> > -         stmt1 = oe->stmt_to_insert;
> > -         if (stmt1)
> > -           insert_stmt_before_use (stmts[i], stmt1);
> > -         stmt1 = NULL;
> > +         fprintf (dump_file, "Transforming ");
> > +         print_gimple_stmt (dump_file, stmts[i], 0);
> >         }
> > -      stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> > -                                   tmp_op[1],
> > -                                   tmp_op[0],
> > -                                   opcode);
> > -      gimple_set_visited (stmts[i], true);
> >
> > -      if (dump_file && (dump_flags & TDF_DETAILS))
> > +      /* When the work of normal ops is over, but the loop is not over,
> > +        continue to do biased ops.  */
> > +      if (width_count == 0 && ops1 == &ops_normal)
> >         {
> > -         fprintf (dump_file, " into ");
> > -         print_gimple_stmt (dump_file, stmts[i], 0);
> > +         ops1 = &ops_biased;
> > +         width_count = width_active = width_biased;
> > +         ops_changed = true;
> >         }
> > -    }
> >
> > -  for (i = width; i < stmt_num; i++)
> > -    {
> > -      /* We keep original statement only for the last one.  All others are
> > -        recreated.  */
> > -      if ( op_index < 0)
> > +      /* Swap the operands if no FMA in the chain.  */
> > +      if (ops1->length () > 2 && !has_fma)
> > +       swap_ops_for_binary_stmt (*ops1, ops1->length () - 3);
> > +
> > +      if (i < width_active
> > +         || (ops_changed && i <= (last_rhs1_stmt_index +
> > + width_active)))
> >         {
> > -         if (width_count == 2)
> > +         for (j = 0; j < 2; j++)
> >             {
> > -
> > -             /* We keep original statement only for the last one.  All
> > -                others are recreated.  */
> > -             gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[i-1]));
> > -             gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs (stmts[i-2]));
> > -             update_stmt (stmts[i]);
> > +             oe = ops1->pop ();
> > +             tmp_op[j] = oe->op;
> > +             /* If the stmt that defines operand has to be inserted, insert it
> > +                before the use.  */
> > +             stmt1 = oe->stmt_to_insert;
> > +             if (stmt1)
> > +               insert_stmt_before_use (stmts[i], stmt1);
> > +             stmt1 = NULL;
> >             }
> > -         else
> > -           {
> > +         stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> > +                                       tmp_op[1],
> > +                                       tmp_op[0],
> > +                                       opcode);
> > +         gimple_set_visited (stmts[i], true);
> >
> > -             stmts[i] =
> > -               build_and_add_sum (TREE_TYPE (last_rhs1),
> > -                                  gimple_assign_lhs (stmts[i-width_count]),
> > -                                  gimple_assign_lhs (stmts[i-width_count+1]),
> > -                                  opcode);
> > -             gimple_set_visited (stmts[i], true);
> > -             width_count--;
> > -           }
> >         }
> >        else
> >         {
> > -         /* Attach the rest of the ops to the parallel dependency chain.  */
> > -         oe = ops[op_index--];
> > -         op1 = oe->op;
> > -         stmt1 = oe->stmt_to_insert;
> > -         if (stmt1)
> > -           insert_stmt_before_use (stmts[i], stmt1);
> > -         stmt1 = NULL;
> > -         stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> > -                                       gimple_assign_lhs (stmts[i-width]),
> > -                                       op1,
> > -                                       opcode);
> > -         gimple_set_visited (stmts[i], true);
> > +         /* We keep original statement only for the last one.  All others are
> > +            recreated.  */
> > +         if (!ops1->length ())
> > +           {
> > +             /* For biased length equal to 2.  */
> > +             if (width_count == BIASED_END_STMT && !last_rhs2_stmt_index)
> > +               last_rhs2_stmt_index = i - 1;
> > +
> > +             /* When width_count == 2 and there is no biased, just finish.  */
> > +             if (width_count == NORMAL_END_STMT && !has_biased)
> > +               {
> > +                 last_rhs1_stmt_index = i - 1;
> > +                 last_rhs2_stmt_index = i - 2;
> > +               }
> > +             if (last_rhs1_stmt_index && (last_rhs2_stmt_index || !has_biased))
> > +               {
> > +                 /* We keep original statement only for the last one.  All
> > +                    others are recreated.  */
> > +                 gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs
> (stmts[last_rhs1_stmt_index]));
> > +                 gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs
> (stmts[last_rhs2_stmt_index]));
> > +                 update_stmt (stmts[i]);
> > +               }
> > +             else
> > +               {
> > +                 stmts[i] =
> > +                   build_and_add_sum (TREE_TYPE (last_rhs1),
> > +                                      gimple_assign_lhs (stmts[i-width_count]),
> > +                                      gimple_assign_lhs (stmts[i-width_count+1]),
> > +                                      opcode);
> > +                 gimple_set_visited (stmts[i], true);
> > +                 width_count--;
> > +
> > +                 /* It is the end of normal or biased ops.
> > +                    last_rhs1_stmt_index used to record the last stmt index
> > +                    for normal ops.  last_rhs2_stmt_index used to record the
> > +                    last stmt index for biased ops.  */
> > +                 if (width_count == BIASED_END_STMT)
> > +                   {
> > +                     gcc_assert (has_biased);
> > +                     if (ops_biased.length ())
> > +                       last_rhs1_stmt_index = i;
> > +                     else
> > +                       last_rhs2_stmt_index = i;
> > +                     width_count--;
> > +                   }
> > +               }
> > +           }
> > +         else
> > +           {
> > +             /* Attach the rest of the ops to the parallel dependency chain.  */
> > +             oe = ops1->pop ();
> > +             op1 = oe->op;
> > +             stmt1 = oe->stmt_to_insert;
> > +             if (stmt1)
> > +               insert_stmt_before_use (stmts[i], stmt1);
> > +             stmt1 = NULL;
> > +
> > +             /* For only one biased ops.  */
> > +             if (width_count == SPECIAL_BIASED_END_STMT)
> > +               {
> > +                 /* We keep original statement only for the last one.  All
> > +                    others are recreated.  */
> > +                 gcc_assert (has_biased);
> > +                 gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs
> (stmts[last_rhs1_stmt_index]));
> > +                 gimple_assign_set_rhs2 (stmts[i], op1);
> > +                 update_stmt (stmts[i]);
> > +               }
> > +             else
> > +               {
> > +                 stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
> > +                                               gimple_assign_lhs (stmts[i-width_active]),
> > +                                               op1,
> > +                                               opcode);
> > +                 gimple_set_visited (stmts[i], true);
> > +               }
> > +           }
> >         }
> >
> >        if (dump_file && (dump_flags & TDF_DETAILS)) @@ -5595,6 +5692,7
> > @@ rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
> >           print_gimple_stmt (dump_file, stmts[i], 0);
> >         }
> >      }
> > +
> >    remove_visited_stmt_chain (last_rhs1);  }
> >
> > --
> > 2.25.1
> >
  

Patch

diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
index 96c88ec003e..f5da385e0b2 100644
--- a/gcc/tree-ssa-reassoc.cc
+++ b/gcc/tree-ssa-reassoc.cc
@@ -5471,37 +5471,62 @@  get_reassociation_width (int ops_num, enum tree_code opc,
   return width;
 }
 
+#define SPECIAL_BIASED_END_STMT 0 /* It is the end stmt of all ops.  */
+#define BIASED_END_STMT 1 /* It is the end stmt of normal or biased ops.  */
+#define NORMAL_END_STMT 2 /* It is the end stmt of normal ops.  */
+
 /* Rewrite statements with dependency chain with regard the chance to generate
    FMA.
    For the chain with FMA: Try to keep fma opportunity as much as possible.
    For the chain without FMA: Putting the computation in rank order and trying
    to allow operations to be executed in parallel.
    E.g.
-   e + f + g + a * b + c * d;
+   e + f + a * b + c * d;
 
-   ssa1 = e + f;
-   ssa2 = g + a * b;
-   ssa3 = ssa1 + c * d;
-   ssa4 = ssa2 + ssa3;
+   ssa1 = e + a * b;
+   ssa2 = f + c * d;
+   ssa3 = ssa1 + ssa2;
 
    This reassociation approach preserves the chance of fma generation as much
-   as possible.  */
+   as possible.
+
+   Another thing is to avoid adding loop-carried ops to long chains, otherwise
+   the whole chain will have dependencies across the loop iteration. Just keep
+   loop-carried ops in a separate chain.
+   E.g.
+   x_1 = phi(x_0, x_2)
+   y_1 = phi(y_0, y_2)
+
+   a + b + c + d + e + x1 + y1
+
+   SSA1 = a + b;
+   SSA2 = c + d;
+   SSA3 = SSA1 + e;
+   SSA4 = SSA3 + SSA2;
+   SSA5 = x1 + y1;
+   SSA6 = SSA4 + SSA5;
+ */
 static void
 rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
-					 const vec<operand_entry *> &ops)
+			    const vec<operand_entry *> &ops)
 {
   enum tree_code opcode = gimple_assign_rhs_code (stmt);
   int op_num = ops.length ();
+  int op_normal_num = op_num;
   gcc_assert (op_num > 0);
   int stmt_num = op_num - 1;
   gimple **stmts = XALLOCAVEC (gimple *, stmt_num);
-  int op_index = op_num - 1;
-  int width_count = width;
   int i = 0, j = 0;
   tree tmp_op[2], op1;
   operand_entry *oe;
   gimple *stmt1 = NULL;
   tree last_rhs1 = gimple_assign_rhs1 (stmt);
+  int last_rhs1_stmt_index = 0, last_rhs2_stmt_index = 0; 
+  int width_active = 0, width_count = 0;
+  bool has_biased = false, ops_changed = false;
+  auto_vec<operand_entry *> ops_normal;
+  auto_vec<operand_entry *> ops_biased;
+  vec<operand_entry *> *ops1;
 
   /* We start expression rewriting from the top statements.
      So, in this loop we create a full list of statements
@@ -5510,83 +5535,155 @@  rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
   for (i = stmt_num - 2; i >= 0; i--)
     stmts[i] = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmts[i+1]));
 
-  /* Width should not be larger than op_num / 2, since we can not create
+  /* Avoid adding loop-carried ops to long chains, first filter out the
+     loop-carried.  But we need to make sure that the length of the remainder
+     is not less than 4, which is the smallest ops length we can break the
+     dependency.  */
+  FOR_EACH_VEC_ELT (ops, i, oe)
+    {
+      if (TREE_CODE (oe->op) == SSA_NAME
+	  && bitmap_bit_p (biased_names, SSA_NAME_VERSION (oe->op))
+	  && op_normal_num > 4)
+	{
+	  ops_biased.safe_push (oe);
+	  has_biased = true;
+	  op_normal_num --;
+	}
+      else
+	ops_normal.safe_push (oe);
+    }
+
+  /* Width should not be larger than ops length / 2, since we can not create
      more parallel dependency chains that exceeds such value.  */
-  width = width <= op_num / 2 ? width : op_num / 2;
+  int width_normal = op_normal_num / 2;
+  int width_biased = (op_num - op_normal_num) / 2;
+  width_normal = width <= width_normal ? width : width_normal;
+  width_biased = width <= width_biased ? width : width_biased;
+
+  ops1 = &ops_normal;
+  width_count = width_active = width_normal;
 
   /* Build parallel dependency chain according to width.  */
-  for (i = 0; i < width; i++)
+  for (i = 0; i < stmt_num; i++)
     {
-      /* If the chain has FAM, we do not swap two operands.  */
-      if (op_index > 1 && !has_fma)
-	swap_ops_for_binary_stmt (ops, op_index - 2);
-
-      for (j = 0; j < 2; j++)
+      if (dump_file && (dump_flags & TDF_DETAILS))
 	{
-	  gcc_assert (op_index >= 0);
-	  oe = ops[op_index--];
-	  tmp_op[j] = oe->op;
-	  /* If the stmt that defines operand has to be inserted, insert it
-	     before the use.  */
-	  stmt1 = oe->stmt_to_insert;
-	  if (stmt1)
-	    insert_stmt_before_use (stmts[i], stmt1);
-	  stmt1 = NULL;
+	  fprintf (dump_file, "Transforming ");
+	  print_gimple_stmt (dump_file, stmts[i], 0);
 	}
-      stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
-				    tmp_op[1],
-				    tmp_op[0],
-				    opcode);
-      gimple_set_visited (stmts[i], true);
 
-      if (dump_file && (dump_flags & TDF_DETAILS))
+      /* When the work of normal ops is over, but the loop is not over,
+	 continue to do biased ops.  */
+      if (width_count == 0 && ops1 == &ops_normal)
 	{
-	  fprintf (dump_file, " into ");
-	  print_gimple_stmt (dump_file, stmts[i], 0);
+	  ops1 = &ops_biased;
+	  width_count = width_active = width_biased;
+	  ops_changed = true;
 	}
-    }
 
-  for (i = width; i < stmt_num; i++)
-    {
-      /* We keep original statement only for the last one.  All others are
-	 recreated.  */
-      if ( op_index < 0)
+      /* Swap the operands if no FMA in the chain.  */
+      if (ops1->length () > 2 && !has_fma)
+	swap_ops_for_binary_stmt (*ops1, ops1->length () - 3);
+
+      if (i < width_active
+	  || (ops_changed && i <= (last_rhs1_stmt_index + width_active)))
 	{
-	  if (width_count == 2)
+	  for (j = 0; j < 2; j++)
 	    {
-
-	      /* We keep original statement only for the last one.  All
-		 others are recreated.  */
-	      gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[i-1]));
-	      gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs (stmts[i-2]));
-	      update_stmt (stmts[i]);
+	      oe = ops1->pop ();
+	      tmp_op[j] = oe->op;
+	      /* If the stmt that defines operand has to be inserted, insert it
+		 before the use.  */
+	      stmt1 = oe->stmt_to_insert;
+	      if (stmt1)
+		insert_stmt_before_use (stmts[i], stmt1);
+	      stmt1 = NULL;
 	    }
-	  else
-	    {
+	  stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
+					tmp_op[1],
+					tmp_op[0],
+					opcode);
+	  gimple_set_visited (stmts[i], true);
 
-	      stmts[i] =
-		build_and_add_sum (TREE_TYPE (last_rhs1),
-				   gimple_assign_lhs (stmts[i-width_count]),
-				   gimple_assign_lhs (stmts[i-width_count+1]),
-				   opcode);
-	      gimple_set_visited (stmts[i], true);
-	      width_count--;
-	    }
 	}
       else
 	{
-	  /* Attach the rest of the ops to the parallel dependency chain.  */
-	  oe = ops[op_index--];
-	  op1 = oe->op;
-	  stmt1 = oe->stmt_to_insert;
-	  if (stmt1)
-	    insert_stmt_before_use (stmts[i], stmt1);
-	  stmt1 = NULL;
-	  stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
-					gimple_assign_lhs (stmts[i-width]),
-					op1,
-					opcode);
-	  gimple_set_visited (stmts[i], true);
+	  /* We keep original statement only for the last one.  All others are
+	     recreated.  */
+	  if (!ops1->length ())
+	    {
+	      /* For biased length equal to 2.  */
+	      if (width_count == BIASED_END_STMT && !last_rhs2_stmt_index)
+		last_rhs2_stmt_index = i - 1;
+
+	      /* When width_count == 2 and there is no biased, just finish.  */
+	      if (width_count == NORMAL_END_STMT && !has_biased)
+		{
+		  last_rhs1_stmt_index = i - 1;
+		  last_rhs2_stmt_index = i - 2;
+		}
+	      if (last_rhs1_stmt_index && (last_rhs2_stmt_index || !has_biased))
+		{
+		  /* We keep original statement only for the last one.  All
+		     others are recreated.  */
+		  gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[last_rhs1_stmt_index]));
+		  gimple_assign_set_rhs2 (stmts[i], gimple_assign_lhs (stmts[last_rhs2_stmt_index]));
+		  update_stmt (stmts[i]);
+		}
+	      else
+		{
+		  stmts[i] =
+		    build_and_add_sum (TREE_TYPE (last_rhs1),
+				       gimple_assign_lhs (stmts[i-width_count]),
+				       gimple_assign_lhs (stmts[i-width_count+1]),
+				       opcode);
+		  gimple_set_visited (stmts[i], true);
+		  width_count--;
+
+		  /* It is the end of normal or biased ops.
+		     last_rhs1_stmt_index used to record the last stmt index
+		     for normal ops.  last_rhs2_stmt_index used to record the
+		     last stmt index for biased ops.  */
+		  if (width_count == BIASED_END_STMT)
+		    {
+		      gcc_assert (has_biased);
+		      if (ops_biased.length ())
+			last_rhs1_stmt_index = i;
+		      else
+			last_rhs2_stmt_index = i;
+		      width_count--;
+		    }
+		}
+	    }
+	  else
+	    {
+	      /* Attach the rest of the ops to the parallel dependency chain.  */
+	      oe = ops1->pop ();
+	      op1 = oe->op;
+	      stmt1 = oe->stmt_to_insert;
+	      if (stmt1)
+		insert_stmt_before_use (stmts[i], stmt1);
+	      stmt1 = NULL;
+
+	      /* For only one biased ops.  */
+	      if (width_count == SPECIAL_BIASED_END_STMT)
+		{
+		  /* We keep original statement only for the last one.  All
+		     others are recreated.  */
+		  gcc_assert (has_biased);
+		  gimple_assign_set_rhs1 (stmts[i], gimple_assign_lhs (stmts[last_rhs1_stmt_index]));
+		  gimple_assign_set_rhs2 (stmts[i], op1);
+		  update_stmt (stmts[i]);
+		}
+	      else
+		{
+		  stmts[i] = build_and_add_sum (TREE_TYPE (last_rhs1),
+						gimple_assign_lhs (stmts[i-width_active]),
+						op1,
+						opcode);
+		  gimple_set_visited (stmts[i], true);
+		}
+	    }
 	}
 
       if (dump_file && (dump_flags & TDF_DETAILS))
@@ -5595,6 +5692,7 @@  rewrite_expr_tree_parallel (gassign *stmt, int width, bool has_fma,
 	  print_gimple_stmt (dump_file, stmts[i], 0);
 	}
     }
+
   remove_visited_stmt_chain (last_rhs1);
 }