[2/2] Add a tune option to control the length of the chain with FMA

Message ID 20230511101201.2052667-2-lili.cui@intel.com
State Unresolved
Headers
Series [1/2] PR gcc/98350:Add a param to control the length of the chain with FMA in reassoc pass |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Li, Pan2 via Gcc-patches May 11, 2023, 10:12 a.m. UTC
  From: Lili Cui <lili.cui@intel.com>

Set the length of the chain with FMA to 5 for icelake_cost.

With this patch applied,
SPR multi-copy: 508.namd_r increased by 3%
ICX multi-copy: 508.namd_r increased by 3.5%,
                507.cactuBSSN_r increased by 3.7%

Using FMA instead of mult + add reduces register pressure and insruction
retired.

gcc/ChangeLog:

        * config/i386/i386-options.cc (ix86_option_override_internal):
        Set param_max_reassoc_fma_chain_length.
        * config/i386/i386.h (struct processor_costs): Add new tune parameters.
        * config/i386/x86-tune-costs.h (struct processor_costs): Set
	reassoc_max_chain_length_with_fma to 5 for icelake.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/fma-chain.c: New test.
---
 gcc/config/i386/i386-options.cc           |  2 ++
 gcc/config/i386/i386.h                    |  3 ++
 gcc/config/i386/x86-tune-costs.h          | 35 +++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/fma-chain.c | 11 +++++++
 4 files changed, 51 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/fma-chain.c
  

Comments

Richard Biener May 11, 2023, 10:56 a.m. UTC | #1
On Thu, May 11, 2023 at 12:13 PM Cui, Lili via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> From: Lili Cui <lili.cui@intel.com>
>
> Set the length of the chain with FMA to 5 for icelake_cost.
>
> With this patch applied,
> SPR multi-copy: 508.namd_r increased by 3%
> ICX multi-copy: 508.namd_r increased by 3.5%,
>                 507.cactuBSSN_r increased by 3.7%
>
> Using FMA instead of mult + add reduces register pressure and insruction
> retired.

I would say it would make more sense to use the existing reassoc_width
hook and based on the opcode specify the number of adds vs. mults
(where I guess all subarchs have #mults equal to the #fmas) that can
be carried out in parallel?

That means for the reassoc patch shouldn't we simply query
the PLUS and MULT reassoc width and compute something from that
instead of adding another --param?

Richrad.

> gcc/ChangeLog:
>
>         * config/i386/i386-options.cc (ix86_option_override_internal):
>         Set param_max_reassoc_fma_chain_length.
>         * config/i386/i386.h (struct processor_costs): Add new tune parameters.
>         * config/i386/x86-tune-costs.h (struct processor_costs): Set
>         reassoc_max_chain_length_with_fma to 5 for icelake.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/fma-chain.c: New test.
> ---
>  gcc/config/i386/i386-options.cc           |  2 ++
>  gcc/config/i386/i386.h                    |  3 ++
>  gcc/config/i386/x86-tune-costs.h          | 35 +++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/fma-chain.c | 11 +++++++
>  4 files changed, 51 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/fma-chain.c
>
> diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
> index 2cb0bddcd35..67d35d89d91 100644
> --- a/gcc/config/i386/i386-options.cc
> +++ b/gcc/config/i386/i386-options.cc
> @@ -2684,6 +2684,8 @@ ix86_option_override_internal (bool main_args_p,
>                        ix86_tune_cost->l1_cache_size);
>    SET_OPTION_IF_UNSET (opts, opts_set, param_l2_cache_size,
>                        ix86_tune_cost->l2_cache_size);
> +  SET_OPTION_IF_UNSET (opts, opts_set, param_reassoc_max_chain_length_with_fma,
> +                      ix86_tune_cost->reassoc_max_chain_length_with_fma);
>
>    /* 64B is the accepted value for these for all x86.  */
>    SET_OPTION_IF_UNSET (&global_options, &global_options_set,
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index c7439f89bdf..c7fa7312a67 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -206,6 +206,9 @@ struct processor_costs {
>                                    to number of instructions executed in
>                                    parallel.  See also
>                                    ix86_reassociation_width.  */
> +  const int reassoc_max_chain_length_with_fma;
> +                               /* Specify max reassociation chain length with
> +                                  FMA.  */
>    struct stringop_algs *memcpy, *memset;
>    const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
>                                           cost model.  */
> diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
> index 4f7a67ca5c5..1f57a5ee2a7 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -127,6 +127,7 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
>    COSTS_N_BYTES (2),                   /* cost of SQRTSS instruction.  */
>    COSTS_N_BYTES (2),                   /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    ix86_size_memcpy,
>    ix86_size_memset,
>    COSTS_N_BYTES (1),                   /* cond_taken_branch_cost.  */
> @@ -238,6 +239,7 @@ struct processor_costs i386_cost = {        /* 386 specific costs */
>    COSTS_N_INSNS (122),                 /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (122),                 /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    i386_memcpy,
>    i386_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -350,6 +352,7 @@ struct processor_costs i486_cost = {        /* 486 specific costs */
>    COSTS_N_INSNS (83),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (83),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    i486_memcpy,
>    i486_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -460,6 +463,7 @@ struct processor_costs pentium_cost = {
>    COSTS_N_INSNS (70),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (70),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    pentium_memcpy,
>    pentium_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -563,6 +567,7 @@ struct processor_costs lakemont_cost = {
>    COSTS_N_INSNS (31),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (63),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    pentium_memcpy,
>    pentium_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -681,6 +686,7 @@ struct processor_costs pentiumpro_cost = {
>    COSTS_N_INSNS (31),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (31),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    pentiumpro_memcpy,
>    pentiumpro_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -790,6 +796,7 @@ struct processor_costs geode_cost = {
>    COSTS_N_INSNS (54),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (54),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    geode_memcpy,
>    geode_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -902,6 +909,7 @@ struct processor_costs k6_cost = {
>    COSTS_N_INSNS (56),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (56),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    k6_memcpy,
>    k6_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -1015,6 +1023,7 @@ struct processor_costs athlon_cost = {
>    COSTS_N_INSNS (19),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (19),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    athlon_memcpy,
>    athlon_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -1137,6 +1146,7 @@ struct processor_costs k8_cost = {
>    COSTS_N_INSNS (19),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (27),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    k8_memcpy,
>    k8_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -1267,6 +1277,7 @@ struct processor_costs amdfam10_cost = {
>    COSTS_N_INSNS (19),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (27),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    amdfam10_memcpy,
>    amdfam10_memset,
>    COSTS_N_INSNS (2),                   /* cond_taken_branch_cost.  */
> @@ -1390,6 +1401,7 @@ const struct processor_costs bdver_cost = {
>    COSTS_N_INSNS (15),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (26),                  /* cost of SQRTSD instruction.  */
>    1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    bdver_memcpy,
>    bdver_memset,
>    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
> @@ -1545,6 +1557,7 @@ struct processor_costs znver1_cost = {
>       plus/minus operations per cycle but only one multiply.  This is adjusted
>       in ix86_reassociation_width.  */
>    4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    znver1_memcpy,
>    znver1_memset,
>    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
> @@ -1704,6 +1717,7 @@ struct processor_costs znver2_cost = {
>       plus/minus operations per cycle but only one multiply.  This is adjusted
>       in ix86_reassociation_width.  */
>    4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    znver2_memcpy,
>    znver2_memset,
>    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
> @@ -1838,6 +1852,7 @@ struct processor_costs znver3_cost = {
>       plus/minus operations per cycle but only one multiply.  This is adjusted
>       in ix86_reassociation_width.  */
>    4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    znver2_memcpy,
>    znver2_memset,
>    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
> @@ -1974,6 +1989,7 @@ struct processor_costs znver4_cost = {
>       plus/minus operations per cycle but only one multiply.  This is adjusted
>       in ix86_reassociation_width.  */
>    4, 4, 3, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    znver2_memcpy,
>    znver2_memset,
>    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
> @@ -2100,6 +2116,7 @@ struct processor_costs skylake_cost = {
>    COSTS_N_INSNS (12),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
>    1, 4, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    skylake_memcpy,
>    skylake_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -2228,6 +2245,12 @@ struct processor_costs icelake_cost = {
>    COSTS_N_INSNS (12),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
>    1, 4, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  /* Icelake-server prefers fma chains instead of breaking dependencies into
> +     mult + add, which can reduce instruction retired. 1 means not to keep
> +     the fma chain. When the value big than 1, we will generate fma chain.
> +     When the actual fma chain length is greater than this value, the fma
> +     chain will be split with width.  */
> +  5,                                   /* Reassoc max FMA chain length.  */
>    icelake_memcpy,
>    icelake_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -2350,6 +2373,7 @@ struct processor_costs alderlake_cost = {
>    COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
>    1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    alderlake_memcpy,
>    alderlake_memset,
>    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
> @@ -2465,6 +2489,7 @@ const struct processor_costs btver1_cost = {
>    COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (48),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    btver1_memcpy,
>    btver1_memset,
>    COSTS_N_INSNS (2),                   /* cond_taken_branch_cost.  */
> @@ -2577,6 +2602,7 @@ const struct processor_costs btver2_cost = {
>    COSTS_N_INSNS (16),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (21),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    btver2_memcpy,
>    btver2_memset,
>    COSTS_N_INSNS (2),                   /* cond_taken_branch_cost.  */
> @@ -2688,6 +2714,7 @@ struct processor_costs pentium4_cost = {
>    COSTS_N_INSNS (23),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (38),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    pentium4_memcpy,
>    pentium4_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -2802,6 +2829,7 @@ struct processor_costs nocona_cost = {
>    COSTS_N_INSNS (32),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (41),                  /* cost of SQRTSD instruction.  */
>    1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    nocona_memcpy,
>    nocona_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -2914,6 +2942,7 @@ struct processor_costs atom_cost = {
>    COSTS_N_INSNS (31),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (63),                  /* cost of SQRTSD instruction.  */
>    2, 2, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    atom_memcpy,
>    atom_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -3026,6 +3055,7 @@ struct processor_costs slm_cost = {
>    COSTS_N_INSNS (20),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (35),                  /* cost of SQRTSD instruction.  */
>    1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    slm_memcpy,
>    slm_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -3152,6 +3182,7 @@ struct processor_costs tremont_cost = {
>    COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
>    1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    tremont_memcpy,
>    tremont_memset,
>    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
> @@ -3264,6 +3295,7 @@ struct processor_costs intel_cost = {
>    COSTS_N_INSNS (40),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (40),                  /* cost of SQRTSD instruction.  */
>    1, 4, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    intel_memcpy,
>    intel_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> @@ -3381,6 +3413,7 @@ struct processor_costs lujiazui_cost = {
>    COSTS_N_INSNS (32),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (60),                  /* cost of SQRTSD instruction.  */
>    1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    lujiazui_memcpy,
>    lujiazui_memset,
>    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
> @@ -3502,6 +3535,7 @@ struct processor_costs generic_cost = {
>    COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
>    1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    generic_memcpy,
>    generic_memset,
>    COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
> @@ -3630,6 +3664,7 @@ struct processor_costs core_cost = {
>    COSTS_N_INSNS (30),                  /* cost of SQRTSS instruction.  */
>    COSTS_N_INSNS (58),                  /* cost of SQRTSD instruction.  */
>    1, 4, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
> +  1,                                   /* Reassoc max FMA chain length.  */
>    core_memcpy,
>    core_memset,
>    COSTS_N_INSNS (3),                   /* cond_taken_branch_cost.  */
> diff --git a/gcc/testsuite/gcc.target/i386/fma-chain.c b/gcc/testsuite/gcc.target/i386/fma-chain.c
> new file mode 100644
> index 00000000000..9de61f1b6ff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/fma-chain.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=icelake-server -Wno-attributes " } */
> +
> +/* Test that the compiler properly optimizes multiply and add
> +   to generate more FMA instructions.  */
> +float
> +foo (float a, float b, float c, float d, float e, float f, float g, float h, float j)
> +{
> +   return a * b + c * d + e * f + g * h + j;
> +}
> +/* { dg-final { scan-assembler-times "vfm" 4 } } */
> --
> 2.25.1
>
  

Patch

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 2cb0bddcd35..67d35d89d91 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2684,6 +2684,8 @@  ix86_option_override_internal (bool main_args_p,
 		       ix86_tune_cost->l1_cache_size);
   SET_OPTION_IF_UNSET (opts, opts_set, param_l2_cache_size,
 		       ix86_tune_cost->l2_cache_size);
+  SET_OPTION_IF_UNSET (opts, opts_set, param_reassoc_max_chain_length_with_fma,
+		       ix86_tune_cost->reassoc_max_chain_length_with_fma);
 
   /* 64B is the accepted value for these for all x86.  */
   SET_OPTION_IF_UNSET (&global_options, &global_options_set,
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index c7439f89bdf..c7fa7312a67 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -206,6 +206,9 @@  struct processor_costs {
 				   to number of instructions executed in
 				   parallel.  See also
 				   ix86_reassociation_width.  */
+  const int reassoc_max_chain_length_with_fma;
+				/* Specify max reassociation chain length with
+				   FMA.  */
   struct stringop_algs *memcpy, *memset;
   const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
 					  cost model.  */
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 4f7a67ca5c5..1f57a5ee2a7 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -127,6 +127,7 @@  struct processor_costs ix86_size_cost = {/* costs for tuning for size */
   COSTS_N_BYTES (2),			/* cost of SQRTSS instruction.  */
   COSTS_N_BYTES (2),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   ix86_size_memcpy,
   ix86_size_memset,
   COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
@@ -238,6 +239,7 @@  struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (122),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (122),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   i386_memcpy,
   i386_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -350,6 +352,7 @@  struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (83),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (83),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   i486_memcpy,
   i486_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -460,6 +463,7 @@  struct processor_costs pentium_cost = {
   COSTS_N_INSNS (70),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (70),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -563,6 +567,7 @@  struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -681,6 +686,7 @@  struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (31),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -790,6 +796,7 @@  struct processor_costs geode_cost = {
   COSTS_N_INSNS (54),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (54),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   geode_memcpy,
   geode_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -902,6 +909,7 @@  struct processor_costs k6_cost = {
   COSTS_N_INSNS (56),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (56),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   k6_memcpy,
   k6_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1015,6 +1023,7 @@  struct processor_costs athlon_cost = {
   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (19),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   athlon_memcpy,
   athlon_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1137,6 +1146,7 @@  struct processor_costs k8_cost = {
   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   k8_memcpy,
   k8_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -1267,6 +1277,7 @@  struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (19),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (27),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   amdfam10_memcpy,
   amdfam10_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -1390,6 +1401,7 @@  const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (15),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (26),			/* cost of SQRTSD instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   bdver_memcpy,
   bdver_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1545,6 +1557,7 @@  struct processor_costs znver1_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   znver1_memcpy,
   znver1_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1704,6 +1717,7 @@  struct processor_costs znver2_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1838,6 +1852,7 @@  struct processor_costs znver3_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -1974,6 +1989,7 @@  struct processor_costs znver4_cost = {
      plus/minus operations per cycle but only one multiply.  This is adjusted
      in ix86_reassociation_width.  */
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2100,6 +2116,7 @@  struct processor_costs skylake_cost = {
   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   skylake_memcpy,
   skylake_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2228,6 +2245,12 @@  struct processor_costs icelake_cost = {
   COSTS_N_INSNS (12),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  /* Icelake-server prefers fma chains instead of breaking dependencies into
+     mult + add, which can reduce instruction retired. 1 means not to keep
+     the fma chain. When the value big than 1, we will generate fma chain.
+     When the actual fma chain length is greater than this value, the fma
+     chain will be split with width.  */
+  5,					/* Reassoc max FMA chain length.  */
   icelake_memcpy,
   icelake_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2350,6 +2373,7 @@  struct processor_costs alderlake_cost = {
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   alderlake_memcpy,
   alderlake_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -2465,6 +2489,7 @@  const struct processor_costs btver1_cost = {
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (48),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   btver1_memcpy,
   btver1_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -2577,6 +2602,7 @@  const struct processor_costs btver2_cost = {
   COSTS_N_INSNS (16),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (21),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   btver2_memcpy,
   btver2_memset,
   COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
@@ -2688,6 +2714,7 @@  struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (23),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (38),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   pentium4_memcpy,
   pentium4_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2802,6 +2829,7 @@  struct processor_costs nocona_cost = {
   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (41),			/* cost of SQRTSD instruction.  */
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   nocona_memcpy,
   nocona_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -2914,6 +2942,7 @@  struct processor_costs atom_cost = {
   COSTS_N_INSNS (31),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (63),			/* cost of SQRTSD instruction.  */
   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   atom_memcpy,
   atom_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3026,6 +3055,7 @@  struct processor_costs slm_cost = {
   COSTS_N_INSNS (20),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (35),			/* cost of SQRTSD instruction.  */
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   slm_memcpy,
   slm_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3152,6 +3182,7 @@  struct processor_costs tremont_cost = {
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   tremont_memcpy,
   tremont_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3264,6 +3295,7 @@  struct processor_costs intel_cost = {
   COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   intel_memcpy,
   intel_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
@@ -3381,6 +3413,7 @@  struct processor_costs lujiazui_cost = {
   COSTS_N_INSNS (32),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (60),			/* cost of SQRTSD instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   lujiazui_memcpy,
   lujiazui_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3502,6 +3535,7 @@  struct processor_costs generic_cost = {
   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),			/* cost of SQRTSD instruction.  */
   1, 4, 3, 3,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   generic_memcpy,
   generic_memset,
   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
@@ -3630,6 +3664,7 @@  struct processor_costs core_cost = {
   COSTS_N_INSNS (30),			/* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (58),			/* cost of SQRTSD instruction.  */
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
+  1,					/* Reassoc max FMA chain length.  */
   core_memcpy,
   core_memset,
   COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
diff --git a/gcc/testsuite/gcc.target/i386/fma-chain.c b/gcc/testsuite/gcc.target/i386/fma-chain.c
new file mode 100644
index 00000000000..9de61f1b6ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/fma-chain.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=icelake-server -Wno-attributes " } */
+
+/* Test that the compiler properly optimizes multiply and add
+   to generate more FMA instructions.  */
+float
+foo (float a, float b, float c, float d, float e, float f, float g, float h, float j)
+{
+   return a * b + c * d + e * f + g * h + j;
+}
+/* { dg-final { scan-assembler-times "vfm" 4 } } */