[v2] aarch64: Add support for Ampere-1A (-mcpu=ampere1a) CPU

Message ID 20221114135324.19352-1-philipp.tomsich@vrull.eu
State Unresolved
Headers
Series [v2] aarch64: Add support for Ampere-1A (-mcpu=ampere1a) CPU |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Philipp Tomsich Nov. 14, 2022, 1:53 p.m. UTC
  This patch adds support for Ampere-1A CPU:
 - recognize the name of the core and provide detection for -mcpu=native,
 - updated extra_costs,
 - adds a new fusion pair for (A+B+1 and A-B-1).

Ampere-1A and Ampere-1 have more timing difference than the extra
costs indicate, but these don't propagate through to the headline
items in our extra costs (e.g. the change in latency for scalar sqrt
doesn't have a corresponding table entry).

gcc/ChangeLog:

	* config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere1a.
	* config/aarch64/aarch64-cost-tables.h: Add ampere1a_extra_costs.
	* config/aarch64/aarch64-fusion-pairs.def (AARCH64_FUSION_PAIR):
	Define a new fusion pair for A+B+1/A-B-1 (i.e., add/subtract two
	registers and then +1/-1).
	* config/aarch64/aarch64-tune.md: Regenerate.
	* config/aarch64/aarch64.cc (aarch_macro_fusion_pair_p): Implement
	idiom-matcher for the new fusion pair.
	* doc/invoke.texi: Add ampere1a.

Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
---

Changes in v2:
- break line in fusion matcher to stay below 80 characters
- rename fusion pair addsub_2reg_const1
- document 'ampere1a' in invoke.texi

 gcc/config/aarch64/aarch64-cores.def        |   1 +
 gcc/config/aarch64/aarch64-cost-tables.h    | 107 ++++++++++++++++++++
 gcc/config/aarch64/aarch64-fusion-pairs.def |   1 +
 gcc/config/aarch64/aarch64-tune.md          |   2 +-
 gcc/config/aarch64/aarch64.cc               |  64 ++++++++++++
 gcc/doc/invoke.texi                         |   2 +-
 6 files changed, 175 insertions(+), 2 deletions(-)
  

Comments

Philipp Tomsich Nov. 14, 2022, 1:54 p.m. UTC | #1
Applied to master as v2 with the requested changes (and the change to add
"ampere1a" in invoke.texi). Thanks!

Philipp.

On Mon, 14 Nov 2022 at 14:53, Philipp Tomsich <philipp.tomsich@vrull.eu> wrote:
>
> This patch adds support for Ampere-1A CPU:
>  - recognize the name of the core and provide detection for -mcpu=native,
>  - updated extra_costs,
>  - adds a new fusion pair for (A+B+1 and A-B-1).
>
> Ampere-1A and Ampere-1 have more timing difference than the extra
> costs indicate, but these don't propagate through to the headline
> items in our extra costs (e.g. the change in latency for scalar sqrt
> doesn't have a corresponding table entry).
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere1a.
>         * config/aarch64/aarch64-cost-tables.h: Add ampere1a_extra_costs.
>         * config/aarch64/aarch64-fusion-pairs.def (AARCH64_FUSION_PAIR):
>         Define a new fusion pair for A+B+1/A-B-1 (i.e., add/subtract two
>         registers and then +1/-1).
>         * config/aarch64/aarch64-tune.md: Regenerate.
>         * config/aarch64/aarch64.cc (aarch_macro_fusion_pair_p): Implement
>         idiom-matcher for the new fusion pair.
>         * doc/invoke.texi: Add ampere1a.
>
> Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
> ---
>
> Changes in v2:
> - break line in fusion matcher to stay below 80 characters
> - rename fusion pair addsub_2reg_const1
> - document 'ampere1a' in invoke.texi
>
>  gcc/config/aarch64/aarch64-cores.def        |   1 +
>  gcc/config/aarch64/aarch64-cost-tables.h    | 107 ++++++++++++++++++++
>  gcc/config/aarch64/aarch64-fusion-pairs.def |   1 +
>  gcc/config/aarch64/aarch64-tune.md          |   2 +-
>  gcc/config/aarch64/aarch64.cc               |  64 ++++++++++++
>  gcc/doc/invoke.texi                         |   2 +-
>  6 files changed, 175 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
> index d2671778928..aead587cec1 100644
> --- a/gcc/config/aarch64/aarch64-cores.def
> +++ b/gcc/config/aarch64/aarch64-cores.def
> @@ -70,6 +70,7 @@ AARCH64_CORE("thunderxt83",   thunderxt83,   thunderx,  V8A,  (CRC, CRYPTO), thu
>
>  /* Ampere Computing ('\xC0') cores. */
>  AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, SHA3), ampere1, 0xC0, 0xac3, -1)
> +AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES, SHA3, MEMTAG), ampere1a, 0xC0, 0xac4, -1)
>  /* Do not swap around "emag" and "xgene1",
>     this order is required to handle variant correctly. */
>  AARCH64_CORE("emag",        emag,      xgene1,    V8A,  (CRC, CRYPTO), emag, 0x50, 0x000, 3)
> diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
> index 760d7b30368..48522606fbe 100644
> --- a/gcc/config/aarch64/aarch64-cost-tables.h
> +++ b/gcc/config/aarch64/aarch64-cost-tables.h
> @@ -775,4 +775,111 @@ const struct cpu_cost_table ampere1_extra_costs =
>    }
>  };
>
> +const struct cpu_cost_table ampere1a_extra_costs =
> +{
> +  /* ALU */
> +  {
> +    0,                 /* arith.  */
> +    0,                 /* logical.  */
> +    0,                 /* shift.  */
> +    COSTS_N_INSNS (1), /* shift_reg.  */
> +    0,                 /* arith_shift.  */
> +    COSTS_N_INSNS (1), /* arith_shift_reg.  */
> +    0,                 /* log_shift.  */
> +    COSTS_N_INSNS (1), /* log_shift_reg.  */
> +    0,                 /* extend.  */
> +    COSTS_N_INSNS (1), /* extend_arith.  */
> +    0,                 /* bfi.  */
> +    0,                 /* bfx.  */
> +    0,                 /* clz.  */
> +    0,                 /* rev.  */
> +    0,                 /* non_exec.  */
> +    true               /* non_exec_costs_exec.  */
> +  },
> +  {
> +    /* MULT SImode */
> +    {
> +      COSTS_N_INSNS (3),       /* simple.  */
> +      COSTS_N_INSNS (3),       /* flag_setting.  */
> +      COSTS_N_INSNS (3),       /* extend.  */
> +      COSTS_N_INSNS (4),       /* add.  */
> +      COSTS_N_INSNS (4),       /* extend_add.  */
> +      COSTS_N_INSNS (19)       /* idiv.  */
> +    },
> +    /* MULT DImode */
> +    {
> +      COSTS_N_INSNS (3),       /* simple.  */
> +      0,                       /* flag_setting (N/A).  */
> +      COSTS_N_INSNS (3),       /* extend.  */
> +      COSTS_N_INSNS (4),       /* add.  */
> +      COSTS_N_INSNS (4),       /* extend_add.  */
> +      COSTS_N_INSNS (35)       /* idiv.  */
> +    }
> +  },
> +  /* LD/ST */
> +  {
> +    COSTS_N_INSNS (4),         /* load.  */
> +    COSTS_N_INSNS (4),         /* load_sign_extend.  */
> +    0,                         /* ldrd (n/a).  */
> +    0,                         /* ldm_1st.  */
> +    0,                         /* ldm_regs_per_insn_1st.  */
> +    0,                         /* ldm_regs_per_insn_subsequent.  */
> +    COSTS_N_INSNS (5),         /* loadf.  */
> +    COSTS_N_INSNS (5),         /* loadd.  */
> +    COSTS_N_INSNS (5),         /* load_unaligned.  */
> +    0,                         /* store.  */
> +    0,                         /* strd.  */
> +    0,                         /* stm_1st.  */
> +    0,                         /* stm_regs_per_insn_1st.  */
> +    0,                         /* stm_regs_per_insn_subsequent.  */
> +    COSTS_N_INSNS (2),         /* storef.  */
> +    COSTS_N_INSNS (2),         /* stored.  */
> +    COSTS_N_INSNS (2),         /* store_unaligned.  */
> +    COSTS_N_INSNS (3),         /* loadv.  */
> +    COSTS_N_INSNS (3)          /* storev.  */
> +  },
> +  {
> +    /* FP SFmode */
> +    {
> +      COSTS_N_INSNS (25),      /* div.  */
> +      COSTS_N_INSNS (4),       /* mult.  */
> +      COSTS_N_INSNS (4),       /* mult_addsub.  */
> +      COSTS_N_INSNS (4),       /* fma.  */
> +      COSTS_N_INSNS (4),       /* addsub.  */
> +      COSTS_N_INSNS (2),       /* fpconst.  */
> +      COSTS_N_INSNS (4),       /* neg.  */
> +      COSTS_N_INSNS (4),       /* compare.  */
> +      COSTS_N_INSNS (4),       /* widen.  */
> +      COSTS_N_INSNS (4),       /* narrow.  */
> +      COSTS_N_INSNS (4),       /* toint.  */
> +      COSTS_N_INSNS (4),       /* fromint.  */
> +      COSTS_N_INSNS (4)        /* roundint.  */
> +    },
> +    /* FP DFmode */
> +    {
> +      COSTS_N_INSNS (34),      /* div.  */
> +      COSTS_N_INSNS (5),       /* mult.  */
> +      COSTS_N_INSNS (5),       /* mult_addsub.  */
> +      COSTS_N_INSNS (5),       /* fma.  */
> +      COSTS_N_INSNS (5),       /* addsub.  */
> +      COSTS_N_INSNS (2),       /* fpconst.  */
> +      COSTS_N_INSNS (5),       /* neg.  */
> +      COSTS_N_INSNS (5),       /* compare.  */
> +      COSTS_N_INSNS (5),       /* widen.  */
> +      COSTS_N_INSNS (5),       /* narrow.  */
> +      COSTS_N_INSNS (6),       /* toint.  */
> +      COSTS_N_INSNS (6),       /* fromint.  */
> +      COSTS_N_INSNS (5)        /* roundint.  */
> +    }
> +  },
> +  /* Vector */
> +  {
> +    COSTS_N_INSNS (3),  /* alu.  */
> +    COSTS_N_INSNS (3),  /* mult.  */
> +    COSTS_N_INSNS (2),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
> +  }
> +};
> +
>  #endif
> diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def
> index c064fb9b85d..d91f8a2babd 100644
> --- a/gcc/config/aarch64/aarch64-fusion-pairs.def
> +++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
> @@ -36,5 +36,6 @@ AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
>  AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
>  AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
>  AARCH64_FUSION_PAIR ("alu+cbz", ALU_CBZ)
> +AARCH64_FUSION_PAIR ("addsub_2reg_const1", ADDSUB_2REG_CONST1)
>
>  #undef AARCH64_FUSION_PAIR
> diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
> index 22ec1be5a4c..b7d6fc8cc88 100644
> --- a/gcc/config/aarch64/aarch64-tune.md
> +++ b/gcc/config/aarch64/aarch64-tune.md
> @@ -1,5 +1,5 @@
>  ;; -*- buffer-read-only: t -*-
>  ;; Generated automatically by gentune.sh from aarch64-cores.def
>  (define_attr "tune"
> -       "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
> +       "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
>         (const (symbol_ref "((enum attr_tune) aarch64_tune)")))
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index d1f979ebcf8..a7f7c3c0121 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -1921,6 +1921,43 @@ static const struct tune_params ampere1_tunings =
>    &ampere1_prefetch_tune
>  };
>
> +static const struct tune_params ampere1a_tunings =
> +{
> +  &ampere1a_extra_costs,
> +  &generic_addrcost_table,
> +  &generic_regmove_cost,
> +  &ampere1_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4, /* issue_rate  */
> +  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
> +   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
> +   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
> +   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
> +   AARCH64_FUSE_ADDSUB_2REG_CONST1),
> +  /* fusible_ops  */
> +  "32",                /* function_align.  */
> +  "4",         /* jump_align.  */
> +  "32:16",     /* loop_align.  */
> +  2,   /* int_reassoc_width.  */
> +  4,   /* fp_reassoc_width.  */
> +  2,   /* vec_reassoc_width.  */
> +  2,   /* min_div_recip_mul_sf.  */
> +  2,   /* min_div_recip_mul_df.  */
> +  0,   /* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),           /* tune_flags.  */
> +  &ampere1_prefetch_tune
> +};
> +
>  static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
>  {
>    2, /* int_stmt_cost  */
> @@ -25539,6 +25576,33 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
>         }
>      }
>
> +  /* Fuse A+B+1 and A-B-1 */
> +  if (simple_sets_p
> +      && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
> +    {
> +      /* We're trying to match:
> +         prev == (set (r0) (plus (r0) (r1)))
> +         curr == (set (r0) (plus (r0) (const_int 1)))
> +       or:
> +         prev == (set (r0) (minus (r0) (r1)))
> +         curr == (set (r0) (plus (r0) (const_int -1))) */
> +
> +      rtx prev_src = SET_SRC (prev_set);
> +      rtx curr_src = SET_SRC (curr_set);
> +
> +      int polarity = 1;
> +      if (GET_CODE (prev_src) == MINUS)
> +       polarity = -1;
> +
> +      if (GET_CODE (curr_src) == PLUS
> +         && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
> +         && CONST_INT_P (XEXP (curr_src, 1))
> +         && INTVAL (XEXP (curr_src, 1)) == polarity
> +         && REG_P (XEXP (curr_src, 0))
> +         && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
> +       return true;
> +    }
> +
>    return false;
>  }
>
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 60e65f4eaa5..09c8b312ae7 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -19995,7 +19995,7 @@ performance of the code.  Permissible values for this option are:
>  @samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55},
>  @samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c}, @samp{cortex-x2},
>  @samp{cortex-a510}, @samp{cortex-a710}, @samp{cortex-a715}, @samp{ampere1},
> -@samp{native}.
> +@samp{ampere1a}, and @samp{native}.
>
>  The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
>  @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},
> --
> 2.34.1
>
  
Philipp Tomsich Nov. 14, 2022, 2:56 p.m. UTC | #2
Richard,

is this OK for backport to GCC-12 and GCC-11?

Thanks,
Philipp.

On Mon, 14 Nov 2022 at 14:53, Philipp Tomsich <philipp.tomsich@vrull.eu> wrote:
>
> This patch adds support for Ampere-1A CPU:
>  - recognize the name of the core and provide detection for -mcpu=native,
>  - updated extra_costs,
>  - adds a new fusion pair for (A+B+1 and A-B-1).
>
> Ampere-1A and Ampere-1 have more timing difference than the extra
> costs indicate, but these don't propagate through to the headline
> items in our extra costs (e.g. the change in latency for scalar sqrt
> doesn't have a corresponding table entry).
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere1a.
>         * config/aarch64/aarch64-cost-tables.h: Add ampere1a_extra_costs.
>         * config/aarch64/aarch64-fusion-pairs.def (AARCH64_FUSION_PAIR):
>         Define a new fusion pair for A+B+1/A-B-1 (i.e., add/subtract two
>         registers and then +1/-1).
>         * config/aarch64/aarch64-tune.md: Regenerate.
>         * config/aarch64/aarch64.cc (aarch_macro_fusion_pair_p): Implement
>         idiom-matcher for the new fusion pair.
>         * doc/invoke.texi: Add ampere1a.
>
> Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
> ---
>
> Changes in v2:
> - break line in fusion matcher to stay below 80 characters
> - rename fusion pair addsub_2reg_const1
> - document 'ampere1a' in invoke.texi
>
>  gcc/config/aarch64/aarch64-cores.def        |   1 +
>  gcc/config/aarch64/aarch64-cost-tables.h    | 107 ++++++++++++++++++++
>  gcc/config/aarch64/aarch64-fusion-pairs.def |   1 +
>  gcc/config/aarch64/aarch64-tune.md          |   2 +-
>  gcc/config/aarch64/aarch64.cc               |  64 ++++++++++++
>  gcc/doc/invoke.texi                         |   2 +-
>  6 files changed, 175 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
> index d2671778928..aead587cec1 100644
> --- a/gcc/config/aarch64/aarch64-cores.def
> +++ b/gcc/config/aarch64/aarch64-cores.def
> @@ -70,6 +70,7 @@ AARCH64_CORE("thunderxt83",   thunderxt83,   thunderx,  V8A,  (CRC, CRYPTO), thu
>
>  /* Ampere Computing ('\xC0') cores. */
>  AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, SHA3), ampere1, 0xC0, 0xac3, -1)
> +AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES, SHA3, MEMTAG), ampere1a, 0xC0, 0xac4, -1)
>  /* Do not swap around "emag" and "xgene1",
>     this order is required to handle variant correctly. */
>  AARCH64_CORE("emag",        emag,      xgene1,    V8A,  (CRC, CRYPTO), emag, 0x50, 0x000, 3)
> diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
> index 760d7b30368..48522606fbe 100644
> --- a/gcc/config/aarch64/aarch64-cost-tables.h
> +++ b/gcc/config/aarch64/aarch64-cost-tables.h
> @@ -775,4 +775,111 @@ const struct cpu_cost_table ampere1_extra_costs =
>    }
>  };
>
> +const struct cpu_cost_table ampere1a_extra_costs =
> +{
> +  /* ALU */
> +  {
> +    0,                 /* arith.  */
> +    0,                 /* logical.  */
> +    0,                 /* shift.  */
> +    COSTS_N_INSNS (1), /* shift_reg.  */
> +    0,                 /* arith_shift.  */
> +    COSTS_N_INSNS (1), /* arith_shift_reg.  */
> +    0,                 /* log_shift.  */
> +    COSTS_N_INSNS (1), /* log_shift_reg.  */
> +    0,                 /* extend.  */
> +    COSTS_N_INSNS (1), /* extend_arith.  */
> +    0,                 /* bfi.  */
> +    0,                 /* bfx.  */
> +    0,                 /* clz.  */
> +    0,                 /* rev.  */
> +    0,                 /* non_exec.  */
> +    true               /* non_exec_costs_exec.  */
> +  },
> +  {
> +    /* MULT SImode */
> +    {
> +      COSTS_N_INSNS (3),       /* simple.  */
> +      COSTS_N_INSNS (3),       /* flag_setting.  */
> +      COSTS_N_INSNS (3),       /* extend.  */
> +      COSTS_N_INSNS (4),       /* add.  */
> +      COSTS_N_INSNS (4),       /* extend_add.  */
> +      COSTS_N_INSNS (19)       /* idiv.  */
> +    },
> +    /* MULT DImode */
> +    {
> +      COSTS_N_INSNS (3),       /* simple.  */
> +      0,                       /* flag_setting (N/A).  */
> +      COSTS_N_INSNS (3),       /* extend.  */
> +      COSTS_N_INSNS (4),       /* add.  */
> +      COSTS_N_INSNS (4),       /* extend_add.  */
> +      COSTS_N_INSNS (35)       /* idiv.  */
> +    }
> +  },
> +  /* LD/ST */
> +  {
> +    COSTS_N_INSNS (4),         /* load.  */
> +    COSTS_N_INSNS (4),         /* load_sign_extend.  */
> +    0,                         /* ldrd (n/a).  */
> +    0,                         /* ldm_1st.  */
> +    0,                         /* ldm_regs_per_insn_1st.  */
> +    0,                         /* ldm_regs_per_insn_subsequent.  */
> +    COSTS_N_INSNS (5),         /* loadf.  */
> +    COSTS_N_INSNS (5),         /* loadd.  */
> +    COSTS_N_INSNS (5),         /* load_unaligned.  */
> +    0,                         /* store.  */
> +    0,                         /* strd.  */
> +    0,                         /* stm_1st.  */
> +    0,                         /* stm_regs_per_insn_1st.  */
> +    0,                         /* stm_regs_per_insn_subsequent.  */
> +    COSTS_N_INSNS (2),         /* storef.  */
> +    COSTS_N_INSNS (2),         /* stored.  */
> +    COSTS_N_INSNS (2),         /* store_unaligned.  */
> +    COSTS_N_INSNS (3),         /* loadv.  */
> +    COSTS_N_INSNS (3)          /* storev.  */
> +  },
> +  {
> +    /* FP SFmode */
> +    {
> +      COSTS_N_INSNS (25),      /* div.  */
> +      COSTS_N_INSNS (4),       /* mult.  */
> +      COSTS_N_INSNS (4),       /* mult_addsub.  */
> +      COSTS_N_INSNS (4),       /* fma.  */
> +      COSTS_N_INSNS (4),       /* addsub.  */
> +      COSTS_N_INSNS (2),       /* fpconst.  */
> +      COSTS_N_INSNS (4),       /* neg.  */
> +      COSTS_N_INSNS (4),       /* compare.  */
> +      COSTS_N_INSNS (4),       /* widen.  */
> +      COSTS_N_INSNS (4),       /* narrow.  */
> +      COSTS_N_INSNS (4),       /* toint.  */
> +      COSTS_N_INSNS (4),       /* fromint.  */
> +      COSTS_N_INSNS (4)        /* roundint.  */
> +    },
> +    /* FP DFmode */
> +    {
> +      COSTS_N_INSNS (34),      /* div.  */
> +      COSTS_N_INSNS (5),       /* mult.  */
> +      COSTS_N_INSNS (5),       /* mult_addsub.  */
> +      COSTS_N_INSNS (5),       /* fma.  */
> +      COSTS_N_INSNS (5),       /* addsub.  */
> +      COSTS_N_INSNS (2),       /* fpconst.  */
> +      COSTS_N_INSNS (5),       /* neg.  */
> +      COSTS_N_INSNS (5),       /* compare.  */
> +      COSTS_N_INSNS (5),       /* widen.  */
> +      COSTS_N_INSNS (5),       /* narrow.  */
> +      COSTS_N_INSNS (6),       /* toint.  */
> +      COSTS_N_INSNS (6),       /* fromint.  */
> +      COSTS_N_INSNS (5)        /* roundint.  */
> +    }
> +  },
> +  /* Vector */
> +  {
> +    COSTS_N_INSNS (3),  /* alu.  */
> +    COSTS_N_INSNS (3),  /* mult.  */
> +    COSTS_N_INSNS (2),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
> +  }
> +};
> +
>  #endif
> diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def
> index c064fb9b85d..d91f8a2babd 100644
> --- a/gcc/config/aarch64/aarch64-fusion-pairs.def
> +++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
> @@ -36,5 +36,6 @@ AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
>  AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
>  AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
>  AARCH64_FUSION_PAIR ("alu+cbz", ALU_CBZ)
> +AARCH64_FUSION_PAIR ("addsub_2reg_const1", ADDSUB_2REG_CONST1)
>
>  #undef AARCH64_FUSION_PAIR
> diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
> index 22ec1be5a4c..b7d6fc8cc88 100644
> --- a/gcc/config/aarch64/aarch64-tune.md
> +++ b/gcc/config/aarch64/aarch64-tune.md
> @@ -1,5 +1,5 @@
>  ;; -*- buffer-read-only: t -*-
>  ;; Generated automatically by gentune.sh from aarch64-cores.def
>  (define_attr "tune"
> -       "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
> +       "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
>         (const (symbol_ref "((enum attr_tune) aarch64_tune)")))
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index d1f979ebcf8..a7f7c3c0121 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -1921,6 +1921,43 @@ static const struct tune_params ampere1_tunings =
>    &ampere1_prefetch_tune
>  };
>
> +static const struct tune_params ampere1a_tunings =
> +{
> +  &ampere1a_extra_costs,
> +  &generic_addrcost_table,
> +  &generic_regmove_cost,
> +  &ampere1_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    4, /* store_int.  */
> +    4, /* load_fp.  */
> +    4, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4, /* issue_rate  */
> +  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
> +   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
> +   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
> +   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
> +   AARCH64_FUSE_ADDSUB_2REG_CONST1),
> +  /* fusible_ops  */
> +  "32",                /* function_align.  */
> +  "4",         /* jump_align.  */
> +  "32:16",     /* loop_align.  */
> +  2,   /* int_reassoc_width.  */
> +  4,   /* fp_reassoc_width.  */
> +  2,   /* vec_reassoc_width.  */
> +  2,   /* min_div_recip_mul_sf.  */
> +  2,   /* min_div_recip_mul_df.  */
> +  0,   /* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_NONE),           /* tune_flags.  */
> +  &ampere1_prefetch_tune
> +};
> +
>  static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
>  {
>    2, /* int_stmt_cost  */
> @@ -25539,6 +25576,33 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
>         }
>      }
>
> +  /* Fuse A+B+1 and A-B-1 */
> +  if (simple_sets_p
> +      && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
> +    {
> +      /* We're trying to match:
> +         prev == (set (r0) (plus (r0) (r1)))
> +         curr == (set (r0) (plus (r0) (const_int 1)))
> +       or:
> +         prev == (set (r0) (minus (r0) (r1)))
> +         curr == (set (r0) (plus (r0) (const_int -1))) */
> +
> +      rtx prev_src = SET_SRC (prev_set);
> +      rtx curr_src = SET_SRC (curr_set);
> +
> +      int polarity = 1;
> +      if (GET_CODE (prev_src) == MINUS)
> +       polarity = -1;
> +
> +      if (GET_CODE (curr_src) == PLUS
> +         && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
> +         && CONST_INT_P (XEXP (curr_src, 1))
> +         && INTVAL (XEXP (curr_src, 1)) == polarity
> +         && REG_P (XEXP (curr_src, 0))
> +         && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
> +       return true;
> +    }
> +
>    return false;
>  }
>
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 60e65f4eaa5..09c8b312ae7 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -19995,7 +19995,7 @@ performance of the code.  Permissible values for this option are:
>  @samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55},
>  @samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c}, @samp{cortex-x2},
>  @samp{cortex-a510}, @samp{cortex-a710}, @samp{cortex-a715}, @samp{ampere1},
> -@samp{native}.
> +@samp{ampere1a}, and @samp{native}.
>
>  The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
>  @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},
> --
> 2.34.1
>
  
Richard Sandiford Nov. 15, 2022, 10:43 a.m. UTC | #3
Philipp Tomsich <philipp.tomsich@vrull.eu> writes:
> Richard,
>
> is this OK for backport to GCC-12 and GCC-11?

The fusion part seems potentially risky for a stable branch, but since
it's conditional on the new flag (and thus new CPU), I think it should
be OK.

So yeah, OK for both, thanks.

Richard

> Thanks,
> Philipp.
>
> On Mon, 14 Nov 2022 at 14:53, Philipp Tomsich <philipp.tomsich@vrull.eu> wrote:
>>
>> This patch adds support for Ampere-1A CPU:
>>  - recognize the name of the core and provide detection for -mcpu=native,
>>  - updated extra_costs,
>>  - adds a new fusion pair for (A+B+1 and A-B-1).
>>
>> Ampere-1A and Ampere-1 have more timing difference than the extra
>> costs indicate, but these don't propagate through to the headline
>> items in our extra costs (e.g. the change in latency for scalar sqrt
>> doesn't have a corresponding table entry).
>>
>> gcc/ChangeLog:
>>
>>         * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere1a.
>>         * config/aarch64/aarch64-cost-tables.h: Add ampere1a_extra_costs.
>>         * config/aarch64/aarch64-fusion-pairs.def (AARCH64_FUSION_PAIR):
>>         Define a new fusion pair for A+B+1/A-B-1 (i.e., add/subtract two
>>         registers and then +1/-1).
>>         * config/aarch64/aarch64-tune.md: Regenerate.
>>         * config/aarch64/aarch64.cc (aarch_macro_fusion_pair_p): Implement
>>         idiom-matcher for the new fusion pair.
>>         * doc/invoke.texi: Add ampere1a.
>>
>> Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
>> ---
>>
>> Changes in v2:
>> - break line in fusion matcher to stay below 80 characters
>> - rename fusion pair addsub_2reg_const1
>> - document 'ampere1a' in invoke.texi
>>
>>  gcc/config/aarch64/aarch64-cores.def        |   1 +
>>  gcc/config/aarch64/aarch64-cost-tables.h    | 107 ++++++++++++++++++++
>>  gcc/config/aarch64/aarch64-fusion-pairs.def |   1 +
>>  gcc/config/aarch64/aarch64-tune.md          |   2 +-
>>  gcc/config/aarch64/aarch64.cc               |  64 ++++++++++++
>>  gcc/doc/invoke.texi                         |   2 +-
>>  6 files changed, 175 insertions(+), 2 deletions(-)
>>
>> diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
>> index d2671778928..aead587cec1 100644
>> --- a/gcc/config/aarch64/aarch64-cores.def
>> +++ b/gcc/config/aarch64/aarch64-cores.def
>> @@ -70,6 +70,7 @@ AARCH64_CORE("thunderxt83",   thunderxt83,   thunderx,  V8A,  (CRC, CRYPTO), thu
>>
>>  /* Ampere Computing ('\xC0') cores. */
>>  AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, SHA3), ampere1, 0xC0, 0xac3, -1)
>> +AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES, SHA3, MEMTAG), ampere1a, 0xC0, 0xac4, -1)
>>  /* Do not swap around "emag" and "xgene1",
>>     this order is required to handle variant correctly. */
>>  AARCH64_CORE("emag",        emag,      xgene1,    V8A,  (CRC, CRYPTO), emag, 0x50, 0x000, 3)
>> diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
>> index 760d7b30368..48522606fbe 100644
>> --- a/gcc/config/aarch64/aarch64-cost-tables.h
>> +++ b/gcc/config/aarch64/aarch64-cost-tables.h
>> @@ -775,4 +775,111 @@ const struct cpu_cost_table ampere1_extra_costs =
>>    }
>>  };
>>
>> +const struct cpu_cost_table ampere1a_extra_costs =
>> +{
>> +  /* ALU */
>> +  {
>> +    0,                 /* arith.  */
>> +    0,                 /* logical.  */
>> +    0,                 /* shift.  */
>> +    COSTS_N_INSNS (1), /* shift_reg.  */
>> +    0,                 /* arith_shift.  */
>> +    COSTS_N_INSNS (1), /* arith_shift_reg.  */
>> +    0,                 /* log_shift.  */
>> +    COSTS_N_INSNS (1), /* log_shift_reg.  */
>> +    0,                 /* extend.  */
>> +    COSTS_N_INSNS (1), /* extend_arith.  */
>> +    0,                 /* bfi.  */
>> +    0,                 /* bfx.  */
>> +    0,                 /* clz.  */
>> +    0,                 /* rev.  */
>> +    0,                 /* non_exec.  */
>> +    true               /* non_exec_costs_exec.  */
>> +  },
>> +  {
>> +    /* MULT SImode */
>> +    {
>> +      COSTS_N_INSNS (3),       /* simple.  */
>> +      COSTS_N_INSNS (3),       /* flag_setting.  */
>> +      COSTS_N_INSNS (3),       /* extend.  */
>> +      COSTS_N_INSNS (4),       /* add.  */
>> +      COSTS_N_INSNS (4),       /* extend_add.  */
>> +      COSTS_N_INSNS (19)       /* idiv.  */
>> +    },
>> +    /* MULT DImode */
>> +    {
>> +      COSTS_N_INSNS (3),       /* simple.  */
>> +      0,                       /* flag_setting (N/A).  */
>> +      COSTS_N_INSNS (3),       /* extend.  */
>> +      COSTS_N_INSNS (4),       /* add.  */
>> +      COSTS_N_INSNS (4),       /* extend_add.  */
>> +      COSTS_N_INSNS (35)       /* idiv.  */
>> +    }
>> +  },
>> +  /* LD/ST */
>> +  {
>> +    COSTS_N_INSNS (4),         /* load.  */
>> +    COSTS_N_INSNS (4),         /* load_sign_extend.  */
>> +    0,                         /* ldrd (n/a).  */
>> +    0,                         /* ldm_1st.  */
>> +    0,                         /* ldm_regs_per_insn_1st.  */
>> +    0,                         /* ldm_regs_per_insn_subsequent.  */
>> +    COSTS_N_INSNS (5),         /* loadf.  */
>> +    COSTS_N_INSNS (5),         /* loadd.  */
>> +    COSTS_N_INSNS (5),         /* load_unaligned.  */
>> +    0,                         /* store.  */
>> +    0,                         /* strd.  */
>> +    0,                         /* stm_1st.  */
>> +    0,                         /* stm_regs_per_insn_1st.  */
>> +    0,                         /* stm_regs_per_insn_subsequent.  */
>> +    COSTS_N_INSNS (2),         /* storef.  */
>> +    COSTS_N_INSNS (2),         /* stored.  */
>> +    COSTS_N_INSNS (2),         /* store_unaligned.  */
>> +    COSTS_N_INSNS (3),         /* loadv.  */
>> +    COSTS_N_INSNS (3)          /* storev.  */
>> +  },
>> +  {
>> +    /* FP SFmode */
>> +    {
>> +      COSTS_N_INSNS (25),      /* div.  */
>> +      COSTS_N_INSNS (4),       /* mult.  */
>> +      COSTS_N_INSNS (4),       /* mult_addsub.  */
>> +      COSTS_N_INSNS (4),       /* fma.  */
>> +      COSTS_N_INSNS (4),       /* addsub.  */
>> +      COSTS_N_INSNS (2),       /* fpconst.  */
>> +      COSTS_N_INSNS (4),       /* neg.  */
>> +      COSTS_N_INSNS (4),       /* compare.  */
>> +      COSTS_N_INSNS (4),       /* widen.  */
>> +      COSTS_N_INSNS (4),       /* narrow.  */
>> +      COSTS_N_INSNS (4),       /* toint.  */
>> +      COSTS_N_INSNS (4),       /* fromint.  */
>> +      COSTS_N_INSNS (4)        /* roundint.  */
>> +    },
>> +    /* FP DFmode */
>> +    {
>> +      COSTS_N_INSNS (34),      /* div.  */
>> +      COSTS_N_INSNS (5),       /* mult.  */
>> +      COSTS_N_INSNS (5),       /* mult_addsub.  */
>> +      COSTS_N_INSNS (5),       /* fma.  */
>> +      COSTS_N_INSNS (5),       /* addsub.  */
>> +      COSTS_N_INSNS (2),       /* fpconst.  */
>> +      COSTS_N_INSNS (5),       /* neg.  */
>> +      COSTS_N_INSNS (5),       /* compare.  */
>> +      COSTS_N_INSNS (5),       /* widen.  */
>> +      COSTS_N_INSNS (5),       /* narrow.  */
>> +      COSTS_N_INSNS (6),       /* toint.  */
>> +      COSTS_N_INSNS (6),       /* fromint.  */
>> +      COSTS_N_INSNS (5)        /* roundint.  */
>> +    }
>> +  },
>> +  /* Vector */
>> +  {
>> +    COSTS_N_INSNS (3),  /* alu.  */
>> +    COSTS_N_INSNS (3),  /* mult.  */
>> +    COSTS_N_INSNS (2),  /* movi.  */
>> +    COSTS_N_INSNS (2),  /* dup.  */
>> +    COSTS_N_INSNS (2)   /* extract.  */
>> +  }
>> +};
>> +
>>  #endif
>> diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def
>> index c064fb9b85d..d91f8a2babd 100644
>> --- a/gcc/config/aarch64/aarch64-fusion-pairs.def
>> +++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
>> @@ -36,5 +36,6 @@ AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
>>  AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
>>  AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
>>  AARCH64_FUSION_PAIR ("alu+cbz", ALU_CBZ)
>> +AARCH64_FUSION_PAIR ("addsub_2reg_const1", ADDSUB_2REG_CONST1)
>>
>>  #undef AARCH64_FUSION_PAIR
>> diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
>> index 22ec1be5a4c..b7d6fc8cc88 100644
>> --- a/gcc/config/aarch64/aarch64-tune.md
>> +++ b/gcc/config/aarch64/aarch64-tune.md
>> @@ -1,5 +1,5 @@
>>  ;; -*- buffer-read-only: t -*-
>>  ;; Generated automatically by gentune.sh from aarch64-cores.def
>>  (define_attr "tune"
>> -       "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
>> +       "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
>>         (const (symbol_ref "((enum attr_tune) aarch64_tune)")))
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index d1f979ebcf8..a7f7c3c0121 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -1921,6 +1921,43 @@ static const struct tune_params ampere1_tunings =
>>    &ampere1_prefetch_tune
>>  };
>>
>> +static const struct tune_params ampere1a_tunings =
>> +{
>> +  &ampere1a_extra_costs,
>> +  &generic_addrcost_table,
>> +  &generic_regmove_cost,
>> +  &ampere1_vector_cost,
>> +  &generic_branch_cost,
>> +  &generic_approx_modes,
>> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
>> +  { 4, /* load_int.  */
>> +    4, /* store_int.  */
>> +    4, /* load_fp.  */
>> +    4, /* store_fp.  */
>> +    4, /* load_pred.  */
>> +    4 /* store_pred.  */
>> +  }, /* memmov_cost.  */
>> +  4, /* issue_rate  */
>> +  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
>> +   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
>> +   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
>> +   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
>> +   AARCH64_FUSE_ADDSUB_2REG_CONST1),
>> +  /* fusible_ops  */
>> +  "32",                /* function_align.  */
>> +  "4",         /* jump_align.  */
>> +  "32:16",     /* loop_align.  */
>> +  2,   /* int_reassoc_width.  */
>> +  4,   /* fp_reassoc_width.  */
>> +  2,   /* vec_reassoc_width.  */
>> +  2,   /* min_div_recip_mul_sf.  */
>> +  2,   /* min_div_recip_mul_df.  */
>> +  0,   /* max_case_values.  */
>> +  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
>> +  (AARCH64_EXTRA_TUNE_NONE),           /* tune_flags.  */
>> +  &ampere1_prefetch_tune
>> +};
>> +
>>  static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
>>  {
>>    2, /* int_stmt_cost  */
>> @@ -25539,6 +25576,33 @@ aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
>>         }
>>      }
>>
>> +  /* Fuse A+B+1 and A-B-1 */
>> +  if (simple_sets_p
>> +      && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
>> +    {
>> +      /* We're trying to match:
>> +         prev == (set (r0) (plus (r0) (r1)))
>> +         curr == (set (r0) (plus (r0) (const_int 1)))
>> +       or:
>> +         prev == (set (r0) (minus (r0) (r1)))
>> +         curr == (set (r0) (plus (r0) (const_int -1))) */
>> +
>> +      rtx prev_src = SET_SRC (prev_set);
>> +      rtx curr_src = SET_SRC (curr_set);
>> +
>> +      int polarity = 1;
>> +      if (GET_CODE (prev_src) == MINUS)
>> +       polarity = -1;
>> +
>> +      if (GET_CODE (curr_src) == PLUS
>> +         && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
>> +         && CONST_INT_P (XEXP (curr_src, 1))
>> +         && INTVAL (XEXP (curr_src, 1)) == polarity
>> +         && REG_P (XEXP (curr_src, 0))
>> +         && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
>> +       return true;
>> +    }
>> +
>>    return false;
>>  }
>>
>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> index 60e65f4eaa5..09c8b312ae7 100644
>> --- a/gcc/doc/invoke.texi
>> +++ b/gcc/doc/invoke.texi
>> @@ -19995,7 +19995,7 @@ performance of the code.  Permissible values for this option are:
>>  @samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55},
>>  @samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c}, @samp{cortex-x2},
>>  @samp{cortex-a510}, @samp{cortex-a710}, @samp{cortex-a715}, @samp{ampere1},
>> -@samp{native}.
>> +@samp{ampere1a}, and @samp{native}.
>>
>>  The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
>>  @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},
>> --
>> 2.34.1
>>
  
Philipp Tomsich Nov. 16, 2022, 10:47 p.m. UTC | #4
Backports applied to releases/gcc-11 and releases/gcc-12. Thanks!
Philipp.

On Tue, 15 Nov 2022 at 11:43, Richard Sandiford <richard.sandiford@arm.com>
wrote:

> Philipp Tomsich <philipp.tomsich@vrull.eu> writes:
> > Richard,
> >
> > is this OK for backport to GCC-12 and GCC-11?
>
> The fusion part seems potentially risky for a stable branch, but since
> it's conditional on the new flag (and thus new CPU), I think it should
> be OK.
>
> So yeah, OK for both, thanks.
>
> Richard
>
> > Thanks,
> > Philipp.
> >
> > On Mon, 14 Nov 2022 at 14:53, Philipp Tomsich <philipp.tomsich@vrull.eu>
> wrote:
> >>
> >> This patch adds support for Ampere-1A CPU:
> >>  - recognize the name of the core and provide detection for
> -mcpu=native,
> >>  - updated extra_costs,
> >>  - adds a new fusion pair for (A+B+1 and A-B-1).
> >>
> >> Ampere-1A and Ampere-1 have more timing difference than the extra
> >> costs indicate, but these don't propagate through to the headline
> >> items in our extra costs (e.g. the change in latency for scalar sqrt
> >> doesn't have a corresponding table entry).
> >>
> >> gcc/ChangeLog:
> >>
> >>         * config/aarch64/aarch64-cores.def (AARCH64_CORE): Add ampere1a.
> >>         * config/aarch64/aarch64-cost-tables.h: Add
> ampere1a_extra_costs.
> >>         * config/aarch64/aarch64-fusion-pairs.def (AARCH64_FUSION_PAIR):
> >>         Define a new fusion pair for A+B+1/A-B-1 (i.e., add/subtract two
> >>         registers and then +1/-1).
> >>         * config/aarch64/aarch64-tune.md: Regenerate.
> >>         * config/aarch64/aarch64.cc (aarch_macro_fusion_pair_p):
> Implement
> >>         idiom-matcher for the new fusion pair.
> >>         * doc/invoke.texi: Add ampere1a.
> >>
> >> Signed-off-by: Philipp Tomsich <philipp.tomsich@vrull.eu>
> >> ---
> >>
> >> Changes in v2:
> >> - break line in fusion matcher to stay below 80 characters
> >> - rename fusion pair addsub_2reg_const1
> >> - document 'ampere1a' in invoke.texi
> >>
> >>  gcc/config/aarch64/aarch64-cores.def        |   1 +
> >>  gcc/config/aarch64/aarch64-cost-tables.h    | 107 ++++++++++++++++++++
> >>  gcc/config/aarch64/aarch64-fusion-pairs.def |   1 +
> >>  gcc/config/aarch64/aarch64-tune.md          |   2 +-
> >>  gcc/config/aarch64/aarch64.cc               |  64 ++++++++++++
> >>  gcc/doc/invoke.texi                         |   2 +-
> >>  6 files changed, 175 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/gcc/config/aarch64/aarch64-cores.def
> b/gcc/config/aarch64/aarch64-cores.def
> >> index d2671778928..aead587cec1 100644
> >> --- a/gcc/config/aarch64/aarch64-cores.def
> >> +++ b/gcc/config/aarch64/aarch64-cores.def
> >> @@ -70,6 +70,7 @@ AARCH64_CORE("thunderxt83",   thunderxt83,
>  thunderx,  V8A,  (CRC, CRYPTO), thu
> >>
> >>  /* Ampere Computing ('\xC0') cores. */
> >>  AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES,
> SHA3), ampere1, 0xC0, 0xac3, -1)
> >> +AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES,
> SHA3, MEMTAG), ampere1a, 0xC0, 0xac4, -1)
> >>  /* Do not swap around "emag" and "xgene1",
> >>     this order is required to handle variant correctly. */
> >>  AARCH64_CORE("emag",        emag,      xgene1,    V8A,  (CRC, CRYPTO),
> emag, 0x50, 0x000, 3)
> >> diff --git a/gcc/config/aarch64/aarch64-cost-tables.h
> b/gcc/config/aarch64/aarch64-cost-tables.h
> >> index 760d7b30368..48522606fbe 100644
> >> --- a/gcc/config/aarch64/aarch64-cost-tables.h
> >> +++ b/gcc/config/aarch64/aarch64-cost-tables.h
> >> @@ -775,4 +775,111 @@ const struct cpu_cost_table ampere1_extra_costs =
> >>    }
> >>  };
> >>
> >> +const struct cpu_cost_table ampere1a_extra_costs =
> >> +{
> >> +  /* ALU */
> >> +  {
> >> +    0,                 /* arith.  */
> >> +    0,                 /* logical.  */
> >> +    0,                 /* shift.  */
> >> +    COSTS_N_INSNS (1), /* shift_reg.  */
> >> +    0,                 /* arith_shift.  */
> >> +    COSTS_N_INSNS (1), /* arith_shift_reg.  */
> >> +    0,                 /* log_shift.  */
> >> +    COSTS_N_INSNS (1), /* log_shift_reg.  */
> >> +    0,                 /* extend.  */
> >> +    COSTS_N_INSNS (1), /* extend_arith.  */
> >> +    0,                 /* bfi.  */
> >> +    0,                 /* bfx.  */
> >> +    0,                 /* clz.  */
> >> +    0,                 /* rev.  */
> >> +    0,                 /* non_exec.  */
> >> +    true               /* non_exec_costs_exec.  */
> >> +  },
> >> +  {
> >> +    /* MULT SImode */
> >> +    {
> >> +      COSTS_N_INSNS (3),       /* simple.  */
> >> +      COSTS_N_INSNS (3),       /* flag_setting.  */
> >> +      COSTS_N_INSNS (3),       /* extend.  */
> >> +      COSTS_N_INSNS (4),       /* add.  */
> >> +      COSTS_N_INSNS (4),       /* extend_add.  */
> >> +      COSTS_N_INSNS (19)       /* idiv.  */
> >> +    },
> >> +    /* MULT DImode */
> >> +    {
> >> +      COSTS_N_INSNS (3),       /* simple.  */
> >> +      0,                       /* flag_setting (N/A).  */
> >> +      COSTS_N_INSNS (3),       /* extend.  */
> >> +      COSTS_N_INSNS (4),       /* add.  */
> >> +      COSTS_N_INSNS (4),       /* extend_add.  */
> >> +      COSTS_N_INSNS (35)       /* idiv.  */
> >> +    }
> >> +  },
> >> +  /* LD/ST */
> >> +  {
> >> +    COSTS_N_INSNS (4),         /* load.  */
> >> +    COSTS_N_INSNS (4),         /* load_sign_extend.  */
> >> +    0,                         /* ldrd (n/a).  */
> >> +    0,                         /* ldm_1st.  */
> >> +    0,                         /* ldm_regs_per_insn_1st.  */
> >> +    0,                         /* ldm_regs_per_insn_subsequent.  */
> >> +    COSTS_N_INSNS (5),         /* loadf.  */
> >> +    COSTS_N_INSNS (5),         /* loadd.  */
> >> +    COSTS_N_INSNS (5),         /* load_unaligned.  */
> >> +    0,                         /* store.  */
> >> +    0,                         /* strd.  */
> >> +    0,                         /* stm_1st.  */
> >> +    0,                         /* stm_regs_per_insn_1st.  */
> >> +    0,                         /* stm_regs_per_insn_subsequent.  */
> >> +    COSTS_N_INSNS (2),         /* storef.  */
> >> +    COSTS_N_INSNS (2),         /* stored.  */
> >> +    COSTS_N_INSNS (2),         /* store_unaligned.  */
> >> +    COSTS_N_INSNS (3),         /* loadv.  */
> >> +    COSTS_N_INSNS (3)          /* storev.  */
> >> +  },
> >> +  {
> >> +    /* FP SFmode */
> >> +    {
> >> +      COSTS_N_INSNS (25),      /* div.  */
> >> +      COSTS_N_INSNS (4),       /* mult.  */
> >> +      COSTS_N_INSNS (4),       /* mult_addsub.  */
> >> +      COSTS_N_INSNS (4),       /* fma.  */
> >> +      COSTS_N_INSNS (4),       /* addsub.  */
> >> +      COSTS_N_INSNS (2),       /* fpconst.  */
> >> +      COSTS_N_INSNS (4),       /* neg.  */
> >> +      COSTS_N_INSNS (4),       /* compare.  */
> >> +      COSTS_N_INSNS (4),       /* widen.  */
> >> +      COSTS_N_INSNS (4),       /* narrow.  */
> >> +      COSTS_N_INSNS (4),       /* toint.  */
> >> +      COSTS_N_INSNS (4),       /* fromint.  */
> >> +      COSTS_N_INSNS (4)        /* roundint.  */
> >> +    },
> >> +    /* FP DFmode */
> >> +    {
> >> +      COSTS_N_INSNS (34),      /* div.  */
> >> +      COSTS_N_INSNS (5),       /* mult.  */
> >> +      COSTS_N_INSNS (5),       /* mult_addsub.  */
> >> +      COSTS_N_INSNS (5),       /* fma.  */
> >> +      COSTS_N_INSNS (5),       /* addsub.  */
> >> +      COSTS_N_INSNS (2),       /* fpconst.  */
> >> +      COSTS_N_INSNS (5),       /* neg.  */
> >> +      COSTS_N_INSNS (5),       /* compare.  */
> >> +      COSTS_N_INSNS (5),       /* widen.  */
> >> +      COSTS_N_INSNS (5),       /* narrow.  */
> >> +      COSTS_N_INSNS (6),       /* toint.  */
> >> +      COSTS_N_INSNS (6),       /* fromint.  */
> >> +      COSTS_N_INSNS (5)        /* roundint.  */
> >> +    }
> >> +  },
> >> +  /* Vector */
> >> +  {
> >> +    COSTS_N_INSNS (3),  /* alu.  */
> >> +    COSTS_N_INSNS (3),  /* mult.  */
> >> +    COSTS_N_INSNS (2),  /* movi.  */
> >> +    COSTS_N_INSNS (2),  /* dup.  */
> >> +    COSTS_N_INSNS (2)   /* extract.  */
> >> +  }
> >> +};
> >> +
> >>  #endif
> >> diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def
> b/gcc/config/aarch64/aarch64-fusion-pairs.def
> >> index c064fb9b85d..d91f8a2babd 100644
> >> --- a/gcc/config/aarch64/aarch64-fusion-pairs.def
> >> +++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
> >> @@ -36,5 +36,6 @@ AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
> >>  AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
> >>  AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
> >>  AARCH64_FUSION_PAIR ("alu+cbz", ALU_CBZ)
> >> +AARCH64_FUSION_PAIR ("addsub_2reg_const1", ADDSUB_2REG_CONST1)
> >>
> >>  #undef AARCH64_FUSION_PAIR
> >> diff --git a/gcc/config/aarch64/aarch64-tune.md
> b/gcc/config/aarch64/aarch64-tune.md
> >> index 22ec1be5a4c..b7d6fc8cc88 100644
> >> --- a/gcc/config/aarch64/aarch64-tune.md
> >> +++ b/gcc/config/aarch64/aarch64-tune.md
> >> @@ -1,5 +1,5 @@
> >>  ;; -*- buffer-read-only: t -*-
> >>  ;; Generated automatically by gentune.sh from aarch64-cores.def
> >>  (define_attr "tune"
> >> -
>  "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
> >> +
>  "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
> >>         (const (symbol_ref "((enum attr_tune) aarch64_tune)")))
> >> diff --git a/gcc/config/aarch64/aarch64.cc
> b/gcc/config/aarch64/aarch64.cc
> >> index d1f979ebcf8..a7f7c3c0121 100644
> >> --- a/gcc/config/aarch64/aarch64.cc
> >> +++ b/gcc/config/aarch64/aarch64.cc
> >> @@ -1921,6 +1921,43 @@ static const struct tune_params ampere1_tunings =
> >>    &ampere1_prefetch_tune
> >>  };
> >>
> >> +static const struct tune_params ampere1a_tunings =
> >> +{
> >> +  &ampere1a_extra_costs,
> >> +  &generic_addrcost_table,
> >> +  &generic_regmove_cost,
> >> +  &ampere1_vector_cost,
> >> +  &generic_branch_cost,
> >> +  &generic_approx_modes,
> >> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> >> +  { 4, /* load_int.  */
> >> +    4, /* store_int.  */
> >> +    4, /* load_fp.  */
> >> +    4, /* store_fp.  */
> >> +    4, /* load_pred.  */
> >> +    4 /* store_pred.  */
> >> +  }, /* memmov_cost.  */
> >> +  4, /* issue_rate  */
> >> +  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
> >> +   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
> >> +   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
> >> +   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
> >> +   AARCH64_FUSE_ADDSUB_2REG_CONST1),
> >> +  /* fusible_ops  */
> >> +  "32",                /* function_align.  */
> >> +  "4",         /* jump_align.  */
> >> +  "32:16",     /* loop_align.  */
> >> +  2,   /* int_reassoc_width.  */
> >> +  4,   /* fp_reassoc_width.  */
> >> +  2,   /* vec_reassoc_width.  */
> >> +  2,   /* min_div_recip_mul_sf.  */
> >> +  2,   /* min_div_recip_mul_df.  */
> >> +  0,   /* max_case_values.  */
> >> +  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
> >> +  (AARCH64_EXTRA_TUNE_NONE),           /* tune_flags.  */
> >> +  &ampere1_prefetch_tune
> >> +};
> >> +
> >>  static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
> >>  {
> >>    2, /* int_stmt_cost  */
> >> @@ -25539,6 +25576,33 @@ aarch_macro_fusion_pair_p (rtx_insn *prev,
> rtx_insn *curr)
> >>         }
> >>      }
> >>
> >> +  /* Fuse A+B+1 and A-B-1 */
> >> +  if (simple_sets_p
> >> +      && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
> >> +    {
> >> +      /* We're trying to match:
> >> +         prev == (set (r0) (plus (r0) (r1)))
> >> +         curr == (set (r0) (plus (r0) (const_int 1)))
> >> +       or:
> >> +         prev == (set (r0) (minus (r0) (r1)))
> >> +         curr == (set (r0) (plus (r0) (const_int -1))) */
> >> +
> >> +      rtx prev_src = SET_SRC (prev_set);
> >> +      rtx curr_src = SET_SRC (curr_set);
> >> +
> >> +      int polarity = 1;
> >> +      if (GET_CODE (prev_src) == MINUS)
> >> +       polarity = -1;
> >> +
> >> +      if (GET_CODE (curr_src) == PLUS
> >> +         && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) ==
> MINUS)
> >> +         && CONST_INT_P (XEXP (curr_src, 1))
> >> +         && INTVAL (XEXP (curr_src, 1)) == polarity
> >> +         && REG_P (XEXP (curr_src, 0))
> >> +         && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
> >> +       return true;
> >> +    }
> >> +
> >>    return false;
> >>  }
> >>
> >> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> >> index 60e65f4eaa5..09c8b312ae7 100644
> >> --- a/gcc/doc/invoke.texi
> >> +++ b/gcc/doc/invoke.texi
> >> @@ -19995,7 +19995,7 @@ performance of the code.  Permissible values
> for this option are:
> >>  @samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55},
> >>  @samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c},
> @samp{cortex-x2},
> >>  @samp{cortex-a510}, @samp{cortex-a710}, @samp{cortex-a715},
> @samp{ampere1},
> >> -@samp{native}.
> >> +@samp{ampere1a}, and @samp{native}.
> >>
> >>  The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
> >>  @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},
> >> --
> >> 2.34.1
> >>
>
  

Patch

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index d2671778928..aead587cec1 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -70,6 +70,7 @@  AARCH64_CORE("thunderxt83",   thunderxt83,   thunderx,  V8A,  (CRC, CRYPTO), thu
 
 /* Ampere Computing ('\xC0') cores. */
 AARCH64_CORE("ampere1", ampere1, cortexa57, V8_6A, (F16, RNG, AES, SHA3), ampere1, 0xC0, 0xac3, -1)
+AARCH64_CORE("ampere1a", ampere1a, cortexa57, V8_6A, (F16, RNG, AES, SHA3, MEMTAG), ampere1a, 0xC0, 0xac4, -1)
 /* Do not swap around "emag" and "xgene1",
    this order is required to handle variant correctly. */
 AARCH64_CORE("emag",        emag,      xgene1,    V8A,  (CRC, CRYPTO), emag, 0x50, 0x000, 3)
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index 760d7b30368..48522606fbe 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -775,4 +775,111 @@  const struct cpu_cost_table ampere1_extra_costs =
   }
 };
 
+const struct cpu_cost_table ampere1a_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    COSTS_N_INSNS (1), /* shift_reg.  */
+    0,                 /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,                 /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    0,                 /* extend.  */
+    COSTS_N_INSNS (1), /* extend_arith.  */
+    0,                 /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,                 /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      COSTS_N_INSNS (3),       /* flag_setting.  */
+      COSTS_N_INSNS (3),       /* extend.  */
+      COSTS_N_INSNS (4),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (19)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (3),       /* extend.  */
+      COSTS_N_INSNS (4),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (35)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (4),         /* load.  */
+    COSTS_N_INSNS (4),         /* load_sign_extend.  */
+    0,                         /* ldrd (n/a).  */
+    0,                         /* ldm_1st.  */
+    0,                         /* ldm_regs_per_insn_1st.  */
+    0,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (5),         /* loadf.  */
+    COSTS_N_INSNS (5),         /* loadd.  */
+    COSTS_N_INSNS (5),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    0,                         /* stm_regs_per_insn_1st.  */
+    0,                         /* stm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (2),         /* storef.  */
+    COSTS_N_INSNS (2),         /* stored.  */
+    COSTS_N_INSNS (2),         /* store_unaligned.  */
+    COSTS_N_INSNS (3),         /* loadv.  */
+    COSTS_N_INSNS (3)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (25),      /* div.  */
+      COSTS_N_INSNS (4),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (4),       /* addsub.  */
+      COSTS_N_INSNS (2),       /* fpconst.  */
+      COSTS_N_INSNS (4),       /* neg.  */
+      COSTS_N_INSNS (4),       /* compare.  */
+      COSTS_N_INSNS (4),       /* widen.  */
+      COSTS_N_INSNS (4),       /* narrow.  */
+      COSTS_N_INSNS (4),       /* toint.  */
+      COSTS_N_INSNS (4),       /* fromint.  */
+      COSTS_N_INSNS (4)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (34),      /* div.  */
+      COSTS_N_INSNS (5),       /* mult.  */
+      COSTS_N_INSNS (5),       /* mult_addsub.  */
+      COSTS_N_INSNS (5),       /* fma.  */
+      COSTS_N_INSNS (5),       /* addsub.  */
+      COSTS_N_INSNS (2),       /* fpconst.  */
+      COSTS_N_INSNS (5),       /* neg.  */
+      COSTS_N_INSNS (5),       /* compare.  */
+      COSTS_N_INSNS (5),       /* widen.  */
+      COSTS_N_INSNS (5),       /* narrow.  */
+      COSTS_N_INSNS (6),       /* toint.  */
+      COSTS_N_INSNS (6),       /* fromint.  */
+      COSTS_N_INSNS (5)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (3),  /* alu.  */
+    COSTS_N_INSNS (3),  /* mult.  */
+    COSTS_N_INSNS (2),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
+  }
+};
+
 #endif
diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def
index c064fb9b85d..d91f8a2babd 100644
--- a/gcc/config/aarch64/aarch64-fusion-pairs.def
+++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
@@ -36,5 +36,6 @@  AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH)
 AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
 AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
 AARCH64_FUSION_PAIR ("alu+cbz", ALU_CBZ)
+AARCH64_FUSION_PAIR ("addsub_2reg_const1", ADDSUB_2REG_CONST1)
 
 #undef AARCH64_FUSION_PAIR
diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
index 22ec1be5a4c..b7d6fc8cc88 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@ 
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
+	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,cortexx1c,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexa715,cortexx2,neoversen2,demeter,neoversev2"
 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d1f979ebcf8..a7f7c3c0121 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1921,6 +1921,43 @@  static const struct tune_params ampere1_tunings =
   &ampere1_prefetch_tune
 };
 
+static const struct tune_params ampere1a_tunings =
+{
+  &ampere1a_extra_costs,
+  &generic_addrcost_table,
+  &generic_regmove_cost,
+  &ampere1_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    4, /* store_int.  */
+    4, /* load_fp.  */
+    4, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
+   AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
+   AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
+   AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
+   AARCH64_FUSE_ADDSUB_2REG_CONST1),
+  /* fusible_ops  */
+  "32",		/* function_align.  */
+  "4",		/* jump_align.  */
+  "32:16",	/* loop_align.  */
+  2,	/* int_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
+  &ampere1_prefetch_tune
+};
+
 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
 {
   2, /* int_stmt_cost  */
@@ -25539,6 +25576,33 @@  aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
 	}
     }
 
+  /* Fuse A+B+1 and A-B-1 */
+  if (simple_sets_p
+      && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
+    {
+      /* We're trying to match:
+	  prev == (set (r0) (plus (r0) (r1)))
+	  curr == (set (r0) (plus (r0) (const_int 1)))
+	or:
+	  prev == (set (r0) (minus (r0) (r1)))
+	  curr == (set (r0) (plus (r0) (const_int -1))) */
+
+      rtx prev_src = SET_SRC (prev_set);
+      rtx curr_src = SET_SRC (curr_set);
+
+      int polarity = 1;
+      if (GET_CODE (prev_src) == MINUS)
+	polarity = -1;
+
+      if (GET_CODE (curr_src) == PLUS
+	  && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
+	  && CONST_INT_P (XEXP (curr_src, 1))
+	  && INTVAL (XEXP (curr_src, 1)) == polarity
+	  && REG_P (XEXP (curr_src, 0))
+	  && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
+	return true;
+    }
+
   return false;
 }
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 60e65f4eaa5..09c8b312ae7 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -19995,7 +19995,7 @@  performance of the code.  Permissible values for this option are:
 @samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55},
 @samp{cortex-r82}, @samp{cortex-x1}, @samp{cortex-x1c}, @samp{cortex-x2},
 @samp{cortex-a510}, @samp{cortex-a710}, @samp{cortex-a715}, @samp{ampere1},
-@samp{native}.
+@samp{ampere1a}, and @samp{native}.
 
 The values @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
 @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},