aarch64: Add the cost and scheduling models for Neoverse N1
Checks
Commit Message
This patch adds the cost and scheduling models for Neoverse N1, based on the information from the "Arm Neoverse N1 Software Optimization Guide”.
Comments
Hi Evandro,
> -----Original Message-----
> From: Gcc-patches <gcc-patches-
> bounces+kyrylo.tkachov=arm.com@gcc.gnu.org> On Behalf Of Evandro
> Menezes via Gcc-patches
> Sent: Friday, April 7, 2023 11:34 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Evandro Menezes <ebahapo@icloud.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: [PATCH] aarch64: Add the cost and scheduling models for Neoverse
> N1
>
> This patch adds the cost and scheduling models for Neoverse N1, based on
> the information from the "Arm Neoverse N1 Software Optimization Guide”.
>
Thank you for working on this. It is true that we haven't added any scheduling models for big cores from Arm for quite a while.
How has this patch been tested and benchmarked?
Using numbers from the Software Optimization Guide is certainly the way to go, but we need to ensure that the way GCC uses them actually results in better performance in practice.
> --
> Evandro Menezes ◊ evandro@yahoo.com
>
> [PATCH] aarch64: Add the cost and scheduling models for Neoverse N1
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-cores.def:
> Use the Neoverse N1 scheduling and cost models, but only for itself.
> * config/aarch64/aarch64.cc
> (cortexa76_tunings): Rename variable.
> (neoversen1_addrcost_table): New variable.
> (neoversen1_vector_cost): Likewise.
> (neoversen1_regmove_cost): Likewise.
> (neoversen1_advsimd_vector_cost): Likewise.
> (neoversen1_scalar_issue_info): Likewise.
> (neoversen1_advsimd_issue_info): Likewise.
> (neoversen1_vec_issue_info): Likewise.
> (neoversen1_vector_cost): Likewise.
> (neoversen1_tunings): Likewise.
> * config/aarch64/aarch64.md: Include `neoverse-n1.md`.
> * config/aarch64/neoverse-n1.md: New file.
> * gcc/config/arm/aarch-cost-tables.h
> (neoversen1_extra_costs): New variable.
>
> Signed-off-by: Evandro Menezes <evandro@gcc.gnu.org>
>
> ---
> gcc/config/aarch64/aarch64-cores.def | 22 +-
> gcc/config/aarch64/aarch64.cc | 155 +++++-
> gcc/config/aarch64/aarch64.md | 1 +
> gcc/config/aarch64/neoverse-n1.md | 716 +++++++++++++++++++++++++++
> gcc/config/arm/aarch-cost-tables.h | 107 ++++
> 5 files changed, 977 insertions(+), 24 deletions(-)
> create mode 100644 gcc/config/aarch64/neoverse-n1.md
>
> diff --git a/gcc/config/aarch64/aarch64-cores.def
> b/gcc/config/aarch64/aarch64-cores.def
> index 2ec88c98400..cc842c4e22c 100644
> --- a/gcc/config/aarch64/aarch64-cores.def
> +++ b/gcc/config/aarch64/aarch64-cores.def
> @@ -105,18 +105,18 @@ AARCH64_CORE("thunderx2t99", thunderx2t99,
> thunderx2t99, V8_1A, (CRYPTO), thu
> /* ARM ('A') cores. */
> AARCH64_CORE("cortex-a55", cortexa55, cortexa53, V8_2A, (F16, RCPC,
> DOTPROD), cortexa53, 0x41, 0xd05, -1)
> AARCH64_CORE("cortex-a75", cortexa75, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD), cortexa73, 0x41, 0xd0a, -1)
> -AARCH64_CORE("cortex-a76", cortexa76, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD), neoversen1, 0x41, 0xd0b, -1)
> -AARCH64_CORE("cortex-a76ae", cortexa76ae, cortexa57, V8_2A, (F16,
> RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0e, -1)
> -AARCH64_CORE("cortex-a77", cortexa77, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD, SSBS), neoversen1, 0x41, 0xd0d, -1)
> -AARCH64_CORE("cortex-a78", cortexa78, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd41, -1)
> -AARCH64_CORE("cortex-a78ae", cortexa78ae, cortexa57, V8_2A, (F16,
> RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd42, -1)
> -AARCH64_CORE("cortex-a78c", cortexa78c, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), neoversen1, 0x41, 0xd4b, -1)
> +AARCH64_CORE("cortex-a76", cortexa76, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD), cortexa76, 0x41, 0xd0b, -1)
> +AARCH64_CORE("cortex-a76ae", cortexa76ae, cortexa57, V8_2A, (F16,
> RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0e, -1)
> +AARCH64_CORE("cortex-a77", cortexa77, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD, SSBS), cortexa76, 0x41, 0xd0d, -1)
> +AARCH64_CORE("cortex-a78", cortexa78, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd41, -1)
> +AARCH64_CORE("cortex-a78ae", cortexa78ae, cortexa57, V8_2A, (F16,
> RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd42, -1)
> +AARCH64_CORE("cortex-a78c", cortexa78c, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), cortexa76, 0x41, 0xd4b, -1)
> AARCH64_CORE("cortex-a65", cortexa65, cortexa53, V8_2A, (F16, RCPC,
> DOTPROD, SSBS), cortexa73, 0x41, 0xd06, -1)
> AARCH64_CORE("cortex-a65ae", cortexa65ae, cortexa53, V8_2A, (F16, RCPC,
> DOTPROD, SSBS), cortexa73, 0x41, 0xd43, -1)
> -AARCH64_CORE("cortex-x1", cortexx1, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd44, -1)
> -AARCH64_CORE("cortex-x1c", cortexx1c, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD, SSBS, PROFILE, PAUTH), neoversen1, 0x41, 0xd4c, -1)
> -AARCH64_CORE("ares", ares, cortexa57, V8_2A, (F16, RCPC, DOTPROD,
> PROFILE), neoversen1, 0x41, 0xd0c, -1)
> -AARCH64_CORE("neoverse-n1", neoversen1, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
> +AARCH64_CORE("cortex-x1", cortexx1, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd44, -1)
> +AARCH64_CORE("cortex-x1c", cortexx1c, cortexa57, V8_2A, (F16, RCPC,
> DOTPROD, SSBS, PROFILE, PAUTH), cortexa76, 0x41, 0xd4c, -1)
> +AARCH64_CORE("ares", ares, cortexa57, V8_2A, (F16, RCPC, DOTPROD,
> PROFILE), cortexa76, 0x41, 0xd0c, -1)
> +AARCH64_CORE("neoverse-n1", neoversen1, neoversen1, V8_2A, (F16,
> RCPC, DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
> AARCH64_CORE("neoverse-e1", neoversee1, cortexa53, V8_2A, (F16, RCPC,
> DOTPROD, SSBS), cortexa73, 0x41, 0xd4a, -1)
>
> /* Cavium ('C') cores. */
> @@ -160,7 +160,7 @@ AARCH64_CORE("cortex-a73.cortex-a53",
> cortexa73cortexa53, cortexa53, V8A, (CRC
> /* ARM DynamIQ big.LITTLE configurations. */
>
> AARCH64_CORE("cortex-a75.cortex-a55", cortexa75cortexa55, cortexa53,
> V8_2A, (F16, RCPC, DOTPROD), cortexa73, 0x41, AARCH64_BIG_LITTLE
> (0xd0a, 0xd05), -1)
> -AARCH64_CORE("cortex-a76.cortex-a55", cortexa76cortexa55, cortexa53,
> V8_2A, (F16, RCPC, DOTPROD), neoversen1, 0x41, AARCH64_BIG_LITTLE
> (0xd0b, 0xd05), -1)
> +AARCH64_CORE("cortex-a76.cortex-a55", cortexa76cortexa55, cortexa53,
> V8_2A, (F16, RCPC, DOTPROD), cortexa76, 0x41, AARCH64_BIG_LITTLE
> (0xd0b, 0xd05), -1)
I would expect that whatever tuning decisions GCC makes for Neoverse N1 would also be best for all the cores you've touched in this hunk.
That is, we shouldn't have separate cortexa76 and neoversen1 tunings. Let's use neoversen1 tunings for all of these.
>
> /* Armv8-R Architecture Processors. */
> AARCH64_CORE("cortex-r82", cortexr82, cortexa53, V8R, (), cortexa53, 0x41,
> 0xd15, -1)
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 42617ced73a..071318c49f4 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -1867,7 +1867,7 @@ static const struct tune_params
> thunderx3t110_tunings =
> &thunderx3t110_prefetch_tune
> };
>
> -static const struct tune_params neoversen1_tunings =
> +static const struct tune_params cortexa76_tunings =
> {
> &cortexa76_extra_costs,
> &generic_addrcost_table,
> @@ -1885,18 +1885,18 @@ static const struct tune_params
> neoversen1_tunings =
> }, /* memmov_cost. */
> 3, /* issue_rate */
> (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /*
> fusible_ops */
> - "32:16", /* function_align. */
> - "4", /* jump_align. */
> - "32:16", /* loop_align. */
> - 2, /* int_reassoc_width. */
> - 4, /* fp_reassoc_width. */
> - 1, /* fma_reassoc_width. */
> - 2, /* vec_reassoc_width. */
> - 2, /* min_div_recip_mul_sf. */
> - 2, /* min_div_recip_mul_df. */
> - 0, /* max_case_values. */
> - tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
> - (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
> + "32:16", /* function_align. */
> + "4", /* jump_align. */
> + "32:16", /* loop_align. */
> + 2, /* int_reassoc_width. */
> + 4, /* fp_reassoc_width. */
> + 1, /* fma_reassoc_width. */
> + 2, /* vec_reassoc_width. */
> + 2, /* min_div_recip_mul_sf. */
> + 2, /* min_div_recip_mul_df. */
> + 0, /* max_case_values. */
> + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
> + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
> &generic_prefetch_tune
> };
>
> @@ -2293,6 +2293,135 @@ static const struct tune_params
> neoverse512tvb_tunings =
> &generic_prefetch_tune
> };
>
> +static const struct cpu_addrcost_table neoversen1_addrcost_table =
> +{
> + {
> + 0, /* hi */
> + 0, /* si */
> + 0, /* di */
> + 1, /* ti */
> + },
> + 0, /* pre_modify */
> + 0, /* post_modify */
> + 1, /* post_modify_ld3_st3 */
> + 1, /* post_modify_ld4_st4 */
> + 0, /* register_offset */
> + 0, /* register_sextend */
> + 0, /* register_zextend */
> + 0 /* imm_offset */
> +};
> +
> +static const struct cpu_regmove_cost neoversen1_regmove_cost =
> +{
> + 1, /* GP2GP */
> + /* Avoid the use of slow int<->fp moves for spilling by setting
> + their cost higher than memmov_cost. */
> + 3, /* GP2FP */
> + 2, /* FP2GP */
> + 2 /* FP2FP */
> +};
> +
> +static const advsimd_vec_cost neoversen1_advsimd_vector_cost =
> +{
> + 2, /* int_stmt_cost */
> + 2, /* fp_stmt_cost */
> + 0, /* ld2_st2_permute_cost */
> + 0, /* ld3_st3_permute_cost */
> + 0, /* ld4_st4_permute_cost */
> + 3, /* permute_cost */
> + 6, /* reduc_i8_cost */
> + 5, /* reduc_i16_cost */
> + 3, /* reduc_i32_cost */
> + 3, /* reduc_i64_cost */
> + 8, /* reduc_f16_cost */
> + 5, /* reduc_f32_cost */
> + 5, /* reduc_f64_cost */
> + 0, /* store_elt_extra_cost */
> + 2, /* vec_to_scalar_cost */
> + 2, /* scalar_to_vec_cost */
> + 4, /* align_load_cost */
> + 4, /* unalign_load_cost */
> + 1, /* unalign_store_cost */
> + 1 /* store_cost */
> +};
> +
> +static const aarch64_scalar_vec_issue_info neoversen1_scalar_issue_info =
> +{
> + 2, /* loads_stores_per_cycle */
> + 2, /* stores_per_cycle */
> + 2, /* general_ops_per_cycle */
> + 0, /* fp_simd_load_general_ops */
> + 1 /* fp_simd_store_general_ops */
> +};
> +
> +static const aarch64_advsimd_vec_issue_info
> neoversen1_advsimd_issue_info =
> +{
> + {
> + 2, /* loads_stores_per_cycle */
> + 2, /* stores_per_cycle */
> + 2, /* general_ops_per_cycle */
> + 0, /* fp_simd_load_general_ops */
> + 1 /* fp_simd_store_general_ops */
> + },
> + 3, /* ld2_st2_general_ops */
> + 5, /* ld3_st3_general_ops */
> + 11 /* ld4_st4_general_ops */
> +};
> +
> +static const aarch64_vec_issue_info neoversen1_vec_issue_info =
> +{
> + &neoversen1_scalar_issue_info, /* scalar */
> + &neoversen1_advsimd_issue_info, /* advsimd */
> + nullptr /* sve */
> +};
> +
> +
> +static const struct cpu_vector_cost neoversen1_vector_cost =
> +{
> + 1, /* scalar_int_stmt_cost */
> + 1, /* scalar_fp_stmt_cost */
> + 4, /* scalar_load_cost */
> + 1, /* scalar_store_cost */
> + 1, /* cond_taken_branch_cost */
> + 1, /* cond_not_taken_branch_cost */
> + &neoversen1_advsimd_vector_cost, /* advsimd */
> + nullptr, /* sve */
> + &neoversen1_vec_issue_info /* issue_info */
> +};
> +
> +static const struct tune_params neoversen1_tunings =
> +{
> + &neoversen1_extra_costs,
> + &neoversen1_addrcost_table,
> + &neoversen1_regmove_cost,
> + &neoversen1_vector_cost,
> + &generic_branch_cost,
> + &generic_approx_modes,
> + SVE_NOT_IMPLEMENTED, /* sve_width */
> + { 4, /* load_int. */
> + 2, /* store_int. */
> + 5, /* load_fp. */
> + 2, /* store_fp. */
> + 4, /* load_pred. */
> + 4 /* store_pred. */
> + }, /* memmov_cost. */
> + 4, /* issue_rate */
> + AARCH64_FUSE_AES_AESMC, /* fusible_ops */
> + "32", /* function_align. */
I guess this number worries me somewhat. Previously we used "32:16" here as using "32" would bloat code size too much by adding too many alignment nops around small functions.
Do you have performance numbers to recommend this change?
More generally, the tuning structure changes and the scheduling model are two logically distinct and major changes and so should be split into separate patches.
That would make it easier to evaluate the performance benefits of each individually.
> + "4", /* jump_align. */
> + "32:16", /* loop_align. */
> + 2, /* int_reassoc_width. */
> + 4, /* fp_reassoc_width. */
> + 1, /* fma_reassoc_width. */
> + 2, /* vec_reassoc_width. */
> + 2, /* min_div_recip_mul_sf. */
> + 2, /* min_div_recip_mul_df. */
> + 0, /* max_case_values. */
> + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
> + AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND, /* tune_flags. */
> + &generic_prefetch_tune
> +};
> +
> static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
> {
> 2, /* int_stmt_cost */
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 022eef80bc1..6cb9e31259b 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -471,6 +471,7 @@
> (include "../arm/cortex-a57.md")
> (include "../arm/exynos-m1.md")
> (include "falkor.md")
> +(include "neoverse-n1.md")
> (include "saphira.md")
> (include "thunderx.md")
> (include "../arm/xgene1.md")
> diff --git a/gcc/config/aarch64/neoverse-n1.md
> b/gcc/config/aarch64/neoverse-n1.md
For CPUs that support both aarch32 and aarch64 modes we usually put them in config/arm/ but I appreciate that the vast majority of Neoverse N1 users care about running aarch64 code mostly so I don't object to having it in config/aarch64/
> new file mode 100644
> index 00000000000..e13c826d494
> --- /dev/null
> +++ b/gcc/config/aarch64/neoverse-n1.md
> @@ -0,0 +1,716 @@
> +;; ARM Neoverse N1 pipeline description
Please use "Arm Neoverse N1" for the name.
> +;; (Based on the "Arm® NeoverseTM N1 Software Optimization Guide")
> +;;
> +;; Copyright (C) 2014-2023 Free Software Foundation, Inc.
> +;;
> +;; This file is part of GCC.
> +;;
> +;; GCC is free software; you can redistribute it and/or modify it
> +;; under the terms of the GNU General Public License as published by
> +;; the Free Software Foundation; either version 3, or (at your option)
> +;; any later version.
> +;;
> +;; GCC is distributed in the hope that it will be useful, but
> +;; WITHOUT ANY WARRANTY; without even the implied warranty of
> +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> +;; General Public License for more details.
> +;;
> +;; You should have received a copy of the GNU General Public License
> +;; along with GCC; see the file COPYING3. If not see
> +;; <http://www.gnu.org/licenses/>.
> +
> +;; The Neoverse N1 core is modelled as a multiple issue pipeline that has
> +;; the following functional units.
> +
> +(define_automaton "neoverse_n1")
> +
> +;; 1 - Two pipelines for integer operations: SX1, SX2.
> +
> +(define_cpu_unit "neon1_sx1_issue" "neoverse_n1")
> +(define_reservation "neon1_sx1" "neon1_sx1_issue")
> +
> +(define_cpu_unit "neon1_sx2_issue" "neoverse_n1")
> +(define_reservation "neon1_sx2" "neon1_sx2_issue")
> +
> +;; 2 - One pipeline for complex integer operations: MX.
> +
> +(define_cpu_unit "neon1_mx_issue"
> + "neoverse_n1")
> +(define_reservation "neon1_mx" "neon1_mx_issue")
> +(define_reservation "neon1_m_block" "neon1_mx_issue")
> +
> +;; 3 - Two asymmetric pipelines for Neon and FP operations: CX1, CX2.
> +(define_automaton "neoverse_n1_cx")
> +
> +(define_cpu_unit "neon1_cx1_issue"
> + "neoverse_n1_cx")
> +(define_cpu_unit "neon1_cx2_issue"
> + "neoverse_n1_cx")
> +
> +(define_reservation "neon1_cx1" "neon1_cx1_issue")
> +(define_reservation "neon1_cx2" "neon1_cx2_issue")
> +(define_reservation "neon1_v0_block" "neon1_cx1_issue")
> +
> +;; 4 - One pipeline for branch operations: BX.
> +
> +(define_cpu_unit "neon1_bx_issue" "neoverse_n1")
> +(define_reservation "neon1_bx" "neon1_bx_issue")
> +
> +;; 5 - Two pipelines for load and store operations: LS1, LS2.
> +
> +(define_cpu_unit "neon1_ls1_issue" "neoverse_n1")
> +(define_reservation "neon1_ls1" "neon1_ls1_issue")
> +
> +(define_cpu_unit "neon1_ls2_issue" "neoverse_n1")
> +(define_reservation "neon1_ls2" "neon1_ls2_issue")
> +
> +;; Block all issue queues.
> +
> +(define_reservation "neon1_block" "neon1_sx1_issue + neon1_sx2_issue
> + + neon1_mx_issue
> + + neon1_cx1_issue + neon1_cx2_issue
> + + neon1_ls1_issue + neon1_ls2_issue")
> +
> +;; Issue groups.
> +
> +(define_reservation "neon1_b" "neon1_bx")
> +(define_reservation "neon1_i" "(neon1_sx1 | neon1_sx2 | neon1_mx)")
> +(define_reservation "neon1_m" "neon1_mx")
> +(define_reservation "neon1_d" "(neon1_sx2 | neon1_mx)")
> +(define_reservation "neon1_l" "(neon1_ls1 | neon1_ls2)")
> +(define_reservation "neon1_v" "(neon1_cx1 | neon1_cx2)")
> +(define_reservation "neon1_v0" "neon1_cx1")
> +(define_reservation "neon1_v1" "neon1_cx2")
> +
> +;; Intructions resouces.
> +
> +;; Block.
> +(define_insn_reservation "neoverse_n1_block" 1
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "block"))
> + "neon1_block")
> +
> +;; Branches
> +;; No latency as there is no result.
> +(define_insn_reservation "neoverse_n1_branch" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "branch"))
> + "neon1_b")
> +
> +;; Calls
> +;; No latency as there is no result.
> +(define_insn_reservation "neoverse_n1_call" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "call"))
> + "neon1_i + neon1_b")
> +
> +;; ALU with no or simple shift.
> +;; TODO: there should also be "alus_shift_imm_lsl_1to4".
> +(define_insn_reservation "neoverse_n1_alu" 1
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "alu_imm, alu_shift_imm_lsl_1to4, alu_sreg, \
> + alus_imm, alus_sreg, \
> + csel, \
> + logic_imm, logic_reg, logic_shift_imm, \
> + logics_imm, logics_reg, \
> + mov_reg"))
> + "neon1_i")
> +
> +;; ALU with extension or complex shift.
> +;; TODO: there should also be "alus_shift_imm_other".
> +(define_insn_reservation "neoverse_n1_alu_shift" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "alu_ext, alu_shift_imm_other, alu_shift_reg, \
> + alus_shift_imm, alus_shift_reg, \
> + logic_shift_reg, logics_shift_imm, logics_shift_reg, \
> + crc"))
> + "neon1_m")
> +
> +;; Miscellaneous ALU.
> +;; TODO: model 2-register "extr", "bfi", variable shifts.
> +(define_insn_reservation "neoverse_n1_alu_misc" 1
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "adr, rotate_imm, bfm, clz, mov_imm, rbit, rev"))
> + "neon1_i")
> +
> +;; Integer divide.
> +;; Divisions are not pipelined.
> +(define_insn_reservation "neoverse_n1_div" 12
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "udiv, sdiv"))
> + "neon1_m, (neon1_m_block * 12)")
> +
> +;; Narrow multiply.
> +(define_insn_reservation "neoverse_n1_mul" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "mla, mul"))
> + "neon1_m")
> +
> +;; Wide multiply.
> +;; TODO: model multiply high.
> +(define_insn_reservation "neoverse_n1_mull" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "smull, umull"))
> + "neon1_m")
> +
> +;; Integer load.
> +;; TODO: model load pairs fully.
> +(define_insn_reservation "neoverse_n1_ld" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "load_byte, load_4, load_8"))
> + "neon1_l")
> +
> +(define_insn_reservation "neoverse_n1_ld16" 5
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "load_16"))
> + "neon1_l * 2")
> +
> +;; Integer store.
> +;; TODO: model store pairs fully.
> +(define_insn_reservation "neoverse_n1_st" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "store_4, store_8"))
> + "neon1_d, neon1_l")
> +
> +(define_insn_reservation "neoverse_n1_stp" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "store_16"))
> + "neon1_i, (neon1_l * 2)")
> +
> +;; FP arithmetic.
> +(define_insn_reservation "neoverse_n1_fp_alu" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "f_minmaxd, f_minmaxs, \
> + faddd, fadds, \
> + fconstd, fconsts, \
> + fcsel, \
> + ffarithd, ffariths, \
> + fmov"))
> + "neon1_v")
> +
> +;; FP compare.
> +(define_insn_reservation "neoverse_n1_fp_cmp" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "fcmpd, fcmps, fccmpd, fccmps"))
> + "neon1_v0")
> +
> +;; FP round.
> +(define_insn_reservation "neoverse_n1_fp_rint" 3
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "f_rintd, f_rints"))
> + "neon1_v0")
> +
> +;; FP divide & square-root.
> +;; Divisions are not pipelined.
> +;; TODO: model half-precision.
> +(define_insn_reservation "neoverse_n1_fp_divd" 15
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "fdivd, fsqrtd"))
> + "neon1_v0, (neon1_v0_block * 15)")
> +
> +(define_insn_reservation "neoverse_n1_fp_divs" 10
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "fdivs, fsqrts"))
> + "neon1_v0, (neon1_v0_block * 10)")
> +
> +;; FP multiply.
> +(define_insn_reservation "neoverse_n1_fp_mul" 3
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "fmuld, fmuls"))
> + "neon1_v")
> +
> +(define_insn_reservation "neoverse_n1_fp_mac" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "fmacd, fmacs"))
> + "neon1_v")
> +
> +;; FP convert.
> +(define_insn_reservation "neoverse_n1_fp_cvt" 3
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "f_cvt"))
> + "neon1_v0")
> +
> +(define_insn_reservation "neoverse_n1_fp_cvti2f" 6
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "f_cvti2f"))
> + "neon1_m + neon1_v0")
> +
> +(define_insn_reservation "neoverse_n1_fp_cvtf2i" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "f_cvtf2i"))
> + "neon1_v0 + neon1_v1")
> +
> +;; FP move.
> +(define_insn_reservation "neoverse_n1_fp_mov" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "fconstd, fconsts, \
> + fmov"))
> + "neon1_v")
> +
> +(define_insn_reservation "neoverse_n1_fp_movi2f" 3
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "f_mcr"))
> + "neon1_m")
> +
> +(define_insn_reservation "neoverse_n1_fp_movf2i" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "f_mrc, \
> + neon_to_gp, neon_to_gp_q"))
> + "neon1_v1")
> +
> +;; FP load.
> +(define_insn_reservation "neoverse_n1_fp_ld" 5
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "f_loadd, f_loads"))
> + "neon1_i, neon1_l")
> +
> +(define_insn_reservation "neoverse_n1_fp_ldp" 5
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_ldp"))
> + "neon1_i, (neon1_l * 2)")
> +
> +(define_insn_reservation "neoverse_n1_fp_ldp_q" 7
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_ldp_q"))
> + "neon1_i, (neon1_l * 2)")
> +
> +;; FP store.
> +(define_insn_reservation "neoverse_n1_fp_st" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "f_stored, f_stores"))
> + "neon1_i, neon1_l")
> +
> +(define_insn_reservation "neoverse_n1_fp_stp" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_stp"))
> + "neon1_l + neon1_v")
> +
> +(define_insn_reservation "neoverse_n1_fp_stp_q" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_stp_q"))
> + "(neon1_l * 2) + neon1_v")
> +
> +;; ASIMD arithmetic.
> +(define_insn_reservation "neoverse_n1_asimd_abd_long" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_abd_long"))
> + "neon1_v1")
> +
> +(define_insn_reservation "neoverse_n1_asimd_alu" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_abd, neon_abd_q, \
> + neon_abs, neon_abs_q, \
> + neon_add, neon_add_q, \
> + neon_add_halve, neon_add_halve_q, \
> + neon_add_halve_narrow_q, \
> + neon_add_long, neon_add_widen, \
> + neon_bsl, neon_bsl_q, \
> + neon_cls, neon_cls_q, \
> + neon_compare, neon_compare_q, \
> + neon_compare_zero, neon_compare_zero_q, \
> + neon_dot, neon_dot_q, \
> + neon_dup, neon_dup_q, \
> + neon_ext, neon_ext_q, \
> + neon_ins, neon_ins_q, \
> + neon_logic, neon_logic_q, \
> + neon_minmax, neon_minmax_q, \
> + neon_move, neon_move_q, \
> + neon_move_narrow_q, \
> + neon_neg, neon_neg_q, \
> + neon_permute, neon_permute_q, \
> + neon_qabs, neon_qabs_q, \
> + neon_qadd, neon_qadd_q, \
> + neon_qneg, neon_qneg_q, \
> + neon_qsub, neon_qsub_q, \
> + neon_rbit, neon_rbit_q, \
> + neon_reduc_add, neon_reduc_add_q, \
> + neon_rev, neon_rev_q, \
> + neon_sub, neon_sub_q, \
> + neon_sub_halve, neon_sub_halve_q, \
> + neon_sub_halve_narrow_q, \
> + neon_sub_widen, neon_sub_long, \
> + neon_tbl1, neon_tbl1_q, \
> + neon_tbl2, neon_tbl2_q"))
> + "neon1_v")
> +
> +(define_insn_reservation "neoverse_n1_asimd_arith_acc" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_arith_acc"))
> + "neon1_v1")
> +
> +(define_insn_reservation "neoverse_n1_asimd_shift_acc_q" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_shift_acc_q"))
> + "neon1_v1")
> +
> +(define_insn_reservation "neoverse_n1_asimd_reduc" 3
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_reduc_add_long, \
> + neon_reduc_minmax, neon_reduc_minmax_q"))
> + "neon1_v1")
> +
> +
> +;; ASIMD multiply.
> +(define_insn_reservation "neoverse_n1_asimd_mla" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_mla_b, neon_mla_b_long, \
> + neon_mla_h, neon_mla_h_long, \
> + neon_mla_h_scalar, neon_mla_h_scalar_long, \
> + neon_mla_s, neon_mla_s_long, \
> + neon_mla_s_scalar, neon_mla_s_scalar_long"))
> + "neon1_v0")
> +
> +(define_insn_reservation "neoverse_n1_asimd_mla_q" 5
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_mla_b_q,
> + neon_mla_h_q, neon_mla_h_scalar_q, \
> + neon_mla_s_q, neon_mla_s_scalar_q"))
> + "neon1_v0 * 2")
> +
> +(define_insn_reservation "neoverse_n1_asimd_mul" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_mul_b, neon_mul_b_long, \
> + neon_mul_h, neon_mul_h_long, \
> + neon_mul_s, neon_mul_s_long,
> + neon_sat_mul_b, neon_sat_mul_b_long,
> + neon_sat_mul_h, neon_sat_mul_h_long, \
> + neon_sat_mul_h_scalar, neon_sat_mul_h_scalar_long,
> + neon_sat_mul_s, neon_sat_mul_s_long, \
> + neon_sat_mul_s_scalar, neon_sat_mul_s_scalar_long"))
> + "neon1_v0")
> +
> +(define_insn_reservation "neoverse_n1_asimd_mul_q" 5
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_mul_b_q, neon_mul_h_q, neon_mul_s_q, \
> + neon_sat_mul_b_q, \
> + neon_sat_mul_h_q, neon_sat_mul_h_scalar_q, \
> + neon_sat_mul_s_q, neon_sat_mul_s_scalar_q"))
> + "neon1_v0 * 2")
> +
> +(define_insn_reservation "neoverse_n1_asimd_sat_mla" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_sat_mla_b_long, \
> + neon_sat_mla_h_long, neon_sat_mla_h_scalar_long, \
> + neon_sat_mla_s_long, neon_sat_mla_s_scalar_long"))
> + "neon1_v0")
> +
> +;; ASIMD shift.
> +(define_insn_reservation "neoverse_n1_asimd_shift" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_shift_imm, neon_shift_imm_q,
> neon_shift_imm_long, \
> + neon_shift_reg, neon_shift_reg_q"))
> + "neon1_v1")
> +
> +(define_insn_reservation "neoverse_n1_asimd_shift_q" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_sat_shift_imm, neon_sat_shift_imm_q, \
> + neon_sat_shift_imm_narrow_q, \
> + neon_sat_shift_reg, neon_sat_shift_reg_q, \
> + neon_shift_imm_narrow_q"))
> + "neon1_v1")
> +
> +;; ASIMD FP arithmetic.
> +(define_insn_reservation "neoverse_n1_asimd_fp_alu" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_abd_d, neon_fp_abd_d_q, \
> + neon_fp_abd_s, neon_fp_abd_s_q, \
> + neon_fp_abs_d, neon_fp_abs_d_q, \
> + neon_fp_abs_s, neon_fp_abs_s_q, \
> + neon_fp_addsub_d, neon_fp_addsub_d_q, \
> + neon_fp_addsub_s, neon_fp_addsub_s_q, \
> + neon_fp_compare_d, neon_fp_compare_d_q, \
> + neon_fp_compare_s, neon_fp_compare_s_q, \
> + neon_fp_minmax_d, neon_fp_minmax_d_q, \
> + neon_fp_minmax_s, neon_fp_minmax_s_q, \
> + neon_fp_neg_d, neon_fp_neg_d_q, \
> + neon_fp_neg_s, neon_fp_neg_s_q, \
> + neon_fp_reduc_add_d, neon_fp_reduc_add_d_q, \
> + neon_fp_reduc_add_s, neon_fp_reduc_add_s_q"))
> + "neon1_v")
> +
> +(define_insn_reservation "neoverse_n1_asimd_fp_reduc" 5
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_reduc_minmax_d,
> neon_fp_reduc_minmax_d_q, \
> + neon_fp_reduc_minmax_s, neon_fp_reduc_minmax_s_q"))
> + "neon1_v")
> +
> +;; ASIMD FP convert.
> +(define_insn_reservation "neoverse_n1_asimd_cvt" 3
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_cvt_narrow_d_q, \
> + neon_fp_cvt_widen_s, \
> + neon_fp_to_int_d, neon_fp_to_int_d_q, \
> + neon_fp_to_int_s, \
> + neon_int_to_fp_d, neon_int_to_fp_d_q, \
> + neon_int_to_fp_s, \
> + neon_fp_recpe_d, neon_fp_recpe_s, \
> + neon_fp_recpx_d, neon_fp_recpx_s, \
> + neon_fp_round_d, neon_fp_round_d_q, \
> + neon_fp_round_s"))
> + "neon1_v0")
> +
> +(define_insn_reservation "neoverse_n1_asimd_cvt_q" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_cvt_narrow_s_q, \
> + neon_fp_cvt_widen_h, \
> + neon_fp_to_int_s_q, \
> + neon_int_to_fp_s_q, \
> + neon_fp_recpe_d_q, neon_fp_recpe_s_q, \
> + neon_fp_recpx_d_q, neon_fp_recpx_s_q, \
> + neon_fp_round_s_q"))
> + "neon1_v0 * 2")
> +
> +;; ASIMD FP divide & square-root.
> +;; Divisions are not pipelined.
> +;; TODO: model half-precision.
> +(define_insn_reservation "neoverse_n1_asimd_fp_divd_q" 15
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_div_d_q"))
> + "neon1_v0, (neon1_v0_block * 14)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_fp_divs" 10
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_div_s"))
> + "neon1_v0, (neon1_v0_block * 5)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_fp_divs_q" 10
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_div_s_q"))
> + "neon1_v0, (neon1_v0_block * 9)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_fp_sqrtd_q" 17
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_sqrt_d_q"))
> + "neon1_v0, (neon1_v0_block * 16)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_fp_sqrts" 10
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_sqrt_s"))
> + "neon1_v0, (neon1_v0_block * 5)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_fp_sqrts_q" 10
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_sqrt_s_q"))
> + "neon1_v0, (neon1_v0_block * 9)")
> +
> +;; ASIMD FP multiply.
> +(define_insn_reservation "neoverse_n1_asimd_fp_mul" 3
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_mul_d, neon_fp_mul_d_q,
> neon_fp_mul_d_scalar_q, \
> + neon_fp_mul_s, neon_fp_mul_s_q,
> neon_fp_mul_s_scalar_q"))
> + "neon1_v")
> +
> +;; TODO: model the long form.
> +(define_insn_reservation "neoverse_n1_asimd_fp_mla" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_fp_mla_d, neon_fp_mla_d_q,
> neon_fp_mla_d_scalar_q, \
> + neon_fp_mla_s, neon_fp_mla_s_q, neon_fp_mla_s_scalar_q, \
> + neon_fp_recps_d, neon_fp_recps_d_q, \
> + neon_fp_recps_s, neon_fp_recps_s_q"))
> + "neon1_v")
> +
> +;; ASIMD miscellaneous.
> +(define_insn_reservation "neoverse_n1_asimd_gp_fp" 3
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_from_gp, neon_from_gp_q"))
> + "neon1_m")
> +
> +;; TODO: model "tbx" fully.
> +(define_insn_reservation "neoverse_n1_asimd_tbl_3" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_tbl3, neon_tbl3_q"))
> + "neon1_v * 4")
> +
> +(define_insn_reservation "neoverse_n1_asimd_tbl_4" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_tbl4, neon_tbl4_q"))
> + "neon1_v * 6")
> +
> +;; ASIMD load.
> +(define_insn_reservation "neoverse_n1_asimd_ld_a" 5
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q, \
> + neon_load1_2reg, neon_load1_2reg_q"))
> + "neon1_l")
> +
> +(define_insn_reservation "neoverse_n1_asimd_ld_b" 6
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_load1_3reg, neon_load1_3reg_q"))
> + "neon1_l * 3")
> +
> +(define_insn_reservation "neoverse_n1_asimd_ld_c" 6
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_load1_4reg, neon_load1_4reg_q"))
> + "neon1_l * 4")
> +
> +(define_insn_reservation "neoverse_n1_asimd_ld_d" 7
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_load1_all_lanes, neon_load1_all_lanes_q, \
> + neon_load1_one_lane, neon_load1_one_lane_q"))
> + "neon1_l + neon1_v")
> +
> +(define_insn_reservation "neoverse_n1_asimd_ld_e" 7
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_load2_2reg, neon_load2_2reg_q, \
> + neon_load2_all_lanes, neon_load2_all_lanes_q, \
> + neon_load2_one_lane, neon_load2_one_lane_q"))
> + "(neon1_l * 2) + neon1_v")
> +
> +(define_insn_reservation "neoverse_n1_asimd_ld_f" 8
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_load3_3reg, neon_load3_3reg_q, \
> + neon_load4_all_lanes, neon_load4_all_lanes_q, \
> + neon_load4_one_lane, neon_load4_one_lane_q"))
> + "(neon1_l * 4) + neon1_v")
> +
> +(define_insn_reservation "neoverse_n1_asimd_ld_g" 7
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_load3_all_lanes, neon_load3_all_lanes_q, \
> + neon_load3_one_lane, neon_load3_one_lane_q"))
> + "(neon1_l * 4) + neon1_v")
> +
> +(define_insn_reservation "neoverse_n1_asimd_ld_h" 8
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_load4_4reg"))
> + "(neon1_l * 7) + neon1_v")
> +
> +(define_insn_reservation "neoverse_n1_asimd_ld_i" 10
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_load4_4reg_q"))
> + "(neon1_l * 10) + neon1_v")
> +
> +;; ASIMD store.
> +(define_insn_reservation "neoverse_n1_asimd_st_a" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q, \
> + neon_store1_2reg"))
> + "neon1_v + neon1_l")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_b" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store1_1reg_q, \
> + neon_store1_2reg"))
> + "neon1_v + (neon1_l * 2)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_c" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store1_2reg_q, \
> + neon_store1_4reg"))
> + "neon1_v + (neon1_l * 4)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_d" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store1_3reg"))
> + "neon1_v + (neon1_l * 3)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_e" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store1_3reg_q"))
> + "neon1_v + (neon1_l * 6)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_f" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store1_4reg_q"))
> + "neon1_v + (neon1_l * 8)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_g" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store1_one_lane, neon_store1_one_lane_q, \
> + neon_store2_2reg, \
> + neon_store2_one_lane, neon_store2_one_lane_q"))
> + "neon1_v + (neon1_l * 2)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_h" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store2_2reg_q, \
> + neon_store3_3reg, \
> + neon_store3_one_lane_q"))
> + "neon1_v + (neon1_l * 4)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_i" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store3_3reg_q"))
> + "neon1_v + (neon1_l * 6)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_j" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store3_one_lane"))
> + "neon1_v + (neon1_l * 4)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_k" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store4_4reg"))
> + "neon1_v + (neon1_l * 6)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_l" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store4_4reg_q"))
> + "neon1_v + (neon1_l * 12)")
> +
> +(define_insn_reservation "neoverse_n1_asimd_st_m" 0
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "neon_store4_one_lane, neon_store4_one_lane_q"))
> + "neon1_v + (neon1_l * 3)")
> +
> +;; ASIMD crypto.
> +;; TODO: model different widths.
> +(define_insn_reservation "neoverse_n1_asimd_aese" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "crypto_aese"))
> + "neon1_v0")
> +
> +(define_insn_reservation "neoverse_n1_asimd_aesmc" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "crypto_aesmc"))
> + "neon1_v0")
> +
> +;; FIXME: "sha256u1" should be "crypto_sha256_fast".
> +(define_insn_reservation "neoverse_n1_asimd_sha_fast" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "crypto_sha1_fast, crypto_sha1_xor,
> + crypto_sha256_fast"))
> + "neon1_v0")
> +
> +(define_insn_reservation "neoverse_n1_asimd_sha_slow" 4
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow"))
> + "neon1_v0")
> +
> +;; FIXME: "pmull" sometimes is also
> "neon_mul_{b,h,s}(_scalar)?(_(q|long))?"
> +(define_insn_reservation "neoverse_n1_asimd_poly" 3
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "crypto_pmull"))
> + "neon1_v0")
> +
> +;; CRC
> +(define_insn_reservation "neoverse_n1_crc" 2
> + (and (eq_attr "tune" "neoversen1")
> + (eq_attr "type" "crc"))
> + "neon1_m")
> +
> +;; Bypasses.
> +;; TODO: Model region forwarding.
> +
> +;; Integer multiply.
> +;; TODO: model the X and high forms.
> +(define_bypass 1 "neoverse_n1_mul, neoverse_n1_mull"
> + "neoverse_n1_mul, neoverse_n1_mull")
> +
> +;; FP multiply.
> +(define_bypass 2 "neoverse_n1_fp_mul" "neoverse_n1_fp_mul")
> +(define_bypass 2 "neoverse_n1_fp_mac" "neoverse_n1_fp_mac")
> +
> +;; ASIMD arithmetic.
> +(define_bypass 1 "neoverse_n1_asimd_arith_acc"
> "neoverse_n1_asimd_arith_acc")
> +(define_bypass 1 "neoverse_n1_asimd_shift_acc_q"
> "neoverse_n1_asimd_shift_acc_q")
> +
> +;; ASIMD multiply.
> +(define_bypass 1 "neoverse_n1_asimd_mla" "neoverse_n1_asimd_mla")
> +(define_bypass 2 "neoverse_n1_asimd_mla_q"
> "neoverse_n1_asimd_mla_q")
> +
> +;; ASIMD FP multiply.
> +(define_bypass 2 "neoverse_n1_asimd_fp_mul"
> "neoverse_n1_asimd_fp_mul")
> +(define_bypass 2 "neoverse_n1_asimd_fp_mla"
> "neoverse_n1_asimd_fp_mla")
> +
> +;; CRC
> +(define_bypass 1 "neoverse_n1_crc" "neoverse_n1_*")
I haven't gone through the model in great detail, but I've noticed that there are quite a few TODO: markers.
Are there plans to address those? I'd be hesitant to take a model with important gaps in it as the quality of the schedule would be degraded if the compiler cannot reason about significant parts of the ISA.
Thanks,
Kyrill
> diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-
> tables.h
> index e3848214728..fce6da6bbcc 100644
> --- a/gcc/config/arm/aarch-cost-tables.h
> +++ b/gcc/config/arm/aarch-cost-tables.h
> @@ -450,6 +450,113 @@ const struct cpu_cost_table cortexa76_extra_costs
> =
> }
> };
>
> +const struct cpu_cost_table neoversen1_extra_costs =
> +{
> + /* ALU */
> + {
> + 0, /* arith. */
> + 0, /* logical. */
> + 0, /* shift. */
> + 0, /* shift_reg. */
> + COSTS_N_INSNS (1), /* arith_shift. */
> + COSTS_N_INSNS (1), /* arith_shift_reg. */
> + 0, /* log_shift. */
> + COSTS_N_INSNS (1), /* log_shift_reg. */
> + 0, /* extend. */
> + COSTS_N_INSNS (1), /* extend_arith. */
> + COSTS_N_INSNS (1), /* bfi. */
> + 0, /* bfx. */
> + 0, /* clz. */
> + 0, /* rev. */
> + 0, /* non_exec. */
> + true /* non_exec_costs_exec. */
> + },
> + {
> + /* MULT SImode */
> + {
> + COSTS_N_INSNS (1), /* simple. */
> + COSTS_N_INSNS (2), /* flag_setting. */
> + COSTS_N_INSNS (1), /* extend. */
> + COSTS_N_INSNS (1), /* add. */
> + COSTS_N_INSNS (1), /* extend_add. */
> + COSTS_N_INSNS (11) /* idiv. */
> + },
> + /* MULT DImode */
> + {
> + COSTS_N_INSNS (3), /* simple. */
> + 0, /* flag_setting (N/A). */
> + COSTS_N_INSNS (1), /* extend. */
> + COSTS_N_INSNS (3), /* add. */
> + COSTS_N_INSNS (1), /* extend_add. */
> + COSTS_N_INSNS (19) /* idiv. */
> + }
> + },
> + /* LD/ST */
> + {
> + COSTS_N_INSNS (3), /* load. */
> + COSTS_N_INSNS (3), /* load_sign_extend. */
> + COSTS_N_INSNS (3), /* ldrd. */
> + COSTS_N_INSNS (2), /* ldm_1st. */
> + 1, /* ldm_regs_per_insn_1st. */
> + 2, /* ldm_regs_per_insn_subsequent. */
> + COSTS_N_INSNS (4), /* loadf. */
> + COSTS_N_INSNS (4), /* loadd. */
> + COSTS_N_INSNS (3), /* load_unaligned. */
> + 0, /* store. */
> + 0, /* strd. */
> + 0, /* stm_1st. */
> + 1, /* stm_regs_per_insn_1st. */
> + 2, /* stm_regs_per_insn_subsequent. */
> + 0, /* storef. */
> + 0, /* stored. */
> + COSTS_N_INSNS (1), /* store_unaligned. */
> + COSTS_N_INSNS (1), /* loadv. */
> + COSTS_N_INSNS (1) /* storev. */
> + },
> + {
> + /* FP SFmode */
> + {
> + COSTS_N_INSNS (9), /* div. */
> + COSTS_N_INSNS (2), /* mult. */
> + COSTS_N_INSNS (3), /* mult_addsub. */
> + COSTS_N_INSNS (3), /* fma. */
> + COSTS_N_INSNS (1), /* addsub. */
> + COSTS_N_INSNS (1), /* fpconst. */
> + 0, /* neg. */
> + 0, /* compare. */
> + COSTS_N_INSNS (1), /* widen. */
> + COSTS_N_INSNS (1), /* narrow. */
> + COSTS_N_INSNS (1), /* toint. */
> + COSTS_N_INSNS (1), /* fromint. */
> + COSTS_N_INSNS (1) /* roundint. */
> + },
> + /* FP DFmode */
> + {
> + COSTS_N_INSNS (14), /* div. */
> + COSTS_N_INSNS (2), /* mult. */
> + COSTS_N_INSNS (3), /* mult_addsub. */
> + COSTS_N_INSNS (3), /* fma. */
> + COSTS_N_INSNS (1), /* addsub. */
> + COSTS_N_INSNS (1), /* fpconst. */
> + 0, /* neg. */
> + 0, /* compare. */
> + COSTS_N_INSNS (1), /* widen. */
> + COSTS_N_INSNS (1), /* narrow. */
> + COSTS_N_INSNS (1), /* toint. */
> + COSTS_N_INSNS (1), /* fromint. */
> + COSTS_N_INSNS (1) /* roundint. */
> + }
> + },
> + /* Vector */
> + {
> + COSTS_N_INSNS (1), /* alu. */
> + COSTS_N_INSNS (4), /* mult. */
> + COSTS_N_INSNS (1), /* movi. */
> + COSTS_N_INSNS (1), /* dup. */
> + COSTS_N_INSNS (1) /* extract. */
> + }
> +};
> +
> const struct cpu_cost_table exynosm1_extra_costs =
> {
> /* ALU */
> --
> 2.39.2 (Apple Git-143)
Hi, Kyrylo.
> Em 11 de abr. de 2023, à(s) 04:41, Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> escreveu:
>
>> -----Original Message-----
>> From: Gcc-patches <gcc-patches-
>> bounces+kyrylo.tkachov=arm.com@gcc.gnu.org <mailto:bounces+kyrylo.tkachov=arm.com@gcc.gnu.org>> On Behalf Of Evandro
>> Menezes via Gcc-patches
>> Sent: Friday, April 7, 2023 11:34 PM
>> To: gcc-patches@gcc.gnu.org <mailto:gcc-patches@gcc.gnu.org>
>> Cc: Evandro Menezes <ebahapo@icloud.com <mailto:ebahapo@icloud.com>>; Richard Sandiford
>> <Richard.Sandiford@arm.com <mailto:Richard.Sandiford@arm.com>>
>> Subject: [PATCH] aarch64: Add the cost and scheduling models for Neoverse
>> N1
>>
>> This patch adds the cost and scheduling models for Neoverse N1, based on
>> the information from the "Arm Neoverse N1 Software Optimization Guide”.
>>
>
> Thank you for working on this. It is true that we haven't added any scheduling models for big cores from Arm for quite a while.
Could you share what motivated y’all not to?
> How has this patch been tested and benchmarked?
I’ve tested it with some small and large benchmarks, for both static and dynamic analysis.
> Using numbers from the Software Optimization Guide is certainly the way to go, but we need to ensure that the way GCC uses them actually results in better performance in practice.
Of course.
>> [PATCH] aarch64: Add the cost and scheduling models for Neoverse N1
>>
>> gcc/ChangeLog:
>>
>> * config/aarch64/aarch64-cores.def:
>> Use the Neoverse N1 scheduling and cost models, but only for itself.
>> * config/aarch64/aarch64.cc
>> (cortexa76_tunings): Rename variable.
>> (neoversen1_addrcost_table): New variable.
>> (neoversen1_vector_cost): Likewise.
>> (neoversen1_regmove_cost): Likewise.
>> (neoversen1_advsimd_vector_cost): Likewise.
>> (neoversen1_scalar_issue_info): Likewise.
>> (neoversen1_advsimd_issue_info): Likewise.
>> (neoversen1_vec_issue_info): Likewise.
>> (neoversen1_vector_cost): Likewise.
>> (neoversen1_tunings): Likewise.
>> * config/aarch64/aarch64.md: Include `neoverse-n1.md`.
>> * config/aarch64/neoverse-n1.md: New file.
>> * gcc/config/arm/aarch-cost-tables.h
>> (neoversen1_extra_costs): New variable.
>>
>> Signed-off-by: Evandro Menezes <evandro@gcc.gnu.org>
>>
>> ---
>> gcc/config/aarch64/aarch64-cores.def | 22 +-
>> gcc/config/aarch64/aarch64.cc | 155 +++++-
>> gcc/config/aarch64/aarch64.md | 1 +
>> gcc/config/aarch64/neoverse-n1.md | 716 +++++++++++++++++++++++++++
>> gcc/config/arm/aarch-cost-tables.h | 107 ++++
>> 5 files changed, 977 insertions(+), 24 deletions(-)
>> create mode 100644 gcc/config/aarch64/neoverse-n1.md
>>
>> diff --git a/gcc/config/aarch64/aarch64-cores.def
>> b/gcc/config/aarch64/aarch64-cores.def
>> index 2ec88c98400..cc842c4e22c 100644
>> --- a/gcc/config/aarch64/aarch64-cores.def
>> +++ b/gcc/config/aarch64/aarch64-cores.def
>> @@ -105,18 +105,18 @@ AARCH64_CORE("thunderx2t99", thunderx2t99,
>> thunderx2t99, V8_1A, (CRYPTO), thu
>> /* ARM ('A') cores. */
>> AARCH64_CORE("cortex-a55", cortexa55, cortexa53, V8_2A, (F16, RCPC,
>> DOTPROD), cortexa53, 0x41, 0xd05, -1)
>> AARCH64_CORE("cortex-a75", cortexa75, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD), cortexa73, 0x41, 0xd0a, -1)
>> -AARCH64_CORE("cortex-a76", cortexa76, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD), neoversen1, 0x41, 0xd0b, -1)
>> -AARCH64_CORE("cortex-a76ae", cortexa76ae, cortexa57, V8_2A, (F16,
>> RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0e, -1)
>> -AARCH64_CORE("cortex-a77", cortexa77, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS), neoversen1, 0x41, 0xd0d, -1)
>> -AARCH64_CORE("cortex-a78", cortexa78, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd41, -1)
>> -AARCH64_CORE("cortex-a78ae", cortexa78ae, cortexa57, V8_2A, (F16,
>> RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd42, -1)
>> -AARCH64_CORE("cortex-a78c", cortexa78c, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), neoversen1, 0x41, 0xd4b, -1)
>> +AARCH64_CORE("cortex-a76", cortexa76, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD), cortexa76, 0x41, 0xd0b, -1)
>> +AARCH64_CORE("cortex-a76ae", cortexa76ae, cortexa57, V8_2A, (F16,
>> RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0e, -1)
>> +AARCH64_CORE("cortex-a77", cortexa77, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS), cortexa76, 0x41, 0xd0d, -1)
>> +AARCH64_CORE("cortex-a78", cortexa78, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd41, -1)
>> +AARCH64_CORE("cortex-a78ae", cortexa78ae, cortexa57, V8_2A, (F16,
>> RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd42, -1)
>> +AARCH64_CORE("cortex-a78c", cortexa78c, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), cortexa76, 0x41, 0xd4b, -1)
>> AARCH64_CORE("cortex-a65", cortexa65, cortexa53, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS), cortexa73, 0x41, 0xd06, -1)
>> AARCH64_CORE("cortex-a65ae", cortexa65ae, cortexa53, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS), cortexa73, 0x41, 0xd43, -1)
>> -AARCH64_CORE("cortex-x1", cortexx1, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd44, -1)
>> -AARCH64_CORE("cortex-x1c", cortexx1c, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS, PROFILE, PAUTH), neoversen1, 0x41, 0xd4c, -1)
>> -AARCH64_CORE("ares", ares, cortexa57, V8_2A, (F16, RCPC, DOTPROD,
>> PROFILE), neoversen1, 0x41, 0xd0c, -1)
>> -AARCH64_CORE("neoverse-n1", neoversen1, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
>> +AARCH64_CORE("cortex-x1", cortexx1, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd44, -1)
>> +AARCH64_CORE("cortex-x1c", cortexx1c, cortexa57, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS, PROFILE, PAUTH), cortexa76, 0x41, 0xd4c, -1)
>> +AARCH64_CORE("ares", ares, cortexa57, V8_2A, (F16, RCPC, DOTPROD,
>> PROFILE), cortexa76, 0x41, 0xd0c, -1)
>> +AARCH64_CORE("neoverse-n1", neoversen1, neoversen1, V8_2A, (F16,
>> RCPC, DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
>> AARCH64_CORE("neoverse-e1", neoversee1, cortexa53, V8_2A, (F16, RCPC,
>> DOTPROD, SSBS), cortexa73, 0x41, 0xd4a, -1)
>>
>> /* Cavium ('C') cores. */
>> @@ -160,7 +160,7 @@ AARCH64_CORE("cortex-a73.cortex-a53",
>> cortexa73cortexa53, cortexa53, V8A, (CRC
>> /* ARM DynamIQ big.LITTLE configurations. */
>>
>> AARCH64_CORE("cortex-a75.cortex-a55", cortexa75cortexa55, cortexa53,
>> V8_2A, (F16, RCPC, DOTPROD), cortexa73, 0x41, AARCH64_BIG_LITTLE
>> (0xd0a, 0xd05), -1)
>> -AARCH64_CORE("cortex-a76.cortex-a55", cortexa76cortexa55, cortexa53,
>> V8_2A, (F16, RCPC, DOTPROD), neoversen1, 0x41, AARCH64_BIG_LITTLE
>> (0xd0b, 0xd05), -1)
>> +AARCH64_CORE("cortex-a76.cortex-a55", cortexa76cortexa55, cortexa53,
>> V8_2A, (F16, RCPC, DOTPROD), cortexa76, 0x41, AARCH64_BIG_LITTLE
>> (0xd0b, 0xd05), -1)
>
> I would expect that whatever tuning decisions GCC makes for Neoverse N1 would also be best for all the cores you've touched in this hunk.
> That is, we shouldn't have separate cortexa76 and neoversen1 tunings. Let's use neoversen1 tunings for all of these.
If you notice, I renamed the old N1 tunings used by several cores to A76 and used the N1 specific tunings only for it itself. Since I cannot test these new N1 tunings on other cores, I thought that it would be sensible to leave them using the existing tuning used for them.
>> /* Armv8-R Architecture Processors. */
>> AARCH64_CORE("cortex-r82", cortexr82, cortexa53, V8R, (), cortexa53, 0x41,
>> 0xd15, -1)
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index 42617ced73a..071318c49f4 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -1867,7 +1867,7 @@ static const struct tune_params
>> thunderx3t110_tunings =
>> &thunderx3t110_prefetch_tune
>> };
>>
>> -static const struct tune_params neoversen1_tunings =
>> +static const struct tune_params cortexa76_tunings =
>> {
>> &cortexa76_extra_costs,
>> &generic_addrcost_table,
>> @@ -1885,18 +1885,18 @@ static const struct tune_params
>> neoversen1_tunings =
>> }, /* memmov_cost. */
>> 3, /* issue_rate */
>> (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /*
>> fusible_ops */
>> - "32:16", /* function_align. */
>> - "4", /* jump_align. */
>> - "32:16", /* loop_align. */
>> - 2, /* int_reassoc_width. */
>> - 4, /* fp_reassoc_width. */
>> - 1, /* fma_reassoc_width. */
>> - 2, /* vec_reassoc_width. */
>> - 2, /* min_div_recip_mul_sf. */
>> - 2, /* min_div_recip_mul_df. */
>> - 0, /* max_case_values. */
>> - tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
>> - (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
>> + "32:16", /* function_align. */
>> + "4", /* jump_align. */
>> + "32:16", /* loop_align. */
>> + 2, /* int_reassoc_width. */
>> + 4, /* fp_reassoc_width. */
>> + 1, /* fma_reassoc_width. */
>> + 2, /* vec_reassoc_width. */
>> + 2, /* min_div_recip_mul_sf. */
>> + 2, /* min_div_recip_mul_df. */
>> + 0, /* max_case_values. */
>> + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
>> + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
>> &generic_prefetch_tune
>> };
>>
>> @@ -2293,6 +2293,135 @@ static const struct tune_params
>> neoverse512tvb_tunings =
>> &generic_prefetch_tune
>> };
>>
>> +static const struct cpu_addrcost_table neoversen1_addrcost_table =
>> +{
>> + {
>> + 0, /* hi */
>> + 0, /* si */
>> + 0, /* di */
>> + 1, /* ti */
>> + },
>> + 0, /* pre_modify */
>> + 0, /* post_modify */
>> + 1, /* post_modify_ld3_st3 */
>> + 1, /* post_modify_ld4_st4 */
>> + 0, /* register_offset */
>> + 0, /* register_sextend */
>> + 0, /* register_zextend */
>> + 0 /* imm_offset */
>> +};
>> +
>> +static const struct cpu_regmove_cost neoversen1_regmove_cost =
>> +{
>> + 1, /* GP2GP */
>> + /* Avoid the use of slow int<->fp moves for spilling by setting
>> + their cost higher than memmov_cost. */
>> + 3, /* GP2FP */
>> + 2, /* FP2GP */
>> + 2 /* FP2FP */
>> +};
>> +
>> +static const advsimd_vec_cost neoversen1_advsimd_vector_cost =
>> +{
>> + 2, /* int_stmt_cost */
>> + 2, /* fp_stmt_cost */
>> + 0, /* ld2_st2_permute_cost */
>> + 0, /* ld3_st3_permute_cost */
>> + 0, /* ld4_st4_permute_cost */
>> + 3, /* permute_cost */
>> + 6, /* reduc_i8_cost */
>> + 5, /* reduc_i16_cost */
>> + 3, /* reduc_i32_cost */
>> + 3, /* reduc_i64_cost */
>> + 8, /* reduc_f16_cost */
>> + 5, /* reduc_f32_cost */
>> + 5, /* reduc_f64_cost */
>> + 0, /* store_elt_extra_cost */
>> + 2, /* vec_to_scalar_cost */
>> + 2, /* scalar_to_vec_cost */
>> + 4, /* align_load_cost */
>> + 4, /* unalign_load_cost */
>> + 1, /* unalign_store_cost */
>> + 1 /* store_cost */
>> +};
>> +
>> +static const aarch64_scalar_vec_issue_info neoversen1_scalar_issue_info =
>> +{
>> + 2, /* loads_stores_per_cycle */
>> + 2, /* stores_per_cycle */
>> + 2, /* general_ops_per_cycle */
>> + 0, /* fp_simd_load_general_ops */
>> + 1 /* fp_simd_store_general_ops */
>> +};
>> +
>> +static const aarch64_advsimd_vec_issue_info
>> neoversen1_advsimd_issue_info =
>> +{
>> + {
>> + 2, /* loads_stores_per_cycle */
>> + 2, /* stores_per_cycle */
>> + 2, /* general_ops_per_cycle */
>> + 0, /* fp_simd_load_general_ops */
>> + 1 /* fp_simd_store_general_ops */
>> + },
>> + 3, /* ld2_st2_general_ops */
>> + 5, /* ld3_st3_general_ops */
>> + 11 /* ld4_st4_general_ops */
>> +};
>> +
>> +static const aarch64_vec_issue_info neoversen1_vec_issue_info =
>> +{
>> + &neoversen1_scalar_issue_info, /* scalar */
>> + &neoversen1_advsimd_issue_info, /* advsimd */
>> + nullptr /* sve */
>> +};
>> +
>> +
>> +static const struct cpu_vector_cost neoversen1_vector_cost =
>> +{
>> + 1, /* scalar_int_stmt_cost */
>> + 1, /* scalar_fp_stmt_cost */
>> + 4, /* scalar_load_cost */
>> + 1, /* scalar_store_cost */
>> + 1, /* cond_taken_branch_cost */
>> + 1, /* cond_not_taken_branch_cost */
>> + &neoversen1_advsimd_vector_cost, /* advsimd */
>> + nullptr, /* sve */
>> + &neoversen1_vec_issue_info /* issue_info */
>> +};
>> +
>> +static const struct tune_params neoversen1_tunings =
>> +{
>> + &neoversen1_extra_costs,
>> + &neoversen1_addrcost_table,
>> + &neoversen1_regmove_cost,
>> + &neoversen1_vector_cost,
>> + &generic_branch_cost,
>> + &generic_approx_modes,
>> + SVE_NOT_IMPLEMENTED, /* sve_width */
>> + { 4, /* load_int. */
>> + 2, /* store_int. */
>> + 5, /* load_fp. */
>> + 2, /* store_fp. */
>> + 4, /* load_pred. */
>> + 4 /* store_pred. */
>> + }, /* memmov_cost. */
>> + 4, /* issue_rate */
>> + AARCH64_FUSE_AES_AESMC, /* fusible_ops */
>> + "32", /* function_align. */
>
> I guess this number worries me somewhat. Previously we used "32:16" here as using "32" would bloat code size too much by adding too many alignment nops around small functions.
> Do you have performance numbers to recommend this change?
At first, I did this to minimize noise between comparisons in run time due to lucky alignments that the initial values could cause. In the end, I registered a 2% penalty, for instance, in SPEC CPU2017. However, I do not have performance numbers for solely this change.
> More generally, the tuning structure changes and the scheduling model are two logically distinct and major changes and so should be split into separate patches.
> That would make it easier to evaluate the performance benefits of each individually.
Will do.
>> + "4", /* jump_align. */
>> + "32:16", /* loop_align. */
>> + 2, /* int_reassoc_width. */
>> + 4, /* fp_reassoc_width. */
>> + 1, /* fma_reassoc_width. */
>> + 2, /* vec_reassoc_width. */
>> + 2, /* min_div_recip_mul_sf. */
>> + 2, /* min_div_recip_mul_df. */
>> + 0, /* max_case_values. */
>> + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
>> + AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND, /* tune_flags. */
>> + &generic_prefetch_tune
>> +};
>> +
>> static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
>> {
>> 2, /* int_stmt_cost */
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 022eef80bc1..6cb9e31259b 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -471,6 +471,7 @@
>> (include "../arm/cortex-a57.md")
>> (include "../arm/exynos-m1.md")
>> (include "falkor.md")
>> +(include "neoverse-n1.md")
>> (include "saphira.md")
>> (include "thunderx.md")
>> (include "../arm/xgene1.md")
>> diff --git a/gcc/config/aarch64/neoverse-n1.md
>> b/gcc/config/aarch64/neoverse-n1.md
>
> For CPUs that support both aarch32 and aarch64 modes we usually put them in config/arm/ but I appreciate that the vast majority of Neoverse N1 users care about running aarch64 code mostly so I don't object to having it in config/aarch64/
Thank you.
>> new file mode 100644
>> index 00000000000..e13c826d494
>> --- /dev/null
>> +++ b/gcc/config/aarch64/neoverse-n1.md
>> @@ -0,0 +1,716 @@
>> +;; ARM Neoverse N1 pipeline description
>
> Please use "Arm Neoverse N1" for the name.
Will do.
>> +;; (Based on the "Arm® NeoverseTM N1 Software Optimization Guide")
>> +;;
>> +;; Copyright (C) 2014-2023 Free Software Foundation, Inc.
>> +;;
>> +;; This file is part of GCC.
>> +;;
>> +;; GCC is free software; you can redistribute it and/or modify it
>> +;; under the terms of the GNU General Public License as published by
>> +;; the Free Software Foundation; either version 3, or (at your option)
>> +;; any later version.
>> +;;
>> +;; GCC is distributed in the hope that it will be useful, but
>> +;; WITHOUT ANY WARRANTY; without even the implied warranty of
>> +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> +;; General Public License for more details.
>> +;;
>> +;; You should have received a copy of the GNU General Public License
>> +;; along with GCC; see the file COPYING3. If not see
>> +;; <http://www.gnu.org/licenses/>.
>> +
>> +;; The Neoverse N1 core is modelled as a multiple issue pipeline that has
>> +;; the following functional units.
>> +
>> +(define_automaton "neoverse_n1")
>> +
>> +;; 1 - Two pipelines for integer operations: SX1, SX2.
>> +
>> +(define_cpu_unit "neon1_sx1_issue" "neoverse_n1")
>> +(define_reservation "neon1_sx1" "neon1_sx1_issue")
>> +
>> +(define_cpu_unit "neon1_sx2_issue" "neoverse_n1")
>> +(define_reservation "neon1_sx2" "neon1_sx2_issue")
>> +
>> +;; 2 - One pipeline for complex integer operations: MX.
>> +
>> +(define_cpu_unit "neon1_mx_issue"
>> + "neoverse_n1")
>> +(define_reservation "neon1_mx" "neon1_mx_issue")
>> +(define_reservation "neon1_m_block" "neon1_mx_issue")
>> +
>> +;; 3 - Two asymmetric pipelines for Neon and FP operations: CX1, CX2.
>> +(define_automaton "neoverse_n1_cx")
>> +
>> +(define_cpu_unit "neon1_cx1_issue"
>> + "neoverse_n1_cx")
>> +(define_cpu_unit "neon1_cx2_issue"
>> + "neoverse_n1_cx")
>> +
>> +(define_reservation "neon1_cx1" "neon1_cx1_issue")
>> +(define_reservation "neon1_cx2" "neon1_cx2_issue")
>> +(define_reservation "neon1_v0_block" "neon1_cx1_issue")
>> +
>> +;; 4 - One pipeline for branch operations: BX.
>> +
>> +(define_cpu_unit "neon1_bx_issue" "neoverse_n1")
>> +(define_reservation "neon1_bx" "neon1_bx_issue")
>> +
>> +;; 5 - Two pipelines for load and store operations: LS1, LS2.
>> +
>> +(define_cpu_unit "neon1_ls1_issue" "neoverse_n1")
>> +(define_reservation "neon1_ls1" "neon1_ls1_issue")
>> +
>> +(define_cpu_unit "neon1_ls2_issue" "neoverse_n1")
>> +(define_reservation "neon1_ls2" "neon1_ls2_issue")
>> +
>> +;; Block all issue queues.
>> +
>> +(define_reservation "neon1_block" "neon1_sx1_issue + neon1_sx2_issue
>> + + neon1_mx_issue
>> + + neon1_cx1_issue + neon1_cx2_issue
>> + + neon1_ls1_issue + neon1_ls2_issue")
>> +
>> +;; Issue groups.
>> +
>> +(define_reservation "neon1_b" "neon1_bx")
>> +(define_reservation "neon1_i" "(neon1_sx1 | neon1_sx2 | neon1_mx)")
>> +(define_reservation "neon1_m" "neon1_mx")
>> +(define_reservation "neon1_d" "(neon1_sx2 | neon1_mx)")
>> +(define_reservation "neon1_l" "(neon1_ls1 | neon1_ls2)")
>> +(define_reservation "neon1_v" "(neon1_cx1 | neon1_cx2)")
>> +(define_reservation "neon1_v0" "neon1_cx1")
>> +(define_reservation "neon1_v1" "neon1_cx2")
>> +
>> +;; Intructions resouces.
>> +
>> +;; Block.
>> +(define_insn_reservation "neoverse_n1_block" 1
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "block"))
>> + "neon1_block")
>> +
>> +;; Branches
>> +;; No latency as there is no result.
>> +(define_insn_reservation "neoverse_n1_branch" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "branch"))
>> + "neon1_b")
>> +
>> +;; Calls
>> +;; No latency as there is no result.
>> +(define_insn_reservation "neoverse_n1_call" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "call"))
>> + "neon1_i + neon1_b")
>> +
>> +;; ALU with no or simple shift.
>> +;; TODO: there should also be "alus_shift_imm_lsl_1to4".
>> +(define_insn_reservation "neoverse_n1_alu" 1
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "alu_imm, alu_shift_imm_lsl_1to4, alu_sreg, \
>> + alus_imm, alus_sreg, \
>> + csel, \
>> + logic_imm, logic_reg, logic_shift_imm, \
>> + logics_imm, logics_reg, \
>> + mov_reg"))
>> + "neon1_i")
>> +
>> +;; ALU with extension or complex shift.
>> +;; TODO: there should also be "alus_shift_imm_other".
>> +(define_insn_reservation "neoverse_n1_alu_shift" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "alu_ext, alu_shift_imm_other, alu_shift_reg, \
>> + alus_shift_imm, alus_shift_reg, \
>> + logic_shift_reg, logics_shift_imm, logics_shift_reg, \
>> + crc"))
>> + "neon1_m")
>> +
>> +;; Miscellaneous ALU.
>> +;; TODO: model 2-register "extr", "bfi", variable shifts.
>> +(define_insn_reservation "neoverse_n1_alu_misc" 1
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "adr, rotate_imm, bfm, clz, mov_imm, rbit, rev"))
>> + "neon1_i")
>> +
>> +;; Integer divide.
>> +;; Divisions are not pipelined.
>> +(define_insn_reservation "neoverse_n1_div" 12
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "udiv, sdiv"))
>> + "neon1_m, (neon1_m_block * 12)")
>> +
>> +;; Narrow multiply.
>> +(define_insn_reservation "neoverse_n1_mul" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "mla, mul"))
>> + "neon1_m")
>> +
>> +;; Wide multiply.
>> +;; TODO: model multiply high.
>> +(define_insn_reservation "neoverse_n1_mull" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "smull, umull"))
>> + "neon1_m")
>> +
>> +;; Integer load.
>> +;; TODO: model load pairs fully.
>> +(define_insn_reservation "neoverse_n1_ld" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "load_byte, load_4, load_8"))
>> + "neon1_l")
>> +
>> +(define_insn_reservation "neoverse_n1_ld16" 5
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "load_16"))
>> + "neon1_l * 2")
>> +
>> +;; Integer store.
>> +;; TODO: model store pairs fully.
>> +(define_insn_reservation "neoverse_n1_st" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "store_4, store_8"))
>> + "neon1_d, neon1_l")
>> +
>> +(define_insn_reservation "neoverse_n1_stp" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "store_16"))
>> + "neon1_i, (neon1_l * 2)")
>> +
>> +;; FP arithmetic.
>> +(define_insn_reservation "neoverse_n1_fp_alu" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "f_minmaxd, f_minmaxs, \
>> + faddd, fadds, \
>> + fconstd, fconsts, \
>> + fcsel, \
>> + ffarithd, ffariths, \
>> + fmov"))
>> + "neon1_v")
>> +
>> +;; FP compare.
>> +(define_insn_reservation "neoverse_n1_fp_cmp" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "fcmpd, fcmps, fccmpd, fccmps"))
>> + "neon1_v0")
>> +
>> +;; FP round.
>> +(define_insn_reservation "neoverse_n1_fp_rint" 3
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "f_rintd, f_rints"))
>> + "neon1_v0")
>> +
>> +;; FP divide & square-root.
>> +;; Divisions are not pipelined.
>> +;; TODO: model half-precision.
>> +(define_insn_reservation "neoverse_n1_fp_divd" 15
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "fdivd, fsqrtd"))
>> + "neon1_v0, (neon1_v0_block * 15)")
>> +
>> +(define_insn_reservation "neoverse_n1_fp_divs" 10
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "fdivs, fsqrts"))
>> + "neon1_v0, (neon1_v0_block * 10)")
>> +
>> +;; FP multiply.
>> +(define_insn_reservation "neoverse_n1_fp_mul" 3
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "fmuld, fmuls"))
>> + "neon1_v")
>> +
>> +(define_insn_reservation "neoverse_n1_fp_mac" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "fmacd, fmacs"))
>> + "neon1_v")
>> +
>> +;; FP convert.
>> +(define_insn_reservation "neoverse_n1_fp_cvt" 3
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "f_cvt"))
>> + "neon1_v0")
>> +
>> +(define_insn_reservation "neoverse_n1_fp_cvti2f" 6
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "f_cvti2f"))
>> + "neon1_m + neon1_v0")
>> +
>> +(define_insn_reservation "neoverse_n1_fp_cvtf2i" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "f_cvtf2i"))
>> + "neon1_v0 + neon1_v1")
>> +
>> +;; FP move.
>> +(define_insn_reservation "neoverse_n1_fp_mov" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "fconstd, fconsts, \
>> + fmov"))
>> + "neon1_v")
>> +
>> +(define_insn_reservation "neoverse_n1_fp_movi2f" 3
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "f_mcr"))
>> + "neon1_m")
>> +
>> +(define_insn_reservation "neoverse_n1_fp_movf2i" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "f_mrc, \
>> + neon_to_gp, neon_to_gp_q"))
>> + "neon1_v1")
>> +
>> +;; FP load.
>> +(define_insn_reservation "neoverse_n1_fp_ld" 5
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "f_loadd, f_loads"))
>> + "neon1_i, neon1_l")
>> +
>> +(define_insn_reservation "neoverse_n1_fp_ldp" 5
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_ldp"))
>> + "neon1_i, (neon1_l * 2)")
>> +
>> +(define_insn_reservation "neoverse_n1_fp_ldp_q" 7
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_ldp_q"))
>> + "neon1_i, (neon1_l * 2)")
>> +
>> +;; FP store.
>> +(define_insn_reservation "neoverse_n1_fp_st" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "f_stored, f_stores"))
>> + "neon1_i, neon1_l")
>> +
>> +(define_insn_reservation "neoverse_n1_fp_stp" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_stp"))
>> + "neon1_l + neon1_v")
>> +
>> +(define_insn_reservation "neoverse_n1_fp_stp_q" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_stp_q"))
>> + "(neon1_l * 2) + neon1_v")
>> +
>> +;; ASIMD arithmetic.
>> +(define_insn_reservation "neoverse_n1_asimd_abd_long" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_abd_long"))
>> + "neon1_v1")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_alu" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_abd, neon_abd_q, \
>> + neon_abs, neon_abs_q, \
>> + neon_add, neon_add_q, \
>> + neon_add_halve, neon_add_halve_q, \
>> + neon_add_halve_narrow_q, \
>> + neon_add_long, neon_add_widen, \
>> + neon_bsl, neon_bsl_q, \
>> + neon_cls, neon_cls_q, \
>> + neon_compare, neon_compare_q, \
>> + neon_compare_zero, neon_compare_zero_q, \
>> + neon_dot, neon_dot_q, \
>> + neon_dup, neon_dup_q, \
>> + neon_ext, neon_ext_q, \
>> + neon_ins, neon_ins_q, \
>> + neon_logic, neon_logic_q, \
>> + neon_minmax, neon_minmax_q, \
>> + neon_move, neon_move_q, \
>> + neon_move_narrow_q, \
>> + neon_neg, neon_neg_q, \
>> + neon_permute, neon_permute_q, \
>> + neon_qabs, neon_qabs_q, \
>> + neon_qadd, neon_qadd_q, \
>> + neon_qneg, neon_qneg_q, \
>> + neon_qsub, neon_qsub_q, \
>> + neon_rbit, neon_rbit_q, \
>> + neon_reduc_add, neon_reduc_add_q, \
>> + neon_rev, neon_rev_q, \
>> + neon_sub, neon_sub_q, \
>> + neon_sub_halve, neon_sub_halve_q, \
>> + neon_sub_halve_narrow_q, \
>> + neon_sub_widen, neon_sub_long, \
>> + neon_tbl1, neon_tbl1_q, \
>> + neon_tbl2, neon_tbl2_q"))
>> + "neon1_v")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_arith_acc" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_arith_acc"))
>> + "neon1_v1")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_shift_acc_q" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_shift_acc_q"))
>> + "neon1_v1")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_reduc" 3
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_reduc_add_long, \
>> + neon_reduc_minmax, neon_reduc_minmax_q"))
>> + "neon1_v1")
>> +
>> +
>> +;; ASIMD multiply.
>> +(define_insn_reservation "neoverse_n1_asimd_mla" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_mla_b, neon_mla_b_long, \
>> + neon_mla_h, neon_mla_h_long, \
>> + neon_mla_h_scalar, neon_mla_h_scalar_long, \
>> + neon_mla_s, neon_mla_s_long, \
>> + neon_mla_s_scalar, neon_mla_s_scalar_long"))
>> + "neon1_v0")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_mla_q" 5
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_mla_b_q,
>> + neon_mla_h_q, neon_mla_h_scalar_q, \
>> + neon_mla_s_q, neon_mla_s_scalar_q"))
>> + "neon1_v0 * 2")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_mul" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_mul_b, neon_mul_b_long, \
>> + neon_mul_h, neon_mul_h_long, \
>> + neon_mul_s, neon_mul_s_long,
>> + neon_sat_mul_b, neon_sat_mul_b_long,
>> + neon_sat_mul_h, neon_sat_mul_h_long, \
>> + neon_sat_mul_h_scalar, neon_sat_mul_h_scalar_long,
>> + neon_sat_mul_s, neon_sat_mul_s_long, \
>> + neon_sat_mul_s_scalar, neon_sat_mul_s_scalar_long"))
>> + "neon1_v0")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_mul_q" 5
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_mul_b_q, neon_mul_h_q, neon_mul_s_q, \
>> + neon_sat_mul_b_q, \
>> + neon_sat_mul_h_q, neon_sat_mul_h_scalar_q, \
>> + neon_sat_mul_s_q, neon_sat_mul_s_scalar_q"))
>> + "neon1_v0 * 2")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_sat_mla" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_sat_mla_b_long, \
>> + neon_sat_mla_h_long, neon_sat_mla_h_scalar_long, \
>> + neon_sat_mla_s_long, neon_sat_mla_s_scalar_long"))
>> + "neon1_v0")
>> +
>> +;; ASIMD shift.
>> +(define_insn_reservation "neoverse_n1_asimd_shift" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_shift_imm, neon_shift_imm_q,
>> neon_shift_imm_long, \
>> + neon_shift_reg, neon_shift_reg_q"))
>> + "neon1_v1")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_shift_q" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_sat_shift_imm, neon_sat_shift_imm_q, \
>> + neon_sat_shift_imm_narrow_q, \
>> + neon_sat_shift_reg, neon_sat_shift_reg_q, \
>> + neon_shift_imm_narrow_q"))
>> + "neon1_v1")
>> +
>> +;; ASIMD FP arithmetic.
>> +(define_insn_reservation "neoverse_n1_asimd_fp_alu" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_abd_d, neon_fp_abd_d_q, \
>> + neon_fp_abd_s, neon_fp_abd_s_q, \
>> + neon_fp_abs_d, neon_fp_abs_d_q, \
>> + neon_fp_abs_s, neon_fp_abs_s_q, \
>> + neon_fp_addsub_d, neon_fp_addsub_d_q, \
>> + neon_fp_addsub_s, neon_fp_addsub_s_q, \
>> + neon_fp_compare_d, neon_fp_compare_d_q, \
>> + neon_fp_compare_s, neon_fp_compare_s_q, \
>> + neon_fp_minmax_d, neon_fp_minmax_d_q, \
>> + neon_fp_minmax_s, neon_fp_minmax_s_q, \
>> + neon_fp_neg_d, neon_fp_neg_d_q, \
>> + neon_fp_neg_s, neon_fp_neg_s_q, \
>> + neon_fp_reduc_add_d, neon_fp_reduc_add_d_q, \
>> + neon_fp_reduc_add_s, neon_fp_reduc_add_s_q"))
>> + "neon1_v")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_fp_reduc" 5
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_reduc_minmax_d,
>> neon_fp_reduc_minmax_d_q, \
>> + neon_fp_reduc_minmax_s, neon_fp_reduc_minmax_s_q"))
>> + "neon1_v")
>> +
>> +;; ASIMD FP convert.
>> +(define_insn_reservation "neoverse_n1_asimd_cvt" 3
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_cvt_narrow_d_q, \
>> + neon_fp_cvt_widen_s, \
>> + neon_fp_to_int_d, neon_fp_to_int_d_q, \
>> + neon_fp_to_int_s, \
>> + neon_int_to_fp_d, neon_int_to_fp_d_q, \
>> + neon_int_to_fp_s, \
>> + neon_fp_recpe_d, neon_fp_recpe_s, \
>> + neon_fp_recpx_d, neon_fp_recpx_s, \
>> + neon_fp_round_d, neon_fp_round_d_q, \
>> + neon_fp_round_s"))
>> + "neon1_v0")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_cvt_q" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_cvt_narrow_s_q, \
>> + neon_fp_cvt_widen_h, \
>> + neon_fp_to_int_s_q, \
>> + neon_int_to_fp_s_q, \
>> + neon_fp_recpe_d_q, neon_fp_recpe_s_q, \
>> + neon_fp_recpx_d_q, neon_fp_recpx_s_q, \
>> + neon_fp_round_s_q"))
>> + "neon1_v0 * 2")
>> +
>> +;; ASIMD FP divide & square-root.
>> +;; Divisions are not pipelined.
>> +;; TODO: model half-precision.
>> +(define_insn_reservation "neoverse_n1_asimd_fp_divd_q" 15
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_div_d_q"))
>> + "neon1_v0, (neon1_v0_block * 14)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_fp_divs" 10
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_div_s"))
>> + "neon1_v0, (neon1_v0_block * 5)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_fp_divs_q" 10
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_div_s_q"))
>> + "neon1_v0, (neon1_v0_block * 9)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_fp_sqrtd_q" 17
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_sqrt_d_q"))
>> + "neon1_v0, (neon1_v0_block * 16)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_fp_sqrts" 10
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_sqrt_s"))
>> + "neon1_v0, (neon1_v0_block * 5)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_fp_sqrts_q" 10
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_sqrt_s_q"))
>> + "neon1_v0, (neon1_v0_block * 9)")
>> +
>> +;; ASIMD FP multiply.
>> +(define_insn_reservation "neoverse_n1_asimd_fp_mul" 3
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_mul_d, neon_fp_mul_d_q,
>> neon_fp_mul_d_scalar_q, \
>> + neon_fp_mul_s, neon_fp_mul_s_q,
>> neon_fp_mul_s_scalar_q"))
>> + "neon1_v")
>> +
>> +;; TODO: model the long form.
>> +(define_insn_reservation "neoverse_n1_asimd_fp_mla" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_fp_mla_d, neon_fp_mla_d_q,
>> neon_fp_mla_d_scalar_q, \
>> + neon_fp_mla_s, neon_fp_mla_s_q, neon_fp_mla_s_scalar_q, \
>> + neon_fp_recps_d, neon_fp_recps_d_q, \
>> + neon_fp_recps_s, neon_fp_recps_s_q"))
>> + "neon1_v")
>> +
>> +;; ASIMD miscellaneous.
>> +(define_insn_reservation "neoverse_n1_asimd_gp_fp" 3
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_from_gp, neon_from_gp_q"))
>> + "neon1_m")
>> +
>> +;; TODO: model "tbx" fully.
>> +(define_insn_reservation "neoverse_n1_asimd_tbl_3" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_tbl3, neon_tbl3_q"))
>> + "neon1_v * 4")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_tbl_4" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_tbl4, neon_tbl4_q"))
>> + "neon1_v * 6")
>> +
>> +;; ASIMD load.
>> +(define_insn_reservation "neoverse_n1_asimd_ld_a" 5
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q, \
>> + neon_load1_2reg, neon_load1_2reg_q"))
>> + "neon1_l")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_ld_b" 6
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_load1_3reg, neon_load1_3reg_q"))
>> + "neon1_l * 3")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_ld_c" 6
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_load1_4reg, neon_load1_4reg_q"))
>> + "neon1_l * 4")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_ld_d" 7
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_load1_all_lanes, neon_load1_all_lanes_q, \
>> + neon_load1_one_lane, neon_load1_one_lane_q"))
>> + "neon1_l + neon1_v")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_ld_e" 7
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_load2_2reg, neon_load2_2reg_q, \
>> + neon_load2_all_lanes, neon_load2_all_lanes_q, \
>> + neon_load2_one_lane, neon_load2_one_lane_q"))
>> + "(neon1_l * 2) + neon1_v")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_ld_f" 8
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_load3_3reg, neon_load3_3reg_q, \
>> + neon_load4_all_lanes, neon_load4_all_lanes_q, \
>> + neon_load4_one_lane, neon_load4_one_lane_q"))
>> + "(neon1_l * 4) + neon1_v")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_ld_g" 7
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_load3_all_lanes, neon_load3_all_lanes_q, \
>> + neon_load3_one_lane, neon_load3_one_lane_q"))
>> + "(neon1_l * 4) + neon1_v")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_ld_h" 8
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_load4_4reg"))
>> + "(neon1_l * 7) + neon1_v")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_ld_i" 10
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_load4_4reg_q"))
>> + "(neon1_l * 10) + neon1_v")
>> +
>> +;; ASIMD store.
>> +(define_insn_reservation "neoverse_n1_asimd_st_a" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q, \
>> + neon_store1_2reg"))
>> + "neon1_v + neon1_l")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_b" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store1_1reg_q, \
>> + neon_store1_2reg"))
>> + "neon1_v + (neon1_l * 2)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_c" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store1_2reg_q, \
>> + neon_store1_4reg"))
>> + "neon1_v + (neon1_l * 4)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_d" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store1_3reg"))
>> + "neon1_v + (neon1_l * 3)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_e" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store1_3reg_q"))
>> + "neon1_v + (neon1_l * 6)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_f" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store1_4reg_q"))
>> + "neon1_v + (neon1_l * 8)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_g" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store1_one_lane, neon_store1_one_lane_q, \
>> + neon_store2_2reg, \
>> + neon_store2_one_lane, neon_store2_one_lane_q"))
>> + "neon1_v + (neon1_l * 2)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_h" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store2_2reg_q, \
>> + neon_store3_3reg, \
>> + neon_store3_one_lane_q"))
>> + "neon1_v + (neon1_l * 4)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_i" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store3_3reg_q"))
>> + "neon1_v + (neon1_l * 6)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_j" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store3_one_lane"))
>> + "neon1_v + (neon1_l * 4)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_k" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store4_4reg"))
>> + "neon1_v + (neon1_l * 6)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_l" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store4_4reg_q"))
>> + "neon1_v + (neon1_l * 12)")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_st_m" 0
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "neon_store4_one_lane, neon_store4_one_lane_q"))
>> + "neon1_v + (neon1_l * 3)")
>> +
>> +;; ASIMD crypto.
>> +;; TODO: model different widths.
>> +(define_insn_reservation "neoverse_n1_asimd_aese" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "crypto_aese"))
>> + "neon1_v0")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_aesmc" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "crypto_aesmc"))
>> + "neon1_v0")
>> +
>> +;; FIXME: "sha256u1" should be "crypto_sha256_fast".
>> +(define_insn_reservation "neoverse_n1_asimd_sha_fast" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "crypto_sha1_fast, crypto_sha1_xor,
>> + crypto_sha256_fast"))
>> + "neon1_v0")
>> +
>> +(define_insn_reservation "neoverse_n1_asimd_sha_slow" 4
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow"))
>> + "neon1_v0")
>> +
>> +;; FIXME: "pmull" sometimes is also
>> "neon_mul_{b,h,s}(_scalar)?(_(q|long))?"
>> +(define_insn_reservation "neoverse_n1_asimd_poly" 3
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "crypto_pmull"))
>> + "neon1_v0")
>> +
>> +;; CRC
>> +(define_insn_reservation "neoverse_n1_crc" 2
>> + (and (eq_attr "tune" "neoversen1")
>> + (eq_attr "type" "crc"))
>> + "neon1_m")
>> +
>> +;; Bypasses.
>> +;; TODO: Model region forwarding.
>> +
>> +;; Integer multiply.
>> +;; TODO: model the X and high forms.
>> +(define_bypass 1 "neoverse_n1_mul, neoverse_n1_mull"
>> + "neoverse_n1_mul, neoverse_n1_mull")
>> +
>> +;; FP multiply.
>> +(define_bypass 2 "neoverse_n1_fp_mul" "neoverse_n1_fp_mul")
>> +(define_bypass 2 "neoverse_n1_fp_mac" "neoverse_n1_fp_mac")
>> +
>> +;; ASIMD arithmetic.
>> +(define_bypass 1 "neoverse_n1_asimd_arith_acc"
>> "neoverse_n1_asimd_arith_acc")
>> +(define_bypass 1 "neoverse_n1_asimd_shift_acc_q"
>> "neoverse_n1_asimd_shift_acc_q")
>> +
>> +;; ASIMD multiply.
>> +(define_bypass 1 "neoverse_n1_asimd_mla" "neoverse_n1_asimd_mla")
>> +(define_bypass 2 "neoverse_n1_asimd_mla_q"
>> "neoverse_n1_asimd_mla_q")
>> +
>> +;; ASIMD FP multiply.
>> +(define_bypass 2 "neoverse_n1_asimd_fp_mul"
>> "neoverse_n1_asimd_fp_mul")
>> +(define_bypass 2 "neoverse_n1_asimd_fp_mla"
>> "neoverse_n1_asimd_fp_mla")
>> +
>> +;; CRC
>> +(define_bypass 1 "neoverse_n1_crc" "neoverse_n1_*")
>
> I haven't gone through the model in great detail, but I've noticed that there are quite a few TODO: markers.
> Are there plans to address those? I'd be hesitant to take a model with important gaps in it as the quality of the schedule would be degraded if the compiler cannot reason about significant parts of the ISA.
They are mostly notes for future work. Some require more granular instruction types, some new costs to the tuning to handle specific data types or particular operations, much as it’s already done for SI or DI in some cases.
@@ -105,18 +105,18 @@ AARCH64_CORE("thunderx2t99", thunderx2t99, thunderx2t99, V8_1A, (CRYPTO), thu
/* ARM ('A') cores. */
AARCH64_CORE("cortex-a55", cortexa55, cortexa53, V8_2A, (F16, RCPC, DOTPROD), cortexa53, 0x41, 0xd05, -1)
AARCH64_CORE("cortex-a75", cortexa75, cortexa57, V8_2A, (F16, RCPC, DOTPROD), cortexa73, 0x41, 0xd0a, -1)
-AARCH64_CORE("cortex-a76", cortexa76, cortexa57, V8_2A, (F16, RCPC, DOTPROD), neoversen1, 0x41, 0xd0b, -1)
-AARCH64_CORE("cortex-a76ae", cortexa76ae, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0e, -1)
-AARCH64_CORE("cortex-a77", cortexa77, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0d, -1)
-AARCH64_CORE("cortex-a78", cortexa78, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd41, -1)
-AARCH64_CORE("cortex-a78ae", cortexa78ae, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd42, -1)
-AARCH64_CORE("cortex-a78c", cortexa78c, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), neoversen1, 0x41, 0xd4b, -1)
+AARCH64_CORE("cortex-a76", cortexa76, cortexa57, V8_2A, (F16, RCPC, DOTPROD), cortexa76, 0x41, 0xd0b, -1)
+AARCH64_CORE("cortex-a76ae", cortexa76ae, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0e, -1)
+AARCH64_CORE("cortex-a77", cortexa77, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0d, -1)
+AARCH64_CORE("cortex-a78", cortexa78, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd41, -1)
+AARCH64_CORE("cortex-a78ae", cortexa78ae, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd42, -1)
+AARCH64_CORE("cortex-a78c", cortexa78c, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), cortexa76, 0x41, 0xd4b, -1)
AARCH64_CORE("cortex-a65", cortexa65, cortexa53, V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd06, -1)
AARCH64_CORE("cortex-a65ae", cortexa65ae, cortexa53, V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd43, -1)
-AARCH64_CORE("cortex-x1", cortexx1, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd44, -1)
-AARCH64_CORE("cortex-x1c", cortexx1c, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE, PAUTH), neoversen1, 0x41, 0xd4c, -1)
-AARCH64_CORE("ares", ares, cortexa57, V8_2A, (F16, RCPC, DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
-AARCH64_CORE("neoverse-n1", neoversen1, cortexa57, V8_2A, (F16, RCPC, DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
+AARCH64_CORE("cortex-x1", cortexx1, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd44, -1)
+AARCH64_CORE("cortex-x1c", cortexx1c, cortexa57, V8_2A, (F16, RCPC, DOTPROD, SSBS, PROFILE, PAUTH), cortexa76, 0x41, 0xd4c, -1)
+AARCH64_CORE("ares", ares, cortexa57, V8_2A, (F16, RCPC, DOTPROD, PROFILE), cortexa76, 0x41, 0xd0c, -1)
+AARCH64_CORE("neoverse-n1", neoversen1, neoversen1, V8_2A, (F16, RCPC, DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
AARCH64_CORE("neoverse-e1", neoversee1, cortexa53, V8_2A, (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd4a, -1)
/* Cavium ('C') cores. */
@@ -160,7 +160,7 @@ AARCH64_CORE("cortex-a73.cortex-a53", cortexa73cortexa53, cortexa53, V8A, (CRC
/* ARM DynamIQ big.LITTLE configurations. */
AARCH64_CORE("cortex-a75.cortex-a55", cortexa75cortexa55, cortexa53, V8_2A, (F16, RCPC, DOTPROD), cortexa73, 0x41, AARCH64_BIG_LITTLE (0xd0a, 0xd05), -1)
-AARCH64_CORE("cortex-a76.cortex-a55", cortexa76cortexa55, cortexa53, V8_2A, (F16, RCPC, DOTPROD), neoversen1, 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1)
+AARCH64_CORE("cortex-a76.cortex-a55", cortexa76cortexa55, cortexa53, V8_2A, (F16, RCPC, DOTPROD), cortexa76, 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1)
/* Armv8-R Architecture Processors. */
AARCH64_CORE("cortex-r82", cortexr82, cortexa53, V8R, (), cortexa53, 0x41, 0xd15, -1)
@@ -1867,7 +1867,7 @@ static const struct tune_params thunderx3t110_tunings =
&thunderx3t110_prefetch_tune
};
-static const struct tune_params neoversen1_tunings =
+static const struct tune_params cortexa76_tunings =
{
&cortexa76_extra_costs,
&generic_addrcost_table,
@@ -1885,18 +1885,18 @@ static const struct tune_params neoversen1_tunings =
}, /* memmov_cost. */
3, /* issue_rate */
(AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
- "32:16", /* function_align. */
- "4", /* jump_align. */
- "32:16", /* loop_align. */
- 2, /* int_reassoc_width. */
- 4, /* fp_reassoc_width. */
- 1, /* fma_reassoc_width. */
- 2, /* vec_reassoc_width. */
- 2, /* min_div_recip_mul_sf. */
- 2, /* min_div_recip_mul_df. */
- 0, /* max_case_values. */
- tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
+ "32:16", /* function_align. */
+ "4", /* jump_align. */
+ "32:16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 2, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
&generic_prefetch_tune
};
@@ -2293,6 +2293,135 @@ static const struct tune_params neoverse512tvb_tunings =
&generic_prefetch_tune
};
+static const struct cpu_addrcost_table neoversen1_addrcost_table =
+{
+ {
+ 0, /* hi */
+ 0, /* si */
+ 0, /* di */
+ 1, /* ti */
+ },
+ 0, /* pre_modify */
+ 0, /* post_modify */
+ 1, /* post_modify_ld3_st3 */
+ 1, /* post_modify_ld4_st4 */
+ 0, /* register_offset */
+ 0, /* register_sextend */
+ 0, /* register_zextend */
+ 0 /* imm_offset */
+};
+
+static const struct cpu_regmove_cost neoversen1_regmove_cost =
+{
+ 1, /* GP2GP */
+ /* Avoid the use of slow int<->fp moves for spilling by setting
+ their cost higher than memmov_cost. */
+ 3, /* GP2FP */
+ 2, /* FP2GP */
+ 2 /* FP2FP */
+};
+
+static const advsimd_vec_cost neoversen1_advsimd_vector_cost =
+{
+ 2, /* int_stmt_cost */
+ 2, /* fp_stmt_cost */
+ 0, /* ld2_st2_permute_cost */
+ 0, /* ld3_st3_permute_cost */
+ 0, /* ld4_st4_permute_cost */
+ 3, /* permute_cost */
+ 6, /* reduc_i8_cost */
+ 5, /* reduc_i16_cost */
+ 3, /* reduc_i32_cost */
+ 3, /* reduc_i64_cost */
+ 8, /* reduc_f16_cost */
+ 5, /* reduc_f32_cost */
+ 5, /* reduc_f64_cost */
+ 0, /* store_elt_extra_cost */
+ 2, /* vec_to_scalar_cost */
+ 2, /* scalar_to_vec_cost */
+ 4, /* align_load_cost */
+ 4, /* unalign_load_cost */
+ 1, /* unalign_store_cost */
+ 1 /* store_cost */
+};
+
+static const aarch64_scalar_vec_issue_info neoversen1_scalar_issue_info =
+{
+ 2, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 2, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversen1_advsimd_issue_info =
+{
+ {
+ 2, /* loads_stores_per_cycle */
+ 2, /* stores_per_cycle */
+ 2, /* general_ops_per_cycle */
+ 0, /* fp_simd_load_general_ops */
+ 1 /* fp_simd_store_general_ops */
+ },
+ 3, /* ld2_st2_general_ops */
+ 5, /* ld3_st3_general_ops */
+ 11 /* ld4_st4_general_ops */
+};
+
+static const aarch64_vec_issue_info neoversen1_vec_issue_info =
+{
+ &neoversen1_scalar_issue_info, /* scalar */
+ &neoversen1_advsimd_issue_info, /* advsimd */
+ nullptr /* sve */
+};
+
+
+static const struct cpu_vector_cost neoversen1_vector_cost =
+{
+ 1, /* scalar_int_stmt_cost */
+ 1, /* scalar_fp_stmt_cost */
+ 4, /* scalar_load_cost */
+ 1, /* scalar_store_cost */
+ 1, /* cond_taken_branch_cost */
+ 1, /* cond_not_taken_branch_cost */
+ &neoversen1_advsimd_vector_cost, /* advsimd */
+ nullptr, /* sve */
+ &neoversen1_vec_issue_info /* issue_info */
+};
+
+static const struct tune_params neoversen1_tunings =
+{
+ &neoversen1_extra_costs,
+ &neoversen1_addrcost_table,
+ &neoversen1_regmove_cost,
+ &neoversen1_vector_cost,
+ &generic_branch_cost,
+ &generic_approx_modes,
+ SVE_NOT_IMPLEMENTED, /* sve_width */
+ { 4, /* load_int. */
+ 2, /* store_int. */
+ 5, /* load_fp. */
+ 2, /* store_fp. */
+ 4, /* load_pred. */
+ 4 /* store_pred. */
+ }, /* memmov_cost. */
+ 4, /* issue_rate */
+ AARCH64_FUSE_AES_AESMC, /* fusible_ops */
+ "32", /* function_align. */
+ "4", /* jump_align. */
+ "32:16", /* loop_align. */
+ 2, /* int_reassoc_width. */
+ 4, /* fp_reassoc_width. */
+ 1, /* fma_reassoc_width. */
+ 2, /* vec_reassoc_width. */
+ 2, /* min_div_recip_mul_sf. */
+ 2, /* min_div_recip_mul_df. */
+ 0, /* max_case_values. */
+ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
+ AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND, /* tune_flags. */
+ &generic_prefetch_tune
+};
+
static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
{
2, /* int_stmt_cost */
@@ -471,6 +471,7 @@
(include "../arm/cortex-a57.md")
(include "../arm/exynos-m1.md")
(include "falkor.md")
+(include "neoverse-n1.md")
(include "saphira.md")
(include "thunderx.md")
(include "../arm/xgene1.md")
new file mode 100644
@@ -0,0 +1,716 @@
+;; ARM Neoverse N1 pipeline description
+;; (Based on the "Arm® NeoverseTM N1 Software Optimization Guide")
+;;
+;; Copyright (C) 2014-2023 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; The Neoverse N1 core is modelled as a multiple issue pipeline that has
+;; the following functional units.
+
+(define_automaton "neoverse_n1")
+
+;; 1 - Two pipelines for integer operations: SX1, SX2.
+
+(define_cpu_unit "neon1_sx1_issue" "neoverse_n1")
+(define_reservation "neon1_sx1" "neon1_sx1_issue")
+
+(define_cpu_unit "neon1_sx2_issue" "neoverse_n1")
+(define_reservation "neon1_sx2" "neon1_sx2_issue")
+
+;; 2 - One pipeline for complex integer operations: MX.
+
+(define_cpu_unit "neon1_mx_issue"
+ "neoverse_n1")
+(define_reservation "neon1_mx" "neon1_mx_issue")
+(define_reservation "neon1_m_block" "neon1_mx_issue")
+
+;; 3 - Two asymmetric pipelines for Neon and FP operations: CX1, CX2.
+(define_automaton "neoverse_n1_cx")
+
+(define_cpu_unit "neon1_cx1_issue"
+ "neoverse_n1_cx")
+(define_cpu_unit "neon1_cx2_issue"
+ "neoverse_n1_cx")
+
+(define_reservation "neon1_cx1" "neon1_cx1_issue")
+(define_reservation "neon1_cx2" "neon1_cx2_issue")
+(define_reservation "neon1_v0_block" "neon1_cx1_issue")
+
+;; 4 - One pipeline for branch operations: BX.
+
+(define_cpu_unit "neon1_bx_issue" "neoverse_n1")
+(define_reservation "neon1_bx" "neon1_bx_issue")
+
+;; 5 - Two pipelines for load and store operations: LS1, LS2.
+
+(define_cpu_unit "neon1_ls1_issue" "neoverse_n1")
+(define_reservation "neon1_ls1" "neon1_ls1_issue")
+
+(define_cpu_unit "neon1_ls2_issue" "neoverse_n1")
+(define_reservation "neon1_ls2" "neon1_ls2_issue")
+
+;; Block all issue queues.
+
+(define_reservation "neon1_block" "neon1_sx1_issue + neon1_sx2_issue
+ + neon1_mx_issue
+ + neon1_cx1_issue + neon1_cx2_issue
+ + neon1_ls1_issue + neon1_ls2_issue")
+
+;; Issue groups.
+
+(define_reservation "neon1_b" "neon1_bx")
+(define_reservation "neon1_i" "(neon1_sx1 | neon1_sx2 | neon1_mx)")
+(define_reservation "neon1_m" "neon1_mx")
+(define_reservation "neon1_d" "(neon1_sx2 | neon1_mx)")
+(define_reservation "neon1_l" "(neon1_ls1 | neon1_ls2)")
+(define_reservation "neon1_v" "(neon1_cx1 | neon1_cx2)")
+(define_reservation "neon1_v0" "neon1_cx1")
+(define_reservation "neon1_v1" "neon1_cx2")
+
+;; Intructions resouces.
+
+;; Block.
+(define_insn_reservation "neoverse_n1_block" 1
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "block"))
+ "neon1_block")
+
+;; Branches
+;; No latency as there is no result.
+(define_insn_reservation "neoverse_n1_branch" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "branch"))
+ "neon1_b")
+
+;; Calls
+;; No latency as there is no result.
+(define_insn_reservation "neoverse_n1_call" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "call"))
+ "neon1_i + neon1_b")
+
+;; ALU with no or simple shift.
+;; TODO: there should also be "alus_shift_imm_lsl_1to4".
+(define_insn_reservation "neoverse_n1_alu" 1
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "alu_imm, alu_shift_imm_lsl_1to4, alu_sreg, \
+ alus_imm, alus_sreg, \
+ csel, \
+ logic_imm, logic_reg, logic_shift_imm, \
+ logics_imm, logics_reg, \
+ mov_reg"))
+ "neon1_i")
+
+;; ALU with extension or complex shift.
+;; TODO: there should also be "alus_shift_imm_other".
+(define_insn_reservation "neoverse_n1_alu_shift" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "alu_ext, alu_shift_imm_other, alu_shift_reg, \
+ alus_shift_imm, alus_shift_reg, \
+ logic_shift_reg, logics_shift_imm, logics_shift_reg, \
+ crc"))
+ "neon1_m")
+
+;; Miscellaneous ALU.
+;; TODO: model 2-register "extr", "bfi", variable shifts.
+(define_insn_reservation "neoverse_n1_alu_misc" 1
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "adr, rotate_imm, bfm, clz, mov_imm, rbit, rev"))
+ "neon1_i")
+
+;; Integer divide.
+;; Divisions are not pipelined.
+(define_insn_reservation "neoverse_n1_div" 12
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "udiv, sdiv"))
+ "neon1_m, (neon1_m_block * 12)")
+
+;; Narrow multiply.
+(define_insn_reservation "neoverse_n1_mul" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "mla, mul"))
+ "neon1_m")
+
+;; Wide multiply.
+;; TODO: model multiply high.
+(define_insn_reservation "neoverse_n1_mull" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "smull, umull"))
+ "neon1_m")
+
+;; Integer load.
+;; TODO: model load pairs fully.
+(define_insn_reservation "neoverse_n1_ld" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "load_byte, load_4, load_8"))
+ "neon1_l")
+
+(define_insn_reservation "neoverse_n1_ld16" 5
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "load_16"))
+ "neon1_l * 2")
+
+;; Integer store.
+;; TODO: model store pairs fully.
+(define_insn_reservation "neoverse_n1_st" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "store_4, store_8"))
+ "neon1_d, neon1_l")
+
+(define_insn_reservation "neoverse_n1_stp" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "store_16"))
+ "neon1_i, (neon1_l * 2)")
+
+;; FP arithmetic.
+(define_insn_reservation "neoverse_n1_fp_alu" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "f_minmaxd, f_minmaxs, \
+ faddd, fadds, \
+ fconstd, fconsts, \
+ fcsel, \
+ ffarithd, ffariths, \
+ fmov"))
+ "neon1_v")
+
+;; FP compare.
+(define_insn_reservation "neoverse_n1_fp_cmp" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "fcmpd, fcmps, fccmpd, fccmps"))
+ "neon1_v0")
+
+;; FP round.
+(define_insn_reservation "neoverse_n1_fp_rint" 3
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "f_rintd, f_rints"))
+ "neon1_v0")
+
+;; FP divide & square-root.
+;; Divisions are not pipelined.
+;; TODO: model half-precision.
+(define_insn_reservation "neoverse_n1_fp_divd" 15
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "fdivd, fsqrtd"))
+ "neon1_v0, (neon1_v0_block * 15)")
+
+(define_insn_reservation "neoverse_n1_fp_divs" 10
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "fdivs, fsqrts"))
+ "neon1_v0, (neon1_v0_block * 10)")
+
+;; FP multiply.
+(define_insn_reservation "neoverse_n1_fp_mul" 3
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "fmuld, fmuls"))
+ "neon1_v")
+
+(define_insn_reservation "neoverse_n1_fp_mac" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "fmacd, fmacs"))
+ "neon1_v")
+
+;; FP convert.
+(define_insn_reservation "neoverse_n1_fp_cvt" 3
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "f_cvt"))
+ "neon1_v0")
+
+(define_insn_reservation "neoverse_n1_fp_cvti2f" 6
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "f_cvti2f"))
+ "neon1_m + neon1_v0")
+
+(define_insn_reservation "neoverse_n1_fp_cvtf2i" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "f_cvtf2i"))
+ "neon1_v0 + neon1_v1")
+
+;; FP move.
+(define_insn_reservation "neoverse_n1_fp_mov" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "fconstd, fconsts, \
+ fmov"))
+ "neon1_v")
+
+(define_insn_reservation "neoverse_n1_fp_movi2f" 3
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "f_mcr"))
+ "neon1_m")
+
+(define_insn_reservation "neoverse_n1_fp_movf2i" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "f_mrc, \
+ neon_to_gp, neon_to_gp_q"))
+ "neon1_v1")
+
+;; FP load.
+(define_insn_reservation "neoverse_n1_fp_ld" 5
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "f_loadd, f_loads"))
+ "neon1_i, neon1_l")
+
+(define_insn_reservation "neoverse_n1_fp_ldp" 5
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_ldp"))
+ "neon1_i, (neon1_l * 2)")
+
+(define_insn_reservation "neoverse_n1_fp_ldp_q" 7
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_ldp_q"))
+ "neon1_i, (neon1_l * 2)")
+
+;; FP store.
+(define_insn_reservation "neoverse_n1_fp_st" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "f_stored, f_stores"))
+ "neon1_i, neon1_l")
+
+(define_insn_reservation "neoverse_n1_fp_stp" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_stp"))
+ "neon1_l + neon1_v")
+
+(define_insn_reservation "neoverse_n1_fp_stp_q" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_stp_q"))
+ "(neon1_l * 2) + neon1_v")
+
+;; ASIMD arithmetic.
+(define_insn_reservation "neoverse_n1_asimd_abd_long" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_abd_long"))
+ "neon1_v1")
+
+(define_insn_reservation "neoverse_n1_asimd_alu" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_abd, neon_abd_q, \
+ neon_abs, neon_abs_q, \
+ neon_add, neon_add_q, \
+ neon_add_halve, neon_add_halve_q, \
+ neon_add_halve_narrow_q, \
+ neon_add_long, neon_add_widen, \
+ neon_bsl, neon_bsl_q, \
+ neon_cls, neon_cls_q, \
+ neon_compare, neon_compare_q, \
+ neon_compare_zero, neon_compare_zero_q, \
+ neon_dot, neon_dot_q, \
+ neon_dup, neon_dup_q, \
+ neon_ext, neon_ext_q, \
+ neon_ins, neon_ins_q, \
+ neon_logic, neon_logic_q, \
+ neon_minmax, neon_minmax_q, \
+ neon_move, neon_move_q, \
+ neon_move_narrow_q, \
+ neon_neg, neon_neg_q, \
+ neon_permute, neon_permute_q, \
+ neon_qabs, neon_qabs_q, \
+ neon_qadd, neon_qadd_q, \
+ neon_qneg, neon_qneg_q, \
+ neon_qsub, neon_qsub_q, \
+ neon_rbit, neon_rbit_q, \
+ neon_reduc_add, neon_reduc_add_q, \
+ neon_rev, neon_rev_q, \
+ neon_sub, neon_sub_q, \
+ neon_sub_halve, neon_sub_halve_q, \
+ neon_sub_halve_narrow_q, \
+ neon_sub_widen, neon_sub_long, \
+ neon_tbl1, neon_tbl1_q, \
+ neon_tbl2, neon_tbl2_q"))
+ "neon1_v")
+
+(define_insn_reservation "neoverse_n1_asimd_arith_acc" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_arith_acc"))
+ "neon1_v1")
+
+(define_insn_reservation "neoverse_n1_asimd_shift_acc_q" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_shift_acc_q"))
+ "neon1_v1")
+
+(define_insn_reservation "neoverse_n1_asimd_reduc" 3
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_reduc_add_long, \
+ neon_reduc_minmax, neon_reduc_minmax_q"))
+ "neon1_v1")
+
+
+;; ASIMD multiply.
+(define_insn_reservation "neoverse_n1_asimd_mla" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_mla_b, neon_mla_b_long, \
+ neon_mla_h, neon_mla_h_long, \
+ neon_mla_h_scalar, neon_mla_h_scalar_long, \
+ neon_mla_s, neon_mla_s_long, \
+ neon_mla_s_scalar, neon_mla_s_scalar_long"))
+ "neon1_v0")
+
+(define_insn_reservation "neoverse_n1_asimd_mla_q" 5
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_mla_b_q,
+ neon_mla_h_q, neon_mla_h_scalar_q, \
+ neon_mla_s_q, neon_mla_s_scalar_q"))
+ "neon1_v0 * 2")
+
+(define_insn_reservation "neoverse_n1_asimd_mul" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_mul_b, neon_mul_b_long, \
+ neon_mul_h, neon_mul_h_long, \
+ neon_mul_s, neon_mul_s_long,
+ neon_sat_mul_b, neon_sat_mul_b_long,
+ neon_sat_mul_h, neon_sat_mul_h_long, \
+ neon_sat_mul_h_scalar, neon_sat_mul_h_scalar_long,
+ neon_sat_mul_s, neon_sat_mul_s_long, \
+ neon_sat_mul_s_scalar, neon_sat_mul_s_scalar_long"))
+ "neon1_v0")
+
+(define_insn_reservation "neoverse_n1_asimd_mul_q" 5
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_mul_b_q, neon_mul_h_q, neon_mul_s_q, \
+ neon_sat_mul_b_q, \
+ neon_sat_mul_h_q, neon_sat_mul_h_scalar_q, \
+ neon_sat_mul_s_q, neon_sat_mul_s_scalar_q"))
+ "neon1_v0 * 2")
+
+(define_insn_reservation "neoverse_n1_asimd_sat_mla" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_sat_mla_b_long, \
+ neon_sat_mla_h_long, neon_sat_mla_h_scalar_long, \
+ neon_sat_mla_s_long, neon_sat_mla_s_scalar_long"))
+ "neon1_v0")
+
+;; ASIMD shift.
+(define_insn_reservation "neoverse_n1_asimd_shift" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_shift_imm, neon_shift_imm_q, neon_shift_imm_long, \
+ neon_shift_reg, neon_shift_reg_q"))
+ "neon1_v1")
+
+(define_insn_reservation "neoverse_n1_asimd_shift_q" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_sat_shift_imm, neon_sat_shift_imm_q, \
+ neon_sat_shift_imm_narrow_q, \
+ neon_sat_shift_reg, neon_sat_shift_reg_q, \
+ neon_shift_imm_narrow_q"))
+ "neon1_v1")
+
+;; ASIMD FP arithmetic.
+(define_insn_reservation "neoverse_n1_asimd_fp_alu" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_abd_d, neon_fp_abd_d_q, \
+ neon_fp_abd_s, neon_fp_abd_s_q, \
+ neon_fp_abs_d, neon_fp_abs_d_q, \
+ neon_fp_abs_s, neon_fp_abs_s_q, \
+ neon_fp_addsub_d, neon_fp_addsub_d_q, \
+ neon_fp_addsub_s, neon_fp_addsub_s_q, \
+ neon_fp_compare_d, neon_fp_compare_d_q, \
+ neon_fp_compare_s, neon_fp_compare_s_q, \
+ neon_fp_minmax_d, neon_fp_minmax_d_q, \
+ neon_fp_minmax_s, neon_fp_minmax_s_q, \
+ neon_fp_neg_d, neon_fp_neg_d_q, \
+ neon_fp_neg_s, neon_fp_neg_s_q, \
+ neon_fp_reduc_add_d, neon_fp_reduc_add_d_q, \
+ neon_fp_reduc_add_s, neon_fp_reduc_add_s_q"))
+ "neon1_v")
+
+(define_insn_reservation "neoverse_n1_asimd_fp_reduc" 5
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_reduc_minmax_d, neon_fp_reduc_minmax_d_q, \
+ neon_fp_reduc_minmax_s, neon_fp_reduc_minmax_s_q"))
+ "neon1_v")
+
+;; ASIMD FP convert.
+(define_insn_reservation "neoverse_n1_asimd_cvt" 3
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_cvt_narrow_d_q, \
+ neon_fp_cvt_widen_s, \
+ neon_fp_to_int_d, neon_fp_to_int_d_q, \
+ neon_fp_to_int_s, \
+ neon_int_to_fp_d, neon_int_to_fp_d_q, \
+ neon_int_to_fp_s, \
+ neon_fp_recpe_d, neon_fp_recpe_s, \
+ neon_fp_recpx_d, neon_fp_recpx_s, \
+ neon_fp_round_d, neon_fp_round_d_q, \
+ neon_fp_round_s"))
+ "neon1_v0")
+
+(define_insn_reservation "neoverse_n1_asimd_cvt_q" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_cvt_narrow_s_q, \
+ neon_fp_cvt_widen_h, \
+ neon_fp_to_int_s_q, \
+ neon_int_to_fp_s_q, \
+ neon_fp_recpe_d_q, neon_fp_recpe_s_q, \
+ neon_fp_recpx_d_q, neon_fp_recpx_s_q, \
+ neon_fp_round_s_q"))
+ "neon1_v0 * 2")
+
+;; ASIMD FP divide & square-root.
+;; Divisions are not pipelined.
+;; TODO: model half-precision.
+(define_insn_reservation "neoverse_n1_asimd_fp_divd_q" 15
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_div_d_q"))
+ "neon1_v0, (neon1_v0_block * 14)")
+
+(define_insn_reservation "neoverse_n1_asimd_fp_divs" 10
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_div_s"))
+ "neon1_v0, (neon1_v0_block * 5)")
+
+(define_insn_reservation "neoverse_n1_asimd_fp_divs_q" 10
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_div_s_q"))
+ "neon1_v0, (neon1_v0_block * 9)")
+
+(define_insn_reservation "neoverse_n1_asimd_fp_sqrtd_q" 17
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_sqrt_d_q"))
+ "neon1_v0, (neon1_v0_block * 16)")
+
+(define_insn_reservation "neoverse_n1_asimd_fp_sqrts" 10
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_sqrt_s"))
+ "neon1_v0, (neon1_v0_block * 5)")
+
+(define_insn_reservation "neoverse_n1_asimd_fp_sqrts_q" 10
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_sqrt_s_q"))
+ "neon1_v0, (neon1_v0_block * 9)")
+
+;; ASIMD FP multiply.
+(define_insn_reservation "neoverse_n1_asimd_fp_mul" 3
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_mul_d, neon_fp_mul_d_q, neon_fp_mul_d_scalar_q, \
+ neon_fp_mul_s, neon_fp_mul_s_q, neon_fp_mul_s_scalar_q"))
+ "neon1_v")
+
+;; TODO: model the long form.
+(define_insn_reservation "neoverse_n1_asimd_fp_mla" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_fp_mla_d, neon_fp_mla_d_q, neon_fp_mla_d_scalar_q, \
+ neon_fp_mla_s, neon_fp_mla_s_q, neon_fp_mla_s_scalar_q, \
+ neon_fp_recps_d, neon_fp_recps_d_q, \
+ neon_fp_recps_s, neon_fp_recps_s_q"))
+ "neon1_v")
+
+;; ASIMD miscellaneous.
+(define_insn_reservation "neoverse_n1_asimd_gp_fp" 3
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_from_gp, neon_from_gp_q"))
+ "neon1_m")
+
+;; TODO: model "tbx" fully.
+(define_insn_reservation "neoverse_n1_asimd_tbl_3" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_tbl3, neon_tbl3_q"))
+ "neon1_v * 4")
+
+(define_insn_reservation "neoverse_n1_asimd_tbl_4" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_tbl4, neon_tbl4_q"))
+ "neon1_v * 6")
+
+;; ASIMD load.
+(define_insn_reservation "neoverse_n1_asimd_ld_a" 5
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q, \
+ neon_load1_2reg, neon_load1_2reg_q"))
+ "neon1_l")
+
+(define_insn_reservation "neoverse_n1_asimd_ld_b" 6
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_load1_3reg, neon_load1_3reg_q"))
+ "neon1_l * 3")
+
+(define_insn_reservation "neoverse_n1_asimd_ld_c" 6
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_load1_4reg, neon_load1_4reg_q"))
+ "neon1_l * 4")
+
+(define_insn_reservation "neoverse_n1_asimd_ld_d" 7
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_load1_all_lanes, neon_load1_all_lanes_q, \
+ neon_load1_one_lane, neon_load1_one_lane_q"))
+ "neon1_l + neon1_v")
+
+(define_insn_reservation "neoverse_n1_asimd_ld_e" 7
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_load2_2reg, neon_load2_2reg_q, \
+ neon_load2_all_lanes, neon_load2_all_lanes_q, \
+ neon_load2_one_lane, neon_load2_one_lane_q"))
+ "(neon1_l * 2) + neon1_v")
+
+(define_insn_reservation "neoverse_n1_asimd_ld_f" 8
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_load3_3reg, neon_load3_3reg_q, \
+ neon_load4_all_lanes, neon_load4_all_lanes_q, \
+ neon_load4_one_lane, neon_load4_one_lane_q"))
+ "(neon1_l * 4) + neon1_v")
+
+(define_insn_reservation "neoverse_n1_asimd_ld_g" 7
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_load3_all_lanes, neon_load3_all_lanes_q, \
+ neon_load3_one_lane, neon_load3_one_lane_q"))
+ "(neon1_l * 4) + neon1_v")
+
+(define_insn_reservation "neoverse_n1_asimd_ld_h" 8
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_load4_4reg"))
+ "(neon1_l * 7) + neon1_v")
+
+(define_insn_reservation "neoverse_n1_asimd_ld_i" 10
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_load4_4reg_q"))
+ "(neon1_l * 10) + neon1_v")
+
+;; ASIMD store.
+(define_insn_reservation "neoverse_n1_asimd_st_a" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q, \
+ neon_store1_2reg"))
+ "neon1_v + neon1_l")
+
+(define_insn_reservation "neoverse_n1_asimd_st_b" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store1_1reg_q, \
+ neon_store1_2reg"))
+ "neon1_v + (neon1_l * 2)")
+
+(define_insn_reservation "neoverse_n1_asimd_st_c" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store1_2reg_q, \
+ neon_store1_4reg"))
+ "neon1_v + (neon1_l * 4)")
+
+(define_insn_reservation "neoverse_n1_asimd_st_d" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store1_3reg"))
+ "neon1_v + (neon1_l * 3)")
+
+(define_insn_reservation "neoverse_n1_asimd_st_e" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store1_3reg_q"))
+ "neon1_v + (neon1_l * 6)")
+
+(define_insn_reservation "neoverse_n1_asimd_st_f" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store1_4reg_q"))
+ "neon1_v + (neon1_l * 8)")
+
+(define_insn_reservation "neoverse_n1_asimd_st_g" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store1_one_lane, neon_store1_one_lane_q, \
+ neon_store2_2reg, \
+ neon_store2_one_lane, neon_store2_one_lane_q"))
+ "neon1_v + (neon1_l * 2)")
+
+(define_insn_reservation "neoverse_n1_asimd_st_h" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store2_2reg_q, \
+ neon_store3_3reg, \
+ neon_store3_one_lane_q"))
+ "neon1_v + (neon1_l * 4)")
+
+(define_insn_reservation "neoverse_n1_asimd_st_i" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store3_3reg_q"))
+ "neon1_v + (neon1_l * 6)")
+
+(define_insn_reservation "neoverse_n1_asimd_st_j" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store3_one_lane"))
+ "neon1_v + (neon1_l * 4)")
+
+(define_insn_reservation "neoverse_n1_asimd_st_k" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store4_4reg"))
+ "neon1_v + (neon1_l * 6)")
+
+(define_insn_reservation "neoverse_n1_asimd_st_l" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store4_4reg_q"))
+ "neon1_v + (neon1_l * 12)")
+
+(define_insn_reservation "neoverse_n1_asimd_st_m" 0
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "neon_store4_one_lane, neon_store4_one_lane_q"))
+ "neon1_v + (neon1_l * 3)")
+
+;; ASIMD crypto.
+;; TODO: model different widths.
+(define_insn_reservation "neoverse_n1_asimd_aese" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "crypto_aese"))
+ "neon1_v0")
+
+(define_insn_reservation "neoverse_n1_asimd_aesmc" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "crypto_aesmc"))
+ "neon1_v0")
+
+;; FIXME: "sha256u1" should be "crypto_sha256_fast".
+(define_insn_reservation "neoverse_n1_asimd_sha_fast" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "crypto_sha1_fast, crypto_sha1_xor,
+ crypto_sha256_fast"))
+ "neon1_v0")
+
+(define_insn_reservation "neoverse_n1_asimd_sha_slow" 4
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow"))
+ "neon1_v0")
+
+;; FIXME: "pmull" sometimes is also "neon_mul_{b,h,s}(_scalar)?(_(q|long))?"
+(define_insn_reservation "neoverse_n1_asimd_poly" 3
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "crypto_pmull"))
+ "neon1_v0")
+
+;; CRC
+(define_insn_reservation "neoverse_n1_crc" 2
+ (and (eq_attr "tune" "neoversen1")
+ (eq_attr "type" "crc"))
+ "neon1_m")
+
+;; Bypasses.
+;; TODO: Model region forwarding.
+
+;; Integer multiply.
+;; TODO: model the X and high forms.
+(define_bypass 1 "neoverse_n1_mul, neoverse_n1_mull"
+ "neoverse_n1_mul, neoverse_n1_mull")
+
+;; FP multiply.
+(define_bypass 2 "neoverse_n1_fp_mul" "neoverse_n1_fp_mul")
+(define_bypass 2 "neoverse_n1_fp_mac" "neoverse_n1_fp_mac")
+
+;; ASIMD arithmetic.
+(define_bypass 1 "neoverse_n1_asimd_arith_acc" "neoverse_n1_asimd_arith_acc")
+(define_bypass 1 "neoverse_n1_asimd_shift_acc_q" "neoverse_n1_asimd_shift_acc_q")
+
+;; ASIMD multiply.
+(define_bypass 1 "neoverse_n1_asimd_mla" "neoverse_n1_asimd_mla")
+(define_bypass 2 "neoverse_n1_asimd_mla_q" "neoverse_n1_asimd_mla_q")
+
+;; ASIMD FP multiply.
+(define_bypass 2 "neoverse_n1_asimd_fp_mul" "neoverse_n1_asimd_fp_mul")
+(define_bypass 2 "neoverse_n1_asimd_fp_mla" "neoverse_n1_asimd_fp_mla")
+
+;; CRC
+(define_bypass 1 "neoverse_n1_crc" "neoverse_n1_*")
@@ -450,6 +450,113 @@ const struct cpu_cost_table cortexa76_extra_costs =
}
};
+const struct cpu_cost_table neoversen1_extra_costs =
+{
+ /* ALU */
+ {
+ 0, /* arith. */
+ 0, /* logical. */
+ 0, /* shift. */
+ 0, /* shift_reg. */
+ COSTS_N_INSNS (1), /* arith_shift. */
+ COSTS_N_INSNS (1), /* arith_shift_reg. */
+ 0, /* log_shift. */
+ COSTS_N_INSNS (1), /* log_shift_reg. */
+ 0, /* extend. */
+ COSTS_N_INSNS (1), /* extend_arith. */
+ COSTS_N_INSNS (1), /* bfi. */
+ 0, /* bfx. */
+ 0, /* clz. */
+ 0, /* rev. */
+ 0, /* non_exec. */
+ true /* non_exec_costs_exec. */
+ },
+ {
+ /* MULT SImode */
+ {
+ COSTS_N_INSNS (1), /* simple. */
+ COSTS_N_INSNS (2), /* flag_setting. */
+ COSTS_N_INSNS (1), /* extend. */
+ COSTS_N_INSNS (1), /* add. */
+ COSTS_N_INSNS (1), /* extend_add. */
+ COSTS_N_INSNS (11) /* idiv. */
+ },
+ /* MULT DImode */
+ {
+ COSTS_N_INSNS (3), /* simple. */
+ 0, /* flag_setting (N/A). */
+ COSTS_N_INSNS (1), /* extend. */
+ COSTS_N_INSNS (3), /* add. */
+ COSTS_N_INSNS (1), /* extend_add. */
+ COSTS_N_INSNS (19) /* idiv. */
+ }
+ },
+ /* LD/ST */
+ {
+ COSTS_N_INSNS (3), /* load. */
+ COSTS_N_INSNS (3), /* load_sign_extend. */
+ COSTS_N_INSNS (3), /* ldrd. */
+ COSTS_N_INSNS (2), /* ldm_1st. */
+ 1, /* ldm_regs_per_insn_1st. */
+ 2, /* ldm_regs_per_insn_subsequent. */
+ COSTS_N_INSNS (4), /* loadf. */
+ COSTS_N_INSNS (4), /* loadd. */
+ COSTS_N_INSNS (3), /* load_unaligned. */
+ 0, /* store. */
+ 0, /* strd. */
+ 0, /* stm_1st. */
+ 1, /* stm_regs_per_insn_1st. */
+ 2, /* stm_regs_per_insn_subsequent. */
+ 0, /* storef. */
+ 0, /* stored. */
+ COSTS_N_INSNS (1), /* store_unaligned. */
+ COSTS_N_INSNS (1), /* loadv. */
+ COSTS_N_INSNS (1) /* storev. */
+ },
+ {
+ /* FP SFmode */
+ {
+ COSTS_N_INSNS (9), /* div. */
+ COSTS_N_INSNS (2), /* mult. */
+ COSTS_N_INSNS (3), /* mult_addsub. */
+ COSTS_N_INSNS (3), /* fma. */
+ COSTS_N_INSNS (1), /* addsub. */
+ COSTS_N_INSNS (1), /* fpconst. */
+ 0, /* neg. */
+ 0, /* compare. */
+ COSTS_N_INSNS (1), /* widen. */
+ COSTS_N_INSNS (1), /* narrow. */
+ COSTS_N_INSNS (1), /* toint. */
+ COSTS_N_INSNS (1), /* fromint. */
+ COSTS_N_INSNS (1) /* roundint. */
+ },
+ /* FP DFmode */
+ {
+ COSTS_N_INSNS (14), /* div. */
+ COSTS_N_INSNS (2), /* mult. */
+ COSTS_N_INSNS (3), /* mult_addsub. */
+ COSTS_N_INSNS (3), /* fma. */
+ COSTS_N_INSNS (1), /* addsub. */
+ COSTS_N_INSNS (1), /* fpconst. */
+ 0, /* neg. */
+ 0, /* compare. */
+ COSTS_N_INSNS (1), /* widen. */
+ COSTS_N_INSNS (1), /* narrow. */
+ COSTS_N_INSNS (1), /* toint. */
+ COSTS_N_INSNS (1), /* fromint. */
+ COSTS_N_INSNS (1) /* roundint. */
+ }
+ },
+ /* Vector */
+ {
+ COSTS_N_INSNS (1), /* alu. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (1), /* dup. */
+ COSTS_N_INSNS (1) /* extract. */
+ }
+};
+
const struct cpu_cost_table exynosm1_extra_costs =
{
/* ALU */