aarch64: Add the cost model for Neoverse N1

Message ID 6A93A02F-3719-4751-9055-C774F8FC1D78@icloud.com
State Corrupt patch
Headers
Series aarch64: Add the cost model for Neoverse N1 |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Evandro Menezes April 18, 2023, 9:41 p.m. UTC
  This patch adds the cost model for Neoverse N1, based on the information from the "Arm Neoverse N1 Software Optimization Guide”.
  

Comments

Tamar Christina April 24, 2023, 5:37 p.m. UTC | #1
Hi Evandro,

I wanted to give this patch a try, but the diff seems corrupt, the whitespaces at the start of the context lines seem to have gone missing.

Could you try resending it?

Thanks,
Tamar

> -----Original Message-----
> From: Gcc-patches <gcc-patches-
> bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Evandro
> Menezes via Gcc-patches
> Sent: Tuesday, April 18, 2023 10:42 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Evandro Menezes <ebahapo@icloud.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: [PATCH] aarch64: Add the cost model for Neoverse N1
> 
> This patch adds the cost model for Neoverse N1, based on the information
> from the "Arm Neoverse N1 Software Optimization Guide”.
> 
> --
> Evandro Menezes
> 
> ===================================================================
> =============
> 
> gcc/ChangeLog:
> 
>        * config/aarch64/aarch64-cores.def: Use the Neoverse N1 cost model.
>        * config/aarch64/aarch64.cc
>        (cortexa76_tunings): Rename variable.
>        (neoversen1_addrcost_table): New variable.
>        (neoversen1_vector_cost): Likewise.
>        (neoversen1_regmove_cost): Likewise.
>        (neoversen1_advsimd_vector_cost): Likewise.
>        (neoversen1_scalar_issue_info): Likewise.
>        (neoversen1_advsimd_issue_info): Likewise.
>        (neoversen1_vec_issue_info): Likewise.
>        (neoversen1_vector_cost): Likewise.
>        (neoversen1_tunings): Likewise.
>        * config/arm/aarch-cost-tables.h
>        (neoversen1_extra_costs): New variable.
> 
> Signed-off-by: Evandro Menezes <evandro@gcc.gnu.org>
> ---
> gcc/config/aarch64/aarch64-cores.def |  20 ++--
> gcc/config/aarch64/aarch64.cc        | 155 ++++++++++++++++++++++++---
> gcc/config/arm/aarch-cost-tables.h   | 107 ++++++++++++++++++
> 3 files changed, 259 insertions(+), 23 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64-cores.def
> b/gcc/config/aarch64/aarch64-cores.def
> index 2ec88c98400..e352e4077b1 100644
> --- a/gcc/config/aarch64/aarch64-cores.def
> +++ b/gcc/config/aarch64/aarch64-cores.def
> @@ -105,17 +105,17 @@ AARCH64_CORE("thunderx2t99",  thunderx2t99,
> thunderx2t99, V8_1A,  (CRYPTO), thu
> /* ARM ('A') cores. */
> AARCH64_CORE("cortex-a55",  cortexa55, cortexa53, V8_2A,  (F16, RCPC,
> DOTPROD), cortexa53, 0x41, 0xd05, -1) AARCH64_CORE("cortex-a75",
> cortexa75, cortexa57, V8_2A,  (F16, RCPC, DOTPROD), cortexa73, 0x41,
> 0xd0a, -1) -AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, V8_2A,
> (F16, RCPC, DOTPROD), neoversen1, 0x41, 0xd0b, -1) -
> AARCH64_CORE("cortex-a76ae",  cortexa76ae, cortexa57, V8_2A,  (F16,
> RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0e, -1) -
> AARCH64_CORE("cortex-a77",  cortexa77, cortexa57, V8_2A,  (F16, RCPC,
> DOTPROD, SSBS), neoversen1, 0x41, 0xd0d, -1) -AARCH64_CORE("cortex-
> a78",  cortexa78, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE),
> neoversen1, 0x41, 0xd41, -1) -AARCH64_CORE("cortex-a78ae",  cortexa78ae,
> cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41,
> 0xd42, -1) -AARCH64_CORE("cortex-a78c",  cortexa78c, cortexa57, V8_2A,
> (F16, RCPC, DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), neoversen1, 0x41,
> 0xd4b, -1)
> +AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, V8_2A,  (F16, RCPC,
> +DOTPROD), cortexa76, 0x41, 0xd0b, -1) AARCH64_CORE("cortex-a76ae",
> +cortexa76ae, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa76,
> +0x41, 0xd0e, -1) AARCH64_CORE("cortex-a77",  cortexa77, cortexa57,
> +V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0d, -1)
> +AARCH64_CORE("cortex-a78",  cortexa78, cortexa57, V8_2A,  (F16, RCPC,
> +DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd41, -1)
> +AARCH64_CORE("cortex-a78ae",  cortexa78ae, cortexa57, V8_2A,  (F16,
> +RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd42, -1)
> +AARCH64_CORE("cortex-a78c",  cortexa78c, cortexa57, V8_2A,  (F16, RCPC,
> +DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), cortexa76, 0x41, 0xd4b, -1)
> AARCH64_CORE("cortex-a65",  cortexa65, cortexa53, V8_2A,  (F16, RCPC,
> DOTPROD, SSBS), cortexa73, 0x41, 0xd06, -1) AARCH64_CORE("cortex-
> a65ae",  cortexa65ae, cortexa53, V8_2A,  (F16, RCPC, DOTPROD, SSBS),
> cortexa73, 0x41, 0xd43, -1) -AARCH64_CORE("cortex-x1",  cortexx1,
> cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41,
> 0xd44, -1) -AARCH64_CORE("cortex-x1c",  cortexx1c, cortexa57, V8_2A,
> (F16, RCPC, DOTPROD, SSBS, PROFILE, PAUTH), neoversen1, 0x41, 0xd4c, -1)
> -AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD,
> PROFILE), neoversen1, 0x41, 0xd0c, -1)
> +AARCH64_CORE("cortex-x1",  cortexx1, cortexa57, V8_2A,  (F16, RCPC,
> +DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd44, -1)
> +AARCH64_CORE("cortex-x1c",  cortexx1c, cortexa57, V8_2A,  (F16, RCPC,
> +DOTPROD, SSBS, PROFILE, PAUTH), cortexa76, 0x41, 0xd4c, -1)
> +AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD,
> +PROFILE), cortexa76, 0x41, 0xd0c, -1)
> AARCH64_CORE("neoverse-n1",  neoversen1, cortexa57, V8_2A,  (F16, RCPC,
> DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
> AARCH64_CORE("neoverse-e1",  neoversee1, cortexa53, V8_2A,  (F16, RCPC,
> DOTPROD, SSBS), cortexa73, 0x41, 0xd4a, -1)
> 
> @@ -160,7 +160,7 @@ AARCH64_CORE("cortex-a73.cortex-a53",
> cortexa73cortexa53, cortexa53, V8A,  (CRC
> /* ARM DynamIQ big.LITTLE configurations.  */
> 
> AARCH64_CORE("cortex-a75.cortex-a55",  cortexa75cortexa55, cortexa53,
> V8_2A,  (F16, RCPC, DOTPROD), cortexa73, 0x41, AARCH64_BIG_LITTLE
> (0xd0a, 0xd05), -1) -AARCH64_CORE("cortex-a76.cortex-a55",
> cortexa76cortexa55, cortexa53, V8_2A,  (F16, RCPC, DOTPROD), neoversen1,
> 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1)
> +AARCH64_CORE("cortex-a76.cortex-a55",  cortexa76cortexa55, cortexa53,
> +V8_2A,  (F16, RCPC, DOTPROD), cortexa76, 0x41, AARCH64_BIG_LITTLE
> +(0xd0b, 0xd05), -1)
> 
> /* Armv8-R Architecture Processors.  */
> AARCH64_CORE("cortex-r82", cortexr82, cortexa53, V8R, (), cortexa53, 0x41,
> 0xd15, -1) diff --git a/gcc/config/aarch64/aarch64.cc
> b/gcc/config/aarch64/aarch64.cc index 42617ced73a..46710490a39
> 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -1867,7 +1867,7 @@ static const struct tune_params
> thunderx3t110_tunings =
>   &thunderx3t110_prefetch_tune
> };
> 
> -static const struct tune_params neoversen1_tunings =
> +static const struct tune_params cortexa76_tunings =
> {
>   &cortexa76_extra_costs,
>   &generic_addrcost_table,
> @@ -1885,18 +1885,18 @@ static const struct tune_params
> neoversen1_tunings =
>   }, /* memmov_cost.  */
>   3, /* issue_rate  */
>   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /*
> fusible_ops  */
> -  "32:16", /* function_align.  */
> -  "4", /* jump_align.  */
> -  "32:16", /* loop_align.  */
> -  2, /* int_reassoc_width.  */
> -  4, /* fp_reassoc_width.  */
> -  1, /* fma_reassoc_width.  */
> -  2, /* vec_reassoc_width.  */
> -  2, /* min_div_recip_mul_sf.  */
> -  2, /* min_div_recip_mul_df.  */
> -  0, /* max_case_values.  */
> -  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
> -  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags.  */
> +  "32:16",     /* function_align.  */
> +  "4",         /* jump_align.  */
> +  "32:16",     /* loop_align.  */
> +  2,   /* int_reassoc_width.  */
> +  4,   /* fp_reassoc_width.  */
> +  1,   /* fma_reassoc_width.  */
> +  2,   /* vec_reassoc_width.  */
> +  2,   /* min_div_recip_mul_sf.  */
> +  2,   /* min_div_recip_mul_df.  */
> +  0,   /* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
> +  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),     /* tune_flags.  */
>   &generic_prefetch_tune
> };
> 
> @@ -2293,6 +2293,135 @@ static const struct tune_params
> neoverse512tvb_tunings =
>   &generic_prefetch_tune
> };
> 
> +static const struct cpu_addrcost_table neoversen1_addrcost_table = {
> +    {
> +      0, /* hi  */
> +      0, /* si  */
> +      0, /* di  */
> +      1, /* ti  */
> +    },
> +  0, /* pre_modify  */
> +  0, /* post_modify  */
> +  1, /* post_modify_ld3_st3  */
> +  1, /* post_modify_ld4_st4  */
> +  0, /* register_offset  */
> +  0, /* register_sextend  */
> +  0, /* register_zextend  */
> +  0 /* imm_offset  */
> +};
> +
> +static const struct cpu_regmove_cost neoversen1_regmove_cost = {
> +  1, /* GP2GP  */
> +  /* Avoid the use of slow int<->fp moves for spilling by setting
> +     their cost higher than memmov_cost.  */
> +  3, /* GP2FP  */
> +  2, /* FP2GP  */
> +  2 /* FP2FP  */
> +};
> +
> +static const advsimd_vec_cost neoversen1_advsimd_vector_cost = {
> +  2, /* int_stmt_cost  */
> +  2, /* fp_stmt_cost  */
> +  0, /* ld2_st2_permute_cost  */
> +  0, /* ld3_st3_permute_cost  */
> +  0, /* ld4_st4_permute_cost  */
> +  3, /* permute_cost  */
> +  6, /* reduc_i8_cost  */
> +  5, /* reduc_i16_cost  */
> +  3, /* reduc_i32_cost  */
> +  3, /* reduc_i64_cost  */
> +  8, /* reduc_f16_cost  */
> +  5, /* reduc_f32_cost  */
> +  5, /* reduc_f64_cost  */
> +  0, /* store_elt_extra_cost  */
> +  2, /* vec_to_scalar_cost  */
> +  2, /* scalar_to_vec_cost  */
> +  4, /* align_load_cost  */
> +  4, /* unalign_load_cost  */
> +  1, /* unalign_store_cost  */
> +  1  /* store_cost  */
> +};
> +
> +static const aarch64_scalar_vec_issue_info neoversen1_scalar_issue_info
> += {
> +  2, /* loads_stores_per_cycle  */
> +  2, /* stores_per_cycle  */
> +  2, /* general_ops_per_cycle  */
> +  0, /* fp_simd_load_general_ops  */
> +  1 /* fp_simd_store_general_ops  */
> +};
> +
> +static const aarch64_advsimd_vec_issue_info
> +neoversen1_advsimd_issue_info = {
> +  {
> +    2, /* loads_stores_per_cycle  */
> +    2, /* stores_per_cycle  */
> +    2, /* general_ops_per_cycle  */
> +    0, /* fp_simd_load_general_ops  */
> +    1 /* fp_simd_store_general_ops  */
> +  },
> +  3, /* ld2_st2_general_ops  */
> +  5, /* ld3_st3_general_ops  */
> +  11 /* ld4_st4_general_ops  */
> +};
> +
> +static const aarch64_vec_issue_info neoversen1_vec_issue_info = {
> +  &neoversen1_scalar_issue_info, /* scalar  */
> +  &neoversen1_advsimd_issue_info, /* advsimd  */
> +  nullptr /* sve  */
> +};
> +
> +
> +static const struct cpu_vector_cost neoversen1_vector_cost = {
> +  1, /* scalar_int_stmt_cost  */
> +  1, /* scalar_fp_stmt_cost  */
> +  4, /* scalar_load_cost  */
> +  1, /* scalar_store_cost  */
> +  1, /* cond_taken_branch_cost  */
> +  1, /* cond_not_taken_branch_cost  */
> +  &neoversen1_advsimd_vector_cost, /* advsimd  */
> +  nullptr, /* sve  */
> +  &neoversen1_vec_issue_info /* issue_info  */ };
> +
> +static const struct tune_params neoversen1_tunings = {
> +  &neoversen1_extra_costs,
> +  &neoversen1_addrcost_table,
> +  &neoversen1_regmove_cost,
> +  &neoversen1_vector_cost,
> +  &generic_branch_cost,
> +  &generic_approx_modes,
> +  SVE_NOT_IMPLEMENTED, /* sve_width  */
> +  { 4, /* load_int.  */
> +    2, /* store_int.  */
> +    5, /* load_fp.  */
> +    2, /* store_fp.  */
> +    4, /* load_pred.  */
> +    4 /* store_pred.  */
> +  }, /* memmov_cost.  */
> +  4, /* issue_rate  */
> +  AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
> +  "32:16", /* function_align.  */
> +  "4", /* jump_align.  */
> +  "32:16", /* loop_align.  */
> +  2, /* int_reassoc_width.  */
> +  4, /* fp_reassoc_width.  */
> +  1, /* fma_reassoc_width.  */
> +  2, /* vec_reassoc_width.  */
> +  2, /* min_div_recip_mul_sf.  */
> +  2, /* min_div_recip_mul_df.  */
> +  0, /* max_case_values.  */
> +  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
> +  AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND, /* tune_flags.  */
> +  &generic_prefetch_tune
> +};
> +
> static const advsimd_vec_cost neoversen2_advsimd_vector_cost = {
>   2, /* int_stmt_cost  */
> diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-
> tables.h
> index e3848214728..fce6da6bbcc 100644
> --- a/gcc/config/arm/aarch-cost-tables.h
> +++ b/gcc/config/arm/aarch-cost-tables.h
> @@ -450,6 +450,113 @@ const struct cpu_cost_table cortexa76_extra_costs
> =
>   }
> };
> 
> +const struct cpu_cost_table neoversen1_extra_costs = {
> +  /* ALU */
> +  {
> +    0,                 /* arith.  */
> +    0,                 /* logical.  */
> +    0,                 /* shift.  */
> +    0,                 /* shift_reg.  */
> +    COSTS_N_INSNS (1), /* arith_shift.  */
> +    COSTS_N_INSNS (1), /* arith_shift_reg.  */
> +    0,       /* log_shift.  */
> +    COSTS_N_INSNS (1), /* log_shift_reg.  */
> +    0,                 /* extend.  */
> +    COSTS_N_INSNS (1), /* extend_arith.  */
> +    COSTS_N_INSNS (1), /* bfi.  */
> +    0,                 /* bfx.  */
> +    0,                 /* clz.  */
> +    0,                 /* rev.  */
> +    0,                 /* non_exec.  */
> +    true               /* non_exec_costs_exec.  */
> +  },
> +  {
> +    /* MULT SImode */
> +    {
> +      COSTS_N_INSNS (1),       /* simple.  */
> +      COSTS_N_INSNS (2),       /* flag_setting.  */
> +      COSTS_N_INSNS (1),       /* extend.  */
> +      COSTS_N_INSNS (1),       /* add.  */
> +      COSTS_N_INSNS (1),       /* extend_add.  */
> +      COSTS_N_INSNS (11)       /* idiv.  */
> +    },
> +    /* MULT DImode */
> +    {
> +      COSTS_N_INSNS (3),       /* simple.  */
> +      0,                       /* flag_setting (N/A).  */
> +      COSTS_N_INSNS (1),       /* extend.  */
> +      COSTS_N_INSNS (3),       /* add.  */
> +      COSTS_N_INSNS (1),       /* extend_add.  */
> +      COSTS_N_INSNS (19)       /* idiv.  */
> +    }
> +  },
> +  /* LD/ST */
> +  {
> +    COSTS_N_INSNS (3),         /* load.  */
> +    COSTS_N_INSNS (3),         /* load_sign_extend.  */
> +    COSTS_N_INSNS (3),         /* ldrd.  */
> +    COSTS_N_INSNS (2),         /* ldm_1st.  */
> +    1,                         /* ldm_regs_per_insn_1st.  */
> +    2,                         /* ldm_regs_per_insn_subsequent.  */
> +    COSTS_N_INSNS (4),         /* loadf.  */
> +    COSTS_N_INSNS (4),         /* loadd.  */
> +    COSTS_N_INSNS (3),         /* load_unaligned.  */
> +    0,                         /* store.  */
> +    0,                         /* strd.  */
> +    0,                         /* stm_1st.  */
> +    1,                         /* stm_regs_per_insn_1st.  */
> +    2,                         /* stm_regs_per_insn_subsequent.  */
> +    0,                         /* storef.  */
> +    0,                         /* stored.  */
> +    COSTS_N_INSNS (1),         /* store_unaligned.  */
> +    COSTS_N_INSNS (1),         /* loadv.  */
> +    COSTS_N_INSNS (1)          /* storev.  */
> +  },
> +  {
> +    /* FP SFmode */
> +    {
> +      COSTS_N_INSNS (9),       /* div.  */
> +      COSTS_N_INSNS (2),       /* mult.  */
> +      COSTS_N_INSNS (3),       /* mult_addsub.  */
> +      COSTS_N_INSNS (3),       /* fma.  */
> +      COSTS_N_INSNS (1),       /* addsub.  */
> +      COSTS_N_INSNS (1),       /* fpconst.  */
> +      0,                       /* neg.  */
> +      0,                       /* compare.  */
> +      COSTS_N_INSNS (1),       /* widen.  */
> +      COSTS_N_INSNS (1),       /* narrow.  */
> +      COSTS_N_INSNS (1),       /* toint.  */
> +      COSTS_N_INSNS (1),       /* fromint.  */
> +      COSTS_N_INSNS (1)        /* roundint.  */
> +    },
> +    /* FP DFmode */
> +    {
> +      COSTS_N_INSNS (14),      /* div.  */
> +      COSTS_N_INSNS (2),       /* mult.  */
> +      COSTS_N_INSNS (3),       /* mult_addsub.  */
> +      COSTS_N_INSNS (3),       /* fma.  */
> +      COSTS_N_INSNS (1),       /* addsub.  */
> +      COSTS_N_INSNS (1),       /* fpconst.  */
> +      0,                       /* neg.  */
> +      0,                       /* compare.  */
> +      COSTS_N_INSNS (1),       /* widen.  */
> +      COSTS_N_INSNS (1),       /* narrow.  */
> +      COSTS_N_INSNS (1),       /* toint.  */
> +      COSTS_N_INSNS (1),       /* fromint.  */
> +      COSTS_N_INSNS (1)        /* roundint.  */
> +    }
> +  },
> +  /* Vector */
> +  {
> +    COSTS_N_INSNS (1),  /* alu.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (1),  /* dup.  */
> +    COSTS_N_INSNS (1)   /* extract.  */
> +  }
> +};
> +
> const struct cpu_cost_table exynosm1_extra_costs = {
>   /* ALU */
> --
> 2.39.2 (Apple Git-143)
> 
> 
> 
> 
> --
> Evandro Menezes ◊ evandro@yahoo.com ◊ Austin, TX
> Άγιος ο Θεός ⁂ ܩܕܝܫܐ ܐܢ̱ܬ ܠܐ ܡܝܘܬܐ ⁂ Sanctus Deus
  
Evandro Menezes April 24, 2023, 10:48 p.m. UTC | #2
Hi, Tamara.

Does this work?

Thank you,
  
Evandro Menezes April 24, 2023, 10:51 p.m. UTC | #3
Sorry, but it seems that, before sending, the email client is stripping leading spaces.  I’m attaching the file here.
  
Tamar Christina April 25, 2023, 10:03 a.m. UTC | #4
Thanks Evandro,

That one works.  I’ll run the new cost model and sched modules through a number of workloads and come back with the results.

Cheers,
Tamar

From: Evandro Menezes <ebahapo@icloud.com>
Sent: Monday, April 24, 2023 11:52 PM
To: Evandro Menezes <ebahapo@icloud.com>
Cc: Tamar Christina <Tamar.Christina@arm.com>; evandro+gcc-patches@gcc.gnu.org; gcc-patches@gcc.gnu.org; Richard Sandiford <Richard.Sandiford@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: Re: [PATCH] aarch64: Add the cost model for Neoverse N1

Sorry, but it seems that, before sending, the email client is stripping leading spaces.  I’m attaching the file here.

--
Evandro Menezes ◊ evandro@yahoo.com<mailto:evandro@yahoo.com> ◊ Austin, TX
Άγιος ο Θεός ⁂ ܩܕܝܫܐ ܐܢ̱ܬ ܠܐ ܡܝܘܬܐ ⁂ Sanctus Deus


Em 24 de abr. de 2023, à(s) 17:48, Evandro Menezes <ebahapo@icloud.com<mailto:ebahapo@icloud.com>> escreveu:

Hi, Tamara.

Does this work?

Thank you,

--
Evandro Menezes ◊ evandro@yahoo.com<mailto:evandro@yahoo.com> ◊ Austin, TX
Άγιος ο Θεός ⁂ ܩܕܝܫܐ ܐܢ̱ܬ ܠܐ ܡܝܘܬܐ ⁂ Sanctus Deus


Em 24 de abr. de 2023, à(s) 12:37, Tamar Christina <tamar.christina@arm.com<mailto:tamar.christina@arm.com>> escreveu:

Hi Evandro,

I wanted to give this patch a try, but the diff seems corrupt, the whitespaces at the start of the context lines seem to have gone missing.

Could you try resending it?

Thanks,
Tamar
  

Patch

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 2ec88c98400..e352e4077b1 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -105,17 +105,17 @@  AARCH64_CORE("thunderx2t99",  thunderx2t99,  thunderx2t99, V8_1A,  (CRYPTO), thu
/* ARM ('A') cores. */
AARCH64_CORE("cortex-a55",  cortexa55, cortexa53, V8_2A,  (F16, RCPC, DOTPROD), cortexa53, 0x41, 0xd05, -1)
AARCH64_CORE("cortex-a75",  cortexa75, cortexa57, V8_2A,  (F16, RCPC, DOTPROD), cortexa73, 0x41, 0xd0a, -1)
-AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, V8_2A,  (F16, RCPC, DOTPROD), neoversen1, 0x41, 0xd0b, -1)
-AARCH64_CORE("cortex-a76ae",  cortexa76ae, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0e, -1)
-AARCH64_CORE("cortex-a77",  cortexa77, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS), neoversen1, 0x41, 0xd0d, -1)
-AARCH64_CORE("cortex-a78",  cortexa78, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd41, -1)
-AARCH64_CORE("cortex-a78ae",  cortexa78ae, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd42, -1)
-AARCH64_CORE("cortex-a78c",  cortexa78c, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), neoversen1, 0x41, 0xd4b, -1)
+AARCH64_CORE("cortex-a76",  cortexa76, cortexa57, V8_2A,  (F16, RCPC, DOTPROD), cortexa76, 0x41, 0xd0b, -1)
+AARCH64_CORE("cortex-a76ae",  cortexa76ae, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0e, -1)
+AARCH64_CORE("cortex-a77",  cortexa77, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa76, 0x41, 0xd0d, -1)
+AARCH64_CORE("cortex-a78",  cortexa78, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd41, -1)
+AARCH64_CORE("cortex-a78ae",  cortexa78ae, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd42, -1)
+AARCH64_CORE("cortex-a78c",  cortexa78c, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE, FLAGM, PAUTH), cortexa76, 0x41, 0xd4b, -1)
AARCH64_CORE("cortex-a65",  cortexa65, cortexa53, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd06, -1)
AARCH64_CORE("cortex-a65ae",  cortexa65ae, cortexa53, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd43, -1)
-AARCH64_CORE("cortex-x1",  cortexx1, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), neoversen1, 0x41, 0xd44, -1)
-AARCH64_CORE("cortex-x1c",  cortexx1c, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE, PAUTH), neoversen1, 0x41, 0xd4c, -1)
-AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
+AARCH64_CORE("cortex-x1",  cortexx1, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE), cortexa76, 0x41, 0xd44, -1)
+AARCH64_CORE("cortex-x1c",  cortexx1c, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, SSBS, PROFILE, PAUTH), cortexa76, 0x41, 0xd4c, -1)
+AARCH64_CORE("ares",  ares, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, PROFILE), cortexa76, 0x41, 0xd0c, -1)
AARCH64_CORE("neoverse-n1",  neoversen1, cortexa57, V8_2A,  (F16, RCPC, DOTPROD, PROFILE), neoversen1, 0x41, 0xd0c, -1)
AARCH64_CORE("neoverse-e1",  neoversee1, cortexa53, V8_2A,  (F16, RCPC, DOTPROD, SSBS), cortexa73, 0x41, 0xd4a, -1)

@@ -160,7 +160,7 @@  AARCH64_CORE("cortex-a73.cortex-a53",  cortexa73cortexa53, cortexa53, V8A,  (CRC
/* ARM DynamIQ big.LITTLE configurations.  */

AARCH64_CORE("cortex-a75.cortex-a55",  cortexa75cortexa55, cortexa53, V8_2A,  (F16, RCPC, DOTPROD), cortexa73, 0x41, AARCH64_BIG_LITTLE (0xd0a, 0xd05), -1)
-AARCH64_CORE("cortex-a76.cortex-a55",  cortexa76cortexa55, cortexa53, V8_2A,  (F16, RCPC, DOTPROD), neoversen1, 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1)
+AARCH64_CORE("cortex-a76.cortex-a55",  cortexa76cortexa55, cortexa53, V8_2A,  (F16, RCPC, DOTPROD), cortexa76, 0x41, AARCH64_BIG_LITTLE (0xd0b, 0xd05), -1)

/* Armv8-R Architecture Processors.  */
AARCH64_CORE("cortex-r82", cortexr82, cortexa53, V8R, (), cortexa53, 0x41, 0xd15, -1)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 42617ced73a..46710490a39 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1867,7 +1867,7 @@  static const struct tune_params thunderx3t110_tunings =
  &thunderx3t110_prefetch_tune
};

-static const struct tune_params neoversen1_tunings =
+static const struct tune_params cortexa76_tunings =
{
  &cortexa76_extra_costs,
  &generic_addrcost_table,
@@ -1885,18 +1885,18 @@  static const struct tune_params neoversen1_tunings =
  }, /* memmov_cost.  */
  3, /* issue_rate  */
  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
-  "32:16", /* function_align.  */
-  "4", /* jump_align.  */
-  "32:16", /* loop_align.  */
-  2, /* int_reassoc_width.  */
-  4, /* fp_reassoc_width.  */
-  1, /* fma_reassoc_width.  */
-  2, /* vec_reassoc_width.  */
-  2, /* min_div_recip_mul_sf.  */
-  2, /* min_div_recip_mul_df.  */
-  0, /* max_case_values.  */
-  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags.  */
+  "32:16",     /* function_align.  */
+  "4",         /* jump_align.  */
+  "32:16",     /* loop_align.  */
+  2,   /* int_reassoc_width.  */
+  4,   /* fp_reassoc_width.  */
+  1,   /* fma_reassoc_width.  */
+  2,   /* vec_reassoc_width.  */
+  2,   /* min_div_recip_mul_sf.  */
+  2,   /* min_div_recip_mul_df.  */
+  0,   /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),     /* tune_flags.  */
  &generic_prefetch_tune
};

@@ -2293,6 +2293,135 @@  static const struct tune_params neoverse512tvb_tunings =
  &generic_prefetch_tune
};

+static const struct cpu_addrcost_table neoversen1_addrcost_table =
+{
+    {
+      0, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  1, /* post_modify_ld3_st3  */
+  1, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0 /* imm_offset  */
+};
+
+static const struct cpu_regmove_cost neoversen1_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  3, /* GP2FP  */
+  2, /* FP2GP  */
+  2 /* FP2FP  */
+};
+
+static const advsimd_vec_cost neoversen1_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  0, /* ld2_st2_permute_cost  */
+  0, /* ld3_st3_permute_cost  */
+  0, /* ld4_st4_permute_cost  */
+  3, /* permute_cost  */
+  6, /* reduc_i8_cost  */
+  5, /* reduc_i16_cost  */
+  3, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  8, /* reduc_f16_cost  */
+  5, /* reduc_f32_cost  */
+  5, /* reduc_f64_cost  */
+  0, /* store_elt_extra_cost  */
+  2, /* vec_to_scalar_cost  */
+  2, /* scalar_to_vec_cost  */
+  4, /* align_load_cost  */
+  4, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info neoversen1_scalar_issue_info =
+{
+  2, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  2, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info neoversen1_advsimd_issue_info =
+{
+  {
+    2, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    2, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  3, /* ld2_st2_general_ops  */
+  5, /* ld3_st3_general_ops  */
+  11 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_vec_issue_info neoversen1_vec_issue_info =
+{
+  &neoversen1_scalar_issue_info, /* scalar  */
+  &neoversen1_advsimd_issue_info, /* advsimd  */
+  nullptr /* sve  */
+};
+
+
+static const struct cpu_vector_cost neoversen1_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &neoversen1_advsimd_vector_cost, /* advsimd  */
+  nullptr, /* sve  */
+  &neoversen1_vec_issue_info /* issue_info  */
+};
+
+static const struct tune_params neoversen1_tunings =
+{
+  &neoversen1_extra_costs,
+  &neoversen1_addrcost_table,
+  &neoversen1_regmove_cost,
+  &neoversen1_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  { 4, /* load_int.  */
+    2, /* store_int.  */
+    5, /* load_fp.  */
+    2, /* store_fp.  */
+    4, /* load_pred.  */
+    4 /* store_pred.  */
+  }, /* memmov_cost.  */
+  4, /* issue_rate  */
+  AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
+  "32:16", /* function_align.  */
+  "4", /* jump_align.  */
+  "32:16", /* loop_align.  */
+  2, /* int_reassoc_width.  */
+  4, /* fp_reassoc_width.  */
+  1, /* fma_reassoc_width.  */
+  2, /* vec_reassoc_width.  */
+  2, /* min_div_recip_mul_sf.  */
+  2, /* min_div_recip_mul_df.  */
+  0, /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
+  AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND, /* tune_flags.  */
+  &generic_prefetch_tune
+};
+
static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
{
  2, /* int_stmt_cost  */
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index e3848214728..fce6da6bbcc 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -450,6 +450,113 @@  const struct cpu_cost_table cortexa76_extra_costs =
  }
};

+const struct cpu_cost_table neoversen1_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    0,                 /* shift_reg.  */
+    COSTS_N_INSNS (1), /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,       /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    0,                 /* extend.  */
+    COSTS_N_INSNS (1), /* extend_arith.  */
+    COSTS_N_INSNS (1), /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,                 /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (1),       /* simple.  */
+      COSTS_N_INSNS (2),       /* flag_setting.  */
+      COSTS_N_INSNS (1),       /* extend.  */
+      COSTS_N_INSNS (1),       /* add.  */
+      COSTS_N_INSNS (1),       /* extend_add.  */
+      COSTS_N_INSNS (11)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (1),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (1),       /* extend_add.  */
+      COSTS_N_INSNS (19)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (3),         /* load.  */
+    COSTS_N_INSNS (3),         /* load_sign_extend.  */
+    COSTS_N_INSNS (3),         /* ldrd.  */
+    COSTS_N_INSNS (2),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (4),         /* loadf.  */
+    COSTS_N_INSNS (4),         /* loadd.  */
+    COSTS_N_INSNS (3),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    COSTS_N_INSNS (1),         /* store_unaligned.  */
+    COSTS_N_INSNS (1),         /* loadv.  */
+    COSTS_N_INSNS (1)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (9),       /* div.  */
+      COSTS_N_INSNS (2),       /* mult.  */
+      COSTS_N_INSNS (3),       /* mult_addsub.  */
+      COSTS_N_INSNS (3),       /* fma.  */
+      COSTS_N_INSNS (1),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      0,                       /* neg.  */
+      0,                       /* compare.  */
+      COSTS_N_INSNS (1),       /* widen.  */
+      COSTS_N_INSNS (1),       /* narrow.  */
+      COSTS_N_INSNS (1),       /* toint.  */
+      COSTS_N_INSNS (1),       /* fromint.  */
+      COSTS_N_INSNS (1)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (14),      /* div.  */
+      COSTS_N_INSNS (2),       /* mult.  */
+      COSTS_N_INSNS (3),       /* mult_addsub.  */
+      COSTS_N_INSNS (3),       /* fma.  */
+      COSTS_N_INSNS (1),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      0,                       /* neg.  */
+      0,                       /* compare.  */
+      COSTS_N_INSNS (1),       /* widen.  */
+      COSTS_N_INSNS (1),       /* narrow.  */
+      COSTS_N_INSNS (1),       /* toint.  */
+      COSTS_N_INSNS (1),       /* fromint.  */
+      COSTS_N_INSNS (1)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (1),  /* alu.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
+  }
+};
+
const struct cpu_cost_table exynosm1_extra_costs =
{
  /* ALU */