[05/07] RISC-V: Add auto-vectorization support

Message ID d2107aec-938f-0581-244c-4c08ee08190e@rivosinc.com
State Not Applicable
Headers
Series RISC-V: Add auto-vectorization support |

Checks

Context Check Description
snail/gcc-patch-check fail Git am fail log

Commit Message

Michael Collison March 3, 2023, 4:53 a.m. UTC
  This patch adds support for registering target hooks for basic 
autovectorization support as well as basic tuning information for the 
vector extension.

gcc/ChangeLog:

     * config/riscv/riscv-cores.def (RISCV_TUNE):
     Add VECTOR_TUNE_INFO parameter and
     * common/config/riscv/riscv-common.cc (RISCV_TUNE):
     Add VECTOR_TUNE_INFO parameter.
     * config/riscv/riscv.cc (riscv_vector_tune_param):
     New struct for vector tuning information.
     (riscv_tune_info): add vector_tune_param.
     (vector_tune_param): New static variable.
     (riscv_vectorization_factor): New variable.
     (generic_rvv_insn_scale_table): New struct.
     (generic_rvv_stmt_scale_table): New struct.
     (generic_rvv_insn_cost_table): New vector insn cost table.
     (generic_rvv_stmt_cost_table): New vector statement cost table.
     (generic_rvv_tune_info): New rvv tuning table.
     (RISCV_TUNE): Add VECTOR_TUNE_INFO parameter.
     (riscv_rtx_costs): Return vector estimate if vector mode.
     (riscv_option_override): Set vector_tune_param.
     (riscv_option_override): Set riscv_vectorization_factor.
     (riscv_estimated_poly_value): Implement
     TARGET_ESTIMATED_POLY_VALUE.
     (riscv_preferred_simd_mode): Implement
     TARGET_VECTORIZE_PREFERRED_SIMD_MODE.
     (riscv_autovectorize_vector_modes): Implement
     TARGET_AUTOVECTORIZE_VECTOR_MODES.
     (riscv_get_mask_mode): Implement TARGET_VECTORIZE_GET_MASK_MODE.
     (riscv_empty_mask_is_expensive): Implement
     TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.
     (riscv_builtin_vectorization_cost): Implement
     TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST.
     (riscv_vectorize_create_costs): Implement
     TARGET_VECTORIZE_CREATE_COSTS.
     (TARGET_ESTIMATED_POLY_VALUE): Register target macro.
     (TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST): Ditto.
     (TARGET_VECTORIZE_PREFERRED_SIMD_MODE): Ditto.
     (TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES): Ditto.
     (TARGET_VECTORIZE_GET_MASK_MODE): Ditto.
     (TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE): Ditto.
     (TARGET_VECTORIZE_LOOP_LEN_OVERRIDE_MASK): Ditto.
     (TARGET_VECTORIZE_CREATE_COSTS): Ditto

---
  gcc/common/config/riscv/riscv-common.cc |   2 +-
  gcc/config/riscv/riscv-cores.def        |  14 +-
  gcc/config/riscv/riscv.cc               | 321 +++++++++++++++++++++++-
  3 files changed, 325 insertions(+), 12 deletions(-)

  static tree riscv_handle_type_attribute (tree *, tree, tree, int, bool *);

@@ -403,8 +469,8 @@ static const unsigned gpr_save_reg_order[] = {

  /* A table describing all the processors GCC knows about.  */
  static const struct riscv_tune_info riscv_tune_info_table[] = {
-#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO)    \
-  { TUNE_NAME, PIPELINE_MODEL, & TUNE_INFO},
+#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO, 
VECTOR_TUNE_INFO)    \
+  { TUNE_NAME, PIPELINE_MODEL, & TUNE_INFO, &VECTOR_TUNE_INFO},
  #include "riscv-cores.def"
  };

@@ -2237,8 +2303,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int 
outer_code, int opno ATTRIBUTE_UN
       Cost Model need to be well analyzed and supported in the future. */
    if (riscv_v_ext_vector_mode_p (mode))
      {
-      *total = COSTS_N_INSNS (1);
-      return true;
+      return vector_tune_param->rvv_insn_costs_table->get_cost (x, 
mode, total, speed);
      }

    bool float_mode_p = FLOAT_MODE_P (mode);
@@ -6079,6 +6144,7 @@ riscv_option_override (void)
                 RISCV_TUNE_STRING_DEFAULT));
    riscv_microarchitecture = cpu->microarchitecture;
    tune_param = optimize_size ? &optimize_size_tune_info : cpu->tune_param;
+  vector_tune_param = cpu->vector_tune_param;

    /* Use -mtune's setting for slow_unaligned_access, even when optimizing
       for size.  For architectures that trap and emulate unaligned 
accesses,
@@ -6198,6 +6264,10 @@ riscv_option_override (void)

    /* Convert -march to a chunks count.  */
    riscv_vector_chunks = riscv_convert_vector_bits ();
+
+  if (TARGET_VECTOR)
+    riscv_vectorization_factor = riscv_vector_lmul;
+
  }

  /* Implement TARGET_CONDITIONAL_REGISTER_USAGE.  */
@@ -6892,6 +6962,218 @@ riscv_dwarf_poly_indeterminate_value (unsigned 
int i, unsigned int *factor,
    return RISCV_DWARF_VLENB;
  }

+/* Implement TARGET_ESTIMATED_POLY_VALUE.
+   Look into the tuning structure for an estimate.
+   KIND specifies the type of requested estimate: min, max or likely.
+   For cores with a known RVV width all three estimates are the same.
+   For generic RVV tuning we want to distinguish the maximum estimate from
+   the minimum and likely ones.
+   The likely estimate is the same as the minimum in that case to give a
+   conservative behavior of auto-vectorizing with RVV when it is a win
+   even for 128-bit RVV.
+   When RVV width information is available VAL.coeffs[1] is multiplied by
+   the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
+
+static HOST_WIDE_INT
+riscv_estimated_poly_value (poly_int64 val,
+                            poly_value_estimate_kind kind = 
POLY_VALUE_LIKELY)
+{
+  unsigned int width_source =
+      BITS_PER_RISCV_VECTOR.is_constant ()
+          ? (unsigned int)BITS_PER_RISCV_VECTOR.to_constant ()
+          : (unsigned int)RVV_SCALABLE;
+
+  /* If there is no core-specific information then the minimum and likely
+     values are based on 128-bit vectors and the maximum is based on
+     the architectural maximum of 2048 bits.  */
+  if (width_source == RVV_SCALABLE)
+    switch (kind)
+      {
+      case POLY_VALUE_MIN:
+      case POLY_VALUE_LIKELY:
+        return val.coeffs[0];
+
+      case POLY_VALUE_MAX:
+        return val.coeffs[0] + val.coeffs[1] * 15;
+      }
+
+  /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, 
treating the
+     lowest as likely.  This could be made more general if future -mtune
+     options need it to be.  */
+  if (kind == POLY_VALUE_MAX)
+    width_source = 1 << floor_log2 (width_source);
+  else
+    width_source = least_bit_hwi (width_source);
+
+  /* If the core provides width information, use that.  */
+  HOST_WIDE_INT over_128 = width_source - 128;
+  return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
+}
+
+/* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE.  */
+
+static machine_mode
+riscv_preferred_simd_mode (scalar_mode mode)
+{
+  machine_mode vmode =
+    riscv_vector::riscv_vector_preferred_simd_mode (mode, 
riscv_vectorization_factor);
+  if (VECTOR_MODE_P (vmode))
+    return vmode;
+
+  return word_mode;
+}
+
+/* Implement TARGET_AUTOVECTORIZE_VECTOR_MODES for RVV.  */
+static unsigned int
+riscv_autovectorize_vector_modes (vector_modes *modes, bool)
+{
+  if (!TARGET_VECTOR)
+    return 0;
+
+  if (riscv_vectorization_factor == RVV_LMUL1)
+    {
+      modes->safe_push (VNx16QImode);
+      modes->safe_push (VNx8QImode);
+      modes->safe_push (VNx4QImode);
+      modes->safe_push (VNx2QImode);
+    }
+  else if (riscv_vectorization_factor == RVV_LMUL2)
+    {
+      modes->safe_push (VNx32QImode);
+      modes->safe_push (VNx16QImode);
+      modes->safe_push (VNx8QImode);
+      modes->safe_push (VNx4QImode);
+    }
+  else if (riscv_vectorization_factor == RVV_LMUL4)
+    {
+      modes->safe_push (VNx64QImode);
+      modes->safe_push (VNx32QImode);
+      modes->safe_push (VNx16QImode);
+      modes->safe_push (VNx8QImode);
+    }
+  else
+    {
+      modes->safe_push (VNx64QImode);
+      modes->safe_push (VNx32QImode);
+      modes->safe_push (VNx16QImode);
+    }
+
+  return 0;
+}
+
+/* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
+
+static opt_machine_mode
+riscv_get_mask_mode (machine_mode mode)
+{
+  machine_mode mask_mode = VOIDmode;
+  if (TARGET_VECTOR &&
+      riscv_vector::riscv_vector_get_mask_mode (mode).exists (&mask_mode))
+    return mask_mode;
+
+  return default_get_mask_mode (mode);
+}
+
+/* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
+   it isn't worth branching around empty masked ops (including masked
+   stores).  */
+
+static bool
+riscv_empty_mask_is_expensive (unsigned)
+{
+  return false;
+}
+
+/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+int
+riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+                                  tree vectype, int misalign 
ATTRIBUTE_UNUSED)
+{
+  unsigned elements;
+  bool fp = false;
+  rtx x = NULL_RTX;
+  machine_mode mode = VOIDmode;
+
+  if (vectype != NULL)
+    {
+      fp = FLOAT_TYPE_P (vectype);
+      mode = TYPE_MODE (vectype);
+    }
+
+  switch (type_of_cost)
+    {
+    case scalar_stmt:
+      return fp ? 
vector_tune_param->rvv_stmt_costs_table->scalar_fp->cost (
+                      x, mode)
+                : 
vector_tune_param->rvv_stmt_costs_table->scalar_int->cost (
+                      x, mode);
+
+    case scalar_load:
+      return vector_tune_param->rvv_stmt_costs_table->scalar_load->cost (x,
+ mode);
+
+    case scalar_store:
+      return 
vector_tune_param->rvv_stmt_costs_table->scalar_store->cost (x,
+ mode);
+
+    case vector_stmt:
+      return fp ? vector_tune_param->rvv_stmt_costs_table->vec_fp->cost (x,
+ mode)
+                : 
vector_tune_param->rvv_stmt_costs_table->vec_int->cost (x,
+ mode);
+
+    case vector_load:
+      return 
vector_tune_param->rvv_stmt_costs_table->vec_align_load->cost (
+          x, mode);
+
+    case vector_store:
+      return vector_tune_param->rvv_stmt_costs_table->vec_store->cost 
(x, mode);
+
+    case vec_to_scalar:
+      return vector_tune_param->rvv_stmt_costs_table->vec_to_scalar->cost (
+          x, mode);
+
+    case scalar_to_vec:
+      return vector_tune_param->rvv_stmt_costs_table->scalar_to_vec->cost (
+          x, mode);
+
+    case unaligned_load:
+    case vector_gather_load:
+      return 
vector_tune_param->rvv_stmt_costs_table->vec_unalign_load->cost (
+          x, mode);
+
+    case unaligned_store:
+    case vector_scatter_store:
+      return 
vector_tune_param->rvv_stmt_costs_table->vec_unalign_store->cost (
+          x, mode);
+
+    case cond_branch_taken:
+      return 
vector_tune_param->rvv_stmt_costs_table->cond_taken_branch->cost (
+          x, mode);
+
+    case cond_branch_not_taken:
+      return vector_tune_param->rvv_stmt_costs_table->cond_not_taken_branch
+          ->cost (x, mode);
+
+    case vec_perm:
+      return vector_tune_param->rvv_stmt_costs_table->vec_permute->cost (x,
+ mode);
+
+    case vec_promote_demote:
+      return fp ? vector_tune_param->rvv_stmt_costs_table->vec_fp->cost (x,
+ mode)
+                : 
vector_tune_param->rvv_stmt_costs_table->vec_int->cost (x,
+ mode);
+
+    case vec_construct:
+      elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
+      return elements / 2 + 1;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
  /* Return true if a shift-amount matches the trailing cleared bits on
     a bitmask.  */

@@ -6901,6 +7183,13 @@ riscv_shamt_matches_mask_p (int shamt, 
HOST_WIDE_INT mask)
    return shamt == ctz_hwi (mask);
  }

+/* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
+vector_costs *
+riscv_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
+{
+  return new riscv_vector_costs (vinfo, costing_for_scalar);
+}
+
  /* Initialize the GCC target structure.  */
  #undef TARGET_ASM_ALIGNED_HI_OP
  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -7143,6 +7432,30 @@ riscv_shamt_matches_mask_p (int shamt, 
HOST_WIDE_INT mask)
  #undef TARGET_VERIFY_TYPE_CONTEXT
  #define TARGET_VERIFY_TYPE_CONTEXT riscv_verify_type_context

+#undef TARGET_ESTIMATED_POLY_VALUE
+#define TARGET_ESTIMATED_POLY_VALUE riscv_estimated_poly_value
+
+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST 
riscv_builtin_vectorization_cost
+
+#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
+#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
+
+#undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
+#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES 
riscv_autovectorize_vector_modes
+
+#undef TARGET_VECTORIZE_GET_MASK_MODE
+#define TARGET_VECTORIZE_GET_MASK_MODE riscv_get_mask_mode
+
+#undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
+#define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE 
riscv_empty_mask_is_expensive
+
+#undef TARGET_VECTORIZE_LOOP_LEN_OVERRIDE_MASK
+#define TARGET_VECTORIZE_LOOP_LEN_OVERRIDE_MASK 
riscv_loop_len_override_mask
+
+#undef TARGET_VECTORIZE_CREATE_COSTS
+#define TARGET_VECTORIZE_CREATE_COSTS riscv_vectorize_create_costs
+
  #undef TARGET_VECTOR_ALIGNMENT
  #define TARGET_VECTOR_ALIGNMENT riscv_vector_alignment
  

Patch

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc
index ebc1ed7d7e4..6b8d92af986 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -246,7 +246,7 @@  static const riscv_cpu_info riscv_cpu_tables[] =

  static const char *riscv_tunes[] =
  {
-#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO) \
+#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO, 
VECTOR_TUNE_INFO)    \
      TUNE_NAME,
  #include "../../../config/riscv/riscv-cores.def"
      NULL
diff --git a/gcc/config/riscv/riscv-cores.def 
b/gcc/config/riscv/riscv-cores.def
index 2a834cae21d..4feb0366222 100644
--- a/gcc/config/riscv/riscv-cores.def
+++ b/gcc/config/riscv/riscv-cores.def
@@ -30,15 +30,15 @@ 
     identifier, reference to riscv.cc.  */

  #ifndef RISCV_TUNE
-#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO)
+#define RISCV_TUNE(TUNE_NAME, PIPELINE_MODEL, TUNE_INFO, VECTOR_TUNE_INFO)
  #endif

-RISCV_TUNE("rocket", generic, rocket_tune_info)
-RISCV_TUNE("sifive-3-series", generic, rocket_tune_info)
-RISCV_TUNE("sifive-5-series", generic, rocket_tune_info)
-RISCV_TUNE("sifive-7-series", sifive_7, sifive_7_tune_info)
-RISCV_TUNE("thead-c906", generic, thead_c906_tune_info)
-RISCV_TUNE("size", generic, optimize_size_tune_info)
+RISCV_TUNE("rocket", generic, rocket_tune_info, generic_rvv_tune_info)
+RISCV_TUNE("sifive-3-series", generic, rocket_tune_info, 
generic_rvv_tune_info)
+RISCV_TUNE("sifive-5-series", generic, rocket_tune_info, 
generic_rvv_tune_info)
+RISCV_TUNE("sifive-7-series", sifive_7, sifive_7_tune_info, 
generic_rvv_tune_info)
+RISCV_TUNE("thead-c906", generic, thead_c906_tune_info, 
generic_rvv_tune_info)
+RISCV_TUNE("size", generic, optimize_size_tune_info, generic_rvv_tune_info)

  #undef RISCV_TUNE

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index f11b7949a49..16b38ba4d76 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -60,6 +60,16 @@  along with GCC; see the file COPYING3.  If not see
  #include "opts.h"
  #include "tm-constrs.h"
  #include "rtl-iter.h"
+#include "gimple.h"
+#include "cfghooks.h"
+#include "cfgloop.h"
+#include "cfgrtl.h"
+#include "sel-sched.h"
+#include "fold-const.h"
+#include "gimple-iterator.h"
+#include "gimple-expr.h"
+#include "tree-vectorizer.h"
+#include "riscv-vector-cost.h"

  /* This file should be included last.  */
  #include "target-def.h"
@@ -238,6 +248,12 @@  struct riscv_tune_param
    bool slow_unaligned_access;
  };

+/* Cost for vector insn classes.  */
+struct riscv_vector_tune_param {
+    const vector_insn_cost_table* rvv_insn_costs_table;
+    const vector_stmt_cost_table* rvv_stmt_costs_table;
+};
+
  /* Information about one micro-arch we know about.  */
  struct riscv_tune_info {
    /* This micro-arch canonical name.  */
@@ -248,6 +264,9 @@  struct riscv_tune_info {

    /* Tuning parameters for this micro-arch.  */
    const struct riscv_tune_param *tune_param;
+
+  /* Tuning vector parameters for this micro-arch.  */
+  const struct riscv_vector_tune_param *vector_tune_param;
  };

  /* Global variables for machine-dependent things.  */
@@ -266,6 +285,9 @@  static int epilogue_cfa_sp_offset;
  /* Which tuning parameters to use.  */
  static const struct riscv_tune_param *tune_param;

+/* Which vector tuning parameters to use.  */
+static const struct riscv_vector_tune_param *vector_tune_param;
+
  /* Which automaton to use for tuning.  */
  enum riscv_microarchitecture_type riscv_microarchitecture;

@@ -275,6 +297,9 @@  poly_uint16 riscv_vector_chunks;
  /* The number of bytes in a vector chunk.  */
  unsigned riscv_bytes_per_vector_chunk;

+/* Prefer vf for auto-vectorizer.  */
+unsigned riscv_vectorization_factor;
+
  /* Index R is the smallest register class that contains register R.  */
  const enum reg_class riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = {
    GR_REGS,    GR_REGS,    GR_REGS,    GR_REGS,
@@ -367,6 +392,47 @@  static const struct riscv_tune_param 
optimize_size_tune_info = {
    false,                    /* slow_unaligned_access */
  };

+static const vector_insn_scale_table generic_rvv_insn_scale_table = {
+    4, /*load*/
+    1, /*store*/
+    1, /*alu*/
+    1, /*mult*/
+    1, /*movi*/
+    1, /*dup*/
+    1, /*extract*/
+    1, /*if_then_else*/
+};
+
+static const vector_stmt_scale_table generic_rvv_stmt_scale_table = {
+    1, /* scalar_int_stmt_cost  */
+    1, /* scalar_fp_stmt_cost  */
+    1, /* scalar_load_cost  */
+    1, /* scalar_store_cost  */
+    1, /* vec_int_stmt_cost  */
+    1, /* vec_fp_stmt_cost  */
+    1, /* vec_permute_cost  */
+    1, /* vec_to_scalar_cost  */
+    1, /* scalar_to_vec_cost  */
+    1, /* vec_align_load_cost  */
+    1, /* vec_unalign_load_cost  */
+    1, /* vec_unalign_store_cost  */
+    1, /* vec_store_cost  */
+    1, /* cond_taken_branch_cost  */
+    1 /* cond_not_taken_branch_cost  */
+};
+
+static const vector_insn_cost_table* generic_rvv_insn_cost_table =
+            new vector_insn_cost_table(&generic_rvv_insn_scale_table);
+
+static const vector_stmt_cost_table* generic_rvv_stmt_cost_table =
+            new vector_stmt_cost_table(&generic_rvv_stmt_scale_table);
+
+/* Costs to use when optimizing for riscv vector.  */
+static const struct riscv_vector_tune_param generic_rvv_tune_info = {
+  generic_rvv_insn_cost_table,
+  generic_rvv_stmt_cost_table
+};
+
  static tree riscv_handle_fndecl_attribute (tree *, tree, tree, int, 
bool *);