diff mbox series

RISC-V: Add RVV builtin vectorization cost model

Message ID	20231214032343.124505-1-juzhe.zhong@rivai.ai
State	Unresolved
Headers	Received-SPF: pass (google.com: domain of gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org designates 8.43.85.97 as permitted sender) client-ip=8.43.85.97; DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org EFD6E385C019 From: Juzhe-Zhong <juzhe.zhong@rivai.ai> To: gcc-patches@gcc.gnu.org Cc: kito.cheng@gmail.com, kito.cheng@sifive.com, jeffreyalaw@gmail.com, rdapp.gcc@gmail.com, Juzhe-Zhong <juzhe.zhong@rivai.ai> Subject: [PATCH] RISC-V: Add RVV builtin vectorization cost model Date: Thu, 14 Dec 2023 11:23:43 +0800 Message-Id: <20231214032343.124505-1-juzhe.zhong@rivai.ai> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Feedback-ID: bizesmtp:rivai.ai:qybglogicsvrgz:qybglogicsvrgz7a-one-0 Precedence: list Errors-To: gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org X-getmail-retrieved-from-mailbox: INBOX
Series	RISC-V: Add RVV builtin vectorization cost model \| RISC-V: Add RVV builtin vectorization cost model

Checks

Context	Check	Description
snail/gcc-patch-check	warning	Git am fail log

Commit Message

juzhe.zhong@rivai.ai Dec. 14, 2023, 3:23 a.m. UTC

  This patch fixes PR11153:

        ble     a1,zero,.L8
        addiw   a5,a1,-1
        li      a4,4
        addi    sp,sp,-16
        mv      a2,a0
        sext.w  a3,a1
        bleu    a5,a4,.L9
        srliw   a4,a3,2
        slli    a4,a4,4
        mv      a5,a0
        add     a4,a4,a0
        vsetivli        zero,4,e32,m1,ta,ma
        vmv.v.i v1,0
        vse32.v v1,0(sp)
.L4:
        vle32.v v1,0(a5) ---> This loop always processes 4 elements which is ok for VLEN = 128bits, but waste a huge amount of computation units when VLEN > 128bits
        vle32.v v2,0(sp)
        addi    a5,a5,16
        vadd.vv v1,v2,v1
        vse32.v v1,0(sp)
        bne     a4,a5,.L4
        ld      a5,0(sp)
        lw      a4,0(sp)
        andi    a1,a1,-4
        srai    a5,a5,32
        addw    a5,a4,a5
        lw      a4,8(sp)
        addw    a5,a5,a4
        ld      a4,8(sp)
        srai    a4,a4,32
        addw    a0,a5,a4
        beq     a3,a1,.L15
.L3:
        subw    a3,a3,a1
        slli    a5,a1,32
        slli    a3,a3,32
        srli    a3,a3,32
        srli    a5,a5,30
        add     a2,a2,a5
        vsetvli a5,a3,e8,mf4,tu,mu
        vsetvli a4,zero,e32,m1,ta,ma
        sub     a1,a3,a5
        vmv.v.i v1,0
        vsetvli zero,a3,e32,m1,tu,ma
        vle32.v v2,0(a2)
        vmv.v.v v1,v2
        bne     a3,a5,.L21
.L7:
        vsetvli a4,zero,e32,m1,ta,ma
        vmv.s.x v2,zero
        vredsum.vs      v1,v1,v2
        vmv.x.s a5,v1
        addw    a0,a0,a5
.L15:
        addi    sp,sp,16
        jr      ra
.L21:
        slli    a5,a5,2
        add     a2,a2,a5
        vsetvli zero,a1,e32,m1,tu,ma
        vle32.v v2,0(a2)
        vadd.vv v1,v1,v2
        j       .L7
.L8:
        li      a0,0
        ret
.L9:
        li      a1,0
        li      a0,0
        j       .L3

The rootcause of this is we missed RVV builtin vectorization cost model.

After this patch:

	ble	a1,zero,.L4
	vsetvli	a5,zero,e32,m1,ta,ma
	vmv.v.i	v1,0
.L3:
	vsetvli	a5,a1,e32,m1,tu,ma
	vle32.v	v2,0(a0)
	slli	a4,a5,2
	sub	a1,a1,a5
	add	a0,a0,a4
	vadd.vv	v1,v2,v1
	bne	a1,zero,.L3
	li	a5,0
	vsetivli	zero,1,e32,m1,ta,ma
	vmv.s.x	v2,a5
	vsetvli	a5,zero,e32,m1,ta,ma
	vredsum.vs	v1,v1,v2
	vmv.x.s	a0,v1
	ret
.L4:
	li	a0,0
	ret

	PR target/111153

gcc/ChangeLog:

	* config/riscv/riscv-protos.h (struct common_vector_cost): New struct.
	(struct scalable_vector_cost): Ditto.
	(struct cpu_vector_cost): Ditto.
	* config/riscv/riscv-vector-costs.cc (costs::add_stmt_cost): Add RVV builtin vectorization cost
	* config/riscv/riscv.cc (struct riscv_tune_param): Ditto.
	(get_common_costs): New function.
	(riscv_builtin_vectorization_cost): Ditto.
	(TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST): New targethook.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/costmodel/riscv/rvv/pr111153.c: New test.

---
 gcc/config/riscv/riscv-protos.h               |  76 ++++++++++
 gcc/config/riscv/riscv-vector-costs.cc        |   5 +-
 gcc/config/riscv/riscv.cc                     | 143 ++++++++++++++++++
 .../vect/costmodel/riscv/rvv/pr111153.c       |  18 +++
 4 files changed, 239 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c

Comments

Kito Cheng Dec. 14, 2023, 6:44 a.m. UTC | #1

LGTM

Juzhe-Zhong <juzhe.zhong@rivai.ai> 於 2023年12月14日 週四 11:24 寫道：

> This patch fixes PR11153:
>
>         ble     a1,zero,.L8
>         addiw   a5,a1,-1
>         li      a4,4
>         addi    sp,sp,-16
>         mv      a2,a0
>         sext.w  a3,a1
>         bleu    a5,a4,.L9
>         srliw   a4,a3,2
>         slli    a4,a4,4
>         mv      a5,a0
>         add     a4,a4,a0
>         vsetivli        zero,4,e32,m1,ta,ma
>         vmv.v.i v1,0
>         vse32.v v1,0(sp)
> .L4:
>         vle32.v v1,0(a5) ---> This loop always processes 4 elements which
> is ok for VLEN = 128bits, but waste a huge amount of computation units when
> VLEN > 128bits
>         vle32.v v2,0(sp)
>         addi    a5,a5,16
>         vadd.vv v1,v2,v1
>         vse32.v v1,0(sp)
>         bne     a4,a5,.L4
>         ld      a5,0(sp)
>         lw      a4,0(sp)
>         andi    a1,a1,-4
>         srai    a5,a5,32
>         addw    a5,a4,a5
>         lw      a4,8(sp)
>         addw    a5,a5,a4
>         ld      a4,8(sp)
>         srai    a4,a4,32
>         addw    a0,a5,a4
>         beq     a3,a1,.L15
> .L3:
>         subw    a3,a3,a1
>         slli    a5,a1,32
>         slli    a3,a3,32
>         srli    a3,a3,32
>         srli    a5,a5,30
>         add     a2,a2,a5
>         vsetvli a5,a3,e8,mf4,tu,mu
>         vsetvli a4,zero,e32,m1,ta,ma
>         sub     a1,a3,a5
>         vmv.v.i v1,0
>         vsetvli zero,a3,e32,m1,tu,ma
>         vle32.v v2,0(a2)
>         vmv.v.v v1,v2
>         bne     a3,a5,.L21
> .L7:
>         vsetvli a4,zero,e32,m1,ta,ma
>         vmv.s.x v2,zero
>         vredsum.vs      v1,v1,v2
>         vmv.x.s a5,v1
>         addw    a0,a0,a5
> .L15:
>         addi    sp,sp,16
>         jr      ra
> .L21:
>         slli    a5,a5,2
>         add     a2,a2,a5
>         vsetvli zero,a1,e32,m1,tu,ma
>         vle32.v v2,0(a2)
>         vadd.vv v1,v1,v2
>         j       .L7
> .L8:
>         li      a0,0
>         ret
> .L9:
>         li      a1,0
>         li      a0,0
>         j       .L3
>
> The rootcause of this is we missed RVV builtin vectorization cost model.
>
> After this patch:
>
>         ble     a1,zero,.L4
>         vsetvli a5,zero,e32,m1,ta,ma
>         vmv.v.i v1,0
> .L3:
>         vsetvli a5,a1,e32,m1,tu,ma
>         vle32.v v2,0(a0)
>         slli    a4,a5,2
>         sub     a1,a1,a5
>         add     a0,a0,a4
>         vadd.vv v1,v2,v1
>         bne     a1,zero,.L3
>         li      a5,0
>         vsetivli        zero,1,e32,m1,ta,ma
>         vmv.s.x v2,a5
>         vsetvli a5,zero,e32,m1,ta,ma
>         vredsum.vs      v1,v1,v2
>         vmv.x.s a0,v1
>         ret
> .L4:
>         li      a0,0
>         ret
>
>         PR target/111153
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv-protos.h (struct common_vector_cost): New
> struct.
>         (struct scalable_vector_cost): Ditto.
>         (struct cpu_vector_cost): Ditto.
>         * config/riscv/riscv-vector-costs.cc (costs::add_stmt_cost): Add
> RVV builtin vectorization cost
>         * config/riscv/riscv.cc (struct riscv_tune_param): Ditto.
>         (get_common_costs): New function.
>         (riscv_builtin_vectorization_cost): Ditto.
>         (TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST): New targethook.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/vect/costmodel/riscv/rvv/pr111153.c: New test.
>
> ---
>  gcc/config/riscv/riscv-protos.h               |  76 ++++++++++
>  gcc/config/riscv/riscv-vector-costs.cc        |   5 +-
>  gcc/config/riscv/riscv.cc                     | 143 ++++++++++++++++++
>  .../vect/costmodel/riscv/rvv/pr111153.c       |  18 +++
>  4 files changed, 239 insertions(+), 3 deletions(-)
>  create mode 100644
> gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c
>
> diff --git a/gcc/config/riscv/riscv-protos.h
> b/gcc/config/riscv/riscv-protos.h
> index 85ab1db2088..7de0b031001 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -200,6 +200,82 @@ struct riscv_cpu_info {
>
>  extern const riscv_cpu_info *riscv_find_cpu (const char *);
>
> +/* Common vector costs in any kind of vectorization (e.g VLA and VLS).  */
> +struct common_vector_cost
> +{
> +  /* Cost of any integer vector operation, excluding the ones handled
> +     specially below.  */
> +  const int int_stmt_cost;
> +
> +  /* Cost of any fp vector operation, excluding the ones handled
> +     specially below.  */
> +  const int fp_stmt_cost;
> +
> +  /* Gather/scatter vectorization cost.  */
> +  const int gather_load_cost;
> +  const int scatter_store_cost;
> +
> +  /* Cost of a vector-to-scalar operation.  */
> +  const int vec_to_scalar_cost;
> +
> +  /* Cost of a scalar-to-vector operation.  */
> +  const int scalar_to_vec_cost;
> +
> +  /* Cost of a permute operation.  */
> +  const int permute_cost;
> +
> +  /* Cost of an aligned vector load.  */
> +  const int align_load_cost;
> +
> +  /* Cost of an aligned vector store.  */
> +  const int align_store_cost;
> +
> +  /* Cost of an unaligned vector load.  */
> +  const int unalign_load_cost;
> +
> +  /* Cost of an unaligned vector store.  */
> +  const int unalign_store_cost;
> +};
> +
> +/* scalable vectorization (VLA) specific cost.  */
> +struct scalable_vector_cost : common_vector_cost
> +{
> +  CONSTEXPR scalable_vector_cost (const common_vector_cost &base)
> +    : common_vector_cost (base)
> +  {}
> +
> +  /* TODO: We will need more other kinds of vector cost for VLA.
> +     E.g. fold_left reduction cost, lanes load/store cost, ..., etc.  */
> +};
> +
> +/* Cost for vector insn classes.  */
> +struct cpu_vector_cost
> +{
> +  /* Cost of any integer scalar operation, excluding load and store.  */
> +  const int scalar_int_stmt_cost;
> +
> +  /* Cost of any fp scalar operation, excluding load and store.  */
> +  const int scalar_fp_stmt_cost;
> +
> +  /* Cost of a scalar load.  */
> +  const int scalar_load_cost;
> +
> +  /* Cost of a scalar store.  */
> +  const int scalar_store_cost;
> +
> +  /* Cost of a taken branch.  */
> +  const int cond_taken_branch_cost;
> +
> +  /* Cost of a not-taken branch.  */
> +  const int cond_not_taken_branch_cost;
> +
> +  /* Cost of an VLS modes operations.  */
> +  const common_vector_cost *vls;
> +
> +  /* Cost of an VLA modes operations.  */
> +  const scalable_vector_cost *vla;
> +};
> +
>  /* Routines implemented in riscv-selftests.cc.  */
>  #if CHECKING_P
>  namespace selftest {
> diff --git a/gcc/config/riscv/riscv-vector-costs.cc
> b/gcc/config/riscv/riscv-vector-costs.cc
> index 7888cef58fe..e7bc9ed5233 100644
> --- a/gcc/config/riscv/riscv-vector-costs.cc
> +++ b/gcc/config/riscv/riscv-vector-costs.cc
> @@ -750,9 +750,8 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt
> kind,
>                       stmt_vec_info stmt_info, slp_tree, tree vectype,
>                       int misalign, vect_cost_model_location where)
>  {
> -  /* TODO: Use default STMT cost model.
> -          We will support more accurate STMT cost model later.  */
> -  int stmt_cost = default_builtin_vectorization_cost (kind, vectype,
> misalign);
> +  int stmt_cost
> +    = targetm.vectorize.builtin_vectorization_cost (kind, vectype,
> misalign);
>
>    /* Do one-time initialization based on the vinfo.  */
>    loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 69a8a503f30..2dc44244309 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -281,6 +281,7 @@ struct riscv_tune_param
>    bool slow_unaligned_access;
>    bool use_divmod_expansion;
>    unsigned int fusible_ops;
> +  const struct cpu_vector_cost *vec_costs;
>  };
>
>
> @@ -348,6 +349,50 @@ const enum reg_class
> riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = {
>    VD_REGS,     VD_REGS,        VD_REGS,        VD_REGS,
>  };
>
> +/* Generic costs for VLS vector operations.   */
> +static const common_vector_cost generic_vls_vector_cost = {
> +  1, /* int_stmt_cost  */
> +  1, /* fp_stmt_cost  */
> +  1, /* gather_load_cost  */
> +  1, /* scatter_store_cost  */
> +  1, /* vec_to_scalar_cost  */
> +  1, /* scalar_to_vec_cost  */
> +  1, /* permute_cost  */
> +  3, /* align_load_cost  */
> +  3, /* align_store_cost  */
> +  3, /* unalign_load_cost  */
> +  3, /* unalign_store_cost  */
> +};
> +
> +/* Generic costs for VLA vector operations.  */
> +static const scalable_vector_cost generic_vla_vector_cost = {
> +  {
> +    1, /* int_stmt_cost  */
> +    1, /* fp_stmt_cost  */
> +    1, /* gather_load_cost  */
> +    1, /* scatter_store_cost  */
> +    1, /* vec_to_scalar_cost  */
> +    1, /* scalar_to_vec_cost  */
> +    1, /* permute_cost  */
> +    3, /* align_load_cost  */
> +    3, /* align_store_cost  */
> +    3, /* unalign_load_cost  */
> +    3, /* unalign_store_cost  */
> +  },
> +};
> +
> +/* Generic costs for vector insn classes.  */
> +static const struct cpu_vector_cost generic_vector_cost = {
> +  1,                       /* scalar_int_stmt_cost  */
> +  1,                       /* scalar_fp_stmt_cost  */
> +  1,                       /* scalar_load_cost  */
> +  1,                       /* scalar_store_cost  */
> +  3,                       /* cond_taken_branch_cost  */
> +  1,                       /* cond_not_taken_branch_cost  */
> +  &generic_vls_vector_cost, /* vls  */
> +  &generic_vla_vector_cost, /* vla */
> +};
> +
>  /* Costs to use when optimizing for rocket.  */
>  static const struct riscv_tune_param rocket_tune_info = {
>    {COSTS_N_INSNS (4), COSTS_N_INSNS (5)},      /* fp_add */
> @@ -362,6 +407,7 @@ static const struct riscv_tune_param rocket_tune_info
> = {
>    true,                                                /*
> slow_unaligned_access */
>    false,                                       /* use_divmod_expansion */
>    RISCV_FUSE_NOTHING,                           /* fusible_ops */
> +  NULL,                                                /* vector cost */
>  };
>
>  /* Costs to use when optimizing for Sifive 7 Series.  */
> @@ -378,6 +424,7 @@ static const struct riscv_tune_param
> sifive_7_tune_info = {
>    true,                                                /*
> slow_unaligned_access */
>    false,                                       /* use_divmod_expansion */
>    RISCV_FUSE_NOTHING,                           /* fusible_ops */
> +  NULL,                                                /* vector cost */
>  };
>
>  /* Costs to use when optimizing for T-HEAD c906.  */
> @@ -394,6 +441,7 @@ static const struct riscv_tune_param
> thead_c906_tune_info = {
>    false,            /* slow_unaligned_access */
>    false,       /* use_divmod_expansion */
>    RISCV_FUSE_NOTHING,                           /* fusible_ops */
> +  NULL,                                                /* vector cost */
>  };
>
>  /* Costs to use when optimizing for a generic ooo profile.  */
> @@ -410,6 +458,7 @@ static const struct riscv_tune_param
> generic_ooo_tune_info = {
>    false,                                       /* slow_unaligned_access */
>    false,                                       /* use_divmod_expansion */
>    RISCV_FUSE_NOTHING,                           /* fusible_ops */
> +  &generic_vector_cost,                                /* vector cost */
>  };
>
>  /* Costs to use when optimizing for size.  */
> @@ -426,6 +475,7 @@ static const struct riscv_tune_param
> optimize_size_tune_info = {
>    false,                                       /* slow_unaligned_access */
>    false,                                       /* use_divmod_expansion */
>    RISCV_FUSE_NOTHING,                           /* fusible_ops */
> +  NULL,                                                /* vector cost */
>  };
>
>  static bool riscv_avoid_shrink_wrapping_separate ();
> @@ -10192,6 +10242,95 @@ riscv_frame_pointer_required (void)
>    return riscv_save_frame_pointer && !crtl->is_leaf;
>  }
>
> +/* Return the appropriate common costs for vectors of type VECTYPE.  */
> +static const common_vector_cost *
> +get_common_costs (tree vectype)
> +{
> +  const cpu_vector_cost *costs = tune_param->vec_costs;
> +  gcc_assert (costs);
> +
> +  if (vectype && riscv_v_ext_vls_mode_p (TYPE_MODE (vectype)))
> +    return costs->vls;
> +  return costs->vla;
> +}
> +
> +/* Implement targetm.vectorize.builtin_vectorization_cost.  */
> +
> +static int
> +riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> +                                 tree vectype, int misalign
> ATTRIBUTE_UNUSED)
> +{
> +  unsigned elements;
> +  const cpu_vector_cost *costs = tune_param->vec_costs;
> +  bool fp = false;
> +
> +  if (vectype != NULL)
> +    fp = FLOAT_TYPE_P (vectype);
> +
> +  if (costs != NULL)
> +    {
> +      const common_vector_cost *common_costs = get_common_costs (vectype);
> +      gcc_assert (common_costs != NULL);
> +      switch (type_of_cost)
> +       {
> +       case scalar_stmt:
> +         return fp ? costs->scalar_fp_stmt_cost :
> costs->scalar_int_stmt_cost;
> +
> +       case scalar_load:
> +         return costs->scalar_load_cost;
> +
> +       case scalar_store:
> +         return costs->scalar_store_cost;
> +
> +       case vector_stmt:
> +         return fp ? common_costs->fp_stmt_cost :
> common_costs->int_stmt_cost;
> +
> +       case vector_load:
> +         return common_costs->align_load_cost;
> +
> +       case vector_store:
> +         return common_costs->align_store_cost;
> +
> +       case vec_to_scalar:
> +         return common_costs->vec_to_scalar_cost;
> +
> +       case scalar_to_vec:
> +         return common_costs->scalar_to_vec_cost;
> +
> +       case unaligned_load:
> +         return common_costs->unalign_load_cost;
> +       case vector_gather_load:
> +         return common_costs->gather_load_cost;
> +
> +       case unaligned_store:
> +         return common_costs->unalign_store_cost;
> +       case vector_scatter_store:
> +         return common_costs->scatter_store_cost;
> +
> +       case cond_branch_taken:
> +         return costs->cond_taken_branch_cost;
> +
> +       case cond_branch_not_taken:
> +         return costs->cond_not_taken_branch_cost;
> +
> +       case vec_perm:
> +         return common_costs->permute_cost;
> +
> +       case vec_promote_demote:
> +         return fp ? common_costs->fp_stmt_cost :
> common_costs->int_stmt_cost;
> +
> +       case vec_construct:
> +         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
> +         return elements / 2 + 1;
> +
> +       default:
> +         gcc_unreachable ();
> +       }
> +    }
> +
> +  return default_builtin_vectorization_cost (type_of_cost, vectype,
> misalign);
> +}
> +
>  /* Implement targetm.vectorize.create_costs.  */
>
>  static vector_costs *
> @@ -10582,6 +10721,10 @@ extract_base_offset_in_addr (rtx mem, rtx *base,
> rtx *offset)
>  #undef TARGET_FRAME_POINTER_REQUIRED
>  #define TARGET_FRAME_POINTER_REQUIRED riscv_frame_pointer_required
>
> +#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
> +#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
> +  riscv_builtin_vectorization_cost
> +
>  #undef TARGET_VECTORIZE_CREATE_COSTS
>  #define TARGET_VECTORIZE_CREATE_COSTS riscv_vectorize_create_costs
>
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c
> b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c
> new file mode 100644
> index 00000000000..06e08ec5f2e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize
> -mtune=generic-ooo" } */
> +
> +#define DEF_REDUC_PLUS(TYPE)
>      \
> +  TYPE __attribute__ ((noinline, noclone))
>      \
> +  reduc_plus_##TYPE (TYPE *__restrict a, int n)
>       \
> +  {
>       \
> +    TYPE r = 0;
>       \
> +    for (int i = 0; i < n; ++i)
>       \
> +      r += a[i];
>      \
> +    return r;
>       \
> +  }
> +
> +#define TEST_PLUS(T) T (int)
> +
> +TEST_PLUS (DEF_REDUC_PLUS)
> +
> +/* { dg-final { scan-assembler-not {vsetivli\s+zero,\s*4} } } */
> --
> 2.36.3
>
>

Li, Pan2 Dec. 14, 2023, 6:53 a.m. UTC | #2

Committed, thanks Kito.

Pan

From: Kito Cheng <kito.cheng@gmail.com>
Sent: Thursday, December 14, 2023 2:45 PM
To: Juzhe-Zhong <juzhe.zhong@rivai.ai>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>; Kito Cheng <kito.cheng@sifive.com>; Jeff Law <jeffreyalaw@gmail.com>; Robin Dapp <rdapp.gcc@gmail.com>
Subject: Re: [PATCH] RISC-V: Add RVV builtin vectorization cost model

LGTM

Juzhe-Zhong <juzhe.zhong@rivai.ai<mailto:juzhe.zhong@rivai.ai>> 於 2023年12月14日 週四 11:24 寫道：
This patch fixes PR11153:

        ble     a1,zero,.L8
        addiw   a5,a1,-1
        li      a4,4
        addi    sp,sp,-16
        mv      a2,a0
        sext.w  a3,a1
        bleu    a5,a4,.L9
        srliw   a4,a3,2
        slli    a4,a4,4
        mv      a5,a0
        add     a4,a4,a0
        vsetivli        zero,4,e32,m1,ta,ma
        vmv.v.i v1,0
        vse32.v v1,0(sp)
.L4:
        vle32.v v1,0(a5) ---> This loop always processes 4 elements which is ok for VLEN = 128bits, but waste a huge amount of computation units when VLEN > 128bits
        vle32.v v2,0(sp)
        addi    a5,a5,16
        vadd.vv v1,v2,v1
        vse32.v v1,0(sp)
        bne     a4,a5,.L4
        ld      a5,0(sp)
        lw      a4,0(sp)
        andi    a1,a1,-4
        srai    a5,a5,32
        addw    a5,a4,a5
        lw      a4,8(sp)
        addw    a5,a5,a4
        ld      a4,8(sp)
        srai    a4,a4,32
        addw    a0,a5,a4
        beq     a3,a1,.L15
.L3:
        subw    a3,a3,a1
        slli    a5,a1,32
        slli    a3,a3,32
        srli    a3,a3,32
        srli    a5,a5,30
        add     a2,a2,a5
        vsetvli a5,a3,e8,mf4,tu,mu
        vsetvli a4,zero,e32,m1,ta,ma
        sub     a1,a3,a5
        vmv.v.i v1,0
        vsetvli zero,a3,e32,m1,tu,ma
        vle32.v v2,0(a2)
        vmv.v.v v1,v2
        bne     a3,a5,.L21
.L7:
        vsetvli a4,zero,e32,m1,ta,ma
        vmv.s.x v2,zero
        vredsum.vs      v1,v1,v2
        vmv.x.s a5,v1
        addw    a0,a0,a5
.L15:
        addi    sp,sp,16
        jr      ra
.L21:
        slli    a5,a5,2
        add     a2,a2,a5
        vsetvli zero,a1,e32,m1,tu,ma
        vle32.v v2,0(a2)
        vadd.vv v1,v1,v2
        j       .L7
.L8:
        li      a0,0
        ret
.L9:
        li      a1,0
        li      a0,0
        j       .L3

The rootcause of this is we missed RVV builtin vectorization cost model.

After this patch:

        ble     a1,zero,.L4
        vsetvli a5,zero,e32,m1,ta,ma
        vmv.v.i v1,0
.L3:
        vsetvli a5,a1,e32,m1,tu,ma
        vle32.v v2,0(a0)
        slli    a4,a5,2
        sub     a1,a1,a5
        add     a0,a0,a4
        vadd.vv v1,v2,v1
        bne     a1,zero,.L3
        li      a5,0
        vsetivli        zero,1,e32,m1,ta,ma
        vmv.s.x v2,a5
        vsetvli a5,zero,e32,m1,ta,ma
        vredsum.vs      v1,v1,v2
        vmv.x.s a0,v1
        ret
.L4:
        li      a0,0
        ret

        PR target/111153

gcc/ChangeLog:

        * config/riscv/riscv-protos.h (struct common_vector_cost): New struct.
        (struct scalable_vector_cost): Ditto.
        (struct cpu_vector_cost): Ditto.
        * config/riscv/riscv-vector-costs.cc (costs::add_stmt_cost): Add RVV builtin vectorization cost
        * config/riscv/riscv.cc (struct riscv_tune_param): Ditto.
        (get_common_costs): New function.
        (riscv_builtin_vectorization_cost): Ditto.
        (TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST): New targethook.

gcc/testsuite/ChangeLog:

        * gcc.dg/vect/costmodel/riscv/rvv/pr111153.c: New test.

---
 gcc/config/riscv/riscv-protos.h               |  76 ++++++++++
 gcc/config/riscv/riscv-vector-costs.cc        |   5 +-
 gcc/config/riscv/riscv.cc                     | 143 ++++++++++++++++++
 .../vect/costmodel/riscv/rvv/pr111153.c       |  18 +++
 4 files changed, 239 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 85ab1db2088..7de0b031001 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -200,6 +200,82 @@ struct riscv_cpu_info {

 extern const riscv_cpu_info *riscv_find_cpu (const char *);

+/* Common vector costs in any kind of vectorization (e.g VLA and VLS).  */
+struct common_vector_cost
+{
+  /* Cost of any integer vector operation, excluding the ones handled
+     specially below.  */
+  const int int_stmt_cost;
+
+  /* Cost of any fp vector operation, excluding the ones handled
+     specially below.  */
+  const int fp_stmt_cost;
+
+  /* Gather/scatter vectorization cost.  */
+  const int gather_load_cost;
+  const int scatter_store_cost;
+
+  /* Cost of a vector-to-scalar operation.  */
+  const int vec_to_scalar_cost;
+
+  /* Cost of a scalar-to-vector operation.  */
+  const int scalar_to_vec_cost;
+
+  /* Cost of a permute operation.  */
+  const int permute_cost;
+
+  /* Cost of an aligned vector load.  */
+  const int align_load_cost;
+
+  /* Cost of an aligned vector store.  */
+  const int align_store_cost;
+
+  /* Cost of an unaligned vector load.  */
+  const int unalign_load_cost;
+
+  /* Cost of an unaligned vector store.  */
+  const int unalign_store_cost;
+};
+
+/* scalable vectorization (VLA) specific cost.  */
+struct scalable_vector_cost : common_vector_cost
+{
+  CONSTEXPR scalable_vector_cost (const common_vector_cost &base)
+    : common_vector_cost (base)
+  {}
+
+  /* TODO: We will need more other kinds of vector cost for VLA.
+     E.g. fold_left reduction cost, lanes load/store cost, ..., etc.  */
+};
+
+/* Cost for vector insn classes.  */
+struct cpu_vector_cost
+{
+  /* Cost of any integer scalar operation, excluding load and store.  */
+  const int scalar_int_stmt_cost;
+
+  /* Cost of any fp scalar operation, excluding load and store.  */
+  const int scalar_fp_stmt_cost;
+
+  /* Cost of a scalar load.  */
+  const int scalar_load_cost;
+
+  /* Cost of a scalar store.  */
+  const int scalar_store_cost;
+
+  /* Cost of a taken branch.  */
+  const int cond_taken_branch_cost;
+
+  /* Cost of a not-taken branch.  */
+  const int cond_not_taken_branch_cost;
+
+  /* Cost of an VLS modes operations.  */
+  const common_vector_cost *vls;
+
+  /* Cost of an VLA modes operations.  */
+  const scalable_vector_cost *vla;
+};
+
 /* Routines implemented in riscv-selftests.cc.  */
 #if CHECKING_P
 namespace selftest {
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 7888cef58fe..e7bc9ed5233 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -750,9 +750,8 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
                      stmt_vec_info stmt_info, slp_tree, tree vectype,
                      int misalign, vect_cost_model_location where)
 {
-  /* TODO: Use default STMT cost model.
-          We will support more accurate STMT cost model later.  */
-  int stmt_cost = default_builtin_vectorization_cost (kind, vectype, misalign);
+  int stmt_cost
+    = targetm.vectorize.builtin_vectorization_cost (kind, vectype, misalign);

   /* Do one-time initialization based on the vinfo.  */
   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 69a8a503f30..2dc44244309 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -281,6 +281,7 @@ struct riscv_tune_param
   bool slow_unaligned_access;
   bool use_divmod_expansion;
   unsigned int fusible_ops;
+  const struct cpu_vector_cost *vec_costs;
 };


@@ -348,6 +349,50 @@ const enum reg_class riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = {
   VD_REGS,     VD_REGS,        VD_REGS,        VD_REGS,
 };

+/* Generic costs for VLS vector operations.   */
+static const common_vector_cost generic_vls_vector_cost = {
+  1, /* int_stmt_cost  */
+  1, /* fp_stmt_cost  */
+  1, /* gather_load_cost  */
+  1, /* scatter_store_cost  */
+  1, /* vec_to_scalar_cost  */
+  1, /* scalar_to_vec_cost  */
+  1, /* permute_cost  */
+  3, /* align_load_cost  */
+  3, /* align_store_cost  */
+  3, /* unalign_load_cost  */
+  3, /* unalign_store_cost  */
+};
+
+/* Generic costs for VLA vector operations.  */
+static const scalable_vector_cost generic_vla_vector_cost = {
+  {
+    1, /* int_stmt_cost  */
+    1, /* fp_stmt_cost  */
+    1, /* gather_load_cost  */
+    1, /* scatter_store_cost  */
+    1, /* vec_to_scalar_cost  */
+    1, /* scalar_to_vec_cost  */
+    1, /* permute_cost  */
+    3, /* align_load_cost  */
+    3, /* align_store_cost  */
+    3, /* unalign_load_cost  */
+    3, /* unalign_store_cost  */
+  },
+};
+
+/* Generic costs for vector insn classes.  */
+static const struct cpu_vector_cost generic_vector_cost = {
+  1,                       /* scalar_int_stmt_cost  */
+  1,                       /* scalar_fp_stmt_cost  */
+  1,                       /* scalar_load_cost  */
+  1,                       /* scalar_store_cost  */
+  3,                       /* cond_taken_branch_cost  */
+  1,                       /* cond_not_taken_branch_cost  */
+  &generic_vls_vector_cost, /* vls  */
+  &generic_vla_vector_cost, /* vla */
+};
+
 /* Costs to use when optimizing for rocket.  */
 static const struct riscv_tune_param rocket_tune_info = {
   {COSTS_N_INSNS (4), COSTS_N_INSNS (5)},      /* fp_add */
@@ -362,6 +407,7 @@ static const struct riscv_tune_param rocket_tune_info = {
   true,                                                /* slow_unaligned_access */
   false,                                       /* use_divmod_expansion */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
+  NULL,                                                /* vector cost */
 };

 /* Costs to use when optimizing for Sifive 7 Series.  */
@@ -378,6 +424,7 @@ static const struct riscv_tune_param sifive_7_tune_info = {
   true,                                                /* slow_unaligned_access */
   false,                                       /* use_divmod_expansion */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
+  NULL,                                                /* vector cost */
 };

 /* Costs to use when optimizing for T-HEAD c906.  */
@@ -394,6 +441,7 @@ static const struct riscv_tune_param thead_c906_tune_info = {
   false,            /* slow_unaligned_access */
   false,       /* use_divmod_expansion */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
+  NULL,                                                /* vector cost */
 };

 /* Costs to use when optimizing for a generic ooo profile.  */
@@ -410,6 +458,7 @@ static const struct riscv_tune_param generic_ooo_tune_info = {
   false,                                       /* slow_unaligned_access */
   false,                                       /* use_divmod_expansion */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
+  &generic_vector_cost,                                /* vector cost */
 };

 /* Costs to use when optimizing for size.  */
@@ -426,6 +475,7 @@ static const struct riscv_tune_param optimize_size_tune_info = {
   false,                                       /* slow_unaligned_access */
   false,                                       /* use_divmod_expansion */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
+  NULL,                                                /* vector cost */
 };

 static bool riscv_avoid_shrink_wrapping_separate ();
@@ -10192,6 +10242,95 @@ riscv_frame_pointer_required (void)
   return riscv_save_frame_pointer && !crtl->is_leaf;
 }

+/* Return the appropriate common costs for vectors of type VECTYPE.  */
+static const common_vector_cost *
+get_common_costs (tree vectype)
+{
+  const cpu_vector_cost *costs = tune_param->vec_costs;
+  gcc_assert (costs);
+
+  if (vectype && riscv_v_ext_vls_mode_p (TYPE_MODE (vectype)))
+    return costs->vls;
+  return costs->vla;
+}
+
+/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+
+static int
+riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+                                 tree vectype, int misalign ATTRIBUTE_UNUSED)
+{
+  unsigned elements;
+  const cpu_vector_cost *costs = tune_param->vec_costs;
+  bool fp = false;
+
+  if (vectype != NULL)
+    fp = FLOAT_TYPE_P (vectype);
+
+  if (costs != NULL)
+    {
+      const common_vector_cost *common_costs = get_common_costs (vectype);
+      gcc_assert (common_costs != NULL);
+      switch (type_of_cost)
+       {
+       case scalar_stmt:
+         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
+
+       case scalar_load:
+         return costs->scalar_load_cost;
+
+       case scalar_store:
+         return costs->scalar_store_cost;
+
+       case vector_stmt:
+         return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
+
+       case vector_load:
+         return common_costs->align_load_cost;
+
+       case vector_store:
+         return common_costs->align_store_cost;
+
+       case vec_to_scalar:
+         return common_costs->vec_to_scalar_cost;
+
+       case scalar_to_vec:
+         return common_costs->scalar_to_vec_cost;
+
+       case unaligned_load:
+         return common_costs->unalign_load_cost;
+       case vector_gather_load:
+         return common_costs->gather_load_cost;
+
+       case unaligned_store:
+         return common_costs->unalign_store_cost;
+       case vector_scatter_store:
+         return common_costs->scatter_store_cost;
+
+       case cond_branch_taken:
+         return costs->cond_taken_branch_cost;
+
+       case cond_branch_not_taken:
+         return costs->cond_not_taken_branch_cost;
+
+       case vec_perm:
+         return common_costs->permute_cost;
+
+       case vec_promote_demote:
+         return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
+
+       case vec_construct:
+         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
+         return elements / 2 + 1;
+
+       default:
+         gcc_unreachable ();
+       }
+    }
+
+  return default_builtin_vectorization_cost (type_of_cost, vectype, misalign);
+}
+
 /* Implement targetm.vectorize.create_costs.  */

 static vector_costs *
@@ -10582,6 +10721,10 @@ extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
 #undef TARGET_FRAME_POINTER_REQUIRED
 #define TARGET_FRAME_POINTER_REQUIRED riscv_frame_pointer_required

+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
+  riscv_builtin_vectorization_cost
+
 #undef TARGET_VECTORIZE_CREATE_COSTS
 #define TARGET_VECTORIZE_CREATE_COSTS riscv_vectorize_create_costs

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c
new file mode 100644
index 00000000000..06e08ec5f2e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -mtune=generic-ooo" } */
+
+#define DEF_REDUC_PLUS(TYPE)                                                   \
+  TYPE __attribute__ ((noinline, noclone))                                     \
+  reduc_plus_##TYPE (TYPE *__restrict a, int n)                                \
+  {                                                                            \
+    TYPE r = 0;                                                                \
+    for (int i = 0; i < n; ++i)                                                \
+      r += a[i];                                                               \
+    return r;                                                                  \
+  }
+
+#define TEST_PLUS(T) T (int)
+
+TEST_PLUS (DEF_REDUC_PLUS)
+
+/* { dg-final { scan-assembler-not {vsetivli\s+zero,\s*4} } } */
--
2.36.3

diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 85ab1db2088..7de0b031001 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -200,6 +200,82 @@  struct riscv_cpu_info {
 
 extern const riscv_cpu_info *riscv_find_cpu (const char *);
 
+/* Common vector costs in any kind of vectorization (e.g VLA and VLS).  */
+struct common_vector_cost
+{
+  /* Cost of any integer vector operation, excluding the ones handled
+     specially below.  */
+  const int int_stmt_cost;
+
+  /* Cost of any fp vector operation, excluding the ones handled
+     specially below.  */
+  const int fp_stmt_cost;
+
+  /* Gather/scatter vectorization cost.  */
+  const int gather_load_cost;
+  const int scatter_store_cost;
+
+  /* Cost of a vector-to-scalar operation.  */
+  const int vec_to_scalar_cost;
+
+  /* Cost of a scalar-to-vector operation.  */
+  const int scalar_to_vec_cost;
+
+  /* Cost of a permute operation.  */
+  const int permute_cost;
+
+  /* Cost of an aligned vector load.  */
+  const int align_load_cost;
+
+  /* Cost of an aligned vector store.  */
+  const int align_store_cost;
+
+  /* Cost of an unaligned vector load.  */
+  const int unalign_load_cost;
+
+  /* Cost of an unaligned vector store.  */
+  const int unalign_store_cost;
+};
+
+/* scalable vectorization (VLA) specific cost.  */
+struct scalable_vector_cost : common_vector_cost
+{
+  CONSTEXPR scalable_vector_cost (const common_vector_cost &base)
+    : common_vector_cost (base)
+  {}
+
+  /* TODO: We will need more other kinds of vector cost for VLA.
+     E.g. fold_left reduction cost, lanes load/store cost, ..., etc.  */
+};
+
+/* Cost for vector insn classes.  */
+struct cpu_vector_cost
+{
+  /* Cost of any integer scalar operation, excluding load and store.  */
+  const int scalar_int_stmt_cost;
+
+  /* Cost of any fp scalar operation, excluding load and store.  */
+  const int scalar_fp_stmt_cost;
+
+  /* Cost of a scalar load.  */
+  const int scalar_load_cost;
+
+  /* Cost of a scalar store.  */
+  const int scalar_store_cost;
+
+  /* Cost of a taken branch.  */
+  const int cond_taken_branch_cost;
+
+  /* Cost of a not-taken branch.  */
+  const int cond_not_taken_branch_cost;
+
+  /* Cost of an VLS modes operations.  */
+  const common_vector_cost *vls;
+
+  /* Cost of an VLA modes operations.  */
+  const scalable_vector_cost *vla;
+};
+
 /* Routines implemented in riscv-selftests.cc.  */
 #if CHECKING_P
 namespace selftest {
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 7888cef58fe..e7bc9ed5233 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -750,9 +750,8 @@  costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 		      stmt_vec_info stmt_info, slp_tree, tree vectype,
 		      int misalign, vect_cost_model_location where)
 {
-  /* TODO: Use default STMT cost model.
-	   We will support more accurate STMT cost model later.  */
-  int stmt_cost = default_builtin_vectorization_cost (kind, vectype, misalign);
+  int stmt_cost
+    = targetm.vectorize.builtin_vectorization_cost (kind, vectype, misalign);
 
   /* Do one-time initialization based on the vinfo.  */
   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 69a8a503f30..2dc44244309 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -281,6 +281,7 @@  struct riscv_tune_param
   bool slow_unaligned_access;
   bool use_divmod_expansion;
   unsigned int fusible_ops;
+  const struct cpu_vector_cost *vec_costs;
 };
 
 
@@ -348,6 +349,50 @@  const enum reg_class riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = {
   VD_REGS,	VD_REGS,	VD_REGS,	VD_REGS,
 };
 
+/* Generic costs for VLS vector operations.   */
+static const common_vector_cost generic_vls_vector_cost = {
+  1, /* int_stmt_cost  */
+  1, /* fp_stmt_cost  */
+  1, /* gather_load_cost  */
+  1, /* scatter_store_cost  */
+  1, /* vec_to_scalar_cost  */
+  1, /* scalar_to_vec_cost  */
+  1, /* permute_cost  */
+  3, /* align_load_cost  */
+  3, /* align_store_cost  */
+  3, /* unalign_load_cost  */
+  3, /* unalign_store_cost  */
+};
+
+/* Generic costs for VLA vector operations.  */
+static const scalable_vector_cost generic_vla_vector_cost = {
+  {
+    1, /* int_stmt_cost  */
+    1, /* fp_stmt_cost  */
+    1, /* gather_load_cost  */
+    1, /* scatter_store_cost  */
+    1, /* vec_to_scalar_cost  */
+    1, /* scalar_to_vec_cost  */
+    1, /* permute_cost  */
+    3, /* align_load_cost  */
+    3, /* align_store_cost  */
+    3, /* unalign_load_cost  */
+    3, /* unalign_store_cost  */
+  },
+};
+
+/* Generic costs for vector insn classes.  */
+static const struct cpu_vector_cost generic_vector_cost = {
+  1,			    /* scalar_int_stmt_cost  */
+  1,			    /* scalar_fp_stmt_cost  */
+  1,			    /* scalar_load_cost  */
+  1,			    /* scalar_store_cost  */
+  3,			    /* cond_taken_branch_cost  */
+  1,			    /* cond_not_taken_branch_cost  */
+  &generic_vls_vector_cost, /* vls  */
+  &generic_vla_vector_cost, /* vla */
+};
+
 /* Costs to use when optimizing for rocket.  */
 static const struct riscv_tune_param rocket_tune_info = {
   {COSTS_N_INSNS (4), COSTS_N_INSNS (5)},	/* fp_add */
@@ -362,6 +407,7 @@  static const struct riscv_tune_param rocket_tune_info = {
   true,						/* slow_unaligned_access */
   false,					/* use_divmod_expansion */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
+  NULL,						/* vector cost */
 };
 
 /* Costs to use when optimizing for Sifive 7 Series.  */
@@ -378,6 +424,7 @@  static const struct riscv_tune_param sifive_7_tune_info = {
   true,						/* slow_unaligned_access */
   false,					/* use_divmod_expansion */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
+  NULL,						/* vector cost */
 };
 
 /* Costs to use when optimizing for T-HEAD c906.  */
@@ -394,6 +441,7 @@  static const struct riscv_tune_param thead_c906_tune_info = {
   false,            /* slow_unaligned_access */
   false,	/* use_divmod_expansion */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
+  NULL,						/* vector cost */
 };
 
 /* Costs to use when optimizing for a generic ooo profile.  */
@@ -410,6 +458,7 @@  static const struct riscv_tune_param generic_ooo_tune_info = {
   false,					/* slow_unaligned_access */
   false,					/* use_divmod_expansion */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
+  &generic_vector_cost,				/* vector cost */
 };
 
 /* Costs to use when optimizing for size.  */
@@ -426,6 +475,7 @@  static const struct riscv_tune_param optimize_size_tune_info = {
   false,					/* slow_unaligned_access */
   false,					/* use_divmod_expansion */
   RISCV_FUSE_NOTHING,                           /* fusible_ops */
+  NULL,						/* vector cost */
 };
 
 static bool riscv_avoid_shrink_wrapping_separate ();
@@ -10192,6 +10242,95 @@  riscv_frame_pointer_required (void)
   return riscv_save_frame_pointer && !crtl->is_leaf;
 }
 
+/* Return the appropriate common costs for vectors of type VECTYPE.  */
+static const common_vector_cost *
+get_common_costs (tree vectype)
+{
+  const cpu_vector_cost *costs = tune_param->vec_costs;
+  gcc_assert (costs);
+
+  if (vectype && riscv_v_ext_vls_mode_p (TYPE_MODE (vectype)))
+    return costs->vls;
+  return costs->vla;
+}
+
+/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+
+static int
+riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
+				  tree vectype, int misalign ATTRIBUTE_UNUSED)
+{
+  unsigned elements;
+  const cpu_vector_cost *costs = tune_param->vec_costs;
+  bool fp = false;
+
+  if (vectype != NULL)
+    fp = FLOAT_TYPE_P (vectype);
+
+  if (costs != NULL)
+    {
+      const common_vector_cost *common_costs = get_common_costs (vectype);
+      gcc_assert (common_costs != NULL);
+      switch (type_of_cost)
+	{
+	case scalar_stmt:
+	  return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
+
+	case scalar_load:
+	  return costs->scalar_load_cost;
+
+	case scalar_store:
+	  return costs->scalar_store_cost;
+
+	case vector_stmt:
+	  return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
+
+	case vector_load:
+	  return common_costs->align_load_cost;
+
+	case vector_store:
+	  return common_costs->align_store_cost;
+
+	case vec_to_scalar:
+	  return common_costs->vec_to_scalar_cost;
+
+	case scalar_to_vec:
+	  return common_costs->scalar_to_vec_cost;
+
+	case unaligned_load:
+	  return common_costs->unalign_load_cost;
+	case vector_gather_load:
+	  return common_costs->gather_load_cost;
+
+	case unaligned_store:
+	  return common_costs->unalign_store_cost;
+	case vector_scatter_store:
+	  return common_costs->scatter_store_cost;
+
+	case cond_branch_taken:
+	  return costs->cond_taken_branch_cost;
+
+	case cond_branch_not_taken:
+	  return costs->cond_not_taken_branch_cost;
+
+	case vec_perm:
+	  return common_costs->permute_cost;
+
+	case vec_promote_demote:
+	  return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
+
+	case vec_construct:
+	  elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
+	  return elements / 2 + 1;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
+
+  return default_builtin_vectorization_cost (type_of_cost, vectype, misalign);
+}
+
 /* Implement targetm.vectorize.create_costs.  */
 
 static vector_costs *
@@ -10582,6 +10721,10 @@  extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
 #undef TARGET_FRAME_POINTER_REQUIRED
 #define TARGET_FRAME_POINTER_REQUIRED riscv_frame_pointer_required
 
+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
+  riscv_builtin_vectorization_cost
+
 #undef TARGET_VECTORIZE_CREATE_COSTS
 #define TARGET_VECTORIZE_CREATE_COSTS riscv_vectorize_create_costs
 
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c
new file mode 100644
index 00000000000..06e08ec5f2e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr111153.c
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -mtune=generic-ooo" } */
+
+#define DEF_REDUC_PLUS(TYPE)                                                   \
+  TYPE __attribute__ ((noinline, noclone))                                     \
+  reduc_plus_##TYPE (TYPE *__restrict a, int n)                                \
+  {                                                                            \
+    TYPE r = 0;                                                                \
+    for (int i = 0; i < n; ++i)                                                \
+      r += a[i];                                                               \
+    return r;                                                                  \
+  }
+
+#define TEST_PLUS(T) T (int)
+
+TEST_PLUS (DEF_REDUC_PLUS)
+
+/* { dg-final { scan-assembler-not {vsetivli\s+zero,\s*4} } } */