[v7] RISC-V: Using merge approach to optimize repeating sequence in vec_init
Checks
Commit Message
From: Pan Li <pan2.li@intel.com>
This patch would like to optimize the VLS vector initialization like
repeating sequence. From the vslide1down to the vmerge with a simple
cost model, aka every instruction only has 1 cost.
Given code with -march=rv64gcv_zvl256b --param riscv-autovec-preference=fixed-vlmax
typedef int64_t vnx32di __attribute__ ((vector_size (256)));
__attribute__ ((noipa)) void
f_vnx32di (int64_t a, int64_t b, int64_t *out)
{
vnx32di v = {
a, b, a, b, a, b, a, b,
a, b, a, b, a, b, a, b,
a, b, a, b, a, b, a, b,
a, b, a, b, a, b, a, b,
};
*(vnx32di *) out = v;
}
Before this patch:
vslide1down.vx (x31 times)
After this patch:
li a5,-1431654400
addi a5,a5,-1365
li a3,-1431654400
addi a3,a3,-1366
slli a5,a5,32
add a5,a5,a3
vsetvli a4,zero,e64,m8,ta,ma
vmv.v.x v8,a0
vmv.s.x v0,a5
vmerge.vxm v8,v8,a1,v0
vs8r.v v8,0(a2)
Since we dont't have SEW = 128 in vec_duplicate, we can't combine ab into
SEW = 128 element and then broadcast this big element.
Signed-off-by: Pan Li <pan2.li@intel.com>
Co-Authored by: Juzhe-Zhong <juzhe.zhong@rivai.ai>
gcc/ChangeLog:
* config/riscv/riscv-protos.h (enum insn_type): New type.
* config/riscv/riscv-v.cc (RVV_INSN_OPERANDS_MAX): New macro.
(rvv_builder::can_duplicate_repeating_sequence_p): Align the referenced
class member.
(rvv_builder::get_merged_repeating_sequence): Ditto.
(rvv_builder::repeating_sequence_use_merge_profitable_p): New function
to evaluate the optimization cost.
(rvv_builder::get_merge_scalar_mask): New function to get the merge
mask.
(emit_scalar_move_insn): New function to emit vmv.s.x.
(emit_vlmax_integer_move_insn): New function to emit vlmax vmv.v.x.
(emit_nonvlmax_integer_move_insn): New function to emit nonvlmax
vmv.v.x.
(get_repeating_sequence_dup_machine_mode): New function to get the dup
machine mode.
(expand_vector_init_merge_repeating_sequence): New function to perform
the optimization.
(expand_vec_init): Add this vector init optimization.
* config/riscv/riscv.h (BITS_PER_WORD): New macro.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c: New test.
* gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c: New test.
Signed-off-by: Pan Li <pan2.li@intel.com>
---
gcc/config/riscv/riscv-protos.h | 1 +
gcc/config/riscv/riscv-v.cc | 225 +++++++++++++++++-
gcc/config/riscv/riscv.h | 1 +
.../vls-vlmax/init-repeat-sequence-1.c | 21 ++
.../vls-vlmax/init-repeat-sequence-2.c | 24 ++
.../vls-vlmax/init-repeat-sequence-3.c | 25 ++
.../vls-vlmax/init-repeat-sequence-4.c | 15 ++
.../vls-vlmax/init-repeat-sequence-5.c | 17 ++
.../vls-vlmax/init-repeat-sequence-run-1.c | 47 ++++
.../vls-vlmax/init-repeat-sequence-run-2.c | 46 ++++
.../vls-vlmax/init-repeat-sequence-run-3.c | 41 ++++
11 files changed, 457 insertions(+), 6 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c
Comments
LGTM, thanks
On Mon, May 29, 2023 at 4:54 PM Pan Li via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> This patch would like to optimize the VLS vector initialization like
> repeating sequence. From the vslide1down to the vmerge with a simple
> cost model, aka every instruction only has 1 cost.
>
> Given code with -march=rv64gcv_zvl256b --param riscv-autovec-preference=fixed-vlmax
> typedef int64_t vnx32di __attribute__ ((vector_size (256)));
>
> __attribute__ ((noipa)) void
> f_vnx32di (int64_t a, int64_t b, int64_t *out)
> {
> vnx32di v = {
> a, b, a, b, a, b, a, b,
> a, b, a, b, a, b, a, b,
> a, b, a, b, a, b, a, b,
> a, b, a, b, a, b, a, b,
> };
> *(vnx32di *) out = v;
> }
>
> Before this patch:
> vslide1down.vx (x31 times)
>
> After this patch:
> li a5,-1431654400
> addi a5,a5,-1365
> li a3,-1431654400
> addi a3,a3,-1366
> slli a5,a5,32
> add a5,a5,a3
> vsetvli a4,zero,e64,m8,ta,ma
> vmv.v.x v8,a0
> vmv.s.x v0,a5
> vmerge.vxm v8,v8,a1,v0
> vs8r.v v8,0(a2)
>
> Since we dont't have SEW = 128 in vec_duplicate, we can't combine ab into
> SEW = 128 element and then broadcast this big element.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> Co-Authored by: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-protos.h (enum insn_type): New type.
> * config/riscv/riscv-v.cc (RVV_INSN_OPERANDS_MAX): New macro.
> (rvv_builder::can_duplicate_repeating_sequence_p): Align the referenced
> class member.
> (rvv_builder::get_merged_repeating_sequence): Ditto.
> (rvv_builder::repeating_sequence_use_merge_profitable_p): New function
> to evaluate the optimization cost.
> (rvv_builder::get_merge_scalar_mask): New function to get the merge
> mask.
> (emit_scalar_move_insn): New function to emit vmv.s.x.
> (emit_vlmax_integer_move_insn): New function to emit vlmax vmv.v.x.
> (emit_nonvlmax_integer_move_insn): New function to emit nonvlmax
> vmv.v.x.
> (get_repeating_sequence_dup_machine_mode): New function to get the dup
> machine mode.
> (expand_vector_init_merge_repeating_sequence): New function to perform
> the optimization.
> (expand_vec_init): Add this vector init optimization.
> * config/riscv/riscv.h (BITS_PER_WORD): New macro.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c: New test.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
> gcc/config/riscv/riscv-protos.h | 1 +
> gcc/config/riscv/riscv-v.cc | 225 +++++++++++++++++-
> gcc/config/riscv/riscv.h | 1 +
> .../vls-vlmax/init-repeat-sequence-1.c | 21 ++
> .../vls-vlmax/init-repeat-sequence-2.c | 24 ++
> .../vls-vlmax/init-repeat-sequence-3.c | 25 ++
> .../vls-vlmax/init-repeat-sequence-4.c | 15 ++
> .../vls-vlmax/init-repeat-sequence-5.c | 17 ++
> .../vls-vlmax/init-repeat-sequence-run-1.c | 47 ++++
> .../vls-vlmax/init-repeat-sequence-run-2.c | 46 ++++
> .../vls-vlmax/init-repeat-sequence-run-3.c | 41 ++++
> 11 files changed, 457 insertions(+), 6 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c
>
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 0462f96c8d5..277845673d4 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -142,6 +142,7 @@ enum insn_type
> RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand. */
> RVV_UNOP_MU = RVV_UNOP + 2, /* Likewise. */
> RVV_TERNOP = 5,
> + RVV_SCALAR_MOV_OP = 4, /* +1 for VUNDEF according to vector.md. */
> };
> enum vlmul_type
> {
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index a5715bb466c..8c920532549 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -21,6 +21,10 @@
>
> #define IN_TARGET_CODE 1
>
> +/* We have a maximum of 11 operands for RVV instruction patterns according to
> + the vector.md. */
> +#define RVV_INSN_OPERANDS_MAX 11
> +
> #include "config.h"
> #include "system.h"
> #include "coretypes.h"
> @@ -1286,19 +1290,32 @@ public:
> : rtx_vector_builder (mode, npatterns, nelts_per_pattern)
> {
> m_inner_mode = GET_MODE_INNER (mode);
> - m_inner_size = GET_MODE_BITSIZE (m_inner_mode).to_constant ();
> + m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode);
> + m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode);
> +
> + gcc_assert (
> + int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode));
> }
>
> bool can_duplicate_repeating_sequence_p ();
> rtx get_merged_repeating_sequence ();
>
> + bool repeating_sequence_use_merge_profitable_p ();
> + rtx get_merge_scalar_mask (unsigned int) const;
> +
> machine_mode new_mode () const { return m_new_mode; }
> + scalar_mode inner_mode () const { return m_inner_mode; }
> + scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
> + unsigned int inner_bits_size () const { return m_inner_bits_size; }
> + unsigned int inner_bytes_size () const { return m_inner_bytes_size; }
>
> private:
> - machine_mode m_inner_mode;
> + scalar_mode m_inner_mode;
> + scalar_int_mode m_inner_int_mode;
> machine_mode m_new_mode;
> scalar_int_mode m_new_inner_mode;
> - unsigned int m_inner_size;
> + unsigned int m_inner_bits_size;
> + unsigned int m_inner_bytes_size;
> };
>
> /* Return true if the vector duplicated by a super element which is the fusion
> @@ -1309,7 +1326,7 @@ bool
> rvv_builder::can_duplicate_repeating_sequence_p ()
> {
> poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
> - unsigned int new_inner_size = m_inner_size * npatterns ();
> + unsigned int new_inner_size = m_inner_bits_size * npatterns ();
> if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
> || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
> || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
> @@ -1317,6 +1334,61 @@ rvv_builder::can_duplicate_repeating_sequence_p ()
> return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ());
> }
>
> +/* Return true if it is a repeating sequence that using
> + merge approach has better codegen than using default
> + approach (slide1down).
> +
> + Sequence A:
> + {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
> +
> + nelts = 16
> + npatterns = 2
> +
> + for merging a we need mask 101010....
> + for merging b we need mask 010101....
> +
> + Foreach element in the npattern, we need to build a mask in scalar register.
> + Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
> + instruction and 1 scalar move to v0 register. Finally we need vector merge
> + to merge them.
> +
> + lui a5, #imm
> + add a5, #imm
> + vmov.s.x v0, a5
> + vmerge.vxm v9, v9, a1, v0
> +
> + So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
> + If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
> + So return true in this case as it is profitable.
> +
> + Sequence B:
> + {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
> +
> + nelts = 16
> + npatterns = 8
> +
> + COST of merge approach = (3 + 1) * npatterns = 24
> + COST of slide1down approach = nelts = 16
> + Return false in this case as it is NOT profitable in merge approach.
> +*/
> +bool
> +rvv_builder::repeating_sequence_use_merge_profitable_p ()
> +{
> + if (inner_bytes_size () > UNITS_PER_WORD)
> + return false;
> +
> + unsigned int nelts = full_nelts ().to_constant ();
> +
> + if (!repeating_sequence_p (0, nelts, npatterns ()))
> + return false;
> +
> + unsigned int merge_cost = 1;
> + unsigned int build_merge_mask_cost = 3;
> + unsigned int slide1down_cost = nelts;
> +
> + return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
> +}
> +
> /* Merge the repeating sequence into a single element and return the RTX. */
> rtx
> rvv_builder::get_merged_repeating_sequence ()
> @@ -1324,11 +1396,11 @@ rvv_builder::get_merged_repeating_sequence ()
> scalar_int_mode mode = Pmode;
> rtx target = gen_reg_rtx (mode);
> emit_move_insn (target, const0_rtx);
> - rtx imm = gen_int_mode ((1ULL << m_inner_size) - 1, mode);
> + rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
> /* { a, b, a, b }: Generate duplicate element = b << bits | a. */
> for (unsigned int i = 0; i < npatterns (); i++)
> {
> - unsigned int loc = m_inner_size * i;
> + unsigned int loc = m_inner_bits_size * i;
> rtx shift = gen_int_mode (loc, mode);
> rtx ele = gen_lowpart (mode, elt (i));
> rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
> @@ -1344,6 +1416,29 @@ rvv_builder::get_merged_repeating_sequence ()
> return target;
> }
>
> +/* Get the mask for merge approach.
> +
> + Consider such following case:
> + {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
> + To merge "a", the mask should be 1010....
> + To merge "b", the mask should be 0101....
> +*/
> +rtx
> +rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const
> +{
> + unsigned HOST_WIDE_INT mask = 0;
> + unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
> +
> + gcc_assert (BITS_PER_WORD % npatterns () == 0);
> +
> + int limit = BITS_PER_WORD / npatterns ();
> +
> + for (int i = 0; i < limit; i++)
> + mask |= base_mask << (i * npatterns ());
> +
> + return gen_int_mode (mask, inner_int_mode ());
> +}
> +
> /* Subroutine of riscv_vector_expand_vector_init.
> Works as follows:
> (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
> @@ -1371,6 +1466,111 @@ expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
> }
> }
>
> +/* Emit vmv.s.x instruction. */
> +
> +static void
> +emit_scalar_move_insn (unsigned icode, rtx *ops)
> +{
> + machine_mode data_mode = GET_MODE (ops[0]);
> + machine_mode mask_mode = get_mask_mode (data_mode).require ();
> + insn_expander<RVV_INSN_OPERANDS_MAX> e (riscv_vector::RVV_SCALAR_MOV_OP,
> + /* HAS_DEST_P */ true,
> + /* FULLY_UNMASKED_P */ false,
> + /* USE_REAL_MERGE_P */ true,
> + /* HAS_AVL_P */ true,
> + /* VLMAX_P */ false,
> + data_mode, mask_mode);
> + e.set_policy (TAIL_ANY);
> + e.set_policy (MASK_ANY);
> + e.set_vl (CONST1_RTX (Pmode));
> + e.emit_insn ((enum insn_code) icode, ops);
> +}
> +
> +/* Emit vmv.v.x instruction with vlmax. */
> +
> +static void
> +emit_vlmax_integer_move_insn (unsigned icode, rtx *ops, rtx vl)
> +{
> + emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ops, vl);
> +}
> +
> +/* Emit vmv.v.x instruction with nonvlmax. */
> +
> +static void
> +emit_nonvlmax_integer_move_insn (unsigned icode, rtx *ops, rtx avl)
> +{
> + emit_nonvlmax_insn (icode, riscv_vector::RVV_UNOP, ops, avl);
> +}
> +
> +/* Emit merge instruction. */
> +
> +static machine_mode
> +get_repeating_sequence_dup_machine_mode (const rvv_builder &builder)
> +{
> + poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ());
> +
> + if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR))
> + {
> + dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR,
> + builder.inner_bytes_size ());
> + }
> +
> + return get_vector_mode (builder.inner_int_mode (), dup_nunits).require ();
> +}
> +
> +/* Use merge approach to initialize the vector with repeating sequence.
> + v = {a, b, a, b, a, b, a, b}.
> +
> + v = broadcast (a).
> + mask = 0b01010101....
> + v = merge (v, b, mask)
> +*/
> +static void
> +expand_vector_init_merge_repeating_sequence (rtx target,
> + const rvv_builder &builder)
> +{
> + machine_mode dup_mode = get_repeating_sequence_dup_machine_mode (builder);
> + machine_mode dup_mask_mode = get_mask_mode (dup_mode).require ();
> + machine_mode mask_mode = get_mask_mode (builder.mode ()).require ();
> + uint64_t full_nelts = builder.full_nelts ().to_constant ();
> +
> + /* Step 1: Broadcast the first pattern. */
> + rtx ops[] = {target, force_reg (GET_MODE_INNER (dup_mode), builder.elt (0))};
> + emit_vlmax_integer_move_insn (code_for_pred_broadcast (builder.mode ()),
> + ops, NULL_RTX);
> +
> + /* Step 2: Merge the rest iteration of pattern. */
> + for (unsigned int i = 1; i < builder.npatterns (); i++)
> + {
> + /* Step 2-1: Generate mask register v0 for each merge. */
> + rtx merge_mask = builder.get_merge_scalar_mask (i);
> + rtx mask = gen_reg_rtx (mask_mode);
> + rtx dup = gen_reg_rtx (dup_mode);
> +
> + if (full_nelts <= BITS_PER_WORD) /* vmv.s.x. */
> + {
> + rtx ops[] = {dup, gen_scalar_move_mask (dup_mask_mode),
> + RVV_VUNDEF (dup_mode), merge_mask};
> + emit_scalar_move_insn (code_for_pred_broadcast (GET_MODE (dup)),
> + ops);
> + }
> + else /* vmv.v.x. */
> + {
> + rtx ops[] = {dup, force_reg (GET_MODE_INNER (dup_mode), merge_mask)};
> + rtx vl = gen_int_mode (CEIL (full_nelts, BITS_PER_WORD), Pmode);
> + emit_nonvlmax_integer_move_insn (code_for_pred_broadcast (dup_mode),
> + ops, vl);
> + }
> +
> + emit_move_insn (mask, gen_lowpart (mask_mode, dup));
> +
> + /* Step 2-2: Merge pattern according to the mask. */
> + rtx ops[] = {target, target, builder.elt (i), mask};
> + emit_vlmax_merge_insn (code_for_pred_merge_scalar (GET_MODE (target)),
> + riscv_vector::RVV_MERGE_OP, ops);
> + }
> +}
> +
> /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
>
> void
> @@ -1394,6 +1594,19 @@ expand_vec_init (rtx target, rtx vals)
> emit_move_insn (target, gen_lowpart (mode, dup));
> return;
> }
> +
> + /* Case 2: Optimize repeating sequence cases that Case 1 can
> + not handle and it is profitable. For example:
> + ELEMENT BITSIZE = 64.
> + v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
> + We can't find a vector mode for "ab" which will be combined into
> + 128-bit element to duplicate. */
> + if (v.repeating_sequence_use_merge_profitable_p ())
> + {
> + expand_vector_init_merge_repeating_sequence (target, v);
> + return;
> + }
> +
> /* TODO: We will support more Initialization of vector in the future. */
> }
>
> diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h
> index 807b0bccc18..4541255a8ae 100644
> --- a/gcc/config/riscv/riscv.h
> +++ b/gcc/config/riscv/riscv.h
> @@ -150,6 +150,7 @@ ASM_MISA_SPEC
>
> /* Width of a word, in units (bytes). */
> #define UNITS_PER_WORD (TARGET_64BIT ? 8 : 4)
> +#define BITS_PER_WORD (BITS_PER_UNIT * UNITS_PER_WORD)
> #ifndef IN_LIBGCC2
> #define MIN_UNITS_PER_WORD 4
> #endif
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c
> new file mode 100644
> index 00000000000..59ad49cf795
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d" } */
> +
> +#include <stdint-gcc.h>
> +
> +typedef int64_t vnx16di __attribute__ ((vector_size (1024)));
> +
> +__attribute__ ((noipa)) void
> +f_vnx16di (int64_t a, int64_t b, int64_t *out)
> +{
> + vnx16di v = {
> + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
> + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
> + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
> + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
> + };
> + *(vnx16di *) out = v;
> +}
> +
> +/* { dg-final { scan-assembler-times {vmv\.v\.x\s+v[0-9]+,\s*[a-x0-9]+} 2 } } */
> +/* { dg-final { scan-assembler-times {vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c
> new file mode 100644
> index 00000000000..fe3741e3be7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
> +
> +#include <stdint-gcc.h>
> +
> +typedef double vnx8df __attribute__ ((vector_size (64)));
> +typedef double vnx16df __attribute__ ((vector_size (128)));
> +
> +__attribute__ ((noipa)) void
> +f_vnx8df (double a, double b, double *out)
> +{
> + vnx8df v = {a, b, a, b, a, b, a, b};
> + *(vnx8df *) out = v;
> +}
> +
> +__attribute__ ((noipa)) void
> +f_vnx16df (double a, double b, double *out)
> +{
> + vnx16df v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b};
> + *(vnx16df *) out = v;
> +}
> +
> +/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vfmerge\.vfm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c
> new file mode 100644
> index 00000000000..74776def963
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
> +
> +#include <stdint-gcc.h>
> +
> +typedef int64_t vnx8di __attribute__ ((vector_size (64)));
> +typedef int64_t vnx16di __attribute__ ((vector_size (128)));
> +
> +__attribute__ ((noipa)) void
> +f_vnx8di (int64_t a, int64_t b, int64_t *out)
> +{
> + vnx8di v = {a, b, a, b, a, b, a, b};
> + *(vnx8di *) out = v;
> +}
> +
> +__attribute__ ((noipa)) void
> +f_vnx16di (int64_t a, int64_t b, int64_t *out)
> +{
> + vnx16di v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b};
> + *(vnx16di *) out = v;
> +}
> +
> +
> +/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c
> new file mode 100644
> index 00000000000..2f61465e84f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
> +
> +#include <stdint-gcc.h>
> +
> +typedef int64_t vnx8di __attribute__ ((vector_size (64)));
> +
> +__attribute__ ((noipa)) void
> +f_vnx8di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out)
> +{
> + vnx8di v = {a, b, c, d, a, b, c, d};
> + *(vnx8di *) out = v;
> +}
> +
> +/* { dg-final { scan-assembler-times {vslide1down\.vx\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+} 7 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c
> new file mode 100644
> index 00000000000..7f4e6783f8e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
> +
> +#include <stdint-gcc.h>
> +
> +typedef int64_t vnx16di __attribute__ ((vector_size (128)));
> +
> +__attribute__ ((noipa)) void
> +f_vnx16di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out)
> +{
> + vnx16di v = {a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d,};
> + *(vnx16di *) out = v;
> +}
> +
> +/* { dg-final { scan-assembler-times {vmv\.v\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 0 } } */
> +/* { dg-final { scan-assembler-times {vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 0 } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c
> new file mode 100644
> index 00000000000..1931d3f5fa0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c
> @@ -0,0 +1,47 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
> +
> +#include "init-repeat-sequence-2.c"
> +
> +int
> +main ()
> +{
> + double a = -1789089.23423;
> + double b = -8916156.45644;
> +
> + double v_vnx8df[sizeof (vnx8df) / sizeof (double)];
> + f_vnx8df (a, b, v_vnx8df);
> +
> + return 0;
> + for (int i = 0; i < sizeof (vnx8df) / sizeof (double); i++)
> + {
> + if (i % 2 == 0)
> + {
> + if (v_vnx8df[i] != a)
> + __builtin_abort ();
> + }
> + else
> + {
> + if (v_vnx8df[i] != b)
> + __builtin_abort ();
> + }
> + }
> +
> + double v_vnx16df[sizeof (vnx16df) / sizeof (double)];
> + f_vnx16df (a, b, v_vnx16df);
> + for (int i = 0; i < sizeof (vnx16df) / sizeof (double); i++)
> + {
> + if (i % 2 == 0)
> + {
> + if (v_vnx16df[i] != a)
> + __builtin_abort ();
> + }
> + else
> + {
> + if (v_vnx16df[i] != b)
> + __builtin_abort ();
> + }
> + }
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c
> new file mode 100644
> index 00000000000..5564dd4a05a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
> +
> +#include "init-repeat-sequence-3.c"
> +
> +int
> +main ()
> +{
> + int64_t a = -178908923423;
> + int64_t b = -891615645644;
> +
> + int64_t v_vnx8di[sizeof (vnx8di) / sizeof (int64_t)];
> + f_vnx8di (a, b, v_vnx8di);
> + for (int i = 0; i < sizeof (vnx8di) / sizeof (int64_t); i++)
> + {
> + if (i % 2 == 0)
> + {
> + if (v_vnx8di[i] != a)
> + __builtin_abort ();
> + }
> + else
> + {
> + if (v_vnx8di[i] != b)
> + __builtin_abort ();
> + }
> + }
> +
> + int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)];
> + f_vnx16di (a, b, v_vnx16di);
> +
> + for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++)
> + {
> + if (i % 2 == 0)
> + {
> + if (v_vnx16di[i] != a)
> + __builtin_abort ();
> + }
> + else
> + {
> + if (v_vnx16di[i] != b)
> + __builtin_abort ();
> + }
> + }
> +
> + return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c
> new file mode 100644
> index 00000000000..fec5adc56de
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c
> @@ -0,0 +1,41 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
> +
> +#include "init-repeat-sequence-5.c"
> +
> +int
> +main ()
> +{
> + int64_t a = -178908923423;
> + int64_t b = -891615645644;
> + int64_t c = 78908923423;
> + int64_t d = 81615645644;
> +
> + int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)];
> + f_vnx16di (a, b, c, d, v_vnx16di);
> + for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++)
> + {
> + if (i % 4 == 0)
> + {
> + if (v_vnx16di[i] != a)
> + __builtin_abort ();
> + }
> + else if (i % 4 == 1)
> + {
> + if (v_vnx16di[i] != b)
> + __builtin_abort ();
> + }
> + else if (i % 4 == 2)
> + {
> + if (v_vnx16di[i] != c)
> + __builtin_abort ();
> + }
> + else
> + {
> + if (v_vnx16di[i] != d)
> + __builtin_abort ();
> + }
> + }
> +
> + return 0;
> +}
> --
> 2.34.1
>
Committed, thanks Kito.
Pan
-----Original Message-----
From: Kito Cheng <kito.cheng@gmail.com>
Sent: Monday, May 29, 2023 5:33 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@sifive.com; Wang, Yanzhang <yanzhang.wang@intel.com>
Subject: Re: [PATCH v7] RISC-V: Using merge approach to optimize repeating sequence in vec_init
LGTM, thanks
On Mon, May 29, 2023 at 4:54 PM Pan Li via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> This patch would like to optimize the VLS vector initialization like
> repeating sequence. From the vslide1down to the vmerge with a simple
> cost model, aka every instruction only has 1 cost.
>
> Given code with -march=rv64gcv_zvl256b --param
> riscv-autovec-preference=fixed-vlmax
> typedef int64_t vnx32di __attribute__ ((vector_size (256)));
>
> __attribute__ ((noipa)) void
> f_vnx32di (int64_t a, int64_t b, int64_t *out) {
> vnx32di v = {
> a, b, a, b, a, b, a, b,
> a, b, a, b, a, b, a, b,
> a, b, a, b, a, b, a, b,
> a, b, a, b, a, b, a, b,
> };
> *(vnx32di *) out = v;
> }
>
> Before this patch:
> vslide1down.vx (x31 times)
>
> After this patch:
> li a5,-1431654400
> addi a5,a5,-1365
> li a3,-1431654400
> addi a3,a3,-1366
> slli a5,a5,32
> add a5,a5,a3
> vsetvli a4,zero,e64,m8,ta,ma
> vmv.v.x v8,a0
> vmv.s.x v0,a5
> vmerge.vxm v8,v8,a1,v0
> vs8r.v v8,0(a2)
>
> Since we dont't have SEW = 128 in vec_duplicate, we can't combine ab
> into SEW = 128 element and then broadcast this big element.
>
> Signed-off-by: Pan Li <pan2.li@intel.com> Co-Authored by: Juzhe-Zhong
> <juzhe.zhong@rivai.ai>
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-protos.h (enum insn_type): New type.
> * config/riscv/riscv-v.cc (RVV_INSN_OPERANDS_MAX): New macro.
> (rvv_builder::can_duplicate_repeating_sequence_p): Align the referenced
> class member.
> (rvv_builder::get_merged_repeating_sequence): Ditto.
> (rvv_builder::repeating_sequence_use_merge_profitable_p): New function
> to evaluate the optimization cost.
> (rvv_builder::get_merge_scalar_mask): New function to get the merge
> mask.
> (emit_scalar_move_insn): New function to emit vmv.s.x.
> (emit_vlmax_integer_move_insn): New function to emit vlmax vmv.v.x.
> (emit_nonvlmax_integer_move_insn): New function to emit nonvlmax
> vmv.v.x.
> (get_repeating_sequence_dup_machine_mode): New function to get the dup
> machine mode.
> (expand_vector_init_merge_repeating_sequence): New function to perform
> the optimization.
> (expand_vec_init): Add this vector init optimization.
> * config/riscv/riscv.h (BITS_PER_WORD): New macro.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-1.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-2.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-3.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-4.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-5.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-1.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-2.c: New test.
> * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-run-3.c: New test.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
> gcc/config/riscv/riscv-protos.h | 1 +
> gcc/config/riscv/riscv-v.cc | 225 +++++++++++++++++-
> gcc/config/riscv/riscv.h | 1 +
> .../vls-vlmax/init-repeat-sequence-1.c | 21 ++
> .../vls-vlmax/init-repeat-sequence-2.c | 24 ++
> .../vls-vlmax/init-repeat-sequence-3.c | 25 ++
> .../vls-vlmax/init-repeat-sequence-4.c | 15 ++
> .../vls-vlmax/init-repeat-sequence-5.c | 17 ++
> .../vls-vlmax/init-repeat-sequence-run-1.c | 47 ++++
> .../vls-vlmax/init-repeat-sequence-run-2.c | 46 ++++
> .../vls-vlmax/init-repeat-sequence-run-3.c | 41 ++++
> 11 files changed, 457 insertions(+), 6 deletions(-) create mode
> 100644
> gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque
> nce-1.c create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque
> nce-2.c create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque
> nce-3.c create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque
> nce-4.c create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque
> nce-5.c create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque
> nce-run-1.c create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque
> nce-run-2.c create mode 100644
> gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seque
> nce-run-3.c
>
> diff --git a/gcc/config/riscv/riscv-protos.h
> b/gcc/config/riscv/riscv-protos.h index 0462f96c8d5..277845673d4
> 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -142,6 +142,7 @@ enum insn_type
> RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand. */
> RVV_UNOP_MU = RVV_UNOP + 2, /* Likewise. */
> RVV_TERNOP = 5,
> + RVV_SCALAR_MOV_OP = 4, /* +1 for VUNDEF according to vector.md. */
> };
> enum vlmul_type
> {
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index a5715bb466c..8c920532549 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -21,6 +21,10 @@
>
> #define IN_TARGET_CODE 1
>
> +/* We have a maximum of 11 operands for RVV instruction patterns according to
> + the vector.md. */
> +#define RVV_INSN_OPERANDS_MAX 11
> +
> #include "config.h"
> #include "system.h"
> #include "coretypes.h"
> @@ -1286,19 +1290,32 @@ public:
> : rtx_vector_builder (mode, npatterns, nelts_per_pattern)
> {
> m_inner_mode = GET_MODE_INNER (mode);
> - m_inner_size = GET_MODE_BITSIZE (m_inner_mode).to_constant ();
> + m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode);
> + m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode);
> +
> + gcc_assert (
> + int_mode_for_size (inner_bits_size (), 0).exists
> + (&m_inner_int_mode));
> }
>
> bool can_duplicate_repeating_sequence_p ();
> rtx get_merged_repeating_sequence ();
>
> + bool repeating_sequence_use_merge_profitable_p (); rtx
> + get_merge_scalar_mask (unsigned int) const;
> +
> machine_mode new_mode () const { return m_new_mode; }
> + scalar_mode inner_mode () const { return m_inner_mode; }
> + scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
> + unsigned int inner_bits_size () const { return m_inner_bits_size; }
> + unsigned int inner_bytes_size () const { return m_inner_bytes_size;
> + }
>
> private:
> - machine_mode m_inner_mode;
> + scalar_mode m_inner_mode;
> + scalar_int_mode m_inner_int_mode;
> machine_mode m_new_mode;
> scalar_int_mode m_new_inner_mode;
> - unsigned int m_inner_size;
> + unsigned int m_inner_bits_size;
> + unsigned int m_inner_bytes_size;
> };
>
> /* Return true if the vector duplicated by a super element which is
> the fusion @@ -1309,7 +1326,7 @@ bool
> rvv_builder::can_duplicate_repeating_sequence_p () {
> poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
> - unsigned int new_inner_size = m_inner_size * npatterns ();
> + unsigned int new_inner_size = m_inner_bits_size * npatterns ();
> if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
> || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
> || !get_vector_mode (m_new_inner_mode, new_size).exists
> (&m_new_mode)) @@ -1317,6 +1334,61 @@ rvv_builder::can_duplicate_repeating_sequence_p ()
> return repeating_sequence_p (0, full_nelts ().to_constant (),
> npatterns ()); }
>
> +/* Return true if it is a repeating sequence that using
> + merge approach has better codegen than using default
> + approach (slide1down).
> +
> + Sequence A:
> + {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
> +
> + nelts = 16
> + npatterns = 2
> +
> + for merging a we need mask 101010....
> + for merging b we need mask 010101....
> +
> + Foreach element in the npattern, we need to build a mask in scalar register.
> + Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
> + instruction and 1 scalar move to v0 register. Finally we need vector merge
> + to merge them.
> +
> + lui a5, #imm
> + add a5, #imm
> + vmov.s.x v0, a5
> + vmerge.vxm v9, v9, a1, v0
> +
> + So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
> + If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
> + So return true in this case as it is profitable.
> +
> + Sequence B:
> + {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
> +
> + nelts = 16
> + npatterns = 8
> +
> + COST of merge approach = (3 + 1) * npatterns = 24
> + COST of slide1down approach = nelts = 16
> + Return false in this case as it is NOT profitable in merge approach.
> +*/
> +bool
> +rvv_builder::repeating_sequence_use_merge_profitable_p () {
> + if (inner_bytes_size () > UNITS_PER_WORD)
> + return false;
> +
> + unsigned int nelts = full_nelts ().to_constant ();
> +
> + if (!repeating_sequence_p (0, nelts, npatterns ()))
> + return false;
> +
> + unsigned int merge_cost = 1;
> + unsigned int build_merge_mask_cost = 3; unsigned int
> + slide1down_cost = nelts;
> +
> + return (build_merge_mask_cost + merge_cost) * npatterns () <
> +slide1down_cost; }
> +
> /* Merge the repeating sequence into a single element and return the
> RTX. */ rtx rvv_builder::get_merged_repeating_sequence () @@
> -1324,11 +1396,11 @@ rvv_builder::get_merged_repeating_sequence ()
> scalar_int_mode mode = Pmode;
> rtx target = gen_reg_rtx (mode);
> emit_move_insn (target, const0_rtx);
> - rtx imm = gen_int_mode ((1ULL << m_inner_size) - 1, mode);
> + rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
> /* { a, b, a, b }: Generate duplicate element = b << bits | a. */
> for (unsigned int i = 0; i < npatterns (); i++)
> {
> - unsigned int loc = m_inner_size * i;
> + unsigned int loc = m_inner_bits_size * i;
> rtx shift = gen_int_mode (loc, mode);
> rtx ele = gen_lowpart (mode, elt (i));
> rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX,
> false, @@ -1344,6 +1416,29 @@ rvv_builder::get_merged_repeating_sequence ()
> return target;
> }
>
> +/* Get the mask for merge approach.
> +
> + Consider such following case:
> + {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
> + To merge "a", the mask should be 1010....
> + To merge "b", the mask should be 0101....
> +*/
> +rtx
> +rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern)
> +const {
> + unsigned HOST_WIDE_INT mask = 0;
> + unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
> +
> + gcc_assert (BITS_PER_WORD % npatterns () == 0);
> +
> + int limit = BITS_PER_WORD / npatterns ();
> +
> + for (int i = 0; i < limit; i++)
> + mask |= base_mask << (i * npatterns ());
> +
> + return gen_int_mode (mask, inner_int_mode ()); }
> +
> /* Subroutine of riscv_vector_expand_vector_init.
> Works as follows:
> (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
> @@ -1371,6 +1466,111 @@ expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
> }
> }
>
> +/* Emit vmv.s.x instruction. */
> +
> +static void
> +emit_scalar_move_insn (unsigned icode, rtx *ops) {
> + machine_mode data_mode = GET_MODE (ops[0]);
> + machine_mode mask_mode = get_mask_mode (data_mode).require ();
> + insn_expander<RVV_INSN_OPERANDS_MAX> e (riscv_vector::RVV_SCALAR_MOV_OP,
> + /* HAS_DEST_P */ true,
> + /* FULLY_UNMASKED_P */ false,
> + /* USE_REAL_MERGE_P */ true,
> + /* HAS_AVL_P */ true,
> + /* VLMAX_P */ false,
> + data_mode, mask_mode);
> + e.set_policy (TAIL_ANY);
> + e.set_policy (MASK_ANY);
> + e.set_vl (CONST1_RTX (Pmode));
> + e.emit_insn ((enum insn_code) icode, ops); }
> +
> +/* Emit vmv.v.x instruction with vlmax. */
> +
> +static void
> +emit_vlmax_integer_move_insn (unsigned icode, rtx *ops, rtx vl) {
> + emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ops, vl); }
> +
> +/* Emit vmv.v.x instruction with nonvlmax. */
> +
> +static void
> +emit_nonvlmax_integer_move_insn (unsigned icode, rtx *ops, rtx avl) {
> + emit_nonvlmax_insn (icode, riscv_vector::RVV_UNOP, ops, avl); }
> +
> +/* Emit merge instruction. */
> +
> +static machine_mode
> +get_repeating_sequence_dup_machine_mode (const rvv_builder &builder)
> +{
> + poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ());
> +
> + if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR))
> + {
> + dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR,
> + builder.inner_bytes_size ());
> + }
> +
> + return get_vector_mode (builder.inner_int_mode (),
> +dup_nunits).require (); }
> +
> +/* Use merge approach to initialize the vector with repeating sequence.
> + v = {a, b, a, b, a, b, a, b}.
> +
> + v = broadcast (a).
> + mask = 0b01010101....
> + v = merge (v, b, mask)
> +*/
> +static void
> +expand_vector_init_merge_repeating_sequence (rtx target,
> + const rvv_builder
> +&builder) {
> + machine_mode dup_mode = get_repeating_sequence_dup_machine_mode
> +(builder);
> + machine_mode dup_mask_mode = get_mask_mode (dup_mode).require ();
> + machine_mode mask_mode = get_mask_mode (builder.mode ()).require
> +();
> + uint64_t full_nelts = builder.full_nelts ().to_constant ();
> +
> + /* Step 1: Broadcast the first pattern. */ rtx ops[] = {target,
> + force_reg (GET_MODE_INNER (dup_mode), builder.elt (0))};
> + emit_vlmax_integer_move_insn (code_for_pred_broadcast (builder.mode ()),
> + ops, NULL_RTX);
> +
> + /* Step 2: Merge the rest iteration of pattern. */ for (unsigned
> + int i = 1; i < builder.npatterns (); i++)
> + {
> + /* Step 2-1: Generate mask register v0 for each merge. */
> + rtx merge_mask = builder.get_merge_scalar_mask (i);
> + rtx mask = gen_reg_rtx (mask_mode);
> + rtx dup = gen_reg_rtx (dup_mode);
> +
> + if (full_nelts <= BITS_PER_WORD) /* vmv.s.x. */
> + {
> + rtx ops[] = {dup, gen_scalar_move_mask (dup_mask_mode),
> + RVV_VUNDEF (dup_mode), merge_mask};
> + emit_scalar_move_insn (code_for_pred_broadcast (GET_MODE (dup)),
> + ops);
> + }
> + else /* vmv.v.x. */
> + {
> + rtx ops[] = {dup, force_reg (GET_MODE_INNER (dup_mode), merge_mask)};
> + rtx vl = gen_int_mode (CEIL (full_nelts, BITS_PER_WORD), Pmode);
> + emit_nonvlmax_integer_move_insn (code_for_pred_broadcast (dup_mode),
> + ops, vl);
> + }
> +
> + emit_move_insn (mask, gen_lowpart (mask_mode, dup));
> +
> + /* Step 2-2: Merge pattern according to the mask. */
> + rtx ops[] = {target, target, builder.elt (i), mask};
> + emit_vlmax_merge_insn (code_for_pred_merge_scalar (GET_MODE (target)),
> + riscv_vector::RVV_MERGE_OP, ops);
> + }
> +}
> +
> /* Initialize register TARGET from the elements in PARALLEL rtx VALS.
> */
>
> void
> @@ -1394,6 +1594,19 @@ expand_vec_init (rtx target, rtx vals)
> emit_move_insn (target, gen_lowpart (mode, dup));
> return;
> }
> +
> + /* Case 2: Optimize repeating sequence cases that Case 1 can
> + not handle and it is profitable. For example:
> + ELEMENT BITSIZE = 64.
> + v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
> + We can't find a vector mode for "ab" which will be combined into
> + 128-bit element to duplicate. */
> + if (v.repeating_sequence_use_merge_profitable_p ())
> + {
> + expand_vector_init_merge_repeating_sequence (target, v);
> + return;
> + }
> +
> /* TODO: We will support more Initialization of vector in the future. */
> }
>
> diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index
> 807b0bccc18..4541255a8ae 100644
> --- a/gcc/config/riscv/riscv.h
> +++ b/gcc/config/riscv/riscv.h
> @@ -150,6 +150,7 @@ ASM_MISA_SPEC
>
> /* Width of a word, in units (bytes). */ #define UNITS_PER_WORD
> (TARGET_64BIT ? 8 : 4)
> +#define BITS_PER_WORD (BITS_PER_UNIT * UNITS_PER_WORD)
> #ifndef IN_LIBGCC2
> #define MIN_UNITS_PER_WORD 4
> #endif
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-1.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-1.c
> new file mode 100644
> index 00000000000..59ad49cf795
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat
> +++ -sequence-1.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d" } */
> +
> +#include <stdint-gcc.h>
> +
> +typedef int64_t vnx16di __attribute__ ((vector_size (1024)));
> +
> +__attribute__ ((noipa)) void
> +f_vnx16di (int64_t a, int64_t b, int64_t *out) {
> + vnx16di v = {
> + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
> + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
> + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
> + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
> +a, b, a, b, a, b, a, b, a, b,
> + };
> + *(vnx16di *) out = v;
> +}
> +
> +/* { dg-final { scan-assembler-times
> +{vmv\.v\.x\s+v[0-9]+,\s*[a-x0-9]+} 2 } } */
> +/* { dg-final { scan-assembler-times
> +{vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-2.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-2.c
> new file mode 100644
> index 00000000000..fe3741e3be7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat
> +++ -sequence-2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
> +
> +#include <stdint-gcc.h>
> +
> +typedef double vnx8df __attribute__ ((vector_size (64))); typedef
> +double vnx16df __attribute__ ((vector_size (128)));
> +
> +__attribute__ ((noipa)) void
> +f_vnx8df (double a, double b, double *out) {
> + vnx8df v = {a, b, a, b, a, b, a, b};
> + *(vnx8df *) out = v;
> +}
> +
> +__attribute__ ((noipa)) void
> +f_vnx16df (double a, double b, double *out) {
> + vnx16df v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b};
> + *(vnx16df *) out = v;
> +}
> +
> +/* { dg-final { scan-assembler-times
> +{vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times
> +{vfmerge\.vfm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-3.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-3.c
> new file mode 100644
> index 00000000000..74776def963
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat
> +++ -sequence-3.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
> +
> +#include <stdint-gcc.h>
> +
> +typedef int64_t vnx8di __attribute__ ((vector_size (64))); typedef
> +int64_t vnx16di __attribute__ ((vector_size (128)));
> +
> +__attribute__ ((noipa)) void
> +f_vnx8di (int64_t a, int64_t b, int64_t *out) {
> + vnx8di v = {a, b, a, b, a, b, a, b};
> + *(vnx8di *) out = v;
> +}
> +
> +__attribute__ ((noipa)) void
> +f_vnx16di (int64_t a, int64_t b, int64_t *out) {
> + vnx16di v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b};
> + *(vnx16di *) out = v;
> +}
> +
> +
> +/* { dg-final { scan-assembler-times
> +{vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times
> +{vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-4.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-4.c
> new file mode 100644
> index 00000000000..2f61465e84f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat
> +++ -sequence-4.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
> +
> +#include <stdint-gcc.h>
> +
> +typedef int64_t vnx8di __attribute__ ((vector_size (64)));
> +
> +__attribute__ ((noipa)) void
> +f_vnx8di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out) {
> + vnx8di v = {a, b, c, d, a, b, c, d};
> + *(vnx8di *) out = v;
> +}
> +
> +/* { dg-final { scan-assembler-times
> +{vslide1down\.vx\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+} 7 } } */
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-5.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-5.c
> new file mode 100644
> index 00000000000..7f4e6783f8e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat
> +++ -sequence-5.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
> +
> +#include <stdint-gcc.h>
> +
> +typedef int64_t vnx16di __attribute__ ((vector_size (128)));
> +
> +__attribute__ ((noipa)) void
> +f_vnx16di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out)
> +{
> + vnx16di v = {a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d,};
> + *(vnx16di *) out = v;
> +}
> +
> +/* { dg-final { scan-assembler-times
> +{vmv\.v\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
> +/* { dg-final { scan-assembler-times
> +{vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 0 } } */
> +/* { dg-final { scan-assembler-times
> +{vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 0 } } */
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-run-1.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-run-1.c
> new file mode 100644
> index 00000000000..1931d3f5fa0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat
> +++ -sequence-run-1.c
> @@ -0,0 +1,47 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" }
> +*/
> +
> +#include "init-repeat-sequence-2.c"
> +
> +int
> +main ()
> +{
> + double a = -1789089.23423;
> + double b = -8916156.45644;
> +
> + double v_vnx8df[sizeof (vnx8df) / sizeof (double)]; f_vnx8df (a,
> + b, v_vnx8df);
> +
> + return 0;
> + for (int i = 0; i < sizeof (vnx8df) / sizeof (double); i++)
> + {
> + if (i % 2 == 0)
> + {
> + if (v_vnx8df[i] != a)
> + __builtin_abort ();
> + }
> + else
> + {
> + if (v_vnx8df[i] != b)
> + __builtin_abort ();
> + }
> + }
> +
> + double v_vnx16df[sizeof (vnx16df) / sizeof (double)]; f_vnx16df
> + (a, b, v_vnx16df); for (int i = 0; i < sizeof (vnx16df) / sizeof
> + (double); i++)
> + {
> + if (i % 2 == 0)
> + {
> + if (v_vnx16df[i] != a)
> + __builtin_abort ();
> + }
> + else
> + {
> + if (v_vnx16df[i] != b)
> + __builtin_abort ();
> + }
> + }
> +
> + return 0;
> +}
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-run-2.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-run-2.c
> new file mode 100644
> index 00000000000..5564dd4a05a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat
> +++ -sequence-run-2.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" }
> +*/
> +
> +#include "init-repeat-sequence-3.c"
> +
> +int
> +main ()
> +{
> + int64_t a = -178908923423;
> + int64_t b = -891615645644;
> +
> + int64_t v_vnx8di[sizeof (vnx8di) / sizeof (int64_t)]; f_vnx8di (a,
> + b, v_vnx8di); for (int i = 0; i < sizeof (vnx8di) / sizeof
> + (int64_t); i++)
> + {
> + if (i % 2 == 0)
> + {
> + if (v_vnx8di[i] != a)
> + __builtin_abort ();
> + }
> + else
> + {
> + if (v_vnx8di[i] != b)
> + __builtin_abort ();
> + }
> + }
> +
> + int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)]; f_vnx16di
> + (a, b, v_vnx16di);
> +
> + for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++)
> + {
> + if (i % 2 == 0)
> + {
> + if (v_vnx16di[i] != a)
> + __builtin_abort ();
> + }
> + else
> + {
> + if (v_vnx16di[i] != b)
> + __builtin_abort ();
> + }
> + }
> +
> + return 0;
> +}
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-run-3.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-seq
> uence-run-3.c
> new file mode 100644
> index 00000000000..fec5adc56de
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat
> +++ -sequence-run-3.c
> @@ -0,0 +1,41 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" }
> +*/
> +
> +#include "init-repeat-sequence-5.c"
> +
> +int
> +main ()
> +{
> + int64_t a = -178908923423;
> + int64_t b = -891615645644;
> + int64_t c = 78908923423;
> + int64_t d = 81615645644;
> +
> + int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)]; f_vnx16di
> + (a, b, c, d, v_vnx16di); for (int i = 0; i < sizeof (vnx16di) /
> + sizeof (int64_t); i++)
> + {
> + if (i % 4 == 0)
> + {
> + if (v_vnx16di[i] != a)
> + __builtin_abort ();
> + }
> + else if (i % 4 == 1)
> + {
> + if (v_vnx16di[i] != b)
> + __builtin_abort ();
> + }
> + else if (i % 4 == 2)
> + {
> + if (v_vnx16di[i] != c)
> + __builtin_abort ();
> + }
> + else
> + {
> + if (v_vnx16di[i] != d)
> + __builtin_abort ();
> + }
> + }
> +
> + return 0;
> +}
> --
> 2.34.1
>
@@ -142,6 +142,7 @@ enum insn_type
RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand. */
RVV_UNOP_MU = RVV_UNOP + 2, /* Likewise. */
RVV_TERNOP = 5,
+ RVV_SCALAR_MOV_OP = 4, /* +1 for VUNDEF according to vector.md. */
};
enum vlmul_type
{
@@ -21,6 +21,10 @@
#define IN_TARGET_CODE 1
+/* We have a maximum of 11 operands for RVV instruction patterns according to
+ the vector.md. */
+#define RVV_INSN_OPERANDS_MAX 11
+
#include "config.h"
#include "system.h"
#include "coretypes.h"
@@ -1286,19 +1290,32 @@ public:
: rtx_vector_builder (mode, npatterns, nelts_per_pattern)
{
m_inner_mode = GET_MODE_INNER (mode);
- m_inner_size = GET_MODE_BITSIZE (m_inner_mode).to_constant ();
+ m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode);
+ m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode);
+
+ gcc_assert (
+ int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode));
}
bool can_duplicate_repeating_sequence_p ();
rtx get_merged_repeating_sequence ();
+ bool repeating_sequence_use_merge_profitable_p ();
+ rtx get_merge_scalar_mask (unsigned int) const;
+
machine_mode new_mode () const { return m_new_mode; }
+ scalar_mode inner_mode () const { return m_inner_mode; }
+ scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
+ unsigned int inner_bits_size () const { return m_inner_bits_size; }
+ unsigned int inner_bytes_size () const { return m_inner_bytes_size; }
private:
- machine_mode m_inner_mode;
+ scalar_mode m_inner_mode;
+ scalar_int_mode m_inner_int_mode;
machine_mode m_new_mode;
scalar_int_mode m_new_inner_mode;
- unsigned int m_inner_size;
+ unsigned int m_inner_bits_size;
+ unsigned int m_inner_bytes_size;
};
/* Return true if the vector duplicated by a super element which is the fusion
@@ -1309,7 +1326,7 @@ bool
rvv_builder::can_duplicate_repeating_sequence_p ()
{
poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
- unsigned int new_inner_size = m_inner_size * npatterns ();
+ unsigned int new_inner_size = m_inner_bits_size * npatterns ();
if (!int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
|| GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
|| !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
@@ -1317,6 +1334,61 @@ rvv_builder::can_duplicate_repeating_sequence_p ()
return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ());
}
+/* Return true if it is a repeating sequence that using
+ merge approach has better codegen than using default
+ approach (slide1down).
+
+ Sequence A:
+ {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
+
+ nelts = 16
+ npatterns = 2
+
+ for merging a we need mask 101010....
+ for merging b we need mask 010101....
+
+ Foreach element in the npattern, we need to build a mask in scalar register.
+ Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
+ instruction and 1 scalar move to v0 register. Finally we need vector merge
+ to merge them.
+
+ lui a5, #imm
+ add a5, #imm
+ vmov.s.x v0, a5
+ vmerge.vxm v9, v9, a1, v0
+
+ So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
+ If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
+ So return true in this case as it is profitable.
+
+ Sequence B:
+ {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
+
+ nelts = 16
+ npatterns = 8
+
+ COST of merge approach = (3 + 1) * npatterns = 24
+ COST of slide1down approach = nelts = 16
+ Return false in this case as it is NOT profitable in merge approach.
+*/
+bool
+rvv_builder::repeating_sequence_use_merge_profitable_p ()
+{
+ if (inner_bytes_size () > UNITS_PER_WORD)
+ return false;
+
+ unsigned int nelts = full_nelts ().to_constant ();
+
+ if (!repeating_sequence_p (0, nelts, npatterns ()))
+ return false;
+
+ unsigned int merge_cost = 1;
+ unsigned int build_merge_mask_cost = 3;
+ unsigned int slide1down_cost = nelts;
+
+ return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
+}
+
/* Merge the repeating sequence into a single element and return the RTX. */
rtx
rvv_builder::get_merged_repeating_sequence ()
@@ -1324,11 +1396,11 @@ rvv_builder::get_merged_repeating_sequence ()
scalar_int_mode mode = Pmode;
rtx target = gen_reg_rtx (mode);
emit_move_insn (target, const0_rtx);
- rtx imm = gen_int_mode ((1ULL << m_inner_size) - 1, mode);
+ rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
/* { a, b, a, b }: Generate duplicate element = b << bits | a. */
for (unsigned int i = 0; i < npatterns (); i++)
{
- unsigned int loc = m_inner_size * i;
+ unsigned int loc = m_inner_bits_size * i;
rtx shift = gen_int_mode (loc, mode);
rtx ele = gen_lowpart (mode, elt (i));
rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
@@ -1344,6 +1416,29 @@ rvv_builder::get_merged_repeating_sequence ()
return target;
}
+/* Get the mask for merge approach.
+
+ Consider such following case:
+ {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
+ To merge "a", the mask should be 1010....
+ To merge "b", the mask should be 0101....
+*/
+rtx
+rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const
+{
+ unsigned HOST_WIDE_INT mask = 0;
+ unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
+
+ gcc_assert (BITS_PER_WORD % npatterns () == 0);
+
+ int limit = BITS_PER_WORD / npatterns ();
+
+ for (int i = 0; i < limit; i++)
+ mask |= base_mask << (i * npatterns ());
+
+ return gen_int_mode (mask, inner_int_mode ());
+}
+
/* Subroutine of riscv_vector_expand_vector_init.
Works as follows:
(a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
@@ -1371,6 +1466,111 @@ expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
}
}
+/* Emit vmv.s.x instruction. */
+
+static void
+emit_scalar_move_insn (unsigned icode, rtx *ops)
+{
+ machine_mode data_mode = GET_MODE (ops[0]);
+ machine_mode mask_mode = get_mask_mode (data_mode).require ();
+ insn_expander<RVV_INSN_OPERANDS_MAX> e (riscv_vector::RVV_SCALAR_MOV_OP,
+ /* HAS_DEST_P */ true,
+ /* FULLY_UNMASKED_P */ false,
+ /* USE_REAL_MERGE_P */ true,
+ /* HAS_AVL_P */ true,
+ /* VLMAX_P */ false,
+ data_mode, mask_mode);
+ e.set_policy (TAIL_ANY);
+ e.set_policy (MASK_ANY);
+ e.set_vl (CONST1_RTX (Pmode));
+ e.emit_insn ((enum insn_code) icode, ops);
+}
+
+/* Emit vmv.v.x instruction with vlmax. */
+
+static void
+emit_vlmax_integer_move_insn (unsigned icode, rtx *ops, rtx vl)
+{
+ emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ops, vl);
+}
+
+/* Emit vmv.v.x instruction with nonvlmax. */
+
+static void
+emit_nonvlmax_integer_move_insn (unsigned icode, rtx *ops, rtx avl)
+{
+ emit_nonvlmax_insn (icode, riscv_vector::RVV_UNOP, ops, avl);
+}
+
+/* Emit merge instruction. */
+
+static machine_mode
+get_repeating_sequence_dup_machine_mode (const rvv_builder &builder)
+{
+ poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ());
+
+ if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR))
+ {
+ dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR,
+ builder.inner_bytes_size ());
+ }
+
+ return get_vector_mode (builder.inner_int_mode (), dup_nunits).require ();
+}
+
+/* Use merge approach to initialize the vector with repeating sequence.
+ v = {a, b, a, b, a, b, a, b}.
+
+ v = broadcast (a).
+ mask = 0b01010101....
+ v = merge (v, b, mask)
+*/
+static void
+expand_vector_init_merge_repeating_sequence (rtx target,
+ const rvv_builder &builder)
+{
+ machine_mode dup_mode = get_repeating_sequence_dup_machine_mode (builder);
+ machine_mode dup_mask_mode = get_mask_mode (dup_mode).require ();
+ machine_mode mask_mode = get_mask_mode (builder.mode ()).require ();
+ uint64_t full_nelts = builder.full_nelts ().to_constant ();
+
+ /* Step 1: Broadcast the first pattern. */
+ rtx ops[] = {target, force_reg (GET_MODE_INNER (dup_mode), builder.elt (0))};
+ emit_vlmax_integer_move_insn (code_for_pred_broadcast (builder.mode ()),
+ ops, NULL_RTX);
+
+ /* Step 2: Merge the rest iteration of pattern. */
+ for (unsigned int i = 1; i < builder.npatterns (); i++)
+ {
+ /* Step 2-1: Generate mask register v0 for each merge. */
+ rtx merge_mask = builder.get_merge_scalar_mask (i);
+ rtx mask = gen_reg_rtx (mask_mode);
+ rtx dup = gen_reg_rtx (dup_mode);
+
+ if (full_nelts <= BITS_PER_WORD) /* vmv.s.x. */
+ {
+ rtx ops[] = {dup, gen_scalar_move_mask (dup_mask_mode),
+ RVV_VUNDEF (dup_mode), merge_mask};
+ emit_scalar_move_insn (code_for_pred_broadcast (GET_MODE (dup)),
+ ops);
+ }
+ else /* vmv.v.x. */
+ {
+ rtx ops[] = {dup, force_reg (GET_MODE_INNER (dup_mode), merge_mask)};
+ rtx vl = gen_int_mode (CEIL (full_nelts, BITS_PER_WORD), Pmode);
+ emit_nonvlmax_integer_move_insn (code_for_pred_broadcast (dup_mode),
+ ops, vl);
+ }
+
+ emit_move_insn (mask, gen_lowpart (mask_mode, dup));
+
+ /* Step 2-2: Merge pattern according to the mask. */
+ rtx ops[] = {target, target, builder.elt (i), mask};
+ emit_vlmax_merge_insn (code_for_pred_merge_scalar (GET_MODE (target)),
+ riscv_vector::RVV_MERGE_OP, ops);
+ }
+}
+
/* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
void
@@ -1394,6 +1594,19 @@ expand_vec_init (rtx target, rtx vals)
emit_move_insn (target, gen_lowpart (mode, dup));
return;
}
+
+ /* Case 2: Optimize repeating sequence cases that Case 1 can
+ not handle and it is profitable. For example:
+ ELEMENT BITSIZE = 64.
+ v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
+ We can't find a vector mode for "ab" which will be combined into
+ 128-bit element to duplicate. */
+ if (v.repeating_sequence_use_merge_profitable_p ())
+ {
+ expand_vector_init_merge_repeating_sequence (target, v);
+ return;
+ }
+
/* TODO: We will support more Initialization of vector in the future. */
}
@@ -150,6 +150,7 @@ ASM_MISA_SPEC
/* Width of a word, in units (bytes). */
#define UNITS_PER_WORD (TARGET_64BIT ? 8 : 4)
+#define BITS_PER_WORD (BITS_PER_UNIT * UNITS_PER_WORD)
#ifndef IN_LIBGCC2
#define MIN_UNITS_PER_WORD 4
#endif
new file mode 100644
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d" } */
+
+#include <stdint-gcc.h>
+
+typedef int64_t vnx16di __attribute__ ((vector_size (1024)));
+
+__attribute__ ((noipa)) void
+f_vnx16di (int64_t a, int64_t b, int64_t *out)
+{
+ vnx16di v = {
+ a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+ a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+ a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+ a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b,
+ };
+ *(vnx16di *) out = v;
+}
+
+/* { dg-final { scan-assembler-times {vmv\.v\.x\s+v[0-9]+,\s*[a-x0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
new file mode 100644
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include <stdint-gcc.h>
+
+typedef double vnx8df __attribute__ ((vector_size (64)));
+typedef double vnx16df __attribute__ ((vector_size (128)));
+
+__attribute__ ((noipa)) void
+f_vnx8df (double a, double b, double *out)
+{
+ vnx8df v = {a, b, a, b, a, b, a, b};
+ *(vnx8df *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx16df (double a, double b, double *out)
+{
+ vnx16df v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b};
+ *(vnx16df *) out = v;
+}
+
+/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vfmerge\.vfm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
new file mode 100644
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include <stdint-gcc.h>
+
+typedef int64_t vnx8di __attribute__ ((vector_size (64)));
+typedef int64_t vnx16di __attribute__ ((vector_size (128)));
+
+__attribute__ ((noipa)) void
+f_vnx8di (int64_t a, int64_t b, int64_t *out)
+{
+ vnx8di v = {a, b, a, b, a, b, a, b};
+ *(vnx8di *) out = v;
+}
+
+__attribute__ ((noipa)) void
+f_vnx16di (int64_t a, int64_t b, int64_t *out)
+{
+ vnx16di v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b};
+ *(vnx16di *) out = v;
+}
+
+
+/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 1 } } */
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include <stdint-gcc.h>
+
+typedef int64_t vnx8di __attribute__ ((vector_size (64)));
+
+__attribute__ ((noipa)) void
+f_vnx8di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out)
+{
+ vnx8di v = {a, b, c, d, a, b, c, d};
+ *(vnx8di *) out = v;
+}
+
+/* { dg-final { scan-assembler-times {vslide1down\.vx\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+} 7 } } */
new file mode 100644
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include <stdint-gcc.h>
+
+typedef int64_t vnx16di __attribute__ ((vector_size (128)));
+
+__attribute__ ((noipa)) void
+f_vnx16di (int64_t a, int64_t b, int64_t c, int64_t d, int64_t *out)
+{
+ vnx16di v = {a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d,};
+ *(vnx16di *) out = v;
+}
+
+/* { dg-final { scan-assembler-times {vmv\.v\.x\tv[0-9]+,\s*[a-x0-9]+} 1 } } */
+/* { dg-final { scan-assembler-times {vmv\.s\.x\tv[0-9]+,\s*[a-x0-9]+} 0 } } */
+/* { dg-final { scan-assembler-times {vmerge\.vxm\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+,\s*v0} 0 } } */
new file mode 100644
@@ -0,0 +1,47 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "init-repeat-sequence-2.c"
+
+int
+main ()
+{
+ double a = -1789089.23423;
+ double b = -8916156.45644;
+
+ double v_vnx8df[sizeof (vnx8df) / sizeof (double)];
+ f_vnx8df (a, b, v_vnx8df);
+
+ return 0;
+ for (int i = 0; i < sizeof (vnx8df) / sizeof (double); i++)
+ {
+ if (i % 2 == 0)
+ {
+ if (v_vnx8df[i] != a)
+ __builtin_abort ();
+ }
+ else
+ {
+ if (v_vnx8df[i] != b)
+ __builtin_abort ();
+ }
+ }
+
+ double v_vnx16df[sizeof (vnx16df) / sizeof (double)];
+ f_vnx16df (a, b, v_vnx16df);
+ for (int i = 0; i < sizeof (vnx16df) / sizeof (double); i++)
+ {
+ if (i % 2 == 0)
+ {
+ if (v_vnx16df[i] != a)
+ __builtin_abort ();
+ }
+ else
+ {
+ if (v_vnx16df[i] != b)
+ __builtin_abort ();
+ }
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,46 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "init-repeat-sequence-3.c"
+
+int
+main ()
+{
+ int64_t a = -178908923423;
+ int64_t b = -891615645644;
+
+ int64_t v_vnx8di[sizeof (vnx8di) / sizeof (int64_t)];
+ f_vnx8di (a, b, v_vnx8di);
+ for (int i = 0; i < sizeof (vnx8di) / sizeof (int64_t); i++)
+ {
+ if (i % 2 == 0)
+ {
+ if (v_vnx8di[i] != a)
+ __builtin_abort ();
+ }
+ else
+ {
+ if (v_vnx8di[i] != b)
+ __builtin_abort ();
+ }
+ }
+
+ int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)];
+ f_vnx16di (a, b, v_vnx16di);
+
+ for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++)
+ {
+ if (i % 2 == 0)
+ {
+ if (v_vnx16di[i] != a)
+ __builtin_abort ();
+ }
+ else
+ {
+ if (v_vnx16di[i] != b)
+ __builtin_abort ();
+ }
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,41 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "init-repeat-sequence-5.c"
+
+int
+main ()
+{
+ int64_t a = -178908923423;
+ int64_t b = -891615645644;
+ int64_t c = 78908923423;
+ int64_t d = 81615645644;
+
+ int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)];
+ f_vnx16di (a, b, c, d, v_vnx16di);
+ for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++)
+ {
+ if (i % 4 == 0)
+ {
+ if (v_vnx16di[i] != a)
+ __builtin_abort ();
+ }
+ else if (i % 4 == 1)
+ {
+ if (v_vnx16di[i] != b)
+ __builtin_abort ();
+ }
+ else if (i % 4 == 2)
+ {
+ if (v_vnx16di[i] != c)
+ __builtin_abort ();
+ }
+ else
+ {
+ if (v_vnx16di[i] != d)
+ __builtin_abort ();
+ }
+ }
+
+ return 0;
+}