@@ -272,6 +272,36 @@
DONE;
})
+;; =========================================================================
+;; == Strided Load/Store
+;; =========================================================================
+
+(define_expand "mask_len_strided_load_<mode>"
+ [(match_operand:V 0 "register_operand")
+ (match_operand 1 "pmode_reg_or_0_operand")
+ (match_operand 2 "pmode_reg_or_0_operand")
+ (match_operand:<VM> 3 "vector_mask_operand")
+ (match_operand 4 "autovec_length_operand")
+ (match_operand 5 "const_0_operand")]
+ "TARGET_VECTOR"
+{
+ riscv_vector::expand_strided_load_store (<MODE>mode, operands, true);
+ DONE;
+})
+
+(define_expand "mask_len_strided_store_<mode>"
+ [(match_operand 0 "pmode_reg_or_0_operand")
+ (match_operand 1 "pmode_reg_or_0_operand")
+ (match_operand:V 2 "register_operand")
+ (match_operand:<VM> 3 "vector_mask_operand")
+ (match_operand 4 "autovec_length_operand")
+ (match_operand 5 "const_0_operand")]
+ "TARGET_VECTOR"
+{
+ riscv_vector::expand_strided_load_store (<MODE>mode, operands, false);
+ DONE;
+})
+
;; =========================================================================
;; == Array Load/Store
;; =========================================================================
@@ -546,6 +546,7 @@ void expand_vec_perm (rtx, rtx, rtx, rtx);
void expand_select_vl (rtx *);
void expand_load_store (rtx *, bool);
void expand_gather_scatter (rtx *, bool);
+void expand_strided_load_store (machine_mode, rtx *, bool);
void expand_cond_len_ternop (unsigned, rtx *);
void prepare_ternary_operands (rtx *);
void expand_lanes_load_store (rtx *, bool);
@@ -3608,6 +3608,62 @@ expand_gather_scatter (rtx *ops, bool is_load)
}
}
+/* Expand MASK_LEN_STRIDED_{LOAD,STORE}. */
+void
+expand_strided_load_store (machine_mode mode, rtx *ops, bool is_load)
+{
+ rtx ptr, stride, vec_reg;
+ rtx mask = ops[3];
+ rtx len = ops[4];
+ poly_int64 value;
+ if (is_load)
+ {
+ vec_reg = ops[0];
+ ptr = ops[1];
+ stride = ops[2];
+ }
+ else
+ {
+ vec_reg = ops[2];
+ ptr = ops[0];
+ stride = ops[1];
+ }
+
+ if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)))
+ {
+ /* If the length operand is equal to VF, it is VLMAX load/store. */
+ if (is_load)
+ {
+ rtx m_ops[] = {vec_reg, mask, gen_rtx_MEM (mode, ptr), stride};
+ emit_vlmax_insn (code_for_pred_strided_load (mode), BINARY_OP_TAMA,
+ m_ops);
+ }
+ else
+ {
+ len = gen_reg_rtx (Pmode);
+ emit_vlmax_vsetvl (mode, len);
+ emit_insn (gen_pred_strided_store (mode, gen_rtx_MEM (mode, ptr),
+ mask, stride, vec_reg, len,
+ get_avl_type_rtx (VLMAX)));
+ }
+ }
+ else
+ {
+ if (!satisfies_constraint_K (len))
+ len = force_reg (Pmode, len);
+ if (is_load)
+ {
+ rtx m_ops[] = {vec_reg, mask, gen_rtx_MEM (mode, ptr), stride};
+ emit_nonvlmax_insn (code_for_pred_strided_load (mode), BINARY_OP_TAMA,
+ m_ops, len);
+ }
+ else
+ emit_insn (gen_pred_strided_store (mode, gen_rtx_MEM (mode, ptr), mask,
+ stride, vec_reg, len,
+ get_avl_type_rtx (NONVLMAX)));
+ }
+}
+
/* Expand COND_LEN_*. */
void
expand_cond_len_ternop (unsigned icode, rtx *ops)
new file mode 100644
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -ffast-math -fdump-tree-optimized-details" } */
+
+#include <stdint-gcc.h>
+
+#ifndef INDEX8
+#define INDEX8 int8_t
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS stride, DATA_TYPE *restrict cond, \
+ INDEX##BITS n) \
+ { \
+ for (INDEX##BITS i = 0; i < n; ++i) \
+ if (cond[i * stride]) \
+ dest[i] += src[i * stride]; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 8) \
+ T (DATA_TYPE, 16) \
+ T (DATA_TYPE, 32) \
+ T (DATA_TYPE, 64)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int8_t) \
+ TEST_TYPE (T, uint8_t) \
+ TEST_TYPE (T, int16_t) \
+ TEST_TYPE (T, uint16_t) \
+ TEST_TYPE (T, _Float16) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_LOAD" 132 "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "optimized" } } */
new file mode 100644
@@ -0,0 +1,97 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-mcmodel=medany" } */
+
+#include "mask_strided_load-1.c"
+#include <assert.h>
+
+int
+main (void)
+{
+ /* FIXME: The purpose of this assembly is to ensure that the vtype register is
+ initialized befor instructions such as vmv1r.v are executed. Otherwise you
+ will get illegal instruction errors when running with spike+pk. This is an
+ interim solution for reduce unnecessary failures and a unified solution
+ will come later. */
+ asm volatile("vsetivli x0, 0, e8, m1, ta, ma");
+#define RUN_LOOP(DATA_TYPE, BITS) \
+ DATA_TYPE dest_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ DATA_TYPE dest2_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ DATA_TYPE src_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ DATA_TYPE cond_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ INDEX##BITS stride_##DATA_TYPE##_##BITS = (BITS - 3); \
+ INDEX##BITS n_##DATA_TYPE##_##BITS = (BITS + 13); \
+ for (INDEX##BITS i = 0; \
+ i < stride_##DATA_TYPE##_##BITS * n_##DATA_TYPE##_##BITS; i++) \
+ { \
+ dest_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \
+ dest2_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \
+ src_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 13 + 9107) & (BITS - 1)); \
+ cond_##DATA_TYPE##_##BITS[i] = (DATA_TYPE) (i & 1); \
+ } \
+ f_##DATA_TYPE##_##BITS (dest_##DATA_TYPE##_##BITS, src_##DATA_TYPE##_##BITS, \
+ stride_##DATA_TYPE##_##BITS, \
+ cond_##DATA_TYPE##_##BITS, n_##DATA_TYPE##_##BITS); \
+ for (int i = 0; i < n_##DATA_TYPE##_##BITS; i++) \
+ { \
+ if (cond_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS]) \
+ assert ( \
+ dest_##DATA_TYPE##_##BITS[i] \
+ == (dest2_##DATA_TYPE##_##BITS[i] \
+ + src_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS])); \
+ else \
+ assert (dest_##DATA_TYPE##_##BITS[i] \
+ == dest2_##DATA_TYPE##_##BITS[i]); \
+ }
+
+ RUN_LOOP (int8_t, 8)
+ RUN_LOOP (uint8_t, 8)
+ RUN_LOOP (int16_t, 8)
+ RUN_LOOP (uint16_t, 8)
+ RUN_LOOP (_Float16, 8)
+ RUN_LOOP (int32_t, 8)
+ RUN_LOOP (uint32_t, 8)
+ RUN_LOOP (float, 8)
+ RUN_LOOP (int64_t, 8)
+ RUN_LOOP (uint64_t, 8)
+ RUN_LOOP (double, 8)
+
+ RUN_LOOP (int8_t, 16)
+ RUN_LOOP (uint8_t, 16)
+ RUN_LOOP (int16_t, 16)
+ RUN_LOOP (uint16_t, 16)
+ RUN_LOOP (_Float16, 16)
+ RUN_LOOP (int32_t, 16)
+ RUN_LOOP (uint32_t, 16)
+ RUN_LOOP (float, 16)
+ RUN_LOOP (int64_t, 16)
+ RUN_LOOP (uint64_t, 16)
+ RUN_LOOP (double, 16)
+
+ RUN_LOOP (int8_t, 32)
+ RUN_LOOP (uint8_t, 32)
+ RUN_LOOP (int16_t, 32)
+ RUN_LOOP (uint16_t, 32)
+ RUN_LOOP (_Float16, 32)
+ RUN_LOOP (int32_t, 32)
+ RUN_LOOP (uint32_t, 32)
+ RUN_LOOP (float, 32)
+ RUN_LOOP (int64_t, 32)
+ RUN_LOOP (uint64_t, 32)
+ RUN_LOOP (double, 32)
+
+ RUN_LOOP (int8_t, 64)
+ RUN_LOOP (uint8_t, 64)
+ RUN_LOOP (int16_t, 64)
+ RUN_LOOP (uint16_t, 64)
+ RUN_LOOP (_Float16, 64)
+ RUN_LOOP (int32_t, 64)
+ RUN_LOOP (uint32_t, 64)
+ RUN_LOOP (float, 64)
+ RUN_LOOP (int64_t, 64)
+ RUN_LOOP (uint64_t, 64)
+ RUN_LOOP (double, 64)
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -ffast-math -fdump-tree-optimized-details" } */
+
+#include <stdint-gcc.h>
+
+#ifndef INDEX8
+#define INDEX8 int8_t
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#define INDEX64 int64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS stride, DATA_TYPE *restrict cond, \
+ INDEX##BITS n) \
+ { \
+ for (INDEX##BITS i = 0; i < n; ++i) \
+ if (cond[i * stride]) \
+ dest[i * stride] = src[i] + BITS; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 8) \
+ T (DATA_TYPE, 16) \
+ T (DATA_TYPE, 32) \
+ T (DATA_TYPE, 64)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int8_t) \
+ TEST_TYPE (T, uint8_t) \
+ TEST_TYPE (T, int16_t) \
+ TEST_TYPE (T, uint16_t) \
+ TEST_TYPE (T, _Float16) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_STORE" 66 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_LOAD" 66 "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \.SCATTER_STORE" "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \.MASK_SCATTER_STORE" "optimized" } } */
new file mode 100644
@@ -0,0 +1,89 @@
+/* { dg-do run { target { riscv_v } } } */
+
+#include "mask_strided_store-1.c"
+#include <assert.h>
+
+int
+main (void)
+{
+#define RUN_LOOP(DATA_TYPE, BITS) \
+ DATA_TYPE dest_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ DATA_TYPE dest2_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ DATA_TYPE src_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ DATA_TYPE cond_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ INDEX##BITS stride_##DATA_TYPE##_##BITS = (BITS - 3); \
+ INDEX##BITS n_##DATA_TYPE##_##BITS = (BITS + 13); \
+ for (INDEX##BITS i = 0; \
+ i < stride_##DATA_TYPE##_##BITS * n_##DATA_TYPE##_##BITS; i++) \
+ { \
+ dest_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \
+ dest2_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \
+ src_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 13 + 9107) & (BITS - 1)); \
+ cond_##DATA_TYPE##_##BITS[i] = (DATA_TYPE) (i & 1); \
+ } \
+ f_##DATA_TYPE##_##BITS (dest_##DATA_TYPE##_##BITS, src_##DATA_TYPE##_##BITS, \
+ stride_##DATA_TYPE##_##BITS, \
+ cond_##DATA_TYPE##_##BITS, n_##DATA_TYPE##_##BITS); \
+ for (int i = 0; i < n_##DATA_TYPE##_##BITS; i++) \
+ { \
+ if (cond_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS]) \
+ assert (dest_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS] \
+ == (src_##DATA_TYPE##_##BITS[i] + BITS)); \
+ else \
+ assert ( \
+ dest_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS] \
+ == dest2_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS]); \
+ }
+
+ RUN_LOOP (int8_t, 8)
+ RUN_LOOP (uint8_t, 8)
+ RUN_LOOP (int16_t, 8)
+ RUN_LOOP (uint16_t, 8)
+ RUN_LOOP (_Float16, 8)
+ RUN_LOOP (int32_t, 8)
+ RUN_LOOP (uint32_t, 8)
+ RUN_LOOP (float, 8)
+ RUN_LOOP (int64_t, 8)
+ RUN_LOOP (uint64_t, 8)
+ RUN_LOOP (double, 8)
+
+ RUN_LOOP (int8_t, 16)
+ RUN_LOOP (uint8_t, 16)
+ RUN_LOOP (int16_t, 16)
+ RUN_LOOP (uint16_t, 16)
+ RUN_LOOP (_Float16, 16)
+ RUN_LOOP (int32_t, 16)
+ RUN_LOOP (uint32_t, 16)
+ RUN_LOOP (float, 16)
+ RUN_LOOP (int64_t, 16)
+ RUN_LOOP (uint64_t, 16)
+ RUN_LOOP (double, 16)
+
+ RUN_LOOP (int8_t, 32)
+ RUN_LOOP (uint8_t, 32)
+ RUN_LOOP (int16_t, 32)
+ RUN_LOOP (uint16_t, 32)
+ RUN_LOOP (_Float16, 32)
+ RUN_LOOP (int32_t, 32)
+ RUN_LOOP (uint32_t, 32)
+ RUN_LOOP (float, 32)
+ RUN_LOOP (int64_t, 32)
+ RUN_LOOP (uint64_t, 32)
+ RUN_LOOP (double, 32)
+
+ RUN_LOOP (int8_t, 64)
+ RUN_LOOP (uint8_t, 64)
+ RUN_LOOP (int16_t, 64)
+ RUN_LOOP (uint16_t, 64)
+ RUN_LOOP (_Float16, 64)
+ RUN_LOOP (int32_t, 64)
+ RUN_LOOP (uint32_t, 64)
+ RUN_LOOP (float, 64)
+ RUN_LOOP (int64_t, 64)
+ RUN_LOOP (uint64_t, 64)
+ RUN_LOOP (double, 64)
+ return 0;
+}
@@ -40,6 +40,6 @@
TEST_ALL (TEST_LOOP)
-/* { dg-final { scan-tree-dump-times " \.MASK_LEN_GATHER_LOAD" 66 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_LOAD" 66 "optimized" } } */
/* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "optimized" } } */
/* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "optimized" } } */
@@ -40,6 +40,6 @@
TEST_ALL (TEST_LOOP)
-/* { dg-final { scan-tree-dump-times " \.MASK_LEN_GATHER_LOAD" 33 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_LOAD" 33 "optimized" } } */
/* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "optimized" } } */
/* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "optimized" } } */
new file mode 100644
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -ffast-math -fdump-tree-optimized-details" } */
+
+#include <stdint-gcc.h>
+
+#ifndef INDEX8
+#define INDEX8 uint8_t
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS stride, INDEX##BITS n) \
+ { \
+ for (INDEX##BITS i = 0; i < n; ++i) \
+ dest[i] += src[i * stride]; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 8) \
+ T (DATA_TYPE, 16) \
+ T (DATA_TYPE, 32) \
+ T (DATA_TYPE, 64)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int8_t) \
+ TEST_TYPE (T, uint8_t) \
+ TEST_TYPE (T, int16_t) \
+ TEST_TYPE (T, uint16_t) \
+ TEST_TYPE (T, _Float16) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_LOAD" 55 "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "optimized" } } */
new file mode 100644
@@ -0,0 +1,84 @@
+/* { dg-do run { target { riscv_v } } } */
+
+#include "strided_load-3.c"
+#include <assert.h>
+
+int
+main (void)
+{
+#define RUN_LOOP(DATA_TYPE, BITS) \
+ DATA_TYPE dest_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ DATA_TYPE dest2_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ DATA_TYPE src_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ INDEX##BITS stride_##DATA_TYPE##_##BITS = (BITS - 3); \
+ INDEX##BITS n_##DATA_TYPE##_##BITS = (BITS + 13); \
+ for (INDEX##BITS i = 0; \
+ i < stride_##DATA_TYPE##_##BITS * n_##DATA_TYPE##_##BITS; i++) \
+ { \
+ dest_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \
+ dest2_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \
+ src_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 13 + 9107) & (BITS - 1)); \
+ } \
+ f_##DATA_TYPE##_##BITS (dest_##DATA_TYPE##_##BITS, src_##DATA_TYPE##_##BITS, \
+ stride_##DATA_TYPE##_##BITS, \
+ n_##DATA_TYPE##_##BITS); \
+ for (int i = 0; i < n_##DATA_TYPE##_##BITS; i++) \
+ { \
+ assert ( \
+ dest_##DATA_TYPE##_##BITS[i] \
+ == (dest2_##DATA_TYPE##_##BITS[i] \
+ + src_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS])); \
+ }
+
+ RUN_LOOP (int8_t, 8)
+ RUN_LOOP (uint8_t, 8)
+ RUN_LOOP (int16_t, 8)
+ RUN_LOOP (uint16_t, 8)
+ RUN_LOOP (_Float16, 8)
+ RUN_LOOP (int32_t, 8)
+ RUN_LOOP (uint32_t, 8)
+ RUN_LOOP (float, 8)
+ RUN_LOOP (int64_t, 8)
+ RUN_LOOP (uint64_t, 8)
+ RUN_LOOP (double, 8)
+
+ RUN_LOOP (int8_t, 16)
+ RUN_LOOP (uint8_t, 16)
+ RUN_LOOP (int16_t, 16)
+ RUN_LOOP (uint16_t, 16)
+ RUN_LOOP (_Float16, 16)
+ RUN_LOOP (int32_t, 16)
+ RUN_LOOP (uint32_t, 16)
+ RUN_LOOP (float, 16)
+ RUN_LOOP (int64_t, 16)
+ RUN_LOOP (uint64_t, 16)
+ RUN_LOOP (double, 16)
+
+ RUN_LOOP (int8_t, 32)
+ RUN_LOOP (uint8_t, 32)
+ RUN_LOOP (int16_t, 32)
+ RUN_LOOP (uint16_t, 32)
+ RUN_LOOP (_Float16, 32)
+ RUN_LOOP (int32_t, 32)
+ RUN_LOOP (uint32_t, 32)
+ RUN_LOOP (float, 32)
+ RUN_LOOP (int64_t, 32)
+ RUN_LOOP (uint64_t, 32)
+ RUN_LOOP (double, 32)
+
+ RUN_LOOP (int8_t, 64)
+ RUN_LOOP (uint8_t, 64)
+ RUN_LOOP (int16_t, 64)
+ RUN_LOOP (uint16_t, 64)
+ RUN_LOOP (_Float16, 64)
+ RUN_LOOP (int32_t, 64)
+ RUN_LOOP (uint32_t, 64)
+ RUN_LOOP (float, 64)
+ RUN_LOOP (int64_t, 64)
+ RUN_LOOP (uint64_t, 64)
+ RUN_LOOP (double, 64)
+ return 0;
+}
@@ -40,6 +40,6 @@
TEST_ALL (TEST_LOOP)
-/* { dg-final { scan-tree-dump-times " \.MASK_LEN_SCATTER_STORE" 66 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_STORE" 66 "optimized" } } */
/* { dg-final { scan-tree-dump-not " \.SCATTER_STORE" "optimized" } } */
/* { dg-final { scan-tree-dump-not " \.MASK_SCATTER_STORE" "optimized" } } */
@@ -40,6 +40,6 @@
TEST_ALL (TEST_LOOP)
-/* { dg-final { scan-tree-dump-times " \.MASK_LEN_SCATTER_STORE" 44 "optimized" } } */
+/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_STORE" 44 "optimized" } } */
/* { dg-final { scan-tree-dump-not " \.SCATTER_STORE" "optimized" } } */
/* { dg-final { scan-tree-dump-not " \.MASK_SCATTER_STORE" "optimized" } } */
new file mode 100644
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh -mabi=lp64d -O3 --param riscv-autovec-preference=scalable -fno-vect-cost-model -ffast-math -fdump-tree-optimized-details" } */
+
+#include <stdint-gcc.h>
+
+#ifndef INDEX8
+#define INDEX8 uint8_t
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+#define INDEX64 uint64_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE##_##BITS (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS stride, INDEX##BITS n) \
+ { \
+ for (INDEX##BITS i = 0; i < n; ++i) \
+ dest[i * stride] = src[i] + BITS; \
+ }
+
+#define TEST_TYPE(T, DATA_TYPE) \
+ T (DATA_TYPE, 8) \
+ T (DATA_TYPE, 16) \
+ T (DATA_TYPE, 32) \
+ T (DATA_TYPE, 64)
+
+#define TEST_ALL(T) \
+ TEST_TYPE (T, int8_t) \
+ TEST_TYPE (T, uint8_t) \
+ TEST_TYPE (T, int16_t) \
+ TEST_TYPE (T, uint16_t) \
+ TEST_TYPE (T, _Float16) \
+ TEST_TYPE (T, int32_t) \
+ TEST_TYPE (T, uint32_t) \
+ TEST_TYPE (T, float) \
+ TEST_TYPE (T, int64_t) \
+ TEST_TYPE (T, uint64_t) \
+ TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-tree-dump-times " \.MASK_LEN_STRIDED_STORE" 55 "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \.SCATTER_STORE" "optimized" } } */
+/* { dg-final { scan-tree-dump-not " \.MASK_SCATTER_STORE" "optimized" } } */
new file mode 100644
@@ -0,0 +1,82 @@
+/* { dg-do run { target { riscv_v } } } */
+
+#include "strided_store-3.c"
+#include <assert.h>
+
+int
+main (void)
+{
+#define RUN_LOOP(DATA_TYPE, BITS) \
+ DATA_TYPE dest_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ DATA_TYPE dest2_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ DATA_TYPE src_##DATA_TYPE##_##BITS[(BITS - 3) * (BITS + 13)]; \
+ INDEX##BITS stride_##DATA_TYPE##_##BITS = (BITS - 3); \
+ INDEX##BITS n_##DATA_TYPE##_##BITS = (BITS + 13); \
+ for (INDEX##BITS i = 0; \
+ i < stride_##DATA_TYPE##_##BITS * n_##DATA_TYPE##_##BITS; i++) \
+ { \
+ dest_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \
+ dest2_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 81 + 735) & (BITS - 1)); \
+ src_##DATA_TYPE##_##BITS[i] \
+ = (DATA_TYPE) ((i * 13 + 9107) & (BITS - 1)); \
+ } \
+ f_##DATA_TYPE##_##BITS (dest_##DATA_TYPE##_##BITS, src_##DATA_TYPE##_##BITS, \
+ stride_##DATA_TYPE##_##BITS, \
+ n_##DATA_TYPE##_##BITS); \
+ for (int i = 0; i < n_##DATA_TYPE##_##BITS; i++) \
+ { \
+ assert (dest_##DATA_TYPE##_##BITS[i * stride_##DATA_TYPE##_##BITS] \
+ == (src_##DATA_TYPE##_##BITS[i] + BITS)); \
+ }
+
+ RUN_LOOP (int8_t, 8)
+ RUN_LOOP (uint8_t, 8)
+ RUN_LOOP (int16_t, 8)
+ RUN_LOOP (uint16_t, 8)
+ RUN_LOOP (_Float16, 8)
+ RUN_LOOP (int32_t, 8)
+ RUN_LOOP (uint32_t, 8)
+ RUN_LOOP (float, 8)
+ RUN_LOOP (int64_t, 8)
+ RUN_LOOP (uint64_t, 8)
+ RUN_LOOP (double, 8)
+
+ RUN_LOOP (int8_t, 16)
+ RUN_LOOP (uint8_t, 16)
+ RUN_LOOP (int16_t, 16)
+ RUN_LOOP (uint16_t, 16)
+ RUN_LOOP (_Float16, 16)
+ RUN_LOOP (int32_t, 16)
+ RUN_LOOP (uint32_t, 16)
+ RUN_LOOP (float, 16)
+ RUN_LOOP (int64_t, 16)
+ RUN_LOOP (uint64_t, 16)
+ RUN_LOOP (double, 16)
+
+ RUN_LOOP (int8_t, 32)
+ RUN_LOOP (uint8_t, 32)
+ RUN_LOOP (int16_t, 32)
+ RUN_LOOP (uint16_t, 32)
+ RUN_LOOP (_Float16, 32)
+ RUN_LOOP (int32_t, 32)
+ RUN_LOOP (uint32_t, 32)
+ RUN_LOOP (float, 32)
+ RUN_LOOP (int64_t, 32)
+ RUN_LOOP (uint64_t, 32)
+ RUN_LOOP (double, 32)
+
+ RUN_LOOP (int8_t, 64)
+ RUN_LOOP (uint8_t, 64)
+ RUN_LOOP (int16_t, 64)
+ RUN_LOOP (uint16_t, 64)
+ RUN_LOOP (_Float16, 64)
+ RUN_LOOP (int32_t, 64)
+ RUN_LOOP (uint32_t, 64)
+ RUN_LOOP (float, 64)
+ RUN_LOOP (int64_t, 64)
+ RUN_LOOP (uint64_t, 64)
+ RUN_LOOP (double, 64)
+ return 0;
+}