@@ -606,3 +606,11 @@ (define_predicate "subreg_register_operand"
(and (match_code "reg")
(match_test "reload_completed || reload_in_progress")
(match_test "register_operand (op, GET_MODE (op))"))))
+
+; Bias value for LEN_LOAD and LEN_STORE. The bias will be added to the
+; length (in bytes for s390) to be loaded. vll/vstl expect the lowest byte
+; to load while LEN_LOAD/LEN_STORE use the actual length in bytes. This implies
+; that we cannot load a length of 0.
+(define_predicate "vll_bias_operand"
+ (and (match_code "const_int")
+ (match_test "op == CONSTM1_RTX (QImode)")))
@@ -15782,6 +15782,14 @@ s390_option_override_internal (struct gcc_options *opts,
/* Use the alternative scheduling-pressure algorithm by default. */
SET_OPTION_IF_UNSET (opts, opts_set, param_sched_pressure_algorithm, 2);
+
+ /* Allow simple vector masking using vll/vstl for epilogues. */
+ if (TARGET_Z13)
+ SET_OPTION_IF_UNSET (opts, opts_set, param_vect_partial_vector_usage, 1);
+ else
+ SET_OPTION_IF_UNSET (opts, opts_set, param_vect_partial_vector_usage, 0);
+
+ /* Do not vectorize loops with a low trip count for now. */
SET_OPTION_IF_UNSET (opts, opts_set, param_min_vect_loop_bound, 2);
/* Set the default alignment. */
@@ -2947,6 +2947,45 @@ (define_insn_and_split "*bswap<mode>"
""
[(set_attr "op_type" "*,VRX,VRX")])
+;
+; Implement len_load/len_store optabs with vll/vstl.
+(define_expand "len_load_v16qi"
+ [(match_operand:V16QI 0 "register_operand")
+ (match_operand:V16QI 1 "memory_operand")
+ (match_operand:QI 2 "register_operand")
+ (match_operand:QI 3 "vll_bias_operand")
+ ]
+ "TARGET_VX && TARGET_64BIT"
+{
+ rtx src1 = XEXP (operands[1], 0);
+ rtx src = gen_reg_rtx (Pmode);
+ emit_move_insn (src, src1);
+ rtx mem = gen_rtx_MEM (BLKmode, src);
+
+ rtx len = gen_lowpart (SImode, operands[2]);
+ emit_insn (gen_vllv16qi (operands[0], len, mem));
+ DONE;
+})
+
+(define_expand "len_store_v16qi"
+ [(match_operand:V16QI 0 "memory_operand")
+ (match_operand:V16QI 1 "register_operand")
+ (match_operand:QI 2 "register_operand")
+ (match_operand:QI 3 "vll_bias_operand")
+ ]
+ "TARGET_VX && TARGET_64BIT"
+{
+ rtx dst1 = XEXP (operands[0], 0);
+ rtx dst = gen_reg_rtx (Pmode);
+ emit_move_insn (dst, dst1);
+ rtx mem = gen_rtx_MEM (BLKmode, dst);
+
+ rtx len = gen_lowpart (SImode, operands[2]);
+ emit_insn (gen_vstlv16qi (operands[1], len, mem));
+ DONE;
+});;
+
+
; reduc_smin
; reduc_smax
; reduc_umin
@@ -236,6 +236,9 @@ dg-runtest [lsort [prune [glob -nocomplain $srcdir/$subdir/*.{c,S}] \
dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*vector*/*.{c,S}]] \
"" $DEFAULT_CFLAGS
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*vector*/partial/*.{c,S}]] \
+ "" $DEFAULT_CFLAGS
+
gfortran-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*vector*/*.F90]] \
"" $DEFAULT_FFLAGS
new file mode 100644
@@ -0,0 +1,18 @@
+#include "s390-vec-length.h"
+
+/* Test the case loop iteration is known. */
+
+#define N 127
+
+#define test(TYPE) \
+ extern TYPE a_##TYPE[N]; \
+ extern TYPE b_##TYPE[N]; \
+ extern TYPE c_##TYPE[N]; \
+ void __attribute__ ((noinline, noclone)) test##TYPE () \
+ { \
+ unsigned int i = 0; \
+ for (i = 0; i < N; i++) \
+ c_##TYPE[i] = a_##TYPE[i] + b_##TYPE[i]; \
+ }
+
+TEST_ALL (test)
new file mode 100644
@@ -0,0 +1,18 @@
+#include "s390-vec-length.h"
+
+/* Test the case loop iteration is unknown. */
+
+#define N 255
+
+#define test(TYPE) \
+ extern TYPE a_##TYPE[N]; \
+ extern TYPE b_##TYPE[N]; \
+ extern TYPE c_##TYPE[N]; \
+ void __attribute__ ((noinline, noclone)) test##TYPE (unsigned int n) \
+ { \
+ unsigned int i = 0; \
+ for (i = 0; i < n; i++) \
+ c_##TYPE[i] = a_##TYPE[i] + b_##TYPE[i]; \
+ }
+
+TEST_ALL (test)
new file mode 100644
@@ -0,0 +1,31 @@
+#include "s390-vec-length.h"
+
+/* Test the case loop iteration less than VF. */
+
+/* For char. */
+#define N_uint8_t 15
+#define N_int8_t 15
+/* For short. */
+#define N_uint16_t 6
+#define N_int16_t 6
+/* For int/float. */
+#define N_uint32_t 3
+#define N_int32_t 3
+#define N_float 3
+/* For long/double. */
+#define N_uint64_t 1
+#define N_int64_t 1
+#define N_double 1
+
+#define test(TYPE) \
+ extern TYPE a_##TYPE[N_##TYPE]; \
+ extern TYPE b_##TYPE[N_##TYPE]; \
+ extern TYPE c_##TYPE[N_##TYPE]; \
+ void __attribute__ ((noinline, noclone)) test##TYPE () \
+ { \
+ unsigned int i = 0; \
+ for (i = 0; i < N_##TYPE; i++) \
+ c_##TYPE[i] = a_##TYPE[i] + b_##TYPE[i]; \
+ }
+
+TEST_ALL (test)
new file mode 100644
@@ -0,0 +1,17 @@
+#include "s390-vec-length.h"
+
+#define N 64
+#define START 1
+#define END 59
+
+#define test(TYPE) \
+ TYPE x_##TYPE[N] __attribute__((aligned(16))); \
+ void __attribute__((noinline, noclone)) test_npeel_##TYPE() { \
+ TYPE v = 0; \
+ for (unsigned int i = START; i < END; i++) { \
+ x_##TYPE[i] = v; \
+ v += 1; \
+ } \
+ }
+
+TEST_ALL (test)
new file mode 100644
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=1 --param=min-vect-loop-bound=0" } */
+
+/* Test that we only vectorize the epilogue with vector load/store with length,
+ the main body still uses normal vector load/store. */
+
+#include "s390-vec-length-1.h"
+
+/* { dg-final { scan-assembler-times {\mvll\M} 14 } } */
+/* { dg-final { scan-assembler-times {\mvstl\M} 7 } } */
+
new file mode 100644
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=1 --param=min-vect-loop-bound=0" } */
+
+/* Test that we only vectorize the epilogue with vector load/store with length,
+ the main body still uses normal vector load/store. */
+
+#include "s390-vec-length-2.h"
+
+/* { dg-final { scan-assembler-times {\mvll\M} 20 } } */
+/* { dg-final { scan-assembler-times {\mvstl\M} 10 } } */
+
new file mode 100644
@@ -0,0 +1,16 @@
+/* { dg-do compile { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=1 --param=min-vect-loop-bound=0" } */
+
+/* Test that we only vectorize the epilogue with vector load/store with length,
+ the main body still uses normal vector load/store. */
+
+#include "s390-vec-length-3.h"
+
+/* { dg-final { scan-assembler-not {\mvl\M} } } */
+/* { dg-final { scan-assembler-not {\mvst\M} } } */
+/* 64bit types get completely unrolled, so only check the others. */
+/* { dg-final { scan-assembler-times {\mvll\M} 14 } } */
+/* { dg-final { scan-assembler-times {\mvstl\M} 7 } } */
+
new file mode 100644
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops -ffast-math" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=1 --param=min-vect-loop-bound=0" } */
+
+/* Test that we only vectorize the epilogue with vector load/store with length,
+ the main body still uses normal vector load/store. */
+
+#include "s390-vec-length-7.h"
+
+/* { dg-final { scan-assembler-times {\mvstl\M} 4 } } */
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do run { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=1" } */
+
+#include "s390-vec-length-run-1.h"
+
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do run { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=1" } */
+
+#include "s390-vec-length-run-2.h"
+
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do run { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=1" } */
+
+#include "s390-vec-length-run-3.h"
+
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do run { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model -ffast-math" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=1" } */
+
+#include "s390-vec-length-run-7.h"
+
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=2" } */
+
+/* Test that the loop body uses vector load/store with length,
+ there should not be any epilogues. */
+
+#include "s390-vec-length-1.h"
+
+/* { dg-final { scan-assembler-times {\mvll\M} 20 } } */
+/* { dg-final { scan-assembler-times {\mvstl\M} 10 } } */
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=2" } */
+
+/* Test that the loop body uses vector load/store with length,
+ there should not be any epilogues. */
+
+#include "s390-vec-length-2.h"
+
+/* { dg-final { scan-assembler-times {\mvll\M} 20 } } */
+/* { dg-final { scan-assembler-times {\mvstl\M} 10 } } */
new file mode 100644
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=2 --param=min-vect-loop-bound=0" } */
+
+/* Test that the loop body uses vector load/store with length,
+ there should not be any epilogues. */
+
+#include "s390-vec-length-3.h"
+
+/* 64bit types get completely unrolled, so only check the others. */
+/* { dg-final { scan-assembler-times {\mvll\M} 14 } } */
+/* { dg-final { scan-assembler-times {\mvstl\M} 7 } } */
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=2" } */
+
+/* Test that the loop body uses vector load/store with length,
+ there should not be any epilogues. */
+
+#include "s390-vec-length-7.h"
+
+/* Each type should have one vstl but we do not currently vectorize the
+ float and double variants and the [u]int64_t ones which do not require
+ partial vectors. */
+/* { dg-final { scan-assembler-times {\mvstl\M} 6 } } */
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do run { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=2" } */
+
+#include "s390-vec-length-run-1.h"
+
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do run { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=2" } */
+
+#include "s390-vec-length-run-2.h"
+
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do run { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=2" } */
+
+#include "s390-vec-length-run-3.h"
+
new file mode 100644
@@ -0,0 +1,7 @@
+/* { dg-do run { target { lp64 && s390_vx } } } */
+/* { dg-options "-march=native -O2 -ftree-vectorize -fno-vect-cost-model -ffast-math" } */
+
+/* { dg-additional-options "--param=vect-partial-vector-usage=2" } */
+
+#include "s390-vec-length-run-7.h"
+
new file mode 100644
@@ -0,0 +1,34 @@
+#include "s390-vec-length-1.h"
+
+#define decl(TYPE) \
+ TYPE a_##TYPE[N]; \
+ TYPE b_##TYPE[N]; \
+ TYPE c_##TYPE[N];
+
+#define run(TYPE) \
+ { \
+ unsigned int i = 0; \
+ for (i = 0; i < N; i++) \
+ { \
+ a_##TYPE[i] = i * 2 + 1; \
+ b_##TYPE[i] = i % 2 - 2; \
+ } \
+ test##TYPE (); \
+ for (i = 0; i < N; i++) \
+ { \
+ TYPE a1 = i * 2 + 1; \
+ TYPE b1 = i % 2 - 2; \
+ TYPE exp_c = a1 + b1; \
+ if (c_##TYPE[i] != exp_c) \
+ __builtin_abort (); \
+ } \
+ }
+
+TEST_ALL (decl)
+
+int
+main (void)
+{
+ TEST_ALL (run)
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,36 @@
+#include "s390-vec-length-2.h"
+
+#define decl(TYPE) \
+ TYPE a_##TYPE[N]; \
+ TYPE b_##TYPE[N]; \
+ TYPE c_##TYPE[N];
+
+#define N1 195
+
+#define run(TYPE) \
+ { \
+ unsigned int i = 0; \
+ for (i = 0; i < N; i++) \
+ { \
+ a_##TYPE[i] = i * 2 + 1; \
+ b_##TYPE[i] = i % 2 - 2; \
+ } \
+ test##TYPE (N1); \
+ for (i = 0; i < N1; i++) \
+ { \
+ TYPE a1 = i * 2 + 1; \
+ TYPE b1 = i % 2 - 2; \
+ TYPE exp_c = a1 + b1; \
+ if (c_##TYPE[i] != exp_c) \
+ __builtin_abort (); \
+ } \
+ }
+
+TEST_ALL (decl)
+
+int
+main (void)
+{
+ TEST_ALL (run)
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,34 @@
+#include "s390-vec-length-3.h"
+
+#define decl(TYPE) \
+ TYPE a_##TYPE[N_##TYPE]; \
+ TYPE b_##TYPE[N_##TYPE]; \
+ TYPE c_##TYPE[N_##TYPE];
+
+#define run(TYPE) \
+ { \
+ unsigned int i = 0; \
+ for (i = 0; i < N_##TYPE; i++) \
+ { \
+ a_##TYPE[i] = i * 2 + 1; \
+ b_##TYPE[i] = i % 2 - 2; \
+ } \
+ test##TYPE (); \
+ for (i = 0; i < N_##TYPE; i++) \
+ { \
+ TYPE a1 = i * 2 + 1; \
+ TYPE b1 = i % 2 - 2; \
+ TYPE exp_c = a1 + b1; \
+ if (c_##TYPE[i] != exp_c) \
+ __builtin_abort (); \
+ } \
+ }
+
+TEST_ALL (decl)
+
+int
+main (void)
+{
+ TEST_ALL (run)
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,16 @@
+#include "s390-vec-length-7.h"
+
+#define run(TYPE) \
+ { \
+ unsigned int i = 0; \
+ test_npeel_##TYPE(); \
+ for (int i = 0; i < N; ++i) { \
+ if (x_##TYPE[i] != (i < START || i >= END ? 0 : (i - START))) \
+ __builtin_abort(); \
+ } \
+ }
+
+int main() {
+ TEST_ALL(run)
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { lp64 && s390_vx } } } */
+/* { dg-options "-mzarch -march=native -O2 -ftree-vectorize -fno-vect-cost-model -fno-unroll-loops" } */
+
+/* { dg-additional-options "--param=min-vect-loop-bound=0 --param=vect-partial-vector-usage=2" } */
+
+#define SZ 333
+
+void foo (char *restrict a, char *restrict b, char *restrict c, int n)
+{
+#pragma gcc unroll 0
+ for (int i = 0; i < 17; i++)
+ c[i] = a[i] + b[i];
+/* { dg-final { scan-assembler-times "lhi\t%r\[0-9\]*,0\n" 1 } } */
+}
+
new file mode 100644
@@ -0,0 +1,14 @@
+#include <stdint.h>
+
+#define TEST_ALL(T) \
+ T (int8_t) \
+ T (uint8_t) \
+ T (int16_t) \
+ T (uint16_t) \
+ T (int32_t) \
+ T (uint32_t) \
+ T (int64_t) \
+ T (uint64_t) \
+ T (float) \
+ T (double)
+
@@ -9,4 +9,4 @@ void foo(int *restrict a, int *restrict b, unsigned int n)
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
-/* { dg-final { scan-tree-dump-times "Vectorizing an unaligned access" 2 "vect" } } */
+/* { dg-final { scan-tree-dump "Vectorizing an unaligned access" "vect" } } */
@@ -7740,7 +7740,8 @@ proc check_effective_target_vect_fully_masked { } {
# @code{len_store} optabs.
proc check_effective_target_vect_len_load_store { } {
- return [check_effective_target_has_arch_pwr9]
+ return [expr { [check_effective_target_has_arch_pwr9]
+ || [check_effective_target_s390_vx] }]
}
# Return the value of parameter vect-partial-vector-usage specified for