@@ -83,6 +83,24 @@
}
)
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] permutation
+;; -------------------------------------------------------------------------
+;; This is the pattern permutes the vector
+;; -------------------------------------------------------------------------
+
+(define_expand "vec_perm<mode>"
+ [(match_operand:V 0 "register_operand")
+ (match_operand:V 1 "register_operand")
+ (match_operand:V 2 "register_operand")
+ (match_operand:<VINDEX> 3 "vector_perm_operand")]
+ "TARGET_VECTOR && GET_MODE_NUNITS (<MODE>mode).is_constant ()"
+ {
+ riscv_vector::expand_vec_perm (operands);
+ DONE;
+ }
+)
+
;; -------------------------------------------------------------------------
;; ---- [INT,FP] Initialize from individual elements
;; -------------------------------------------------------------------------
@@ -330,6 +330,10 @@
(and (match_code "const_vector")
(match_test "riscv_vector::const_vec_all_same_in_range_p (op, 0, 31)"))))
+(define_predicate "vector_perm_operand"
+ (ior (match_operand 0 "register_operand")
+ (match_code "const_vector")))
+
(define_predicate "ltge_operator"
(match_code "lt,ltu,ge,geu"))
@@ -137,6 +137,7 @@ enum insn_type
RVV_MISC_OP = 1,
RVV_UNOP = 2,
RVV_BINOP = 3,
+ RVV_BINOP_MU = RVV_BINOP + 2,
RVV_MERGE_OP = 4,
RVV_CMP_OP = 4,
RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand. */
@@ -240,6 +241,7 @@ opt_machine_mode get_mask_mode (machine_mode);
void expand_vec_series (rtx, rtx, rtx);
void expand_vec_init (rtx, rtx);
void expand_vcond (rtx *);
+void expand_vec_perm (rtx *);
/* Rounding mode bitfield for fixed point VXRM. */
enum vxrm_field_enum
{
@@ -259,6 +259,47 @@ const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
&& IN_RANGE (INTVAL (elt), minval, maxval));
}
+/* Return true if VEC is a constant in which every element is in the range
+ [MINVAL, MAXVAL]. The elements do not need to have the same value.
+
+ This function also exists in aarch64, we may unify it in middle-end in the
+ future. */
+
+static bool
+const_vec_all_in_range_p (rtx vec, HOST_WIDE_INT minval, HOST_WIDE_INT maxval)
+{
+ if (!CONST_VECTOR_P (vec)
+ || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
+ return false;
+
+ int nunits;
+ if (!CONST_VECTOR_STEPPED_P (vec))
+ nunits = const_vector_encoded_nelts (vec);
+ else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
+ return false;
+
+ for (int i = 0; i < nunits; i++)
+ {
+ rtx vec_elem = CONST_VECTOR_ELT (vec, i);
+ if (!CONST_INT_P (vec_elem)
+ || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
+ return false;
+ }
+ return true;
+}
+
+/* Return a const_int vector of VAL.
+
+ This function also exists in aarch64, we may unify it in middle-end in the
+ future. */
+
+static rtx
+gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
+{
+ rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
+ return gen_const_vec_duplicate (mode, c);
+}
+
/* Emit a vlmax vsetvl instruction. This should only be used when
optimization is disabled or after vsetvl insertion pass. */
void
@@ -1927,4 +1968,116 @@ expand_vcond (rtx *ops)
gen_vcond_mask (data_mode, data_mode, ops[0], ops[1], ops[2], mask));
}
+/* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
+ is a const duplicate vector. Otherwise, emit vrgather.vv. */
+static void
+emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
+{
+ rtx elt;
+ insn_code icode;
+ machine_mode data_mode = GET_MODE (target);
+ if (const_vec_duplicate_p (sel, &elt))
+ {
+ icode = code_for_pred_gather_scalar (data_mode);
+ sel = elt;
+ }
+ else
+ icode = code_for_pred_gather (data_mode);
+ rtx ops[] = {target, op, sel};
+ emit_vlmax_insn (icode, RVV_BINOP, ops);
+}
+
+static void
+emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
+{
+ rtx elt;
+ insn_code icode;
+ machine_mode data_mode = GET_MODE (target);
+ if (const_vec_duplicate_p (sel, &elt))
+ {
+ icode = code_for_pred_gather_scalar (data_mode);
+ sel = elt;
+ }
+ else
+ icode = code_for_pred_gather (data_mode);
+ rtx ops[] = {target, mask, target, op, sel};
+ emit_vlmax_masked_mu_insn (icode, RVV_BINOP_MU, ops);
+}
+
+/* Implement vec_perm<mode>. */
+
+void
+expand_vec_perm (rtx *operands)
+{
+ rtx target = operands[0];
+ rtx op0 = operands[1];
+ rtx op1 = operands[2];
+ rtx sel = operands[3];
+ machine_mode data_mode = GET_MODE (target);
+ machine_mode sel_mode = GET_MODE (sel);
+
+ /* Enforced by the pattern condition. */
+ int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
+
+ /* Check if the sel only references the first values vector. If each select
+ index is in range of [0, nunits - 1]. A single vrgather instructions is
+ enough. */
+ if (const_vec_all_in_range_p (sel, 0, nunits - 1))
+ {
+ emit_vlmax_gather_insn (target, op0, sel);
+ return;
+ }
+
+ /* Check if the two values vectors are the same. */
+ if (rtx_equal_p (op0, op1) || const_vec_duplicate_p (sel))
+ {
+ /* Note: vec_perm indices are supposed to wrap when they go beyond the
+ size of the two value vectors, i.e. the upper bits of the indices
+ are effectively ignored. RVV vrgather instead produces 0 for any
+ out-of-range indices, so we need to modulo all the vec_perm indices
+ to ensure they are all in range of [0, nunits - 1]. */
+ rtx max_sel = gen_const_vector_dup (sel_mode, nunits - 1);
+ rtx sel_mod = expand_simple_binop (sel_mode, AND, sel, max_sel, NULL, 0,
+ OPTAB_DIRECT);
+ emit_vlmax_gather_insn (target, op1, sel_mod);
+ return;
+ }
+
+ /* Note: vec_perm indices are supposed to wrap when they go beyond the
+ size of the two value vectors, i.e. the upper bits of the indices
+ are effectively ignored. RVV vrgather instead produces 0 for any
+ out-of-range indices, so we need to modulo all the vec_perm indices
+ to ensure they are all in range of [0, 2 * nunits - 1]. */
+ rtx max_sel = gen_const_vector_dup (sel_mode, 2 * nunits - 1);
+ rtx sel_mod
+ = expand_simple_binop (sel_mode, AND, sel, max_sel, NULL, 0, OPTAB_DIRECT);
+
+ /* This following sequence is handling the case that:
+ __builtin_shufflevector (vec1, vec2, index...), the index can be any
+ value in range of [0, 2 * nunits - 1]. */
+ machine_mode mask_mode;
+ mask_mode = get_mask_mode (data_mode).require ();
+ rtx mask = gen_reg_rtx (mask_mode);
+ max_sel = gen_const_vector_dup (sel_mode, nunits);
+
+ /* Step 1: generate a mask that should select everything >= nunits into the
+ * mask. */
+ expand_vec_cmp (mask, GEU, sel_mod, max_sel);
+
+ /* Step2: gather every op0 values indexed by sel into target,
+ we don't need to care about the result of the element
+ whose index >= nunits. */
+ emit_vlmax_gather_insn (target, op0, sel_mod);
+
+ /* Step3: shift the range from (nunits, max_of_mode] to
+ [0, max_of_mode - nunits]. */
+ rtx tmp = gen_reg_rtx (sel_mode);
+ rtx ops[] = {tmp, sel_mod, max_sel};
+ emit_vlmax_insn (code_for_pred (MINUS, sel_mode), RVV_BINOP, ops);
+
+ /* Step4: gather those into the previously masked-out elements
+ of target. */
+ emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
+}
+
} // namespace riscv_vector
new file mode 100644
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define MASK_2(X, Y) 1, 1
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define PERMUTE(TYPE, NUNITS) \
+ __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2, \
+ TYPE *out) \
+ { \
+ TYPE v \
+ = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
+ *(TYPE *) out = v; \
+ }
+
+#define TEST_ALL(T) \
+ T (vnx2qi, 2) \
+ T (vnx4qi, 4) \
+ T (vnx8qi, 8) \
+ T (vnx16qi, 16) \
+ T (vnx32qi, 32) \
+ T (vnx64qi, 64) \
+ T (vnx128qi, 128) \
+ T (vnx2hi, 2) \
+ T (vnx4hi, 4) \
+ T (vnx8hi, 8) \
+ T (vnx16hi, 16) \
+ T (vnx32hi, 32) \
+ T (vnx64hi, 64) \
+ T (vnx2si, 2) \
+ T (vnx4si, 4) \
+ T (vnx8si, 8) \
+ T (vnx16si, 16) \
+ T (vnx32si, 32) \
+ T (vnx2di, 2) \
+ T (vnx4di, 4) \
+ T (vnx8di, 8) \
+ T (vnx16di, 16) \
+ T (vnx2sf, 2) \
+ T (vnx4sf, 4) \
+ T (vnx8sf, 8) \
+ T (vnx16sf, 16) \
+ T (vnx32sf, 32) \
+ T (vnx2df, 2) \
+ T (vnx4df, 4) \
+ T (vnx8df, 8) \
+ T (vnx16df, 16)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vi\tv[0-9]+,\s*v[0-9]+,\s*1} 31 } } */
new file mode 100644
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define MASK_2(X, Y) 31, 31
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define PERMUTE(TYPE, NUNITS) \
+ void permute_##TYPE (TYPE values1, TYPE values2, TYPE *out) \
+ { \
+ TYPE v \
+ = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
+ *(TYPE *) out = v; \
+ }
+
+#define TEST_ALL(T) \
+ T (vnx32qi, 32) \
+ T (vnx64qi, 64) \
+ T (vnx128qi, 128) \
+ T (vnx32hi, 32) \
+ T (vnx64hi, 64) \
+ T (vnx32si, 32) \
+ T (vnx32sf, 32)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vi\tv[0-9]+,\s*v[0-9]+,\s*31} 7 } } */
new file mode 100644
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define MASK_2(X, Y) 55, 55
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define PERMUTE(TYPE, NUNITS) \
+ void permute_##TYPE (TYPE values1, TYPE values2, TYPE *out) \
+ { \
+ TYPE v \
+ = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
+ *(TYPE *) out = v; \
+ }
+
+#define TEST_ALL(T) \
+ T (vnx64qi, 64) \
+ T (vnx128qi, 128) \
+ T (vnx64hi, 64)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vx\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+} 3 } } */
new file mode 100644
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define MASK_2(X, Y) (Y) - 1 - (X), (Y) - 2 - (X)
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define PERMUTE(TYPE, NUNITS) \
+ __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2, \
+ TYPE *out) \
+ { \
+ TYPE v \
+ = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
+ *(TYPE *) out = v; \
+ }
+
+#define TEST_ALL(T) \
+ T (vnx2qi, 2) \
+ T (vnx4qi, 4) \
+ T (vnx8qi, 8) \
+ T (vnx16qi, 16) \
+ T (vnx32qi, 32) \
+ T (vnx64qi, 64) \
+ T (vnx128qi, 128) \
+ T (vnx2hi, 2) \
+ T (vnx4hi, 4) \
+ T (vnx8hi, 8) \
+ T (vnx16hi, 16) \
+ T (vnx32hi, 32) \
+ T (vnx64hi, 64) \
+ T (vnx2si, 2) \
+ T (vnx4si, 4) \
+ T (vnx8si, 8) \
+ T (vnx16si, 16) \
+ T (vnx32si, 32) \
+ T (vnx2di, 2) \
+ T (vnx4di, 4) \
+ T (vnx8di, 8) \
+ T (vnx16di, 16) \
+ T (vnx2sf, 2) \
+ T (vnx4sf, 4) \
+ T (vnx8sf, 8) \
+ T (vnx16sf, 16) \
+ T (vnx32sf, 32) \
+ T (vnx2df, 2) \
+ T (vnx4df, 4) \
+ T (vnx8df, 8) \
+ T (vnx16df, 16)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
new file mode 100644
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define PERMUTE(TYPE, TYPE2, NUNITS) \
+ __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2, \
+ TYPE2 mask, TYPE *out) \
+ { \
+ TYPE v = __builtin_shuffle (values1, values1, mask); \
+ *(TYPE *) out = v; \
+ }
+
+#define TEST_ALL(T) \
+ T (vnx2qi, vnx2qi, 2) \
+ T (vnx4qi, vnx4qi, 4) \
+ T (vnx8qi, vnx8qi, 8) \
+ T (vnx16qi, vnx16qi, 16) \
+ T (vnx32qi, vnx32qi, 32) \
+ T (vnx64qi, vnx64qi, 64) \
+ T (vnx128qi, vnx128qi, 128) \
+ T (vnx2hi, vnx2hi, 2) \
+ T (vnx4hi, vnx4hi, 4) \
+ T (vnx8hi, vnx8hi, 8) \
+ T (vnx16hi, vnx16hi, 16) \
+ T (vnx32hi, vnx32hi, 32) \
+ T (vnx64hi, vnx64hi, 64) \
+ T (vnx2si, vnx2si, 2) \
+ T (vnx4si, vnx4si, 4) \
+ T (vnx8si, vnx8si, 8) \
+ T (vnx16si, vnx16si, 16) \
+ T (vnx32si, vnx32si, 32) \
+ T (vnx2di, vnx2di, 2) \
+ T (vnx4di, vnx4di, 4) \
+ T (vnx8di, vnx8di, 8) \
+ T (vnx16di, vnx16di, 16) \
+ T (vnx2sf, vnx2si, 2) \
+ T (vnx4sf, vnx4si, 4) \
+ T (vnx8sf, vnx8si, 8) \
+ T (vnx16sf, vnx16si, 16) \
+ T (vnx32sf, vnx32si, 32) \
+ T (vnx2df, vnx2di, 2) \
+ T (vnx4df, vnx4di, 4) \
+ T (vnx8df, vnx8di, 8) \
+ T (vnx16df, vnx16di, 16)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
new file mode 100644
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define MASK_2(X, Y) Y + 1, Y + 1
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define PERMUTE(TYPE, NUNITS) \
+ __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2, \
+ TYPE *out) \
+ { \
+ TYPE v \
+ = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
+ *(TYPE *) out = v; \
+ }
+
+#define TEST_ALL(T) \
+ T (vnx2qi, 2) \
+ T (vnx4qi, 4) \
+ T (vnx8qi, 8) \
+ T (vnx16qi, 16) \
+ T (vnx32qi, 32) \
+ T (vnx64qi, 64) \
+ T (vnx128qi, 128) \
+ T (vnx2hi, 2) \
+ T (vnx4hi, 4) \
+ T (vnx8hi, 8) \
+ T (vnx16hi, 16) \
+ T (vnx32hi, 32) \
+ T (vnx64hi, 64) \
+ T (vnx2si, 2) \
+ T (vnx4si, 4) \
+ T (vnx8si, 8) \
+ T (vnx16si, 16) \
+ T (vnx32si, 32) \
+ T (vnx2di, 2) \
+ T (vnx4di, 4) \
+ T (vnx8di, 8) \
+ T (vnx16di, 16) \
+ T (vnx2sf, 2) \
+ T (vnx4sf, 4) \
+ T (vnx8sf, 8) \
+ T (vnx16sf, 16) \
+ T (vnx32sf, 32) \
+ T (vnx2df, 2) \
+ T (vnx4df, 4) \
+ T (vnx8df, 8) \
+ T (vnx16df, 16)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vi\tv[0-9]+,\s*v[0-9]+,\s*1} 31 } } */
new file mode 100644
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define PERMUTE(TYPE, TYPE2, NUNITS) \
+ __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2, \
+ TYPE2 mask, TYPE *out) \
+ { \
+ TYPE v = __builtin_shuffle (values1, values2, mask); \
+ *(TYPE *) out = v; \
+ }
+
+#define TEST_ALL(T) \
+ T (vnx2qi, vnx2qi, 2) \
+ T (vnx4qi, vnx4qi, 4) \
+ T (vnx8qi, vnx8qi, 8) \
+ T (vnx16qi, vnx16qi, 16) \
+ T (vnx32qi, vnx32qi, 32) \
+ T (vnx64qi, vnx64qi, 64) \
+ T (vnx128qi, vnx128qi, 128) \
+ T (vnx2hi, vnx2hi, 2) \
+ T (vnx4hi, vnx4hi, 4) \
+ T (vnx8hi, vnx8hi, 8) \
+ T (vnx16hi, vnx16hi, 16) \
+ T (vnx32hi, vnx32hi, 32) \
+ T (vnx64hi, vnx64hi, 64) \
+ T (vnx2si, vnx2si, 2) \
+ T (vnx4si, vnx4si, 4) \
+ T (vnx8si, vnx8si, 8) \
+ T (vnx16si, vnx16si, 16) \
+ T (vnx32si, vnx32si, 32) \
+ T (vnx2di, vnx2di, 2) \
+ T (vnx4di, vnx4di, 4) \
+ T (vnx8di, vnx8di, 8) \
+ T (vnx16di, vnx16di, 16) \
+ T (vnx2sf, vnx2si, 2) \
+ T (vnx4sf, vnx4si, 4) \
+ T (vnx8sf, vnx8si, 8) \
+ T (vnx16sf, vnx16si, 16) \
+ T (vnx32sf, vnx32si, 32) \
+ T (vnx2df, vnx2di, 2) \
+ T (vnx4df, vnx4di, 4) \
+ T (vnx8df, vnx8di, 8) \
+ T (vnx16df, vnx16di, 16)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+,\s*v0.t} 31 } } */
new file mode 100644
@@ -0,0 +1,70 @@
+#include <stdint.h>
+
+typedef int8_t vnx2qi __attribute__ ((vector_size (2)));
+typedef int8_t vnx4qi __attribute__ ((vector_size (4)));
+typedef int8_t vnx8qi __attribute__ ((vector_size (8)));
+typedef int8_t vnx16qi __attribute__ ((vector_size (16)));
+typedef int8_t vnx32qi __attribute__ ((vector_size (32)));
+typedef int8_t vnx64qi __attribute__ ((vector_size (64)));
+typedef int8_t vnx128qi __attribute__ ((vector_size (128)));
+
+typedef int16_t vnx2hi __attribute__ ((vector_size (4)));
+typedef int16_t vnx4hi __attribute__ ((vector_size (8)));
+typedef int16_t vnx8hi __attribute__ ((vector_size (16)));
+typedef int16_t vnx16hi __attribute__ ((vector_size (32)));
+typedef int16_t vnx32hi __attribute__ ((vector_size (64)));
+typedef int16_t vnx64hi __attribute__ ((vector_size (128)));
+
+typedef int32_t vnx2si __attribute__ ((vector_size (8)));
+typedef int32_t vnx4si __attribute__ ((vector_size (16)));
+typedef int32_t vnx8si __attribute__ ((vector_size (32)));
+typedef int32_t vnx16si __attribute__ ((vector_size (64)));
+typedef int32_t vnx32si __attribute__ ((vector_size (128)));
+
+typedef int64_t vnx2di __attribute__ ((vector_size (16)));
+typedef int64_t vnx4di __attribute__ ((vector_size (32)));
+typedef int64_t vnx8di __attribute__ ((vector_size (64)));
+typedef int64_t vnx16di __attribute__ ((vector_size (128)));
+
+typedef float vnx2sf __attribute__ ((vector_size (8)));
+typedef float vnx4sf __attribute__ ((vector_size (16)));
+typedef float vnx8sf __attribute__ ((vector_size (32)));
+typedef float vnx16sf __attribute__ ((vector_size (64)));
+typedef float vnx32sf __attribute__ ((vector_size (128)));
+
+typedef double vnx2df __attribute__ ((vector_size (16)));
+typedef double vnx4df __attribute__ ((vector_size (32)));
+typedef double vnx8df __attribute__ ((vector_size (64)));
+typedef double vnx16df __attribute__ ((vector_size (128)));
+
+#define INIT_PERMUTE(NUNITS, NUM1, NUM2, TYPE) \
+ TYPE v_##TYPE##_in1; \
+ TYPE v_##TYPE##_in2; \
+ TYPE v_##TYPE##_out = {0}; \
+ for (int i = 0; i < NUNITS; i++) \
+ { \
+ v_##TYPE##_in1[i] = i * NUM1 + NUM2; \
+ v_##TYPE##_in2[i] = i * NUM1 - NUM2; \
+ }
+
+#define CHECK_PERMUTE_SINGLE(NUNITS, VALUE, TYPE) \
+ for (int i = 0; i < NUNITS; i++) \
+ if (v_##TYPE##_out[i] != VALUE) \
+ __builtin_abort ();
+
+#define CHECK_PERMUTE_REVERSE(NUNITS, TYPE) \
+ for (int i = 0; i < NUNITS; i++) \
+ if (v_##TYPE##_out[i] != v_##TYPE##_in1[NUNITS - 1 - i]) \
+ __builtin_abort ();
+
+#define CHECK_PERMUTE_DOUBLE(NUNITS, TYPE) \
+ for (int i = 0; i < NUNITS; i++) \
+ { \
+ int new_index = i * 2; \
+ if (new_index < NUNITS \
+ && v_##TYPE##_out[i] != v_##TYPE##_in1[new_index]) \
+ __builtin_abort (); \
+ if (new_index >= NUNITS \
+ && v_##TYPE##_out[i] != v_##TYPE##_in2[new_index % NUNITS]) \
+ __builtin_abort (); \
+ }
new file mode 100644
@@ -0,0 +1,104 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-1.c"
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+ INIT_PERMUTE(2, 3, 79, vnx2qi)
+ permute_vnx2qi (v_vnx2qi_in1, v_vnx2qi_in2, &v_vnx2qi_out);
+ CHECK_PERMUTE_SINGLE(2, 3*1+79, vnx2qi)
+ INIT_PERMUTE(4, 2, -69, vnx4qi)
+ permute_vnx4qi (v_vnx4qi_in1, v_vnx4qi_in2, &v_vnx4qi_out);
+ CHECK_PERMUTE_SINGLE(4, 2*1+-69, vnx4qi)
+ INIT_PERMUTE(8, 4, -33, vnx8qi)
+ permute_vnx8qi (v_vnx8qi_in1, v_vnx8qi_in2, &v_vnx8qi_out);
+ CHECK_PERMUTE_SINGLE(8, 4*1+-33, vnx8qi)
+ INIT_PERMUTE(16, -3, 15, vnx16qi)
+ permute_vnx16qi (v_vnx16qi_in1, v_vnx16qi_in2, &v_vnx16qi_out);
+ CHECK_PERMUTE_SINGLE(16, -3*1+15, vnx16qi)
+ INIT_PERMUTE(32, -1, 30, vnx32qi)
+ permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, &v_vnx32qi_out);
+ CHECK_PERMUTE_SINGLE(32, -1*1+30, vnx32qi)
+ INIT_PERMUTE(64, -1, 66, vnx64qi)
+ permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, &v_vnx64qi_out);
+ CHECK_PERMUTE_SINGLE(64, -1*1+66, vnx64qi)
+ INIT_PERMUTE(128, -1, 38, vnx128qi)
+ permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, &v_vnx128qi_out);
+ CHECK_PERMUTE_SINGLE(128, -1*1+38, vnx128qi)
+ INIT_PERMUTE(2, 2, 30238, vnx2hi)
+ permute_vnx2hi (v_vnx2hi_in1, v_vnx2hi_in2, &v_vnx2hi_out);
+ CHECK_PERMUTE_SINGLE(2, 2*1+30238, vnx2hi)
+ INIT_PERMUTE(4, -45, -2345, vnx4hi)
+ permute_vnx4hi (v_vnx4hi_in1, v_vnx4hi_in2, &v_vnx4hi_out);
+ CHECK_PERMUTE_SINGLE(4, -45*1+-2345, vnx4hi)
+ INIT_PERMUTE(8, 98, -18415, vnx8hi)
+ permute_vnx8hi (v_vnx8hi_in1, v_vnx8hi_in2, &v_vnx8hi_out);
+ CHECK_PERMUTE_SINGLE(8, 98*1+-18415, vnx8hi)
+ INIT_PERMUTE(16, 56, 3299, vnx16hi)
+ permute_vnx16hi (v_vnx16hi_in1, v_vnx16hi_in2, &v_vnx16hi_out);
+ CHECK_PERMUTE_SINGLE(16, 56*1+3299, vnx16hi)
+ INIT_PERMUTE(32, 15641, -9156, vnx32hi)
+ permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, &v_vnx32hi_out);
+ CHECK_PERMUTE_SINGLE(32, 15641*1+-9156, vnx32hi)
+ INIT_PERMUTE(64, -25641, 8093, vnx64hi)
+ permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, &v_vnx64hi_out);
+ CHECK_PERMUTE_SINGLE(64, -25641*1+8093, vnx64hi)
+ INIT_PERMUTE(2, -428, -15651, vnx2si)
+ permute_vnx2si (v_vnx2si_in1, v_vnx2si_in2, &v_vnx2si_out);
+ CHECK_PERMUTE_SINGLE(2, -428*1+-15651, vnx2si)
+ INIT_PERMUTE(4, 208, -55651, vnx4si)
+ permute_vnx4si (v_vnx4si_in1, v_vnx4si_in2, &v_vnx4si_out);
+ CHECK_PERMUTE_SINGLE(4, 208*1+-55651, vnx4si)
+ INIT_PERMUTE(8, 808, 75651, vnx8si)
+ permute_vnx8si (v_vnx8si_in1, v_vnx8si_in2, &v_vnx8si_out);
+ CHECK_PERMUTE_SINGLE(8, 808*1+75651, vnx8si)
+ INIT_PERMUTE(16, 816, -8941561, vnx16si)
+ permute_vnx16si (v_vnx16si_in1, v_vnx16si_in2, &v_vnx16si_out);
+ CHECK_PERMUTE_SINGLE(16, 816*1+-8941561, vnx16si)
+ INIT_PERMUTE(32, -532, 98416, vnx32si)
+ permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, &v_vnx32si_out);
+ CHECK_PERMUTE_SINGLE(32, -532*1+98416, vnx32si)
+ INIT_PERMUTE(2, -4161, 9551616, vnx2di)
+ permute_vnx2di (v_vnx2di_in1, v_vnx2di_in2, &v_vnx2di_out);
+ CHECK_PERMUTE_SINGLE(2, -4161*1+9551616, vnx2di)
+ INIT_PERMUTE(4, 7259, -15644961, vnx4di)
+ permute_vnx4di (v_vnx4di_in1, v_vnx4di_in2, &v_vnx4di_out);
+ CHECK_PERMUTE_SINGLE(4, 7259*1+-15644961, vnx4di)
+ INIT_PERMUTE(8, 351, 9156651, vnx8di)
+ permute_vnx8di (v_vnx8di_in1, v_vnx8di_in2, &v_vnx8di_out);
+ CHECK_PERMUTE_SINGLE(8, 351*1+9156651, vnx8di)
+ INIT_PERMUTE(16, 11, -816196231,vnx16di)
+ permute_vnx16di (v_vnx16di_in1, v_vnx16di_in2, &v_vnx16di_out);
+ CHECK_PERMUTE_SINGLE(16, 11*1+-816196231, vnx16di)
+ INIT_PERMUTE(2, 4552, -89, vnx2sf)
+ permute_vnx2sf (v_vnx2sf_in1, v_vnx2sf_in2, &v_vnx2sf_out);
+ CHECK_PERMUTE_SINGLE(2, (4552+-89), vnx2sf)
+ INIT_PERMUTE(4, 685, 7961, vnx4sf)
+ permute_vnx4sf (v_vnx4sf_in1, v_vnx4sf_in2, &v_vnx4sf_out);
+ CHECK_PERMUTE_SINGLE(4, 685+7961, vnx4sf)
+ INIT_PERMUTE(8, 3927, 16513, vnx8sf)
+ permute_vnx8sf (v_vnx8sf_in1, v_vnx8sf_in2, &v_vnx8sf_out);
+ CHECK_PERMUTE_SINGLE(8, 3927*1+16513, vnx8sf)
+ INIT_PERMUTE(16, -68, 16156571, vnx16sf)
+ permute_vnx16sf (v_vnx16sf_in1, v_vnx16sf_in2, &v_vnx16sf_out);
+ CHECK_PERMUTE_SINGLE(16, -68*1+16156571, vnx16sf)
+ INIT_PERMUTE(32, 9985, 1561318, vnx32sf)
+ permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, &v_vnx32sf_out);
+ CHECK_PERMUTE_SINGLE(32, 9985*1+1561318, vnx32sf)
+ INIT_PERMUTE(2, -1565.1561, -5641565.515, vnx2df)
+ permute_vnx2df (v_vnx2df_in1, v_vnx2df_in2, &v_vnx2df_out);
+ CHECK_PERMUTE_SINGLE(2, -1565.1561*1+-5641565.515, vnx2df)
+ INIT_PERMUTE(4, -189.14897196, -15616547.5165574, vnx4df)
+ permute_vnx4df (v_vnx4df_in1, v_vnx4df_in2, &v_vnx4df_out);
+ CHECK_PERMUTE_SINGLE(4, -189.14897196*1+-15616547.5165574, vnx4df)
+ INIT_PERMUTE(8, 651.158691561, -56163.1655411, vnx8df)
+ permute_vnx8df (v_vnx8df_in1, v_vnx8df_in2, &v_vnx8df_out);
+ CHECK_PERMUTE_SINGLE(8, 651.158691561*1+-56163.1655411, vnx8df)
+ INIT_PERMUTE(16, 58.91516377, 251465.81561, vnx16df)
+ permute_vnx16df (v_vnx16df_in1, v_vnx16df_in2, &v_vnx16df_out);
+ CHECK_PERMUTE_SINGLE(16, 58.91516377*1+251465.81561, vnx16df)
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,32 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-2.c"
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+ INIT_PERMUTE(32, -1, 30, vnx32qi)
+ permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, &v_vnx32qi_out);
+ CHECK_PERMUTE_SINGLE(32, -1*31+30, vnx32qi)
+ INIT_PERMUTE(64, -1, 66, vnx64qi)
+ permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, &v_vnx64qi_out);
+ CHECK_PERMUTE_SINGLE(64, -1*31+66, vnx64qi)
+ INIT_PERMUTE(128, -1, 38, vnx128qi)
+ permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, &v_vnx128qi_out);
+ CHECK_PERMUTE_SINGLE(128, -1*31+38, vnx128qi)
+ INIT_PERMUTE(32, 156, -9156, vnx32hi)
+ permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, &v_vnx32hi_out);
+ CHECK_PERMUTE_SINGLE(32, 156*31+-9156, vnx32hi)
+ INIT_PERMUTE(64, -251, 8093, vnx64hi)
+ permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, &v_vnx64hi_out);
+ CHECK_PERMUTE_SINGLE(64, -251*31+8093, vnx64hi)
+ INIT_PERMUTE(32, -532, 98416, vnx32si)
+ permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, &v_vnx32si_out);
+ CHECK_PERMUTE_SINGLE(32, -532*31+98416, vnx32si)
+ INIT_PERMUTE(32, 995, 1561318, vnx32sf)
+ permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, &v_vnx32sf_out);
+ CHECK_PERMUTE_SINGLE(32, 995*31+1561318, vnx32sf)
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-3.c"
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+ INIT_PERMUTE(64, -1, 66, vnx64qi)
+ permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, &v_vnx64qi_out);
+ CHECK_PERMUTE_SINGLE(64, -1*55+66, vnx64qi)
+ INIT_PERMUTE(128, -1, 38, vnx128qi)
+ permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, &v_vnx128qi_out);
+ CHECK_PERMUTE_SINGLE(128, -1*55+38, vnx128qi)
+ INIT_PERMUTE(64, -251, 8093, vnx64hi)
+ permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, &v_vnx64hi_out);
+ CHECK_PERMUTE_SINGLE(64, -251*55+8093, vnx64hi)
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,104 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-4.c"
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+ INIT_PERMUTE(2, 3, 79, vnx2qi)
+ permute_vnx2qi (v_vnx2qi_in1, v_vnx2qi_in2, &v_vnx2qi_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2qi)
+ INIT_PERMUTE(4, 2, -69, vnx4qi)
+ permute_vnx4qi (v_vnx4qi_in1, v_vnx4qi_in2, &v_vnx4qi_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4qi)
+ INIT_PERMUTE(8, 4, -33, vnx8qi)
+ permute_vnx8qi (v_vnx8qi_in1, v_vnx8qi_in2, &v_vnx8qi_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8qi)
+ INIT_PERMUTE(16, -3, 15, vnx16qi)
+ permute_vnx16qi (v_vnx16qi_in1, v_vnx16qi_in2, &v_vnx16qi_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16qi)
+ INIT_PERMUTE(32, -1, 30, vnx32qi)
+ permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, &v_vnx32qi_out);
+ CHECK_PERMUTE_REVERSE(32, vnx32qi)
+ INIT_PERMUTE(64, -1, 66, vnx64qi)
+ permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, &v_vnx64qi_out);
+ CHECK_PERMUTE_REVERSE(64, vnx64qi)
+ INIT_PERMUTE(128, -1, 38, vnx128qi)
+ permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, &v_vnx128qi_out);
+ CHECK_PERMUTE_REVERSE(128, vnx128qi)
+ INIT_PERMUTE(2, 2, 30238, vnx2hi)
+ permute_vnx2hi (v_vnx2hi_in1, v_vnx2hi_in2, &v_vnx2hi_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2hi)
+ INIT_PERMUTE(4, -45, -2345, vnx4hi)
+ permute_vnx4hi (v_vnx4hi_in1, v_vnx4hi_in2, &v_vnx4hi_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4hi)
+ INIT_PERMUTE(8, 98, -18415, vnx8hi)
+ permute_vnx8hi (v_vnx8hi_in1, v_vnx8hi_in2, &v_vnx8hi_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8hi)
+ INIT_PERMUTE(16, 56, 3299, vnx16hi)
+ permute_vnx16hi (v_vnx16hi_in1, v_vnx16hi_in2, &v_vnx16hi_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16hi)
+ INIT_PERMUTE(32, 15641, -9156, vnx32hi)
+ permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, &v_vnx32hi_out);
+ CHECK_PERMUTE_REVERSE(32, vnx32hi)
+ INIT_PERMUTE(64, -25641, 8093, vnx64hi)
+ permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, &v_vnx64hi_out);
+ CHECK_PERMUTE_REVERSE(64, vnx64hi)
+ INIT_PERMUTE(2, -428, -15651, vnx2si)
+ permute_vnx2si (v_vnx2si_in1, v_vnx2si_in2, &v_vnx2si_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2si)
+ INIT_PERMUTE(4, 208, -55651, vnx4si)
+ permute_vnx4si (v_vnx4si_in1, v_vnx4si_in2, &v_vnx4si_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4si)
+ INIT_PERMUTE(8, 808, 75651, vnx8si)
+ permute_vnx8si (v_vnx8si_in1, v_vnx8si_in2, &v_vnx8si_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8si)
+ INIT_PERMUTE(16, 816, -8941561, vnx16si)
+ permute_vnx16si (v_vnx16si_in1, v_vnx16si_in2, &v_vnx16si_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16si)
+ INIT_PERMUTE(32, -532, 98416, vnx32si)
+ permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, &v_vnx32si_out);
+ CHECK_PERMUTE_REVERSE(32, vnx32si)
+ INIT_PERMUTE(2, -4161, 9551616, vnx2di)
+ permute_vnx2di (v_vnx2di_in1, v_vnx2di_in2, &v_vnx2di_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2di)
+ INIT_PERMUTE(4, 7259, -15644961, vnx4di)
+ permute_vnx4di (v_vnx4di_in1, v_vnx4di_in2, &v_vnx4di_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4di)
+ INIT_PERMUTE(8, 351, 9156651, vnx8di)
+ permute_vnx8di (v_vnx8di_in1, v_vnx8di_in2, &v_vnx8di_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8di)
+ INIT_PERMUTE(16, 11, -816196231,vnx16di)
+ permute_vnx16di (v_vnx16di_in1, v_vnx16di_in2, &v_vnx16di_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16di)
+ INIT_PERMUTE(2, 4552, -89, vnx2sf)
+ permute_vnx2sf (v_vnx2sf_in1, v_vnx2sf_in2, &v_vnx2sf_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2sf)
+ INIT_PERMUTE(4, 685, 7961, vnx4sf)
+ permute_vnx4sf (v_vnx4sf_in1, v_vnx4sf_in2, &v_vnx4sf_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4sf)
+ INIT_PERMUTE(8, 3927, 16513, vnx8sf)
+ permute_vnx8sf (v_vnx8sf_in1, v_vnx8sf_in2, &v_vnx8sf_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8sf)
+ INIT_PERMUTE(16, -68, 16156571, vnx16sf)
+ permute_vnx16sf (v_vnx16sf_in1, v_vnx16sf_in2, &v_vnx16sf_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16sf)
+ INIT_PERMUTE(32, 9985, 1561318, vnx32sf)
+ permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, &v_vnx32sf_out);
+ CHECK_PERMUTE_REVERSE(32, vnx32sf)
+ INIT_PERMUTE(2, -1565.1561, -5641565.515, vnx2df)
+ permute_vnx2df (v_vnx2df_in1, v_vnx2df_in2, &v_vnx2df_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2df)
+ INIT_PERMUTE(4, -189.14897196, -15616547.5165574, vnx4df)
+ permute_vnx4df (v_vnx4df_in1, v_vnx4df_in2, &v_vnx4df_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4df)
+ INIT_PERMUTE(8, 651.158691561, -56163.1655411, vnx8df)
+ permute_vnx8df (v_vnx8df_in1, v_vnx8df_in2, &v_vnx8df_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8df)
+ INIT_PERMUTE(16, 58.91516377, 251465.81561, vnx16df)
+ permute_vnx16df (v_vnx16df_in1, v_vnx16df_in2, &v_vnx16df_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16df)
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,137 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-5.c"
+
+#define MASK_2(X, Y) (Y) - 1 - (X) + (Y), (Y) -2 - (X) + (Y)
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define INIT_MASK(TYPE, NUNTIS) \
+ TYPE TYPE##_mask = {MASK_##NUNTIS (0, NUNTIS)};
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+ INIT_PERMUTE(2, 3, 79, vnx2qi)
+ INIT_MASK (vnx2qi, 2)
+ permute_vnx2qi (v_vnx2qi_in1, v_vnx2qi_in2, vnx2qi_mask, &v_vnx2qi_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2qi)
+ INIT_PERMUTE(4, 2, -69, vnx4qi)
+ INIT_MASK (vnx4qi, 4)
+ permute_vnx4qi (v_vnx4qi_in1, v_vnx4qi_in2, vnx4qi_mask, &v_vnx4qi_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4qi)
+ INIT_PERMUTE(8, 4, -33, vnx8qi)
+ INIT_MASK (vnx8qi, 8)
+ permute_vnx8qi (v_vnx8qi_in1, v_vnx8qi_in2, vnx8qi_mask, &v_vnx8qi_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8qi)
+ INIT_PERMUTE(16, -3, 15, vnx16qi)
+ INIT_MASK (vnx16qi, 16)
+ permute_vnx16qi (v_vnx16qi_in1, v_vnx16qi_in2, vnx16qi_mask, &v_vnx16qi_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16qi)
+ INIT_PERMUTE(32, -1, 30, vnx32qi)
+ INIT_MASK (vnx32qi, 32)
+ permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, vnx32qi_mask, &v_vnx32qi_out);
+ CHECK_PERMUTE_REVERSE(32, vnx32qi)
+ INIT_PERMUTE(64, -1, 66, vnx64qi)
+ INIT_MASK (vnx64qi, 64)
+ permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, vnx64qi_mask, &v_vnx64qi_out);
+ CHECK_PERMUTE_REVERSE(64, vnx64qi)
+ INIT_PERMUTE(128, -1, 38, vnx128qi)
+ INIT_MASK (vnx128qi, 128)
+ permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, vnx128qi_mask, &v_vnx128qi_out);
+ CHECK_PERMUTE_REVERSE(128, vnx128qi)
+ INIT_PERMUTE(2, 2, 30238, vnx2hi)
+ INIT_MASK (vnx2hi, 2)
+ permute_vnx2hi (v_vnx2hi_in1, v_vnx2hi_in2, vnx2hi_mask, &v_vnx2hi_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2hi)
+ INIT_PERMUTE(4, -45, -2345, vnx4hi)
+ INIT_MASK (vnx4hi, 4)
+ permute_vnx4hi (v_vnx4hi_in1, v_vnx4hi_in2, vnx4hi_mask, &v_vnx4hi_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4hi)
+ INIT_PERMUTE(8, 98, -18415, vnx8hi)
+ INIT_MASK (vnx8hi, 8)
+ permute_vnx8hi (v_vnx8hi_in1, v_vnx8hi_in2, vnx8hi_mask, &v_vnx8hi_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8hi)
+ INIT_PERMUTE(16, 56, 3299, vnx16hi)
+ INIT_MASK (vnx16hi, 16)
+ permute_vnx16hi (v_vnx16hi_in1, v_vnx16hi_in2, vnx16hi_mask, &v_vnx16hi_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16hi)
+ INIT_PERMUTE(32, 15641, -9156, vnx32hi)
+ INIT_MASK (vnx32hi, 32)
+ permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, vnx32hi_mask, &v_vnx32hi_out);
+ CHECK_PERMUTE_REVERSE(32, vnx32hi)
+ INIT_PERMUTE(64, -25641, 8093, vnx64hi)
+ INIT_MASK (vnx64hi, 64)
+ permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, vnx64hi_mask, &v_vnx64hi_out);
+ CHECK_PERMUTE_REVERSE(64, vnx64hi)
+ INIT_PERMUTE(2, -428, -15651, vnx2si)
+ INIT_MASK (vnx2si, 2)
+ permute_vnx2si (v_vnx2si_in1, v_vnx2si_in2, vnx2si_mask, &v_vnx2si_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2si)
+ INIT_PERMUTE(4, 208, -55651, vnx4si)
+ INIT_MASK (vnx4si, 4)
+ permute_vnx4si (v_vnx4si_in1, v_vnx4si_in2, vnx4si_mask, &v_vnx4si_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4si)
+ INIT_PERMUTE(8, 808, 75651, vnx8si)
+ INIT_MASK (vnx8si, 8)
+ permute_vnx8si (v_vnx8si_in1, v_vnx8si_in2, vnx8si_mask, &v_vnx8si_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8si)
+ INIT_PERMUTE(16, 816, -8941561, vnx16si)
+ INIT_MASK (vnx16si, 16)
+ permute_vnx16si (v_vnx16si_in1, v_vnx16si_in2, vnx16si_mask, &v_vnx16si_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16si)
+ INIT_PERMUTE(32, -532, 98416, vnx32si)
+ INIT_MASK (vnx32si, 32)
+ permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, vnx32si_mask, &v_vnx32si_out);
+ CHECK_PERMUTE_REVERSE(32, vnx32si)
+ INIT_PERMUTE(2, -4161, 9551616, vnx2di)
+ INIT_MASK (vnx2di, 2)
+ permute_vnx2di (v_vnx2di_in1, v_vnx2di_in2, vnx2di_mask, &v_vnx2di_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2di)
+ INIT_PERMUTE(4, 7259, -15644961, vnx4di)
+ INIT_MASK (vnx4di, 4)
+ permute_vnx4di (v_vnx4di_in1, v_vnx4di_in2, vnx4di_mask, &v_vnx4di_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4di)
+ INIT_PERMUTE(8, 351, 9156651, vnx8di)
+ INIT_MASK (vnx8di, 8)
+ permute_vnx8di (v_vnx8di_in1, v_vnx8di_in2, vnx8di_mask, &v_vnx8di_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8di)
+ INIT_PERMUTE(16, 11, -816196231,vnx16di)
+ INIT_MASK (vnx16di, 16)
+ permute_vnx16di (v_vnx16di_in1, v_vnx16di_in2, vnx16di_mask, &v_vnx16di_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16di)
+ INIT_PERMUTE(2, 4552, -89, vnx2sf)
+ permute_vnx2sf (v_vnx2sf_in1, v_vnx2sf_in2, vnx2si_mask, &v_vnx2sf_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2sf)
+ INIT_PERMUTE(4, 685, 7961, vnx4sf)
+ permute_vnx4sf (v_vnx4sf_in1, v_vnx4sf_in2, vnx4si_mask, &v_vnx4sf_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4sf)
+ INIT_PERMUTE(8, 3927, 16513, vnx8sf)
+ permute_vnx8sf (v_vnx8sf_in1, v_vnx8sf_in2, vnx8si_mask, &v_vnx8sf_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8sf)
+ INIT_PERMUTE(16, -68, 16156571, vnx16sf)
+ permute_vnx16sf (v_vnx16sf_in1, v_vnx16sf_in2, vnx16si_mask, &v_vnx16sf_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16sf)
+ INIT_PERMUTE(32, 9985, 1561318, vnx32sf)
+ permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, vnx32si_mask, &v_vnx32sf_out);
+ CHECK_PERMUTE_REVERSE(32, vnx32sf)
+ INIT_PERMUTE(2, -1565.1561, -5641565.515, vnx2df)
+ permute_vnx2df (v_vnx2df_in1, v_vnx2df_in2, vnx2di_mask, &v_vnx2df_out);
+ CHECK_PERMUTE_REVERSE(2, vnx2df)
+ INIT_PERMUTE(4, -189.14897196, -15616547.5165574, vnx4df)
+ permute_vnx4df (v_vnx4df_in1, v_vnx4df_in2, vnx4di_mask, &v_vnx4df_out);
+ CHECK_PERMUTE_REVERSE(4, vnx4df)
+ INIT_PERMUTE(8, 651.158691561, -56163.1655411, vnx8df)
+ permute_vnx8df (v_vnx8df_in1, v_vnx8df_in2, vnx8di_mask, &v_vnx8df_out);
+ CHECK_PERMUTE_REVERSE(8, vnx8df)
+ INIT_PERMUTE(16, 58.91516377, 251465.81561, vnx16df)
+ permute_vnx16df (v_vnx16df_in1, v_vnx16df_in2, vnx16di_mask, &v_vnx16df_out);
+ CHECK_PERMUTE_REVERSE(16, vnx16df)
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,104 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-6.c"
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+ INIT_PERMUTE(2, 3, 79, vnx2qi)
+ permute_vnx2qi (v_vnx2qi_in1, v_vnx2qi_in2, &v_vnx2qi_out);
+ CHECK_PERMUTE_SINGLE(2, 3*1-79, vnx2qi)
+ INIT_PERMUTE(4, 2, -69, vnx4qi)
+ permute_vnx4qi (v_vnx4qi_in1, v_vnx4qi_in2, &v_vnx4qi_out);
+ CHECK_PERMUTE_SINGLE(4, 2*1-(-69), vnx4qi)
+ INIT_PERMUTE(8, 4, -33, vnx8qi)
+ permute_vnx8qi (v_vnx8qi_in1, v_vnx8qi_in2, &v_vnx8qi_out);
+ CHECK_PERMUTE_SINGLE(8, 4*1-(-33), vnx8qi)
+ INIT_PERMUTE(16, -3, 15, vnx16qi)
+ permute_vnx16qi (v_vnx16qi_in1, v_vnx16qi_in2, &v_vnx16qi_out);
+ CHECK_PERMUTE_SINGLE(16, -3*1-15, vnx16qi)
+ INIT_PERMUTE(32, -1, 30, vnx32qi)
+ permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, &v_vnx32qi_out);
+ CHECK_PERMUTE_SINGLE(32, -1*1-30, vnx32qi)
+ INIT_PERMUTE(64, -1, 66, vnx64qi)
+ permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, &v_vnx64qi_out);
+ CHECK_PERMUTE_SINGLE(64, -1*1-66, vnx64qi)
+ INIT_PERMUTE(128, -1, 38, vnx128qi)
+ permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, &v_vnx128qi_out);
+ CHECK_PERMUTE_SINGLE(128, -1*1-38, vnx128qi)
+ INIT_PERMUTE(2, 2, 30238, vnx2hi)
+ permute_vnx2hi (v_vnx2hi_in1, v_vnx2hi_in2, &v_vnx2hi_out);
+ CHECK_PERMUTE_SINGLE(2, 2*1-30238, vnx2hi)
+ INIT_PERMUTE(4, -45, -2345, vnx4hi)
+ permute_vnx4hi (v_vnx4hi_in1, v_vnx4hi_in2, &v_vnx4hi_out);
+ CHECK_PERMUTE_SINGLE(4, -45*1-(-2345), vnx4hi)
+ INIT_PERMUTE(8, 98, -18415, vnx8hi)
+ permute_vnx8hi (v_vnx8hi_in1, v_vnx8hi_in2, &v_vnx8hi_out);
+ CHECK_PERMUTE_SINGLE(8, 98*1-(-18415), vnx8hi)
+ INIT_PERMUTE(16, 56, 3299, vnx16hi)
+ permute_vnx16hi (v_vnx16hi_in1, v_vnx16hi_in2, &v_vnx16hi_out);
+ CHECK_PERMUTE_SINGLE(16, 56*1-3299, vnx16hi)
+ INIT_PERMUTE(32, 15641, -9156, vnx32hi)
+ permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, &v_vnx32hi_out);
+ CHECK_PERMUTE_SINGLE(32, 15641*1-(-9156), vnx32hi)
+ INIT_PERMUTE(64, -2564, 8093, vnx64hi)
+ permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, &v_vnx64hi_out);
+ CHECK_PERMUTE_SINGLE(64, -2564*1-8093, vnx64hi)
+ INIT_PERMUTE(2, -428, -15651, vnx2si)
+ permute_vnx2si (v_vnx2si_in1, v_vnx2si_in2, &v_vnx2si_out);
+ CHECK_PERMUTE_SINGLE(2, -428*1-(-15651), vnx2si)
+ INIT_PERMUTE(4, 208, -55651, vnx4si)
+ permute_vnx4si (v_vnx4si_in1, v_vnx4si_in2, &v_vnx4si_out);
+ CHECK_PERMUTE_SINGLE(4, 208*1-(-55651), vnx4si)
+ INIT_PERMUTE(8, 808, 75651, vnx8si)
+ permute_vnx8si (v_vnx8si_in1, v_vnx8si_in2, &v_vnx8si_out);
+ CHECK_PERMUTE_SINGLE(8, 808*1-75651, vnx8si)
+ INIT_PERMUTE(16, 816, -8941561, vnx16si)
+ permute_vnx16si (v_vnx16si_in1, v_vnx16si_in2, &v_vnx16si_out);
+ CHECK_PERMUTE_SINGLE(16, 816*1-(-8941561), vnx16si)
+ INIT_PERMUTE(32, -532, 98416, vnx32si)
+ permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, &v_vnx32si_out);
+ CHECK_PERMUTE_SINGLE(32, -532*1-98416, vnx32si)
+ INIT_PERMUTE(2, -4161, 9551616, vnx2di)
+ permute_vnx2di (v_vnx2di_in1, v_vnx2di_in2, &v_vnx2di_out);
+ CHECK_PERMUTE_SINGLE(2, -4161*1-9551616, vnx2di)
+ INIT_PERMUTE(4, 7259, -15644961, vnx4di)
+ permute_vnx4di (v_vnx4di_in1, v_vnx4di_in2, &v_vnx4di_out);
+ CHECK_PERMUTE_SINGLE(4, 7259*1-(-15644961), vnx4di)
+ INIT_PERMUTE(8, 351, 9156651, vnx8di)
+ permute_vnx8di (v_vnx8di_in1, v_vnx8di_in2, &v_vnx8di_out);
+ CHECK_PERMUTE_SINGLE(8, 351*1-9156651, vnx8di)
+ INIT_PERMUTE(16, 11, -816196231,vnx16di)
+ permute_vnx16di (v_vnx16di_in1, v_vnx16di_in2, &v_vnx16di_out);
+ CHECK_PERMUTE_SINGLE(16, 11*1-(-816196231), vnx16di)
+ INIT_PERMUTE(2, 4552, -89, vnx2sf)
+ permute_vnx2sf (v_vnx2sf_in1, v_vnx2sf_in2, &v_vnx2sf_out);
+ CHECK_PERMUTE_SINGLE(2, (4552-(-89)), vnx2sf)
+ INIT_PERMUTE(4, 685, 7961, vnx4sf)
+ permute_vnx4sf (v_vnx4sf_in1, v_vnx4sf_in2, &v_vnx4sf_out);
+ CHECK_PERMUTE_SINGLE(4, 685-7961, vnx4sf)
+ INIT_PERMUTE(8, 3927, 16513, vnx8sf)
+ permute_vnx8sf (v_vnx8sf_in1, v_vnx8sf_in2, &v_vnx8sf_out);
+ CHECK_PERMUTE_SINGLE(8, 3927*1-16513, vnx8sf)
+ INIT_PERMUTE(16, -68, 16156571, vnx16sf)
+ permute_vnx16sf (v_vnx16sf_in1, v_vnx16sf_in2, &v_vnx16sf_out);
+ CHECK_PERMUTE_SINGLE(16, -68*1-16156571, vnx16sf)
+ INIT_PERMUTE(32, 9985, 1561318, vnx32sf)
+ permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, &v_vnx32sf_out);
+ CHECK_PERMUTE_SINGLE(32, 9985*1-1561318, vnx32sf)
+ INIT_PERMUTE(2, -1565.1561, -5641565.515, vnx2df)
+ permute_vnx2df (v_vnx2df_in1, v_vnx2df_in2, &v_vnx2df_out);
+ CHECK_PERMUTE_SINGLE(2, -1565.1561*1-(-5641565.515), vnx2df)
+ INIT_PERMUTE(4, -189.14897196, -15616547.5165574, vnx4df)
+ permute_vnx4df (v_vnx4df_in1, v_vnx4df_in2, &v_vnx4df_out);
+ CHECK_PERMUTE_SINGLE(4, -189.14897196*1-(-15616547.5165574), vnx4df)
+ INIT_PERMUTE(8, 651.158691561, -56163.1655411, vnx8df)
+ permute_vnx8df (v_vnx8df_in1, v_vnx8df_in2, &v_vnx8df_out);
+ CHECK_PERMUTE_SINGLE(8, 651.158691561*1-(-56163.1655411), vnx8df)
+ INIT_PERMUTE(16, 58.91516377, 251465.81561, vnx16df)
+ permute_vnx16df (v_vnx16df_in1, v_vnx16df_in2, &v_vnx16df_out);
+ CHECK_PERMUTE_SINGLE(16, 58.91516377*1-251465.81561, vnx16df)
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,135 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O0" } */
+
+#include "perm-7.c"
+
+#define MASK_2(X) X, X + 2
+#define MASK_4(X) MASK_2 (X), MASK_2 (X+4)
+#define MASK_8(X) MASK_4 (X), MASK_4 (X+8)
+#define MASK_16(X) MASK_8 (X), MASK_8 (X+16)
+#define MASK_32(X) MASK_16 (X), MASK_16 (X+32)
+#define MASK_64(X) MASK_32 (X), MASK_32 (X+64)
+#define MASK_128(X) MASK_64 (X), MASK_64 (X+128)
+
+#define INIT_MASK(TYPE, NUNTIS) TYPE TYPE##_mask = {MASK_##NUNTIS (0)};
+
+int __attribute__ ((optimize (0))) main ()
+{
+ INIT_PERMUTE(2, 3, 79, vnx2qi)
+ INIT_MASK (vnx2qi, 2)
+ permute_vnx2qi (v_vnx2qi_in1, v_vnx2qi_in2, vnx2qi_mask, &v_vnx2qi_out);
+ CHECK_PERMUTE_DOUBLE(2, vnx2qi)
+ INIT_PERMUTE(4, 2, -69, vnx4qi)
+ INIT_MASK (vnx4qi, 4)
+ permute_vnx4qi (v_vnx4qi_in1, v_vnx4qi_in2, vnx4qi_mask, &v_vnx4qi_out);
+ CHECK_PERMUTE_DOUBLE(4, vnx4qi)
+ INIT_PERMUTE(8, 4, -33, vnx8qi)
+ INIT_MASK (vnx8qi, 8)
+ permute_vnx8qi (v_vnx8qi_in1, v_vnx8qi_in2, vnx8qi_mask, &v_vnx8qi_out);
+ CHECK_PERMUTE_DOUBLE(8, vnx8qi)
+ INIT_PERMUTE(16, -3, 15, vnx16qi)
+ INIT_MASK (vnx16qi, 16)
+ permute_vnx16qi (v_vnx16qi_in1, v_vnx16qi_in2, vnx16qi_mask, &v_vnx16qi_out);
+ CHECK_PERMUTE_DOUBLE(16, vnx16qi)
+ INIT_PERMUTE(32, -1, 30, vnx32qi)
+ INIT_MASK (vnx32qi, 32)
+ permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, vnx32qi_mask, &v_vnx32qi_out);
+ CHECK_PERMUTE_DOUBLE(32, vnx32qi)
+ INIT_PERMUTE(64, -1, 66, vnx64qi)
+ INIT_MASK (vnx64qi, 64)
+ permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, vnx64qi_mask, &v_vnx64qi_out);
+ CHECK_PERMUTE_DOUBLE(64, vnx64qi)
+ INIT_PERMUTE(128, -1, 38, vnx128qi)
+ INIT_MASK (vnx128qi, 128)
+ permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, vnx128qi_mask, &v_vnx128qi_out);
+ CHECK_PERMUTE_DOUBLE(128, vnx128qi)
+ INIT_PERMUTE(2, 2, 30238, vnx2hi)
+ INIT_MASK (vnx2hi, 2)
+ permute_vnx2hi (v_vnx2hi_in1, v_vnx2hi_in2, vnx2hi_mask, &v_vnx2hi_out);
+ CHECK_PERMUTE_DOUBLE(2, vnx2hi)
+ INIT_PERMUTE(4, -45, -2345, vnx4hi)
+ INIT_MASK (vnx4hi, 4)
+ permute_vnx4hi (v_vnx4hi_in1, v_vnx4hi_in2, vnx4hi_mask, &v_vnx4hi_out);
+ CHECK_PERMUTE_DOUBLE(4, vnx4hi)
+ INIT_PERMUTE(8, 98, -18415, vnx8hi)
+ INIT_MASK (vnx8hi, 8)
+ permute_vnx8hi (v_vnx8hi_in1, v_vnx8hi_in2, vnx8hi_mask, &v_vnx8hi_out);
+ CHECK_PERMUTE_DOUBLE(8, vnx8hi)
+ INIT_PERMUTE(16, 56, 3299, vnx16hi)
+ INIT_MASK (vnx16hi, 16)
+ permute_vnx16hi (v_vnx16hi_in1, v_vnx16hi_in2, vnx16hi_mask, &v_vnx16hi_out);
+ CHECK_PERMUTE_DOUBLE(16, vnx16hi)
+ INIT_PERMUTE(32, 15641, -9156, vnx32hi)
+ INIT_MASK (vnx32hi, 32)
+ permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, vnx32hi_mask, &v_vnx32hi_out);
+ CHECK_PERMUTE_DOUBLE(32, vnx32hi)
+ INIT_PERMUTE(64, -25641, 8093, vnx64hi)
+ INIT_MASK (vnx64hi, 64)
+ permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, vnx64hi_mask, &v_vnx64hi_out);
+ CHECK_PERMUTE_DOUBLE(64, vnx64hi)
+ INIT_PERMUTE(2, -428, -15651, vnx2si)
+ INIT_MASK (vnx2si, 2)
+ permute_vnx2si (v_vnx2si_in1, v_vnx2si_in2, vnx2si_mask, &v_vnx2si_out);
+ CHECK_PERMUTE_DOUBLE(2, vnx2si)
+ INIT_PERMUTE(4, 208, -55651, vnx4si)
+ INIT_MASK (vnx4si, 4)
+ permute_vnx4si (v_vnx4si_in1, v_vnx4si_in2, vnx4si_mask, &v_vnx4si_out);
+ CHECK_PERMUTE_DOUBLE(4, vnx4si)
+ INIT_PERMUTE(8, 808, 75651, vnx8si)
+ INIT_MASK (vnx8si, 8)
+ permute_vnx8si (v_vnx8si_in1, v_vnx8si_in2, vnx8si_mask, &v_vnx8si_out);
+ CHECK_PERMUTE_DOUBLE(8, vnx8si)
+ INIT_PERMUTE(16, 816, -8941561, vnx16si)
+ INIT_MASK (vnx16si, 16)
+ permute_vnx16si (v_vnx16si_in1, v_vnx16si_in2, vnx16si_mask, &v_vnx16si_out);
+ CHECK_PERMUTE_DOUBLE(16, vnx16si)
+ INIT_PERMUTE(32, -532, 98416, vnx32si)
+ INIT_MASK (vnx32si, 32)
+ permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, vnx32si_mask, &v_vnx32si_out);
+ CHECK_PERMUTE_DOUBLE(32, vnx32si)
+ INIT_PERMUTE(2, -4161, 9551616, vnx2di)
+ INIT_MASK (vnx2di, 2)
+ permute_vnx2di (v_vnx2di_in1, v_vnx2di_in2, vnx2di_mask, &v_vnx2di_out);
+ CHECK_PERMUTE_DOUBLE(2, vnx2di)
+ INIT_PERMUTE(4, 7259, -15644961, vnx4di)
+ INIT_MASK (vnx4di, 4)
+ permute_vnx4di (v_vnx4di_in1, v_vnx4di_in2, vnx4di_mask, &v_vnx4di_out);
+ CHECK_PERMUTE_DOUBLE(4, vnx4di)
+ INIT_PERMUTE(8, 351, 9156651, vnx8di)
+ INIT_MASK (vnx8di, 8)
+ permute_vnx8di (v_vnx8di_in1, v_vnx8di_in2, vnx8di_mask, &v_vnx8di_out);
+ CHECK_PERMUTE_DOUBLE(8, vnx8di)
+ INIT_PERMUTE(16, 11, -816196231,vnx16di)
+ INIT_MASK (vnx16di, 16)
+ permute_vnx16di (v_vnx16di_in1, v_vnx16di_in2, vnx16di_mask, &v_vnx16di_out);
+ CHECK_PERMUTE_DOUBLE(16, vnx16di)
+ INIT_PERMUTE(2, 4552, -89, vnx2sf)
+ permute_vnx2sf (v_vnx2sf_in1, v_vnx2sf_in2, vnx2si_mask, &v_vnx2sf_out);
+ CHECK_PERMUTE_DOUBLE(2, vnx2sf)
+ INIT_PERMUTE(4, 685, 7961, vnx4sf)
+ permute_vnx4sf (v_vnx4sf_in1, v_vnx4sf_in2, vnx4si_mask, &v_vnx4sf_out);
+ CHECK_PERMUTE_DOUBLE(4, vnx4sf)
+ INIT_PERMUTE(8, 3927, 16513, vnx8sf)
+ permute_vnx8sf (v_vnx8sf_in1, v_vnx8sf_in2, vnx8si_mask, &v_vnx8sf_out);
+ CHECK_PERMUTE_DOUBLE(8, vnx8sf)
+ INIT_PERMUTE(16, -68, 16156571, vnx16sf)
+ permute_vnx16sf (v_vnx16sf_in1, v_vnx16sf_in2, vnx16si_mask, &v_vnx16sf_out);
+ CHECK_PERMUTE_DOUBLE(16, vnx16sf)
+ INIT_PERMUTE(32, 9985, 1561318, vnx32sf)
+ permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, vnx32si_mask, &v_vnx32sf_out);
+ CHECK_PERMUTE_DOUBLE(32, vnx32sf)
+ INIT_PERMUTE(2, -1565.1561, -5641565.515, vnx2df)
+ permute_vnx2df (v_vnx2df_in1, v_vnx2df_in2, vnx2di_mask, &v_vnx2df_out);
+ CHECK_PERMUTE_DOUBLE(2, vnx2df)
+ INIT_PERMUTE(4, -189.14897196, -15616547.5165574, vnx4df)
+ permute_vnx4df (v_vnx4df_in1, v_vnx4df_in2, vnx4di_mask, &v_vnx4df_out);
+ CHECK_PERMUTE_DOUBLE(4, vnx4df)
+ INIT_PERMUTE(8, 651.158691561, -56163.1655411, vnx8df)
+ permute_vnx8df (v_vnx8df_in1, v_vnx8df_in2, vnx8di_mask, &v_vnx8df_out);
+ CHECK_PERMUTE_DOUBLE(8, vnx8df)
+ INIT_PERMUTE(16, 58.91516377, 251465.81561, vnx16df)
+ permute_vnx16df (v_vnx16df_in1, v_vnx16df_in2, vnx16di_mask, &v_vnx16df_out);
+ CHECK_PERMUTE_DOUBLE(16, vnx16df)
+
+ return 0;
+}