[V2] RISC-V: Support RVV permutation auto-vectorization

Message ID 20230601023615.89715-1-juzhe.zhong@rivai.ai
State Unresolved
Headers
Series [V2] RISC-V: Support RVV permutation auto-vectorization |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

juzhe.zhong@rivai.ai June 1, 2023, 2:36 a.m. UTC
  From: Juzhe-Zhong <juzhe.zhong@rivai.ai>

This patch supports vector permutation for VLS only by vec_perm pattern.
We will support TARGET_VECTORIZE_VEC_PERM_CONST to support VLA permutation
in the future.

Fixed following comments from Robin.
Ok for trunk?

gcc/ChangeLog:

        * config/riscv/autovec.md (vec_perm<mode>): New pattern.
        * config/riscv/predicates.md (vector_perm_operand): New predicate.
        * config/riscv/riscv-protos.h (enum insn_type): New enum.
        (expand_vec_perm): New function.
        * config/riscv/riscv-v.cc (const_vec_all_in_range_p): Ditto.
        (gen_const_vector_dup): Ditto.
        (emit_vlmax_gather_insn): Ditto.
        (emit_vlmax_masked_gather_mu_insn): Ditto.
        (expand_vec_perm): Ditto.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm-1.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm-2.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm-3.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm-5.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm-6.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm-7.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm.h: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-1.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-2.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-3.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-4.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-5.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-6.c: New test.
        * gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-7.c: New test.

---
 gcc/config/riscv/autovec.md                   |  18 +++
 gcc/config/riscv/predicates.md                |   4 +
 gcc/config/riscv/riscv-protos.h               |   2 +
 gcc/config/riscv/riscv-v.cc                   | 153 ++++++++++++++++++
 .../riscv/rvv/autovec/vls-vlmax/perm-1.c      |  58 +++++++
 .../riscv/rvv/autovec/vls-vlmax/perm-2.c      |  33 ++++
 .../riscv/rvv/autovec/vls-vlmax/perm-3.c      |  29 ++++
 .../riscv/rvv/autovec/vls-vlmax/perm-4.c      |  58 +++++++
 .../riscv/rvv/autovec/vls-vlmax/perm-5.c      |  49 ++++++
 .../riscv/rvv/autovec/vls-vlmax/perm-6.c      |  58 +++++++
 .../riscv/rvv/autovec/vls-vlmax/perm-7.c      |  49 ++++++
 .../riscv/rvv/autovec/vls-vlmax/perm.h        |  70 ++++++++
 .../riscv/rvv/autovec/vls-vlmax/perm_run-1.c  | 104 ++++++++++++
 .../riscv/rvv/autovec/vls-vlmax/perm_run-2.c  |  32 ++++
 .../riscv/rvv/autovec/vls-vlmax/perm_run-3.c  |  20 +++
 .../riscv/rvv/autovec/vls-vlmax/perm_run-4.c  | 104 ++++++++++++
 .../riscv/rvv/autovec/vls-vlmax/perm_run-5.c  | 137 ++++++++++++++++
 .../riscv/rvv/autovec/vls-vlmax/perm_run-6.c  | 104 ++++++++++++
 .../riscv/rvv/autovec/vls-vlmax/perm_run-7.c  | 135 ++++++++++++++++
 19 files changed, 1217 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-5.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-6.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-7.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm.h
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-4.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-5.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-6.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-7.c
  

Comments

Jeff Law June 1, 2023, 6:48 p.m. UTC | #1
On 5/31/23 20:36, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> 
> This patch supports vector permutation for VLS only by vec_perm pattern.
> We will support TARGET_VECTORIZE_VEC_PERM_CONST to support VLA permutation
> in the future.
> 
> Fixed following comments from Robin.
> Ok for trunk?
> 
> gcc/ChangeLog:
> 
>          * config/riscv/autovec.md (vec_perm<mode>): New pattern.
>          * config/riscv/predicates.md (vector_perm_operand): New predicate.
>          * config/riscv/riscv-protos.h (enum insn_type): New enum.
>          (expand_vec_perm): New function.
>          * config/riscv/riscv-v.cc (const_vec_all_in_range_p): Ditto.
>          (gen_const_vector_dup): Ditto.
>          (emit_vlmax_gather_insn): Ditto.
>          (emit_vlmax_masked_gather_mu_insn): Ditto.
>          (expand_vec_perm): Ditto.
OK.
jeff
  
Li, Pan2 via Gcc-patches June 2, 2023, 1:03 a.m. UTC | #2
Committed, thanks Jeff.

Pan

-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Jeff Law via Gcc-patches
Sent: Friday, June 2, 2023 2:49 AM
To: juzhe.zhong@rivai.ai; gcc-patches@gcc.gnu.org
Cc: kito.cheng@gmail.com; kito.cheng@sifive.com; palmer@dabbelt.com; palmer@rivosinc.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH V2] RISC-V: Support RVV permutation auto-vectorization



On 5/31/23 20:36, juzhe.zhong@rivai.ai wrote:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
> 
> This patch supports vector permutation for VLS only by vec_perm pattern.
> We will support TARGET_VECTORIZE_VEC_PERM_CONST to support VLA 
> permutation in the future.
> 
> Fixed following comments from Robin.
> Ok for trunk?
> 
> gcc/ChangeLog:
> 
>          * config/riscv/autovec.md (vec_perm<mode>): New pattern.
>          * config/riscv/predicates.md (vector_perm_operand): New predicate.
>          * config/riscv/riscv-protos.h (enum insn_type): New enum.
>          (expand_vec_perm): New function.
>          * config/riscv/riscv-v.cc (const_vec_all_in_range_p): Ditto.
>          (gen_const_vector_dup): Ditto.
>          (emit_vlmax_gather_insn): Ditto.
>          (emit_vlmax_masked_gather_mu_insn): Ditto.
>          (expand_vec_perm): Ditto.
OK.
jeff
  

Patch

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 3a1e1316732..5c3aad7ee44 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -83,6 +83,24 @@ 
   }
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [INT,FP] permutation
+;; -------------------------------------------------------------------------
+;; This is the pattern permutes the vector
+;; -------------------------------------------------------------------------
+
+(define_expand "vec_perm<mode>"
+  [(match_operand:V 0 "register_operand")
+   (match_operand:V 1 "register_operand")
+   (match_operand:V 2 "register_operand")
+   (match_operand:<VINDEX> 3 "vector_perm_operand")]
+  "TARGET_VECTOR && GET_MODE_NUNITS (<MODE>mode).is_constant ()"
+  {
+    riscv_vector::expand_vec_perm (operands);
+    DONE;
+  }
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT,FP] Initialize from individual elements
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index f7c4a3f030f..1ed84850e35 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -330,6 +330,10 @@ 
        (and (match_code "const_vector")
             (match_test "riscv_vector::const_vec_all_same_in_range_p (op, 0, 31)"))))
 
+(define_predicate "vector_perm_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_code "const_vector")))
+
 (define_predicate "ltge_operator"
   (match_code "lt,ltu,ge,geu"))
 
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 277845673d4..d032f569a36 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -137,6 +137,7 @@  enum insn_type
   RVV_MISC_OP = 1,
   RVV_UNOP = 2,
   RVV_BINOP = 3,
+  RVV_BINOP_MU = RVV_BINOP + 2,
   RVV_MERGE_OP = 4,
   RVV_CMP_OP = 4,
   RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand.  */
@@ -240,6 +241,7 @@  opt_machine_mode get_mask_mode (machine_mode);
 void expand_vec_series (rtx, rtx, rtx);
 void expand_vec_init (rtx, rtx);
 void expand_vcond (rtx *);
+void expand_vec_perm (rtx *);
 /* Rounding mode bitfield for fixed point VXRM.  */
 enum vxrm_field_enum
 {
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index b381970140d..1cd3bd3438e 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -259,6 +259,47 @@  const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
 	  && IN_RANGE (INTVAL (elt), minval, maxval));
 }
 
+/* Return true if VEC is a constant in which every element is in the range
+   [MINVAL, MAXVAL].  The elements do not need to have the same value.
+
+   This function also exists in aarch64, we may unify it in middle-end in the
+   future.  */
+
+static bool
+const_vec_all_in_range_p (rtx vec, HOST_WIDE_INT minval, HOST_WIDE_INT maxval)
+{
+  if (!CONST_VECTOR_P (vec)
+      || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
+    return false;
+
+  int nunits;
+  if (!CONST_VECTOR_STEPPED_P (vec))
+    nunits = const_vector_encoded_nelts (vec);
+  else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
+    return false;
+
+  for (int i = 0; i < nunits; i++)
+    {
+      rtx vec_elem = CONST_VECTOR_ELT (vec, i);
+      if (!CONST_INT_P (vec_elem)
+	  || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
+	return false;
+    }
+  return true;
+}
+
+/* Return a const_int vector of VAL.
+
+   This function also exists in aarch64, we may unify it in middle-end in the
+   future.  */
+
+static rtx
+gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
+{
+  rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
+  return gen_const_vec_duplicate (mode, c);
+}
+
 /* Emit a vlmax vsetvl instruction.  This should only be used when
    optimization is disabled or after vsetvl insertion pass.  */
 void
@@ -1927,4 +1968,116 @@  expand_vcond (rtx *ops)
     gen_vcond_mask (data_mode, data_mode, ops[0], ops[1], ops[2], mask));
 }
 
+/* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
+   is a const duplicate vector. Otherwise, emit vrgather.vv.  */
+static void
+emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
+{
+  rtx elt;
+  insn_code icode;
+  machine_mode data_mode = GET_MODE (target);
+  if (const_vec_duplicate_p (sel, &elt))
+    {
+      icode = code_for_pred_gather_scalar (data_mode);
+      sel = elt;
+    }
+  else
+    icode = code_for_pred_gather (data_mode);
+  rtx ops[] = {target, op, sel};
+  emit_vlmax_insn (icode, RVV_BINOP, ops);
+}
+
+static void
+emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
+{
+  rtx elt;
+  insn_code icode;
+  machine_mode data_mode = GET_MODE (target);
+  if (const_vec_duplicate_p (sel, &elt))
+    {
+      icode = code_for_pred_gather_scalar (data_mode);
+      sel = elt;
+    }
+  else
+    icode = code_for_pred_gather (data_mode);
+  rtx ops[] = {target, mask, target, op, sel};
+  emit_vlmax_masked_mu_insn (icode, RVV_BINOP_MU, ops);
+}
+
+/* Implement vec_perm<mode>.  */
+
+void
+expand_vec_perm (rtx *operands)
+{
+  rtx target = operands[0];
+  rtx op0 = operands[1];
+  rtx op1 = operands[2];
+  rtx sel = operands[3];
+  machine_mode data_mode = GET_MODE (target);
+  machine_mode sel_mode = GET_MODE (sel);
+
+  /* Enforced by the pattern condition.  */
+  int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
+
+  /* Check if the sel only references the first values vector. If each select
+     index is in range of [0, nunits - 1]. A single vrgather instructions is
+     enough.  */
+  if (const_vec_all_in_range_p (sel, 0, nunits - 1))
+    {
+      emit_vlmax_gather_insn (target, op0, sel);
+      return;
+    }
+
+  /* Check if the two values vectors are the same.  */
+  if (rtx_equal_p (op0, op1) || const_vec_duplicate_p (sel))
+    {
+      /* Note: vec_perm indices are supposed to wrap when they go beyond the
+	 size of the two value vectors, i.e. the upper bits of the indices
+	 are effectively ignored.  RVV vrgather instead produces 0 for any
+	 out-of-range indices, so we need to modulo all the vec_perm indices
+	 to ensure they are all in range of [0, nunits - 1].  */
+      rtx max_sel = gen_const_vector_dup (sel_mode, nunits - 1);
+      rtx sel_mod = expand_simple_binop (sel_mode, AND, sel, max_sel, NULL, 0,
+					 OPTAB_DIRECT);
+      emit_vlmax_gather_insn (target, op1, sel_mod);
+      return;
+    }
+
+  /* Note: vec_perm indices are supposed to wrap when they go beyond the
+     size of the two value vectors, i.e. the upper bits of the indices
+     are effectively ignored.  RVV vrgather instead produces 0 for any
+     out-of-range indices, so we need to modulo all the vec_perm indices
+     to ensure they are all in range of [0, 2 * nunits - 1].  */
+  rtx max_sel = gen_const_vector_dup (sel_mode, 2 * nunits - 1);
+  rtx sel_mod
+    = expand_simple_binop (sel_mode, AND, sel, max_sel, NULL, 0, OPTAB_DIRECT);
+
+  /* This following sequence is handling the case that:
+     __builtin_shufflevector (vec1, vec2, index...), the index can be any
+     value in range of [0, 2 * nunits - 1].  */
+  machine_mode mask_mode;
+  mask_mode = get_mask_mode (data_mode).require ();
+  rtx mask = gen_reg_rtx (mask_mode);
+  max_sel = gen_const_vector_dup (sel_mode, nunits);
+
+  /* Step 1: generate a mask that should select everything >= nunits into the
+   * mask.  */
+  expand_vec_cmp (mask, GEU, sel_mod, max_sel);
+
+  /* Step2: gather every op0 values indexed by sel into target,
+	    we don't need to care about the result of the element
+	    whose index >= nunits.  */
+  emit_vlmax_gather_insn (target, op0, sel_mod);
+
+  /* Step3: shift the range from (nunits, max_of_mode] to
+	    [0, max_of_mode - nunits].  */
+  rtx tmp = gen_reg_rtx (sel_mode);
+  rtx ops[] = {tmp, sel_mod, max_sel};
+  emit_vlmax_insn (code_for_pred (MINUS, sel_mode), RVV_BINOP, ops);
+
+  /* Step4: gather those into the previously masked-out elements
+	    of target.  */
+  emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
+}
+
 } // namespace riscv_vector
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-1.c
new file mode 100644
index 00000000000..58c2cd8ce23
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-1.c
@@ -0,0 +1,58 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define MASK_2(X, Y) 1, 1
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define PERMUTE(TYPE, NUNITS)                                                  \
+  __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2,     \
+					       TYPE *out)                      \
+  {                                                                            \
+    TYPE v                                                                     \
+      = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
+    *(TYPE *) out = v;                                                         \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (vnx2qi, 2)                                                                \
+  T (vnx4qi, 4)                                                                \
+  T (vnx8qi, 8)                                                                \
+  T (vnx16qi, 16)                                                              \
+  T (vnx32qi, 32)                                                              \
+  T (vnx64qi, 64)                                                              \
+  T (vnx128qi, 128)                                                            \
+  T (vnx2hi, 2)                                                                \
+  T (vnx4hi, 4)                                                                \
+  T (vnx8hi, 8)                                                                \
+  T (vnx16hi, 16)                                                              \
+  T (vnx32hi, 32)                                                              \
+  T (vnx64hi, 64)                                                              \
+  T (vnx2si, 2)                                                                \
+  T (vnx4si, 4)                                                                \
+  T (vnx8si, 8)                                                                \
+  T (vnx16si, 16)                                                              \
+  T (vnx32si, 32)                                                              \
+  T (vnx2di, 2)                                                                \
+  T (vnx4di, 4)                                                                \
+  T (vnx8di, 8)                                                                \
+  T (vnx16di, 16)                                                              \
+  T (vnx2sf, 2)                                                                \
+  T (vnx4sf, 4)                                                                \
+  T (vnx8sf, 8)                                                                \
+  T (vnx16sf, 16)                                                              \
+  T (vnx32sf, 32)                                                              \
+  T (vnx2df, 2)                                                                \
+  T (vnx4df, 4)                                                                \
+  T (vnx8df, 8)                                                                \
+  T (vnx16df, 16)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vi\tv[0-9]+,\s*v[0-9]+,\s*1} 31 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-2.c
new file mode 100644
index 00000000000..d88b6461da5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-2.c
@@ -0,0 +1,33 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define MASK_2(X, Y) 31, 31
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define PERMUTE(TYPE, NUNITS)                                                  \
+  void permute_##TYPE (TYPE values1, TYPE values2, TYPE *out)                  \
+  {                                                                            \
+    TYPE v                                                                     \
+      = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
+    *(TYPE *) out = v;                                                         \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (vnx32qi, 32)                                                              \
+  T (vnx64qi, 64)                                                              \
+  T (vnx128qi, 128)                                                            \
+  T (vnx32hi, 32)                                                              \
+  T (vnx64hi, 64)                                                              \
+  T (vnx32si, 32)                                                              \
+  T (vnx32sf, 32)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vi\tv[0-9]+,\s*v[0-9]+,\s*31} 7 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-3.c
new file mode 100644
index 00000000000..110df490c6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-3.c
@@ -0,0 +1,29 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define MASK_2(X, Y) 55, 55
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define PERMUTE(TYPE, NUNITS)                                                  \
+  void permute_##TYPE (TYPE values1, TYPE values2, TYPE *out)                  \
+  {                                                                            \
+    TYPE v                                                                     \
+      = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
+    *(TYPE *) out = v;                                                         \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (vnx64qi, 64)                                                              \
+  T (vnx128qi, 128)                                                            \
+  T (vnx64hi, 64)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vx\tv[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+} 3 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
new file mode 100644
index 00000000000..179c8274a92
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-4.c
@@ -0,0 +1,58 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define MASK_2(X, Y) (Y) - 1 - (X), (Y) - 2 - (X)
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define PERMUTE(TYPE, NUNITS)                                                  \
+  __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2,     \
+					       TYPE *out)                      \
+  {                                                                            \
+    TYPE v                                                                     \
+      = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
+    *(TYPE *) out = v;                                                         \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (vnx2qi, 2)                                                                \
+  T (vnx4qi, 4)                                                                \
+  T (vnx8qi, 8)                                                                \
+  T (vnx16qi, 16)                                                              \
+  T (vnx32qi, 32)                                                              \
+  T (vnx64qi, 64)                                                              \
+  T (vnx128qi, 128)                                                            \
+  T (vnx2hi, 2)                                                                \
+  T (vnx4hi, 4)                                                                \
+  T (vnx8hi, 8)                                                                \
+  T (vnx16hi, 16)                                                              \
+  T (vnx32hi, 32)                                                              \
+  T (vnx64hi, 64)                                                              \
+  T (vnx2si, 2)                                                                \
+  T (vnx4si, 4)                                                                \
+  T (vnx8si, 8)                                                                \
+  T (vnx16si, 16)                                                              \
+  T (vnx32si, 32)                                                              \
+  T (vnx2di, 2)                                                                \
+  T (vnx4di, 4)                                                                \
+  T (vnx8di, 8)                                                                \
+  T (vnx16di, 16)                                                              \
+  T (vnx2sf, 2)                                                                \
+  T (vnx4sf, 4)                                                                \
+  T (vnx8sf, 8)                                                                \
+  T (vnx16sf, 16)                                                              \
+  T (vnx32sf, 32)                                                              \
+  T (vnx2df, 2)                                                                \
+  T (vnx4df, 4)                                                                \
+  T (vnx8df, 8)                                                                \
+  T (vnx16df, 16)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-5.c
new file mode 100644
index 00000000000..7117a492dc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-5.c
@@ -0,0 +1,49 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define PERMUTE(TYPE, TYPE2, NUNITS)                                           \
+  __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2,     \
+					       TYPE2 mask, TYPE *out)          \
+  {                                                                            \
+    TYPE v = __builtin_shuffle (values1, values1, mask);                       \
+    *(TYPE *) out = v;                                                         \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (vnx2qi, vnx2qi, 2)                                                        \
+  T (vnx4qi, vnx4qi, 4)                                                        \
+  T (vnx8qi, vnx8qi, 8)                                                        \
+  T (vnx16qi, vnx16qi, 16)                                                     \
+  T (vnx32qi, vnx32qi, 32)                                                     \
+  T (vnx64qi, vnx64qi, 64)                                                     \
+  T (vnx128qi, vnx128qi, 128)                                                  \
+  T (vnx2hi, vnx2hi, 2)                                                        \
+  T (vnx4hi, vnx4hi, 4)                                                        \
+  T (vnx8hi, vnx8hi, 8)                                                        \
+  T (vnx16hi, vnx16hi, 16)                                                     \
+  T (vnx32hi, vnx32hi, 32)                                                     \
+  T (vnx64hi, vnx64hi, 64)                                                     \
+  T (vnx2si, vnx2si, 2)                                                        \
+  T (vnx4si, vnx4si, 4)                                                        \
+  T (vnx8si, vnx8si, 8)                                                        \
+  T (vnx16si, vnx16si, 16)                                                     \
+  T (vnx32si, vnx32si, 32)                                                     \
+  T (vnx2di, vnx2di, 2)                                                        \
+  T (vnx4di, vnx4di, 4)                                                        \
+  T (vnx8di, vnx8di, 8)                                                        \
+  T (vnx16di, vnx16di, 16)                                                     \
+  T (vnx2sf, vnx2si, 2)                                                        \
+  T (vnx4sf, vnx4si, 4)                                                        \
+  T (vnx8sf, vnx8si, 8)                                                        \
+  T (vnx16sf, vnx16si, 16)                                                     \
+  T (vnx32sf, vnx32si, 32)                                                     \
+  T (vnx2df, vnx2di, 2)                                                        \
+  T (vnx4df, vnx4di, 4)                                                        \
+  T (vnx8df, vnx8di, 8)                                                        \
+  T (vnx16df, vnx16di, 16)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 31 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-6.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-6.c
new file mode 100644
index 00000000000..67b2e6f680e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-6.c
@@ -0,0 +1,58 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define MASK_2(X, Y) Y + 1, Y + 1
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define PERMUTE(TYPE, NUNITS)                                                  \
+  __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2,     \
+					       TYPE *out)                      \
+  {                                                                            \
+    TYPE v                                                                     \
+      = __builtin_shufflevector (values1, values2, MASK_##NUNITS (0, NUNITS)); \
+    *(TYPE *) out = v;                                                         \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (vnx2qi, 2)                                                                \
+  T (vnx4qi, 4)                                                                \
+  T (vnx8qi, 8)                                                                \
+  T (vnx16qi, 16)                                                              \
+  T (vnx32qi, 32)                                                              \
+  T (vnx64qi, 64)                                                              \
+  T (vnx128qi, 128)                                                            \
+  T (vnx2hi, 2)                                                                \
+  T (vnx4hi, 4)                                                                \
+  T (vnx8hi, 8)                                                                \
+  T (vnx16hi, 16)                                                              \
+  T (vnx32hi, 32)                                                              \
+  T (vnx64hi, 64)                                                              \
+  T (vnx2si, 2)                                                                \
+  T (vnx4si, 4)                                                                \
+  T (vnx8si, 8)                                                                \
+  T (vnx16si, 16)                                                              \
+  T (vnx32si, 32)                                                              \
+  T (vnx2di, 2)                                                                \
+  T (vnx4di, 4)                                                                \
+  T (vnx8di, 8)                                                                \
+  T (vnx16di, 16)                                                              \
+  T (vnx2sf, 2)                                                                \
+  T (vnx4sf, 4)                                                                \
+  T (vnx8sf, 8)                                                                \
+  T (vnx16sf, 16)                                                              \
+  T (vnx32sf, 32)                                                              \
+  T (vnx2df, 2)                                                                \
+  T (vnx4df, 4)                                                                \
+  T (vnx8df, 8)                                                                \
+  T (vnx16df, 16)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vi\tv[0-9]+,\s*v[0-9]+,\s*1} 31 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-7.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-7.c
new file mode 100644
index 00000000000..0ac98287254
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm-7.c
@@ -0,0 +1,49 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv64gcv -mabi=lp64d" } */
+
+#include "perm.h"
+
+#define PERMUTE(TYPE, TYPE2, NUNITS)                                           \
+  __attribute__ ((noipa)) void permute_##TYPE (TYPE values1, TYPE values2,     \
+					       TYPE2 mask, TYPE *out)          \
+  {                                                                            \
+    TYPE v = __builtin_shuffle (values1, values2, mask);                       \
+    *(TYPE *) out = v;                                                         \
+  }
+
+#define TEST_ALL(T)                                                            \
+  T (vnx2qi, vnx2qi, 2)                                                        \
+  T (vnx4qi, vnx4qi, 4)                                                        \
+  T (vnx8qi, vnx8qi, 8)                                                        \
+  T (vnx16qi, vnx16qi, 16)                                                     \
+  T (vnx32qi, vnx32qi, 32)                                                     \
+  T (vnx64qi, vnx64qi, 64)                                                     \
+  T (vnx128qi, vnx128qi, 128)                                                  \
+  T (vnx2hi, vnx2hi, 2)                                                        \
+  T (vnx4hi, vnx4hi, 4)                                                        \
+  T (vnx8hi, vnx8hi, 8)                                                        \
+  T (vnx16hi, vnx16hi, 16)                                                     \
+  T (vnx32hi, vnx32hi, 32)                                                     \
+  T (vnx64hi, vnx64hi, 64)                                                     \
+  T (vnx2si, vnx2si, 2)                                                        \
+  T (vnx4si, vnx4si, 4)                                                        \
+  T (vnx8si, vnx8si, 8)                                                        \
+  T (vnx16si, vnx16si, 16)                                                     \
+  T (vnx32si, vnx32si, 32)                                                     \
+  T (vnx2di, vnx2di, 2)                                                        \
+  T (vnx4di, vnx4di, 4)                                                        \
+  T (vnx8di, vnx8di, 8)                                                        \
+  T (vnx16di, vnx16di, 16)                                                     \
+  T (vnx2sf, vnx2si, 2)                                                        \
+  T (vnx4sf, vnx4si, 4)                                                        \
+  T (vnx8sf, vnx8si, 8)                                                        \
+  T (vnx16sf, vnx16si, 16)                                                     \
+  T (vnx32sf, vnx32si, 32)                                                     \
+  T (vnx2df, vnx2di, 2)                                                        \
+  T (vnx4df, vnx4di, 4)                                                        \
+  T (vnx8df, vnx8di, 8)                                                        \
+  T (vnx16df, vnx16di, 16)
+
+TEST_ALL (PERMUTE)
+
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+,\s*v0.t} 31 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm.h b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm.h
new file mode 100644
index 00000000000..18cb4af059b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm.h
@@ -0,0 +1,70 @@ 
+#include <stdint.h>
+
+typedef int8_t vnx2qi __attribute__ ((vector_size (2)));
+typedef int8_t vnx4qi __attribute__ ((vector_size (4)));
+typedef int8_t vnx8qi __attribute__ ((vector_size (8)));
+typedef int8_t vnx16qi __attribute__ ((vector_size (16)));
+typedef int8_t vnx32qi __attribute__ ((vector_size (32)));
+typedef int8_t vnx64qi __attribute__ ((vector_size (64)));
+typedef int8_t vnx128qi __attribute__ ((vector_size (128)));
+
+typedef int16_t vnx2hi __attribute__ ((vector_size (4)));
+typedef int16_t vnx4hi __attribute__ ((vector_size (8)));
+typedef int16_t vnx8hi __attribute__ ((vector_size (16)));
+typedef int16_t vnx16hi __attribute__ ((vector_size (32)));
+typedef int16_t vnx32hi __attribute__ ((vector_size (64)));
+typedef int16_t vnx64hi __attribute__ ((vector_size (128)));
+
+typedef int32_t vnx2si __attribute__ ((vector_size (8)));
+typedef int32_t vnx4si __attribute__ ((vector_size (16)));
+typedef int32_t vnx8si __attribute__ ((vector_size (32)));
+typedef int32_t vnx16si __attribute__ ((vector_size (64)));
+typedef int32_t vnx32si __attribute__ ((vector_size (128)));
+
+typedef int64_t vnx2di __attribute__ ((vector_size (16)));
+typedef int64_t vnx4di __attribute__ ((vector_size (32)));
+typedef int64_t vnx8di __attribute__ ((vector_size (64)));
+typedef int64_t vnx16di __attribute__ ((vector_size (128)));
+
+typedef float vnx2sf __attribute__ ((vector_size (8)));
+typedef float vnx4sf __attribute__ ((vector_size (16)));
+typedef float vnx8sf __attribute__ ((vector_size (32)));
+typedef float vnx16sf __attribute__ ((vector_size (64)));
+typedef float vnx32sf __attribute__ ((vector_size (128)));
+
+typedef double vnx2df __attribute__ ((vector_size (16)));
+typedef double vnx4df __attribute__ ((vector_size (32)));
+typedef double vnx8df __attribute__ ((vector_size (64)));
+typedef double vnx16df __attribute__ ((vector_size (128)));
+
+#define INIT_PERMUTE(NUNITS, NUM1, NUM2, TYPE)                                 \
+  TYPE v_##TYPE##_in1;                                                         \
+  TYPE v_##TYPE##_in2;                                                         \
+  TYPE v_##TYPE##_out = {0};                                                   \
+  for (int i = 0; i < NUNITS; i++)                                             \
+    {                                                                          \
+      v_##TYPE##_in1[i] = i * NUM1 + NUM2;                                     \
+      v_##TYPE##_in2[i] = i * NUM1 - NUM2;                                     \
+    }
+
+#define CHECK_PERMUTE_SINGLE(NUNITS, VALUE, TYPE)                              \
+  for (int i = 0; i < NUNITS; i++)                                             \
+    if (v_##TYPE##_out[i] != VALUE)                                            \
+      __builtin_abort ();
+
+#define CHECK_PERMUTE_REVERSE(NUNITS, TYPE)                                    \
+  for (int i = 0; i < NUNITS; i++)                                             \
+    if (v_##TYPE##_out[i] != v_##TYPE##_in1[NUNITS - 1 - i])                   \
+      __builtin_abort ();
+
+#define CHECK_PERMUTE_DOUBLE(NUNITS, TYPE)                                     \
+  for (int i = 0; i < NUNITS; i++)                                             \
+    {                                                                          \
+      int new_index = i * 2;                                                   \
+      if (new_index < NUNITS                                                   \
+	  && v_##TYPE##_out[i] != v_##TYPE##_in1[new_index])                   \
+	__builtin_abort ();                                                    \
+      if (new_index >= NUNITS                                                  \
+	  && v_##TYPE##_out[i] != v_##TYPE##_in2[new_index % NUNITS])          \
+	__builtin_abort ();                                                    \
+    }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-1.c
new file mode 100644
index 00000000000..cb216a9543c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-1.c
@@ -0,0 +1,104 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-1.c"
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+  INIT_PERMUTE(2, 3, 79, vnx2qi)
+  permute_vnx2qi (v_vnx2qi_in1, v_vnx2qi_in2, &v_vnx2qi_out);
+  CHECK_PERMUTE_SINGLE(2, 3*1+79, vnx2qi)
+  INIT_PERMUTE(4, 2, -69, vnx4qi)
+  permute_vnx4qi (v_vnx4qi_in1, v_vnx4qi_in2, &v_vnx4qi_out);
+  CHECK_PERMUTE_SINGLE(4, 2*1+-69, vnx4qi)
+  INIT_PERMUTE(8, 4, -33, vnx8qi)
+  permute_vnx8qi (v_vnx8qi_in1, v_vnx8qi_in2, &v_vnx8qi_out);
+  CHECK_PERMUTE_SINGLE(8, 4*1+-33, vnx8qi)
+  INIT_PERMUTE(16, -3, 15, vnx16qi)
+  permute_vnx16qi (v_vnx16qi_in1, v_vnx16qi_in2, &v_vnx16qi_out);
+  CHECK_PERMUTE_SINGLE(16, -3*1+15, vnx16qi)
+  INIT_PERMUTE(32, -1, 30, vnx32qi)
+  permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, &v_vnx32qi_out);
+  CHECK_PERMUTE_SINGLE(32, -1*1+30, vnx32qi)
+  INIT_PERMUTE(64, -1, 66, vnx64qi)
+  permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, &v_vnx64qi_out);
+  CHECK_PERMUTE_SINGLE(64, -1*1+66, vnx64qi)
+  INIT_PERMUTE(128, -1, 38, vnx128qi)
+  permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, &v_vnx128qi_out);
+  CHECK_PERMUTE_SINGLE(128, -1*1+38, vnx128qi)
+  INIT_PERMUTE(2, 2, 30238, vnx2hi)
+  permute_vnx2hi (v_vnx2hi_in1, v_vnx2hi_in2, &v_vnx2hi_out);
+  CHECK_PERMUTE_SINGLE(2, 2*1+30238, vnx2hi)
+  INIT_PERMUTE(4, -45, -2345, vnx4hi)
+  permute_vnx4hi (v_vnx4hi_in1, v_vnx4hi_in2, &v_vnx4hi_out);
+  CHECK_PERMUTE_SINGLE(4, -45*1+-2345, vnx4hi)
+  INIT_PERMUTE(8, 98, -18415, vnx8hi)
+  permute_vnx8hi (v_vnx8hi_in1, v_vnx8hi_in2, &v_vnx8hi_out);
+  CHECK_PERMUTE_SINGLE(8, 98*1+-18415, vnx8hi)
+  INIT_PERMUTE(16, 56, 3299, vnx16hi)
+  permute_vnx16hi (v_vnx16hi_in1, v_vnx16hi_in2, &v_vnx16hi_out);
+  CHECK_PERMUTE_SINGLE(16, 56*1+3299, vnx16hi)
+  INIT_PERMUTE(32, 15641, -9156, vnx32hi)
+  permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, &v_vnx32hi_out);
+  CHECK_PERMUTE_SINGLE(32, 15641*1+-9156, vnx32hi)
+  INIT_PERMUTE(64, -25641, 8093, vnx64hi)
+  permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, &v_vnx64hi_out);
+  CHECK_PERMUTE_SINGLE(64, -25641*1+8093, vnx64hi)
+  INIT_PERMUTE(2, -428, -15651, vnx2si)
+  permute_vnx2si (v_vnx2si_in1, v_vnx2si_in2, &v_vnx2si_out);
+  CHECK_PERMUTE_SINGLE(2, -428*1+-15651, vnx2si)
+  INIT_PERMUTE(4, 208, -55651, vnx4si)
+  permute_vnx4si (v_vnx4si_in1, v_vnx4si_in2, &v_vnx4si_out);
+  CHECK_PERMUTE_SINGLE(4, 208*1+-55651, vnx4si)
+  INIT_PERMUTE(8, 808, 75651, vnx8si)
+  permute_vnx8si (v_vnx8si_in1, v_vnx8si_in2, &v_vnx8si_out);
+  CHECK_PERMUTE_SINGLE(8, 808*1+75651, vnx8si)
+  INIT_PERMUTE(16, 816, -8941561, vnx16si)
+  permute_vnx16si (v_vnx16si_in1, v_vnx16si_in2, &v_vnx16si_out);
+  CHECK_PERMUTE_SINGLE(16, 816*1+-8941561, vnx16si)
+  INIT_PERMUTE(32, -532, 98416, vnx32si)
+  permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, &v_vnx32si_out);
+  CHECK_PERMUTE_SINGLE(32, -532*1+98416, vnx32si)
+  INIT_PERMUTE(2, -4161, 9551616, vnx2di)
+  permute_vnx2di (v_vnx2di_in1, v_vnx2di_in2, &v_vnx2di_out);
+  CHECK_PERMUTE_SINGLE(2, -4161*1+9551616, vnx2di)
+  INIT_PERMUTE(4, 7259, -15644961, vnx4di)
+  permute_vnx4di (v_vnx4di_in1, v_vnx4di_in2, &v_vnx4di_out);
+  CHECK_PERMUTE_SINGLE(4, 7259*1+-15644961, vnx4di)
+  INIT_PERMUTE(8, 351, 9156651, vnx8di)
+  permute_vnx8di (v_vnx8di_in1, v_vnx8di_in2, &v_vnx8di_out);
+  CHECK_PERMUTE_SINGLE(8, 351*1+9156651, vnx8di)
+  INIT_PERMUTE(16, 11, -816196231,vnx16di)
+  permute_vnx16di (v_vnx16di_in1, v_vnx16di_in2, &v_vnx16di_out);
+  CHECK_PERMUTE_SINGLE(16, 11*1+-816196231, vnx16di)
+  INIT_PERMUTE(2, 4552, -89, vnx2sf)
+  permute_vnx2sf (v_vnx2sf_in1, v_vnx2sf_in2, &v_vnx2sf_out);
+  CHECK_PERMUTE_SINGLE(2, (4552+-89), vnx2sf)
+  INIT_PERMUTE(4, 685, 7961, vnx4sf)
+  permute_vnx4sf (v_vnx4sf_in1, v_vnx4sf_in2, &v_vnx4sf_out);
+  CHECK_PERMUTE_SINGLE(4, 685+7961, vnx4sf)
+  INIT_PERMUTE(8, 3927, 16513, vnx8sf)
+  permute_vnx8sf (v_vnx8sf_in1, v_vnx8sf_in2, &v_vnx8sf_out);
+  CHECK_PERMUTE_SINGLE(8, 3927*1+16513, vnx8sf)
+  INIT_PERMUTE(16, -68, 16156571, vnx16sf)
+  permute_vnx16sf (v_vnx16sf_in1, v_vnx16sf_in2, &v_vnx16sf_out);
+  CHECK_PERMUTE_SINGLE(16, -68*1+16156571, vnx16sf)
+  INIT_PERMUTE(32, 9985, 1561318, vnx32sf)
+  permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, &v_vnx32sf_out);
+  CHECK_PERMUTE_SINGLE(32, 9985*1+1561318, vnx32sf)
+  INIT_PERMUTE(2, -1565.1561, -5641565.515, vnx2df)
+  permute_vnx2df (v_vnx2df_in1, v_vnx2df_in2, &v_vnx2df_out);
+  CHECK_PERMUTE_SINGLE(2, -1565.1561*1+-5641565.515, vnx2df)
+  INIT_PERMUTE(4, -189.14897196, -15616547.5165574, vnx4df)
+  permute_vnx4df (v_vnx4df_in1, v_vnx4df_in2, &v_vnx4df_out);
+  CHECK_PERMUTE_SINGLE(4, -189.14897196*1+-15616547.5165574, vnx4df)
+  INIT_PERMUTE(8, 651.158691561, -56163.1655411, vnx8df)
+  permute_vnx8df (v_vnx8df_in1, v_vnx8df_in2, &v_vnx8df_out);
+  CHECK_PERMUTE_SINGLE(8, 651.158691561*1+-56163.1655411, vnx8df)
+  INIT_PERMUTE(16, 58.91516377, 251465.81561, vnx16df)
+  permute_vnx16df (v_vnx16df_in1, v_vnx16df_in2, &v_vnx16df_out);
+  CHECK_PERMUTE_SINGLE(16, 58.91516377*1+251465.81561, vnx16df)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-2.c
new file mode 100644
index 00000000000..1b51b315ad1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-2.c
@@ -0,0 +1,32 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-2.c"
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+  INIT_PERMUTE(32, -1, 30, vnx32qi)
+  permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, &v_vnx32qi_out);
+  CHECK_PERMUTE_SINGLE(32, -1*31+30, vnx32qi)
+  INIT_PERMUTE(64, -1, 66, vnx64qi)
+  permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, &v_vnx64qi_out);
+  CHECK_PERMUTE_SINGLE(64, -1*31+66, vnx64qi)
+  INIT_PERMUTE(128, -1, 38, vnx128qi)
+  permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, &v_vnx128qi_out);
+  CHECK_PERMUTE_SINGLE(128, -1*31+38, vnx128qi)
+  INIT_PERMUTE(32, 156, -9156, vnx32hi)
+  permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, &v_vnx32hi_out);
+  CHECK_PERMUTE_SINGLE(32, 156*31+-9156, vnx32hi)
+  INIT_PERMUTE(64, -251, 8093, vnx64hi)
+  permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, &v_vnx64hi_out);
+  CHECK_PERMUTE_SINGLE(64, -251*31+8093, vnx64hi)
+  INIT_PERMUTE(32, -532, 98416, vnx32si)
+  permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, &v_vnx32si_out);
+  CHECK_PERMUTE_SINGLE(32, -532*31+98416, vnx32si)
+  INIT_PERMUTE(32, 995, 1561318, vnx32sf)
+  permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, &v_vnx32sf_out);
+  CHECK_PERMUTE_SINGLE(32, 995*31+1561318, vnx32sf)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-3.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-3.c
new file mode 100644
index 00000000000..4cae7f4f1a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-3.c
@@ -0,0 +1,20 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-3.c"
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+  INIT_PERMUTE(64, -1, 66, vnx64qi)
+  permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, &v_vnx64qi_out);
+  CHECK_PERMUTE_SINGLE(64, -1*55+66, vnx64qi)
+  INIT_PERMUTE(128, -1, 38, vnx128qi)
+  permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, &v_vnx128qi_out);
+  CHECK_PERMUTE_SINGLE(128, -1*55+38, vnx128qi)
+  INIT_PERMUTE(64, -251, 8093, vnx64hi)
+  permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, &v_vnx64hi_out);
+  CHECK_PERMUTE_SINGLE(64, -251*55+8093, vnx64hi)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-4.c
new file mode 100644
index 00000000000..e60b19fab68
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-4.c
@@ -0,0 +1,104 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-4.c"
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+  INIT_PERMUTE(2, 3, 79, vnx2qi)
+  permute_vnx2qi (v_vnx2qi_in1, v_vnx2qi_in2, &v_vnx2qi_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2qi)
+  INIT_PERMUTE(4, 2, -69, vnx4qi)
+  permute_vnx4qi (v_vnx4qi_in1, v_vnx4qi_in2, &v_vnx4qi_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4qi)
+  INIT_PERMUTE(8, 4, -33, vnx8qi)
+  permute_vnx8qi (v_vnx8qi_in1, v_vnx8qi_in2, &v_vnx8qi_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8qi)
+  INIT_PERMUTE(16, -3, 15, vnx16qi)
+  permute_vnx16qi (v_vnx16qi_in1, v_vnx16qi_in2, &v_vnx16qi_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16qi)
+  INIT_PERMUTE(32, -1, 30, vnx32qi)
+  permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, &v_vnx32qi_out);
+  CHECK_PERMUTE_REVERSE(32, vnx32qi)
+  INIT_PERMUTE(64, -1, 66, vnx64qi)
+  permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, &v_vnx64qi_out);
+  CHECK_PERMUTE_REVERSE(64, vnx64qi)
+  INIT_PERMUTE(128, -1, 38, vnx128qi)
+  permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, &v_vnx128qi_out);
+  CHECK_PERMUTE_REVERSE(128, vnx128qi)
+  INIT_PERMUTE(2, 2, 30238, vnx2hi)
+  permute_vnx2hi (v_vnx2hi_in1, v_vnx2hi_in2, &v_vnx2hi_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2hi)
+  INIT_PERMUTE(4, -45, -2345, vnx4hi)
+  permute_vnx4hi (v_vnx4hi_in1, v_vnx4hi_in2, &v_vnx4hi_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4hi)
+  INIT_PERMUTE(8, 98, -18415, vnx8hi)
+  permute_vnx8hi (v_vnx8hi_in1, v_vnx8hi_in2, &v_vnx8hi_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8hi)
+  INIT_PERMUTE(16, 56, 3299, vnx16hi)
+  permute_vnx16hi (v_vnx16hi_in1, v_vnx16hi_in2, &v_vnx16hi_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16hi)
+  INIT_PERMUTE(32, 15641, -9156, vnx32hi)
+  permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, &v_vnx32hi_out);
+  CHECK_PERMUTE_REVERSE(32, vnx32hi)
+  INIT_PERMUTE(64, -25641, 8093, vnx64hi)
+  permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, &v_vnx64hi_out);
+  CHECK_PERMUTE_REVERSE(64, vnx64hi)
+  INIT_PERMUTE(2, -428, -15651, vnx2si)
+  permute_vnx2si (v_vnx2si_in1, v_vnx2si_in2, &v_vnx2si_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2si)
+  INIT_PERMUTE(4, 208, -55651, vnx4si)
+  permute_vnx4si (v_vnx4si_in1, v_vnx4si_in2, &v_vnx4si_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4si)
+  INIT_PERMUTE(8, 808, 75651, vnx8si)
+  permute_vnx8si (v_vnx8si_in1, v_vnx8si_in2, &v_vnx8si_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8si)
+  INIT_PERMUTE(16, 816, -8941561, vnx16si)
+  permute_vnx16si (v_vnx16si_in1, v_vnx16si_in2, &v_vnx16si_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16si)
+  INIT_PERMUTE(32, -532, 98416, vnx32si)
+  permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, &v_vnx32si_out);
+  CHECK_PERMUTE_REVERSE(32, vnx32si)
+  INIT_PERMUTE(2, -4161, 9551616, vnx2di)
+  permute_vnx2di (v_vnx2di_in1, v_vnx2di_in2, &v_vnx2di_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2di)
+  INIT_PERMUTE(4, 7259, -15644961, vnx4di)
+  permute_vnx4di (v_vnx4di_in1, v_vnx4di_in2, &v_vnx4di_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4di)
+  INIT_PERMUTE(8, 351, 9156651, vnx8di)
+  permute_vnx8di (v_vnx8di_in1, v_vnx8di_in2, &v_vnx8di_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8di)
+  INIT_PERMUTE(16, 11, -816196231,vnx16di)
+  permute_vnx16di (v_vnx16di_in1, v_vnx16di_in2, &v_vnx16di_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16di)
+  INIT_PERMUTE(2, 4552, -89, vnx2sf)
+  permute_vnx2sf (v_vnx2sf_in1, v_vnx2sf_in2, &v_vnx2sf_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2sf)
+  INIT_PERMUTE(4, 685, 7961, vnx4sf)
+  permute_vnx4sf (v_vnx4sf_in1, v_vnx4sf_in2, &v_vnx4sf_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4sf)
+  INIT_PERMUTE(8, 3927, 16513, vnx8sf)
+  permute_vnx8sf (v_vnx8sf_in1, v_vnx8sf_in2, &v_vnx8sf_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8sf)
+  INIT_PERMUTE(16, -68, 16156571, vnx16sf)
+  permute_vnx16sf (v_vnx16sf_in1, v_vnx16sf_in2, &v_vnx16sf_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16sf)
+  INIT_PERMUTE(32, 9985, 1561318, vnx32sf)
+  permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, &v_vnx32sf_out);
+  CHECK_PERMUTE_REVERSE(32, vnx32sf)
+  INIT_PERMUTE(2, -1565.1561, -5641565.515, vnx2df)
+  permute_vnx2df (v_vnx2df_in1, v_vnx2df_in2, &v_vnx2df_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2df)
+  INIT_PERMUTE(4, -189.14897196, -15616547.5165574, vnx4df)
+  permute_vnx4df (v_vnx4df_in1, v_vnx4df_in2, &v_vnx4df_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4df)
+  INIT_PERMUTE(8, 651.158691561, -56163.1655411, vnx8df)
+  permute_vnx8df (v_vnx8df_in1, v_vnx8df_in2, &v_vnx8df_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8df)
+  INIT_PERMUTE(16, 58.91516377, 251465.81561, vnx16df)
+  permute_vnx16df (v_vnx16df_in1, v_vnx16df_in2, &v_vnx16df_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16df)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-5.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-5.c
new file mode 100644
index 00000000000..b61990915b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-5.c
@@ -0,0 +1,137 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-5.c"
+
+#define MASK_2(X, Y) (Y) - 1 - (X) + (Y), (Y) -2 - (X) + (Y)
+#define MASK_4(X, Y) MASK_2 (X, Y), MASK_2 (X + 2, Y)
+#define MASK_8(X, Y) MASK_4 (X, Y), MASK_4 (X + 4, Y)
+#define MASK_16(X, Y) MASK_8 (X, Y), MASK_8 (X + 8, Y)
+#define MASK_32(X, Y) MASK_16 (X, Y), MASK_16 (X + 16, Y)
+#define MASK_64(X, Y) MASK_32 (X, Y), MASK_32 (X + 32, Y)
+#define MASK_128(X, Y) MASK_64 (X, Y), MASK_64 (X + 64, Y)
+
+#define INIT_MASK(TYPE, NUNTIS) \
+  TYPE TYPE##_mask = {MASK_##NUNTIS (0, NUNTIS)};
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+  INIT_PERMUTE(2, 3, 79, vnx2qi)
+  INIT_MASK (vnx2qi, 2)
+  permute_vnx2qi (v_vnx2qi_in1, v_vnx2qi_in2, vnx2qi_mask, &v_vnx2qi_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2qi)
+  INIT_PERMUTE(4, 2, -69, vnx4qi)
+  INIT_MASK (vnx4qi, 4)
+  permute_vnx4qi (v_vnx4qi_in1, v_vnx4qi_in2, vnx4qi_mask, &v_vnx4qi_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4qi)
+  INIT_PERMUTE(8, 4, -33, vnx8qi)
+  INIT_MASK (vnx8qi, 8)
+  permute_vnx8qi (v_vnx8qi_in1, v_vnx8qi_in2, vnx8qi_mask, &v_vnx8qi_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8qi)
+  INIT_PERMUTE(16, -3, 15, vnx16qi)
+  INIT_MASK (vnx16qi, 16)
+  permute_vnx16qi (v_vnx16qi_in1, v_vnx16qi_in2, vnx16qi_mask, &v_vnx16qi_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16qi)
+  INIT_PERMUTE(32, -1, 30, vnx32qi)
+  INIT_MASK (vnx32qi, 32)
+  permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, vnx32qi_mask, &v_vnx32qi_out);
+  CHECK_PERMUTE_REVERSE(32, vnx32qi)
+  INIT_PERMUTE(64, -1, 66, vnx64qi)
+  INIT_MASK (vnx64qi, 64)
+  permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, vnx64qi_mask, &v_vnx64qi_out);
+  CHECK_PERMUTE_REVERSE(64, vnx64qi)
+  INIT_PERMUTE(128, -1, 38, vnx128qi)
+  INIT_MASK (vnx128qi, 128)
+  permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, vnx128qi_mask, &v_vnx128qi_out);
+  CHECK_PERMUTE_REVERSE(128, vnx128qi)
+  INIT_PERMUTE(2, 2, 30238, vnx2hi)
+  INIT_MASK (vnx2hi, 2)
+  permute_vnx2hi (v_vnx2hi_in1, v_vnx2hi_in2, vnx2hi_mask, &v_vnx2hi_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2hi)
+  INIT_PERMUTE(4, -45, -2345, vnx4hi)
+  INIT_MASK (vnx4hi, 4)
+  permute_vnx4hi (v_vnx4hi_in1, v_vnx4hi_in2, vnx4hi_mask, &v_vnx4hi_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4hi)
+  INIT_PERMUTE(8, 98, -18415, vnx8hi)
+  INIT_MASK (vnx8hi, 8)
+  permute_vnx8hi (v_vnx8hi_in1, v_vnx8hi_in2, vnx8hi_mask, &v_vnx8hi_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8hi)
+  INIT_PERMUTE(16, 56, 3299, vnx16hi)
+  INIT_MASK (vnx16hi, 16)
+  permute_vnx16hi (v_vnx16hi_in1, v_vnx16hi_in2, vnx16hi_mask, &v_vnx16hi_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16hi)
+  INIT_PERMUTE(32, 15641, -9156, vnx32hi)
+  INIT_MASK (vnx32hi, 32)
+  permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, vnx32hi_mask, &v_vnx32hi_out);
+  CHECK_PERMUTE_REVERSE(32, vnx32hi)
+  INIT_PERMUTE(64, -25641, 8093, vnx64hi)
+  INIT_MASK (vnx64hi, 64)
+  permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, vnx64hi_mask, &v_vnx64hi_out);
+  CHECK_PERMUTE_REVERSE(64, vnx64hi)
+  INIT_PERMUTE(2, -428, -15651, vnx2si)
+  INIT_MASK (vnx2si, 2)
+  permute_vnx2si (v_vnx2si_in1, v_vnx2si_in2, vnx2si_mask, &v_vnx2si_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2si)
+  INIT_PERMUTE(4, 208, -55651, vnx4si)
+  INIT_MASK (vnx4si, 4)
+  permute_vnx4si (v_vnx4si_in1, v_vnx4si_in2, vnx4si_mask, &v_vnx4si_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4si)
+  INIT_PERMUTE(8, 808, 75651, vnx8si)
+  INIT_MASK (vnx8si, 8)
+  permute_vnx8si (v_vnx8si_in1, v_vnx8si_in2, vnx8si_mask, &v_vnx8si_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8si)
+  INIT_PERMUTE(16, 816, -8941561, vnx16si)
+  INIT_MASK (vnx16si, 16)
+  permute_vnx16si (v_vnx16si_in1, v_vnx16si_in2, vnx16si_mask, &v_vnx16si_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16si)
+  INIT_PERMUTE(32, -532, 98416, vnx32si)
+  INIT_MASK (vnx32si, 32)
+  permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, vnx32si_mask, &v_vnx32si_out);
+  CHECK_PERMUTE_REVERSE(32, vnx32si)
+  INIT_PERMUTE(2, -4161, 9551616, vnx2di)
+  INIT_MASK (vnx2di, 2)
+  permute_vnx2di (v_vnx2di_in1, v_vnx2di_in2, vnx2di_mask, &v_vnx2di_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2di)
+  INIT_PERMUTE(4, 7259, -15644961, vnx4di)
+  INIT_MASK (vnx4di, 4)
+  permute_vnx4di (v_vnx4di_in1, v_vnx4di_in2, vnx4di_mask, &v_vnx4di_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4di)
+  INIT_PERMUTE(8, 351, 9156651, vnx8di)
+  INIT_MASK (vnx8di, 8)
+  permute_vnx8di (v_vnx8di_in1, v_vnx8di_in2, vnx8di_mask, &v_vnx8di_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8di)
+  INIT_PERMUTE(16, 11, -816196231,vnx16di)
+  INIT_MASK (vnx16di, 16)
+  permute_vnx16di (v_vnx16di_in1, v_vnx16di_in2, vnx16di_mask, &v_vnx16di_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16di)
+  INIT_PERMUTE(2, 4552, -89, vnx2sf)
+  permute_vnx2sf (v_vnx2sf_in1, v_vnx2sf_in2, vnx2si_mask, &v_vnx2sf_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2sf)
+  INIT_PERMUTE(4, 685, 7961, vnx4sf)
+  permute_vnx4sf (v_vnx4sf_in1, v_vnx4sf_in2, vnx4si_mask, &v_vnx4sf_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4sf)
+  INIT_PERMUTE(8, 3927, 16513, vnx8sf)
+  permute_vnx8sf (v_vnx8sf_in1, v_vnx8sf_in2, vnx8si_mask, &v_vnx8sf_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8sf)
+  INIT_PERMUTE(16, -68, 16156571, vnx16sf)
+  permute_vnx16sf (v_vnx16sf_in1, v_vnx16sf_in2, vnx16si_mask, &v_vnx16sf_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16sf)
+  INIT_PERMUTE(32, 9985, 1561318, vnx32sf)
+  permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, vnx32si_mask, &v_vnx32sf_out);
+  CHECK_PERMUTE_REVERSE(32, vnx32sf)
+  INIT_PERMUTE(2, -1565.1561, -5641565.515, vnx2df)
+  permute_vnx2df (v_vnx2df_in1, v_vnx2df_in2, vnx2di_mask, &v_vnx2df_out);
+  CHECK_PERMUTE_REVERSE(2, vnx2df)
+  INIT_PERMUTE(4, -189.14897196, -15616547.5165574, vnx4df)
+  permute_vnx4df (v_vnx4df_in1, v_vnx4df_in2, vnx4di_mask, &v_vnx4df_out);
+  CHECK_PERMUTE_REVERSE(4, vnx4df)
+  INIT_PERMUTE(8, 651.158691561, -56163.1655411, vnx8df)
+  permute_vnx8df (v_vnx8df_in1, v_vnx8df_in2, vnx8di_mask, &v_vnx8df_out);
+  CHECK_PERMUTE_REVERSE(8, vnx8df)
+  INIT_PERMUTE(16, 58.91516377, 251465.81561, vnx16df)
+  permute_vnx16df (v_vnx16df_in1, v_vnx16df_in2, vnx16di_mask, &v_vnx16df_out);
+  CHECK_PERMUTE_REVERSE(16, vnx16df)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-6.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-6.c
new file mode 100644
index 00000000000..b23df90f0ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-6.c
@@ -0,0 +1,104 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */
+
+#include "perm-6.c"
+
+int __attribute__ ((optimize (0)))
+main ()
+{
+  INIT_PERMUTE(2, 3, 79, vnx2qi)
+  permute_vnx2qi (v_vnx2qi_in1, v_vnx2qi_in2, &v_vnx2qi_out);
+  CHECK_PERMUTE_SINGLE(2, 3*1-79, vnx2qi)
+  INIT_PERMUTE(4, 2, -69, vnx4qi)
+  permute_vnx4qi (v_vnx4qi_in1, v_vnx4qi_in2, &v_vnx4qi_out);
+  CHECK_PERMUTE_SINGLE(4, 2*1-(-69), vnx4qi)
+  INIT_PERMUTE(8, 4, -33, vnx8qi)
+  permute_vnx8qi (v_vnx8qi_in1, v_vnx8qi_in2, &v_vnx8qi_out);
+  CHECK_PERMUTE_SINGLE(8, 4*1-(-33), vnx8qi)
+  INIT_PERMUTE(16, -3, 15, vnx16qi)
+  permute_vnx16qi (v_vnx16qi_in1, v_vnx16qi_in2, &v_vnx16qi_out);
+  CHECK_PERMUTE_SINGLE(16, -3*1-15, vnx16qi)
+  INIT_PERMUTE(32, -1, 30, vnx32qi)
+  permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, &v_vnx32qi_out);
+  CHECK_PERMUTE_SINGLE(32, -1*1-30, vnx32qi)
+  INIT_PERMUTE(64, -1, 66, vnx64qi)
+  permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, &v_vnx64qi_out);
+  CHECK_PERMUTE_SINGLE(64, -1*1-66, vnx64qi)
+  INIT_PERMUTE(128, -1, 38, vnx128qi)
+  permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, &v_vnx128qi_out);
+  CHECK_PERMUTE_SINGLE(128, -1*1-38, vnx128qi)
+  INIT_PERMUTE(2, 2, 30238, vnx2hi)
+  permute_vnx2hi (v_vnx2hi_in1, v_vnx2hi_in2, &v_vnx2hi_out);
+  CHECK_PERMUTE_SINGLE(2, 2*1-30238, vnx2hi)
+  INIT_PERMUTE(4, -45, -2345, vnx4hi)
+  permute_vnx4hi (v_vnx4hi_in1, v_vnx4hi_in2, &v_vnx4hi_out);
+  CHECK_PERMUTE_SINGLE(4, -45*1-(-2345), vnx4hi)
+  INIT_PERMUTE(8, 98, -18415, vnx8hi)
+  permute_vnx8hi (v_vnx8hi_in1, v_vnx8hi_in2, &v_vnx8hi_out);
+  CHECK_PERMUTE_SINGLE(8, 98*1-(-18415), vnx8hi)
+  INIT_PERMUTE(16, 56, 3299, vnx16hi)
+  permute_vnx16hi (v_vnx16hi_in1, v_vnx16hi_in2, &v_vnx16hi_out);
+  CHECK_PERMUTE_SINGLE(16, 56*1-3299, vnx16hi)
+  INIT_PERMUTE(32, 15641, -9156, vnx32hi)
+  permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, &v_vnx32hi_out);
+  CHECK_PERMUTE_SINGLE(32, 15641*1-(-9156), vnx32hi)
+  INIT_PERMUTE(64, -2564, 8093, vnx64hi)
+  permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, &v_vnx64hi_out);
+  CHECK_PERMUTE_SINGLE(64, -2564*1-8093, vnx64hi)
+  INIT_PERMUTE(2, -428, -15651, vnx2si)
+  permute_vnx2si (v_vnx2si_in1, v_vnx2si_in2, &v_vnx2si_out);
+  CHECK_PERMUTE_SINGLE(2, -428*1-(-15651), vnx2si)
+  INIT_PERMUTE(4, 208, -55651, vnx4si)
+  permute_vnx4si (v_vnx4si_in1, v_vnx4si_in2, &v_vnx4si_out);
+  CHECK_PERMUTE_SINGLE(4, 208*1-(-55651), vnx4si)
+  INIT_PERMUTE(8, 808, 75651, vnx8si)
+  permute_vnx8si (v_vnx8si_in1, v_vnx8si_in2, &v_vnx8si_out);
+  CHECK_PERMUTE_SINGLE(8, 808*1-75651, vnx8si)
+  INIT_PERMUTE(16, 816, -8941561, vnx16si)
+  permute_vnx16si (v_vnx16si_in1, v_vnx16si_in2, &v_vnx16si_out);
+  CHECK_PERMUTE_SINGLE(16, 816*1-(-8941561), vnx16si)
+  INIT_PERMUTE(32, -532, 98416, vnx32si)
+  permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, &v_vnx32si_out);
+  CHECK_PERMUTE_SINGLE(32, -532*1-98416, vnx32si)
+  INIT_PERMUTE(2, -4161, 9551616, vnx2di)
+  permute_vnx2di (v_vnx2di_in1, v_vnx2di_in2, &v_vnx2di_out);
+  CHECK_PERMUTE_SINGLE(2, -4161*1-9551616, vnx2di)
+  INIT_PERMUTE(4, 7259, -15644961, vnx4di)
+  permute_vnx4di (v_vnx4di_in1, v_vnx4di_in2, &v_vnx4di_out);
+  CHECK_PERMUTE_SINGLE(4, 7259*1-(-15644961), vnx4di)
+  INIT_PERMUTE(8, 351, 9156651, vnx8di)
+  permute_vnx8di (v_vnx8di_in1, v_vnx8di_in2, &v_vnx8di_out);
+  CHECK_PERMUTE_SINGLE(8, 351*1-9156651, vnx8di)
+  INIT_PERMUTE(16, 11, -816196231,vnx16di)
+  permute_vnx16di (v_vnx16di_in1, v_vnx16di_in2, &v_vnx16di_out);
+  CHECK_PERMUTE_SINGLE(16, 11*1-(-816196231), vnx16di)
+  INIT_PERMUTE(2, 4552, -89, vnx2sf)
+  permute_vnx2sf (v_vnx2sf_in1, v_vnx2sf_in2, &v_vnx2sf_out);
+  CHECK_PERMUTE_SINGLE(2, (4552-(-89)), vnx2sf)
+  INIT_PERMUTE(4, 685, 7961, vnx4sf)
+  permute_vnx4sf (v_vnx4sf_in1, v_vnx4sf_in2, &v_vnx4sf_out);
+  CHECK_PERMUTE_SINGLE(4, 685-7961, vnx4sf)
+  INIT_PERMUTE(8, 3927, 16513, vnx8sf)
+  permute_vnx8sf (v_vnx8sf_in1, v_vnx8sf_in2, &v_vnx8sf_out);
+  CHECK_PERMUTE_SINGLE(8, 3927*1-16513, vnx8sf)
+  INIT_PERMUTE(16, -68, 16156571, vnx16sf)
+  permute_vnx16sf (v_vnx16sf_in1, v_vnx16sf_in2, &v_vnx16sf_out);
+  CHECK_PERMUTE_SINGLE(16, -68*1-16156571, vnx16sf)
+  INIT_PERMUTE(32, 9985, 1561318, vnx32sf)
+  permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, &v_vnx32sf_out);
+  CHECK_PERMUTE_SINGLE(32, 9985*1-1561318, vnx32sf)
+  INIT_PERMUTE(2, -1565.1561, -5641565.515, vnx2df)
+  permute_vnx2df (v_vnx2df_in1, v_vnx2df_in2, &v_vnx2df_out);
+  CHECK_PERMUTE_SINGLE(2, -1565.1561*1-(-5641565.515), vnx2df)
+  INIT_PERMUTE(4, -189.14897196, -15616547.5165574, vnx4df)
+  permute_vnx4df (v_vnx4df_in1, v_vnx4df_in2, &v_vnx4df_out);
+  CHECK_PERMUTE_SINGLE(4, -189.14897196*1-(-15616547.5165574), vnx4df)
+  INIT_PERMUTE(8, 651.158691561, -56163.1655411, vnx8df)
+  permute_vnx8df (v_vnx8df_in1, v_vnx8df_in2, &v_vnx8df_out);
+  CHECK_PERMUTE_SINGLE(8, 651.158691561*1-(-56163.1655411), vnx8df)
+  INIT_PERMUTE(16, 58.91516377, 251465.81561, vnx16df)
+  permute_vnx16df (v_vnx16df_in1, v_vnx16df_in2, &v_vnx16df_out);
+  CHECK_PERMUTE_SINGLE(16, 58.91516377*1-251465.81561, vnx16df)
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-7.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-7.c
new file mode 100644
index 00000000000..d935d36bf69
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/perm_run-7.c
@@ -0,0 +1,135 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O0" } */
+
+#include "perm-7.c"
+
+#define MASK_2(X) X, X + 2
+#define MASK_4(X) MASK_2 (X), MASK_2 (X+4)
+#define MASK_8(X) MASK_4 (X), MASK_4 (X+8)
+#define MASK_16(X) MASK_8 (X), MASK_8 (X+16)
+#define MASK_32(X) MASK_16 (X), MASK_16 (X+32)
+#define MASK_64(X) MASK_32 (X), MASK_32 (X+64)
+#define MASK_128(X) MASK_64 (X), MASK_64 (X+128)
+
+#define INIT_MASK(TYPE, NUNTIS) TYPE TYPE##_mask = {MASK_##NUNTIS (0)};
+
+int __attribute__ ((optimize (0))) main ()
+{
+  INIT_PERMUTE(2, 3, 79, vnx2qi)
+  INIT_MASK (vnx2qi, 2)
+  permute_vnx2qi (v_vnx2qi_in1, v_vnx2qi_in2, vnx2qi_mask, &v_vnx2qi_out);
+  CHECK_PERMUTE_DOUBLE(2, vnx2qi)
+  INIT_PERMUTE(4, 2, -69, vnx4qi)
+  INIT_MASK (vnx4qi, 4)
+  permute_vnx4qi (v_vnx4qi_in1, v_vnx4qi_in2, vnx4qi_mask, &v_vnx4qi_out);
+  CHECK_PERMUTE_DOUBLE(4, vnx4qi)
+  INIT_PERMUTE(8, 4, -33, vnx8qi)
+  INIT_MASK (vnx8qi, 8)
+  permute_vnx8qi (v_vnx8qi_in1, v_vnx8qi_in2, vnx8qi_mask, &v_vnx8qi_out);
+  CHECK_PERMUTE_DOUBLE(8, vnx8qi)
+  INIT_PERMUTE(16, -3, 15, vnx16qi)
+  INIT_MASK (vnx16qi, 16)
+  permute_vnx16qi (v_vnx16qi_in1, v_vnx16qi_in2, vnx16qi_mask, &v_vnx16qi_out);
+  CHECK_PERMUTE_DOUBLE(16, vnx16qi)
+  INIT_PERMUTE(32, -1, 30, vnx32qi)
+  INIT_MASK (vnx32qi, 32)
+  permute_vnx32qi (v_vnx32qi_in1, v_vnx32qi_in2, vnx32qi_mask, &v_vnx32qi_out);
+  CHECK_PERMUTE_DOUBLE(32, vnx32qi)
+  INIT_PERMUTE(64, -1, 66, vnx64qi)
+  INIT_MASK (vnx64qi, 64)
+  permute_vnx64qi (v_vnx64qi_in1, v_vnx64qi_in2, vnx64qi_mask, &v_vnx64qi_out);
+  CHECK_PERMUTE_DOUBLE(64, vnx64qi)
+  INIT_PERMUTE(128, -1, 38, vnx128qi)
+  INIT_MASK (vnx128qi, 128)
+  permute_vnx128qi (v_vnx128qi_in1, v_vnx128qi_in2, vnx128qi_mask, &v_vnx128qi_out);
+  CHECK_PERMUTE_DOUBLE(128, vnx128qi)
+  INIT_PERMUTE(2, 2, 30238, vnx2hi)
+  INIT_MASK (vnx2hi, 2)
+  permute_vnx2hi (v_vnx2hi_in1, v_vnx2hi_in2, vnx2hi_mask, &v_vnx2hi_out);
+  CHECK_PERMUTE_DOUBLE(2, vnx2hi)
+  INIT_PERMUTE(4, -45, -2345, vnx4hi)
+  INIT_MASK (vnx4hi, 4)
+  permute_vnx4hi (v_vnx4hi_in1, v_vnx4hi_in2, vnx4hi_mask, &v_vnx4hi_out);
+  CHECK_PERMUTE_DOUBLE(4, vnx4hi)
+  INIT_PERMUTE(8, 98, -18415, vnx8hi)
+  INIT_MASK (vnx8hi, 8)
+  permute_vnx8hi (v_vnx8hi_in1, v_vnx8hi_in2, vnx8hi_mask, &v_vnx8hi_out);
+  CHECK_PERMUTE_DOUBLE(8, vnx8hi)
+  INIT_PERMUTE(16, 56, 3299, vnx16hi)
+  INIT_MASK (vnx16hi, 16)
+  permute_vnx16hi (v_vnx16hi_in1, v_vnx16hi_in2, vnx16hi_mask, &v_vnx16hi_out);
+  CHECK_PERMUTE_DOUBLE(16, vnx16hi)
+  INIT_PERMUTE(32, 15641, -9156, vnx32hi)
+  INIT_MASK (vnx32hi, 32)
+  permute_vnx32hi (v_vnx32hi_in1, v_vnx32hi_in2, vnx32hi_mask, &v_vnx32hi_out);
+  CHECK_PERMUTE_DOUBLE(32, vnx32hi)
+  INIT_PERMUTE(64, -25641, 8093, vnx64hi)
+  INIT_MASK (vnx64hi, 64)
+  permute_vnx64hi (v_vnx64hi_in1, v_vnx64hi_in2, vnx64hi_mask, &v_vnx64hi_out);
+  CHECK_PERMUTE_DOUBLE(64, vnx64hi)
+  INIT_PERMUTE(2, -428, -15651, vnx2si)
+  INIT_MASK (vnx2si, 2)
+  permute_vnx2si (v_vnx2si_in1, v_vnx2si_in2, vnx2si_mask, &v_vnx2si_out);
+  CHECK_PERMUTE_DOUBLE(2, vnx2si)
+  INIT_PERMUTE(4, 208, -55651, vnx4si)
+  INIT_MASK (vnx4si, 4)
+  permute_vnx4si (v_vnx4si_in1, v_vnx4si_in2, vnx4si_mask, &v_vnx4si_out);
+  CHECK_PERMUTE_DOUBLE(4, vnx4si)
+  INIT_PERMUTE(8, 808, 75651, vnx8si)
+  INIT_MASK (vnx8si, 8)
+  permute_vnx8si (v_vnx8si_in1, v_vnx8si_in2, vnx8si_mask, &v_vnx8si_out);
+  CHECK_PERMUTE_DOUBLE(8, vnx8si)
+  INIT_PERMUTE(16, 816, -8941561, vnx16si)
+  INIT_MASK (vnx16si, 16)
+  permute_vnx16si (v_vnx16si_in1, v_vnx16si_in2, vnx16si_mask, &v_vnx16si_out);
+  CHECK_PERMUTE_DOUBLE(16, vnx16si)
+  INIT_PERMUTE(32, -532, 98416, vnx32si)
+  INIT_MASK (vnx32si, 32)
+  permute_vnx32si (v_vnx32si_in1, v_vnx32si_in2, vnx32si_mask, &v_vnx32si_out);
+  CHECK_PERMUTE_DOUBLE(32, vnx32si)
+  INIT_PERMUTE(2, -4161, 9551616, vnx2di)
+  INIT_MASK (vnx2di, 2)
+  permute_vnx2di (v_vnx2di_in1, v_vnx2di_in2, vnx2di_mask, &v_vnx2di_out);
+  CHECK_PERMUTE_DOUBLE(2, vnx2di)
+  INIT_PERMUTE(4, 7259, -15644961, vnx4di)
+  INIT_MASK (vnx4di, 4)
+  permute_vnx4di (v_vnx4di_in1, v_vnx4di_in2, vnx4di_mask, &v_vnx4di_out);
+  CHECK_PERMUTE_DOUBLE(4, vnx4di)
+  INIT_PERMUTE(8, 351, 9156651, vnx8di)
+  INIT_MASK (vnx8di, 8)
+  permute_vnx8di (v_vnx8di_in1, v_vnx8di_in2, vnx8di_mask, &v_vnx8di_out);
+  CHECK_PERMUTE_DOUBLE(8, vnx8di)
+  INIT_PERMUTE(16, 11, -816196231,vnx16di)
+  INIT_MASK (vnx16di, 16)
+  permute_vnx16di (v_vnx16di_in1, v_vnx16di_in2, vnx16di_mask, &v_vnx16di_out);
+  CHECK_PERMUTE_DOUBLE(16, vnx16di)
+  INIT_PERMUTE(2, 4552, -89, vnx2sf)
+  permute_vnx2sf (v_vnx2sf_in1, v_vnx2sf_in2, vnx2si_mask, &v_vnx2sf_out);
+  CHECK_PERMUTE_DOUBLE(2, vnx2sf)
+  INIT_PERMUTE(4, 685, 7961, vnx4sf)
+  permute_vnx4sf (v_vnx4sf_in1, v_vnx4sf_in2, vnx4si_mask, &v_vnx4sf_out);
+  CHECK_PERMUTE_DOUBLE(4, vnx4sf)
+  INIT_PERMUTE(8, 3927, 16513, vnx8sf)
+  permute_vnx8sf (v_vnx8sf_in1, v_vnx8sf_in2, vnx8si_mask, &v_vnx8sf_out);
+  CHECK_PERMUTE_DOUBLE(8, vnx8sf)
+  INIT_PERMUTE(16, -68, 16156571, vnx16sf)
+  permute_vnx16sf (v_vnx16sf_in1, v_vnx16sf_in2, vnx16si_mask, &v_vnx16sf_out);
+  CHECK_PERMUTE_DOUBLE(16, vnx16sf)
+  INIT_PERMUTE(32, 9985, 1561318, vnx32sf)
+  permute_vnx32sf (v_vnx32sf_in1, v_vnx32sf_in2, vnx32si_mask, &v_vnx32sf_out);
+  CHECK_PERMUTE_DOUBLE(32, vnx32sf)
+  INIT_PERMUTE(2, -1565.1561, -5641565.515, vnx2df)
+  permute_vnx2df (v_vnx2df_in1, v_vnx2df_in2, vnx2di_mask, &v_vnx2df_out);
+  CHECK_PERMUTE_DOUBLE(2, vnx2df)
+  INIT_PERMUTE(4, -189.14897196, -15616547.5165574, vnx4df)
+  permute_vnx4df (v_vnx4df_in1, v_vnx4df_in2, vnx4di_mask, &v_vnx4df_out);
+  CHECK_PERMUTE_DOUBLE(4, vnx4df)
+  INIT_PERMUTE(8, 651.158691561, -56163.1655411, vnx8df)
+  permute_vnx8df (v_vnx8df_in1, v_vnx8df_in2, vnx8di_mask, &v_vnx8df_out);
+  CHECK_PERMUTE_DOUBLE(8, vnx8df)
+  INIT_PERMUTE(16, 58.91516377, 251465.81561, vnx16df)
+  permute_vnx16df (v_vnx16df_in1, v_vnx16df_in2, vnx16di_mask, &v_vnx16df_out);
+  CHECK_PERMUTE_DOUBLE(16, vnx16df)
+
+  return 0;
+}