i386: Optimize vshuf{i, f}{32x4, 64x2} ymm and vperm{i, f}128 ymm

Message ID 20230418065223.3862113-1-lin1.hu@intel.com
State Accepted
Headers
Series i386: Optimize vshuf{i, f}{32x4, 64x2} ymm and vperm{i, f}128 ymm |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Li, Pan2 via Gcc-patches April 18, 2023, 6:52 a.m. UTC
  Hi, all

The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128.
And it has regtested on x86_64-pc-linux-gnu. OK for trunk?

Thanks.
Lin

vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk.
We can optimze them to vblend, vmovaps when there's no cross-lane.

gcc/ChangeLog:

	* config/i386/sse.md: Modify insn vperm{i,f}
	and vshuf{i,f}.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test.
	* gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto.
	* gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto.
	* gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto.
	* gcc.target/i386/opt-vperm-vshuf-1.c: New test.
	* gcc.target/i386/opt-vperm-vshuf-2.c: Ditto.
	* gcc.target/i386/opt-vperm-vshuf-3.c: Ditto.
---
 gcc/config/i386/sse.md                        | 36 ++++++++--
 .../gcc.target/i386/avx512vl-vshuff32x4-1.c   |  2 +-
 .../gcc.target/i386/avx512vl-vshuff64x2-1.c   |  2 +-
 .../gcc.target/i386/avx512vl-vshufi32x4-1.c   |  2 +-
 .../gcc.target/i386/avx512vl-vshufi64x2-1.c   |  2 +-
 .../gcc.target/i386/opt-vperm-vshuf-1.c       | 51 ++++++++++++++
 .../gcc.target/i386/opt-vperm-vshuf-2.c       | 68 +++++++++++++++++++
 .../gcc.target/i386/opt-vperm-vshuf-3.c       | 63 +++++++++++++++++
 8 files changed, 218 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c
  

Comments

Hongtao Liu April 19, 2023, 1:44 a.m. UTC | #1
On Tue, Apr 18, 2023 at 2:52 PM Hu, Lin1 via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi, all
>
> The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128.
> And it has regtested on x86_64-pc-linux-gnu. OK for trunk?
Ok.
>
> Thanks.
> Lin
>
> vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk.
> We can optimze them to vblend, vmovaps when there's no cross-lane.
>
> gcc/ChangeLog:
>
>         * config/i386/sse.md: Modify insn vperm{i,f}
>         and vshuf{i,f}.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test.
>         * gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto.
>         * gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto.
>         * gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto.
>         * gcc.target/i386/opt-vperm-vshuf-1.c: New test.
>         * gcc.target/i386/opt-vperm-vshuf-2.c: Ditto.
>         * gcc.target/i386/opt-vperm-vshuf-3.c: Ditto.
> ---
>  gcc/config/i386/sse.md                        | 36 ++++++++--
>  .../gcc.target/i386/avx512vl-vshuff32x4-1.c   |  2 +-
>  .../gcc.target/i386/avx512vl-vshuff64x2-1.c   |  2 +-
>  .../gcc.target/i386/avx512vl-vshufi32x4-1.c   |  2 +-
>  .../gcc.target/i386/avx512vl-vshufi64x2-1.c   |  2 +-
>  .../gcc.target/i386/opt-vperm-vshuf-1.c       | 51 ++++++++++++++
>  .../gcc.target/i386/opt-vperm-vshuf-2.c       | 68 +++++++++++++++++++
>  .../gcc.target/i386/opt-vperm-vshuf-3.c       | 63 +++++++++++++++++
>  8 files changed, 218 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 513960e8f33..5b6b2427460 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -18437,6 +18437,8 @@
>    mask = INTVAL (operands[3]) / 2;
>    mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
>    operands[3] = GEN_INT (mask);
> +  if (INTVAL (operands[3]) == 2 && !<mask_applied>)
> +    return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
>    return "vshuf<shuffletype>64x2\t{%3, %2, %1, %0<mask_operand7>|%0<mask_operand7>, %1, %2, %3}";
>  }
>    [(set_attr "type" "sselog")
> @@ -18595,6 +18597,9 @@
>    mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
>    operands[3] = GEN_INT (mask);
>
> +  if (INTVAL (operands[3]) == 2 && !<mask_applied>)
> +    return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> +
>    return "vshuf<shuffletype>32x4\t{%3, %2, %1, %0<mask_operand11>|%0<mask_operand11>, %1, %2, %3}";
>  }
>    [(set_attr "type" "sselog")
> @@ -25663,7 +25668,28 @@
>            (match_operand:SI 3 "const_0_to_255_operand")]
>           UNSPEC_VPERMTI))]
>    "TARGET_AVX2"
> -  "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  {
> +    int mask = INTVAL (operands[3]);
> +    if ((mask & 0xbb) == 16)
> +      {
> +       if (rtx_equal_p (operands[0], operands[1]))
> +         return "";
> +       else
> +         return "vmovaps\t{%1, %0|%0, %1}";
> +      }
> +    if ((mask & 0xbb) == 50)
> +      {
> +       if (rtx_equal_p (operands[0], operands[2]))
> +         return "";
> +       else
> +         return "vmovaps\t{%2, %0|%0, %2}";
> +      }
> +    if ((mask & 0xbb) == 18)
> +      return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
> +    if ((mask & 0xbb) == 48)
> +      return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> +    return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
> +  }
>    [(set_attr "type" "sselog")
>     (set_attr "prefix" "vex")
>     (set_attr "mode" "OI")])
> @@ -26226,9 +26252,11 @@
>     && avx_vperm2f128_parallel (operands[3], <MODE>mode)"
>  {
>    int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1;
> -  if (mask == 0x12)
> -    return "vinsert<i128>\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
> -  if (mask == 0x20)
> +  if ((mask & 0xbb) == 0x12)
> +    return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
> +  if ((mask & 0xbb) == 0x30)
> +    return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> +  if ((mask & 0xbb) == 0x20)
>      return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
>    operands[3] = GEN_INT (mask);
>    return "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
> index 6c2fb2f184a..02aecf4edce 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
> @@ -12,7 +12,7 @@ volatile __mmask8 m;
>  void extern
>  avx512vl_test (void)
>  {
> -  x = _mm256_shuffle_f32x4 (x, x, 2);
> +  x = _mm256_shuffle_f32x4 (x, x, 3);
>    x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2);
>    x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2);
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
> index 1191b400134..563ded5d9df 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
> @@ -12,7 +12,7 @@ volatile __mmask8 m;
>  void extern
>  avx512vl_test (void)
>  {
> -  x = _mm256_shuffle_f64x2 (x, x, 2);
> +  x = _mm256_shuffle_f64x2 (x, x, 3);
>    x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2);
>    x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2);
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c
> index ef9a441e7a5..e89c4140d37 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c
> @@ -12,7 +12,7 @@ volatile __mmask8 m;
>  void extern
>  avx512vl_test (void)
>  {
> -  x = _mm256_shuffle_i32x4 (x, x, 2);
> +  x = _mm256_shuffle_i32x4 (x, x, 3);
>    x = _mm256_mask_shuffle_i32x4 (x, m, x, x, 2);
>    x = _mm256_maskz_shuffle_i32x4 (m, x, x, 2);
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c
> index 0bd117e85d4..8e8e47eda38 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c
> @@ -12,7 +12,7 @@ volatile __mmask8 m;
>  void extern
>  avx512vl_test (void)
>  {
> -  x = _mm256_shuffle_i64x2 (x, x, 2);
> +  x = _mm256_shuffle_i64x2 (x, x, 3);
>    x = _mm256_mask_shuffle_i64x2 (x, m, x, x, 2);
>    x = _mm256_maskz_shuffle_i64x2 (m, x, x, 2);
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
> new file mode 100644
> index 00000000000..1ee00b6b4a1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=sapphirerapids" } */
> +/* { dg-final { scan-assembler-times "vmovaps" 1 } } */
> +/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 1 } } */
> +/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 5 } } */
> +
> +#include<x86intrin.h>
> +
> +/* Vpermi128/Vpermf128 */
> +__m256i
> +perm0 (__m256i a, __m256i b)
> +{
> +  return _mm256_permute2x128_si256 (a, b, 50);
> +}
> +
> +__m256i
> +perm1 (__m256i a, __m256i b)
> +{
> +  return _mm256_permute2x128_si256 (a, b, 18);
> +}
> +
> +__m256i
> +perm2 (__m256i a, __m256i b)
> +{
> +  return _mm256_permute2x128_si256 (a, b, 48);
> +}
> +
> +/* vshuf{i,f}{32x4,64x2} ymm .*/
> +__m256i
> +shuff0 (__m256i a, __m256i b)
> +{
> +  return _mm256_shuffle_i32x4(a, b, 2);
> +}
> +
> +__m256
> +shuff1 (__m256 a, __m256 b)
> +{
> +  return _mm256_shuffle_f32x4(a, b, 2);
> +}
> +
> +__m256i
> +shuff2 (__m256i a, __m256i b)
> +{
> +  return _mm256_shuffle_i64x2(a, b, 2);
> +}
> +
> +__m256d
> +shuff3 (__m256d a, __m256d b)
> +{
> +  return _mm256_shuffle_f64x2(a, b, 2);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
> new file mode 100644
> index 00000000000..9775072b97a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
> @@ -0,0 +1,68 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=sapphirerapids" } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vblendps" } } */
> +/* { dg-final { scan-assembler-not "vperm2i128" } } */
> +/* { dg-final { scan-assembler-not "vperm2f128" } } */
> +
> +#include<x86intrin.h>
> +
> +__m256i
> +perm0 (__m256i a, __m256i b)
> +{
> +  return _mm256_permute2x128_si256 (a, b, 16);
> +}
> +
> +__m256d
> +perm1 (__m256d a, __m256d b)
> +{
> +  return _mm256_permute2f128_pd (a, b, 16);
> +}
> +
> +__m256
> +perm2 (__m256 a, __m256 b)
> +{
> +  return _mm256_permute2f128_ps (a, b, 16);
> +}
> +
> +__m256i
> +perm3 (__m256i a, __m256i b)
> +{
> +  return _mm256_permute2f128_si256 (a, b, 16);
> +}
> +
> +__m256i
> +perm4 (__m256i a, __m256i b)
> +{
> +  return _mm256_permute2x128_si256 (a, b, 20);
> +}
> +
> +__m256d
> +perm5 (__m256d a, __m256d b)
> +{
> +  return _mm256_permute2f128_pd (a, b, 20);
> +}
> +
> +__m256i
> +perm6 (__m256i a, __m256i b)
> +{
> +  return _mm256_permute2x128_si256 (a, b, 80);
> +}
> +
> +__m256d
> +perm7 (__m256d a, __m256d b)
> +{
> +  return _mm256_permute2f128_pd (a, b, 80);
> +}
> +
> +__m256i
> +perm8 (__m256i a, __m256i b)
> +{
> +  return _mm256_permute2x128_si256 (a, b, 84);
> +}
> +
> +__m256d
> +perm9 (__m256d a, __m256d b)
> +{
> +  return _mm256_permute2f128_pd (a, b, 84);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c
> new file mode 100644
> index 00000000000..a330b14caca
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c
> @@ -0,0 +1,63 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=sapphirerapids" } */
> +/* { dg-final { scan-assembler-times "vmov..." 3 } } */
> +/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 3 } } */
> +/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 3 } } */
> +/* { dg-final { scan-assembler-not "vperm2f128" } } */
> +
> +#include<x86intrin.h>
> +
> +/* Vpermf128 */
> +__m256
> +perm0 (__m256 a, __m256 b)
> +{
> +  return _mm256_permute2f128_ps (a, b, 50);
> +}
> +
> +__m256
> +perm1 (__m256 a, __m256 b)
> +{
> +  return _mm256_permute2f128_ps (a, b, 18);
> +}
> +
> +__m256
> +perm2 (__m256 a, __m256 b)
> +{
> +  return _mm256_permute2f128_ps (a, b, 48);
> +}
> +
> +__m256i
> +perm3 (__m256i a, __m256i b)
> +{
> +  return _mm256_permute2f128_si256 (a, b, 50);
> +}
> +
> +__m256i
> +perm4 (__m256i a, __m256i b)
> +{
> +  return _mm256_permute2f128_si256 (a, b, 18);
> +}
> +
> +__m256i
> +perm5 (__m256i a, __m256i b)
> +{
> +  return _mm256_permute2f128_si256 (a, b, 48);
> +}
> +
> +__m256d
> +perm6 (__m256d a, __m256d b)
> +{
> +  return _mm256_permute2f128_pd (a, b, 50);
> +}
> +
> +__m256d
> +perm7 (__m256d a, __m256d b)
> +{
> +  return _mm256_permute2f128_pd (a, b, 18);
> +}
> +
> +__m256d
> +perm8 (__m256d a, __m256d b)
> +{
> +  return _mm256_permute2f128_pd (a, b, 48);
> +}
> --
> 2.31.1
>
  

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 513960e8f33..5b6b2427460 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -18437,6 +18437,8 @@ 
   mask = INTVAL (operands[3]) / 2;
   mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
   operands[3] = GEN_INT (mask);
+  if (INTVAL (operands[3]) == 2 && !<mask_applied>)
+    return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
   return "vshuf<shuffletype>64x2\t{%3, %2, %1, %0<mask_operand7>|%0<mask_operand7>, %1, %2, %3}";
 }
   [(set_attr "type" "sselog")
@@ -18595,6 +18597,9 @@ 
   mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
   operands[3] = GEN_INT (mask);
 
+  if (INTVAL (operands[3]) == 2 && !<mask_applied>)
+    return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+
   return "vshuf<shuffletype>32x4\t{%3, %2, %1, %0<mask_operand11>|%0<mask_operand11>, %1, %2, %3}";
 }
   [(set_attr "type" "sselog")
@@ -25663,7 +25668,28 @@ 
 	   (match_operand:SI 3 "const_0_to_255_operand")]
 	  UNSPEC_VPERMTI))]
   "TARGET_AVX2"
-  "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  {
+    int mask = INTVAL (operands[3]);
+    if ((mask & 0xbb) == 16)
+      {
+	if (rtx_equal_p (operands[0], operands[1]))
+	  return "";
+	else
+	  return "vmovaps\t{%1, %0|%0, %1}";
+      }
+    if ((mask & 0xbb) == 50)
+      {
+	if (rtx_equal_p (operands[0], operands[2]))
+	  return "";
+	else
+	  return "vmovaps\t{%2, %0|%0, %2}";
+      }
+    if ((mask & 0xbb) == 18)
+      return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+    if ((mask & 0xbb) == 48)
+      return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+    return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+  }
   [(set_attr "type" "sselog")
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
@@ -26226,9 +26252,11 @@ 
    && avx_vperm2f128_parallel (operands[3], <MODE>mode)"
 {
   int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1;
-  if (mask == 0x12)
-    return "vinsert<i128>\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
-  if (mask == 0x20)
+  if ((mask & 0xbb) == 0x12)
+    return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+  if ((mask & 0xbb) == 0x30)
+    return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+  if ((mask & 0xbb) == 0x20)
     return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
   operands[3] = GEN_INT (mask);
   return "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
index 6c2fb2f184a..02aecf4edce 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
@@ -12,7 +12,7 @@  volatile __mmask8 m;
 void extern
 avx512vl_test (void)
 {
-  x = _mm256_shuffle_f32x4 (x, x, 2);
+  x = _mm256_shuffle_f32x4 (x, x, 3);
   x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2);
   x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
index 1191b400134..563ded5d9df 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
@@ -12,7 +12,7 @@  volatile __mmask8 m;
 void extern
 avx512vl_test (void)
 {
-  x = _mm256_shuffle_f64x2 (x, x, 2);
+  x = _mm256_shuffle_f64x2 (x, x, 3);
   x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2);
   x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c
index ef9a441e7a5..e89c4140d37 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c
@@ -12,7 +12,7 @@  volatile __mmask8 m;
 void extern
 avx512vl_test (void)
 {
-  x = _mm256_shuffle_i32x4 (x, x, 2);
+  x = _mm256_shuffle_i32x4 (x, x, 3);
   x = _mm256_mask_shuffle_i32x4 (x, m, x, x, 2);
   x = _mm256_maskz_shuffle_i32x4 (m, x, x, 2);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c
index 0bd117e85d4..8e8e47eda38 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c
@@ -12,7 +12,7 @@  volatile __mmask8 m;
 void extern
 avx512vl_test (void)
 {
-  x = _mm256_shuffle_i64x2 (x, x, 2);
+  x = _mm256_shuffle_i64x2 (x, x, 3);
   x = _mm256_mask_shuffle_i64x2 (x, m, x, x, 2);
   x = _mm256_maskz_shuffle_i64x2 (m, x, x, 2);
 }
diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
new file mode 100644
index 00000000000..1ee00b6b4a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
@@ -0,0 +1,51 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=sapphirerapids" } */
+/* { dg-final { scan-assembler-times "vmovaps" 1 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 1 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 5 } } */
+
+#include<x86intrin.h>
+
+/* Vpermi128/Vpermf128 */
+__m256i
+perm0 (__m256i a, __m256i b)
+{
+  return _mm256_permute2x128_si256 (a, b, 50);
+}
+
+__m256i
+perm1 (__m256i a, __m256i b)
+{
+  return _mm256_permute2x128_si256 (a, b, 18);
+}
+
+__m256i
+perm2 (__m256i a, __m256i b)
+{
+  return _mm256_permute2x128_si256 (a, b, 48);
+}
+
+/* vshuf{i,f}{32x4,64x2} ymm .*/
+__m256i
+shuff0 (__m256i a, __m256i b)
+{
+  return _mm256_shuffle_i32x4(a, b, 2);
+}
+
+__m256
+shuff1 (__m256 a, __m256 b)
+{
+  return _mm256_shuffle_f32x4(a, b, 2);
+}
+
+__m256i
+shuff2 (__m256i a, __m256i b)
+{
+  return _mm256_shuffle_i64x2(a, b, 2);
+}
+
+__m256d
+shuff3 (__m256d a, __m256d b)
+{
+  return _mm256_shuffle_f64x2(a, b, 2);
+}
diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
new file mode 100644
index 00000000000..9775072b97a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
@@ -0,0 +1,68 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=sapphirerapids" } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vblendps" } } */
+/* { dg-final { scan-assembler-not "vperm2i128" } } */
+/* { dg-final { scan-assembler-not "vperm2f128" } } */
+
+#include<x86intrin.h>
+
+__m256i
+perm0 (__m256i a, __m256i b)
+{
+  return _mm256_permute2x128_si256 (a, b, 16);
+}
+
+__m256d
+perm1 (__m256d a, __m256d b)
+{
+  return _mm256_permute2f128_pd (a, b, 16);
+}
+
+__m256
+perm2 (__m256 a, __m256 b)
+{
+  return _mm256_permute2f128_ps (a, b, 16);
+}
+
+__m256i
+perm3 (__m256i a, __m256i b)
+{
+  return _mm256_permute2f128_si256 (a, b, 16);
+}
+
+__m256i
+perm4 (__m256i a, __m256i b)
+{
+  return _mm256_permute2x128_si256 (a, b, 20);
+}
+
+__m256d
+perm5 (__m256d a, __m256d b)
+{
+  return _mm256_permute2f128_pd (a, b, 20);
+}
+
+__m256i
+perm6 (__m256i a, __m256i b)
+{
+  return _mm256_permute2x128_si256 (a, b, 80);
+}
+
+__m256d
+perm7 (__m256d a, __m256d b)
+{
+  return _mm256_permute2f128_pd (a, b, 80);
+}
+
+__m256i
+perm8 (__m256i a, __m256i b)
+{
+  return _mm256_permute2x128_si256 (a, b, 84);
+}
+
+__m256d
+perm9 (__m256d a, __m256d b)
+{
+  return _mm256_permute2f128_pd (a, b, 84);
+}
diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c
new file mode 100644
index 00000000000..a330b14caca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c
@@ -0,0 +1,63 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=sapphirerapids" } */
+/* { dg-final { scan-assembler-times "vmov..." 3 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 3 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 3 } } */
+/* { dg-final { scan-assembler-not "vperm2f128" } } */
+
+#include<x86intrin.h>
+
+/* Vpermf128 */
+__m256
+perm0 (__m256 a, __m256 b)
+{
+  return _mm256_permute2f128_ps (a, b, 50);
+}
+
+__m256
+perm1 (__m256 a, __m256 b)
+{
+  return _mm256_permute2f128_ps (a, b, 18);
+}
+
+__m256
+perm2 (__m256 a, __m256 b)
+{
+  return _mm256_permute2f128_ps (a, b, 48);
+}
+
+__m256i
+perm3 (__m256i a, __m256i b)
+{
+  return _mm256_permute2f128_si256 (a, b, 50);
+}
+
+__m256i
+perm4 (__m256i a, __m256i b)
+{
+  return _mm256_permute2f128_si256 (a, b, 18);
+}
+
+__m256i
+perm5 (__m256i a, __m256i b)
+{
+  return _mm256_permute2f128_si256 (a, b, 48);
+}
+
+__m256d
+perm6 (__m256d a, __m256d b)
+{
+  return _mm256_permute2f128_pd (a, b, 50);
+}
+
+__m256d
+perm7 (__m256d a, __m256d b)
+{
+  return _mm256_permute2f128_pd (a, b, 18);
+}
+
+__m256d
+perm8 (__m256d a, __m256d b)
+{
+  return _mm256_permute2f128_pd (a, b, 48);
+}