i386: Optimize vshuf{i, f}{32x4, 64x2} ymm and vperm{i, f}128 ymm
Checks
Commit Message
Hi, all
The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128.
And it has regtested on x86_64-pc-linux-gnu. OK for trunk?
Thanks.
Lin
vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk.
We can optimze them to vblend, vmovaps when there's no cross-lane.
gcc/ChangeLog:
* config/i386/sse.md: Modify insn vperm{i,f}
and vshuf{i,f}.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test.
* gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto.
* gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto.
* gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto.
* gcc.target/i386/opt-vperm-vshuf-1.c: New test.
* gcc.target/i386/opt-vperm-vshuf-2.c: Ditto.
* gcc.target/i386/opt-vperm-vshuf-3.c: Ditto.
---
gcc/config/i386/sse.md | 36 ++++++++--
.../gcc.target/i386/avx512vl-vshuff32x4-1.c | 2 +-
.../gcc.target/i386/avx512vl-vshuff64x2-1.c | 2 +-
.../gcc.target/i386/avx512vl-vshufi32x4-1.c | 2 +-
.../gcc.target/i386/avx512vl-vshufi64x2-1.c | 2 +-
.../gcc.target/i386/opt-vperm-vshuf-1.c | 51 ++++++++++++++
.../gcc.target/i386/opt-vperm-vshuf-2.c | 68 +++++++++++++++++++
.../gcc.target/i386/opt-vperm-vshuf-3.c | 63 +++++++++++++++++
8 files changed, 218 insertions(+), 8 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c
Comments
On Tue, Apr 18, 2023 at 2:52 PM Hu, Lin1 via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi, all
>
> The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128.
> And it has regtested on x86_64-pc-linux-gnu. OK for trunk?
Ok.
>
> Thanks.
> Lin
>
> vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk.
> We can optimze them to vblend, vmovaps when there's no cross-lane.
>
> gcc/ChangeLog:
>
> * config/i386/sse.md: Modify insn vperm{i,f}
> and vshuf{i,f}.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test.
> * gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto.
> * gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto.
> * gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto.
> * gcc.target/i386/opt-vperm-vshuf-1.c: New test.
> * gcc.target/i386/opt-vperm-vshuf-2.c: Ditto.
> * gcc.target/i386/opt-vperm-vshuf-3.c: Ditto.
> ---
> gcc/config/i386/sse.md | 36 ++++++++--
> .../gcc.target/i386/avx512vl-vshuff32x4-1.c | 2 +-
> .../gcc.target/i386/avx512vl-vshuff64x2-1.c | 2 +-
> .../gcc.target/i386/avx512vl-vshufi32x4-1.c | 2 +-
> .../gcc.target/i386/avx512vl-vshufi64x2-1.c | 2 +-
> .../gcc.target/i386/opt-vperm-vshuf-1.c | 51 ++++++++++++++
> .../gcc.target/i386/opt-vperm-vshuf-2.c | 68 +++++++++++++++++++
> .../gcc.target/i386/opt-vperm-vshuf-3.c | 63 +++++++++++++++++
> 8 files changed, 218 insertions(+), 8 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 513960e8f33..5b6b2427460 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -18437,6 +18437,8 @@
> mask = INTVAL (operands[3]) / 2;
> mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
> operands[3] = GEN_INT (mask);
> + if (INTVAL (operands[3]) == 2 && !<mask_applied>)
> + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> return "vshuf<shuffletype>64x2\t{%3, %2, %1, %0<mask_operand7>|%0<mask_operand7>, %1, %2, %3}";
> }
> [(set_attr "type" "sselog")
> @@ -18595,6 +18597,9 @@
> mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
> operands[3] = GEN_INT (mask);
>
> + if (INTVAL (operands[3]) == 2 && !<mask_applied>)
> + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> +
> return "vshuf<shuffletype>32x4\t{%3, %2, %1, %0<mask_operand11>|%0<mask_operand11>, %1, %2, %3}";
> }
> [(set_attr "type" "sselog")
> @@ -25663,7 +25668,28 @@
> (match_operand:SI 3 "const_0_to_255_operand")]
> UNSPEC_VPERMTI))]
> "TARGET_AVX2"
> - "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> + {
> + int mask = INTVAL (operands[3]);
> + if ((mask & 0xbb) == 16)
> + {
> + if (rtx_equal_p (operands[0], operands[1]))
> + return "";
> + else
> + return "vmovaps\t{%1, %0|%0, %1}";
> + }
> + if ((mask & 0xbb) == 50)
> + {
> + if (rtx_equal_p (operands[0], operands[2]))
> + return "";
> + else
> + return "vmovaps\t{%2, %0|%0, %2}";
> + }
> + if ((mask & 0xbb) == 18)
> + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
> + if ((mask & 0xbb) == 48)
> + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> + return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
> + }
> [(set_attr "type" "sselog")
> (set_attr "prefix" "vex")
> (set_attr "mode" "OI")])
> @@ -26226,9 +26252,11 @@
> && avx_vperm2f128_parallel (operands[3], <MODE>mode)"
> {
> int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1;
> - if (mask == 0x12)
> - return "vinsert<i128>\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
> - if (mask == 0x20)
> + if ((mask & 0xbb) == 0x12)
> + return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
> + if ((mask & 0xbb) == 0x30)
> + return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> + if ((mask & 0xbb) == 0x20)
> return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
> operands[3] = GEN_INT (mask);
> return "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
> index 6c2fb2f184a..02aecf4edce 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
> @@ -12,7 +12,7 @@ volatile __mmask8 m;
> void extern
> avx512vl_test (void)
> {
> - x = _mm256_shuffle_f32x4 (x, x, 2);
> + x = _mm256_shuffle_f32x4 (x, x, 3);
> x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2);
> x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2);
> }
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
> index 1191b400134..563ded5d9df 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
> @@ -12,7 +12,7 @@ volatile __mmask8 m;
> void extern
> avx512vl_test (void)
> {
> - x = _mm256_shuffle_f64x2 (x, x, 2);
> + x = _mm256_shuffle_f64x2 (x, x, 3);
> x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2);
> x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2);
> }
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c
> index ef9a441e7a5..e89c4140d37 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi32x4-1.c
> @@ -12,7 +12,7 @@ volatile __mmask8 m;
> void extern
> avx512vl_test (void)
> {
> - x = _mm256_shuffle_i32x4 (x, x, 2);
> + x = _mm256_shuffle_i32x4 (x, x, 3);
> x = _mm256_mask_shuffle_i32x4 (x, m, x, x, 2);
> x = _mm256_maskz_shuffle_i32x4 (m, x, x, 2);
> }
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c
> index 0bd117e85d4..8e8e47eda38 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshufi64x2-1.c
> @@ -12,7 +12,7 @@ volatile __mmask8 m;
> void extern
> avx512vl_test (void)
> {
> - x = _mm256_shuffle_i64x2 (x, x, 2);
> + x = _mm256_shuffle_i64x2 (x, x, 3);
> x = _mm256_mask_shuffle_i64x2 (x, m, x, x, 2);
> x = _mm256_maskz_shuffle_i64x2 (m, x, x, 2);
> }
> diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
> new file mode 100644
> index 00000000000..1ee00b6b4a1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=sapphirerapids" } */
> +/* { dg-final { scan-assembler-times "vmovaps" 1 } } */
> +/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 1 } } */
> +/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 5 } } */
> +
> +#include<x86intrin.h>
> +
> +/* Vpermi128/Vpermf128 */
> +__m256i
> +perm0 (__m256i a, __m256i b)
> +{
> + return _mm256_permute2x128_si256 (a, b, 50);
> +}
> +
> +__m256i
> +perm1 (__m256i a, __m256i b)
> +{
> + return _mm256_permute2x128_si256 (a, b, 18);
> +}
> +
> +__m256i
> +perm2 (__m256i a, __m256i b)
> +{
> + return _mm256_permute2x128_si256 (a, b, 48);
> +}
> +
> +/* vshuf{i,f}{32x4,64x2} ymm .*/
> +__m256i
> +shuff0 (__m256i a, __m256i b)
> +{
> + return _mm256_shuffle_i32x4(a, b, 2);
> +}
> +
> +__m256
> +shuff1 (__m256 a, __m256 b)
> +{
> + return _mm256_shuffle_f32x4(a, b, 2);
> +}
> +
> +__m256i
> +shuff2 (__m256i a, __m256i b)
> +{
> + return _mm256_shuffle_i64x2(a, b, 2);
> +}
> +
> +__m256d
> +shuff3 (__m256d a, __m256d b)
> +{
> + return _mm256_shuffle_f64x2(a, b, 2);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
> new file mode 100644
> index 00000000000..9775072b97a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
> @@ -0,0 +1,68 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=sapphirerapids" } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vblendps" } } */
> +/* { dg-final { scan-assembler-not "vperm2i128" } } */
> +/* { dg-final { scan-assembler-not "vperm2f128" } } */
> +
> +#include<x86intrin.h>
> +
> +__m256i
> +perm0 (__m256i a, __m256i b)
> +{
> + return _mm256_permute2x128_si256 (a, b, 16);
> +}
> +
> +__m256d
> +perm1 (__m256d a, __m256d b)
> +{
> + return _mm256_permute2f128_pd (a, b, 16);
> +}
> +
> +__m256
> +perm2 (__m256 a, __m256 b)
> +{
> + return _mm256_permute2f128_ps (a, b, 16);
> +}
> +
> +__m256i
> +perm3 (__m256i a, __m256i b)
> +{
> + return _mm256_permute2f128_si256 (a, b, 16);
> +}
> +
> +__m256i
> +perm4 (__m256i a, __m256i b)
> +{
> + return _mm256_permute2x128_si256 (a, b, 20);
> +}
> +
> +__m256d
> +perm5 (__m256d a, __m256d b)
> +{
> + return _mm256_permute2f128_pd (a, b, 20);
> +}
> +
> +__m256i
> +perm6 (__m256i a, __m256i b)
> +{
> + return _mm256_permute2x128_si256 (a, b, 80);
> +}
> +
> +__m256d
> +perm7 (__m256d a, __m256d b)
> +{
> + return _mm256_permute2f128_pd (a, b, 80);
> +}
> +
> +__m256i
> +perm8 (__m256i a, __m256i b)
> +{
> + return _mm256_permute2x128_si256 (a, b, 84);
> +}
> +
> +__m256d
> +perm9 (__m256d a, __m256d b)
> +{
> + return _mm256_permute2f128_pd (a, b, 84);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c
> new file mode 100644
> index 00000000000..a330b14caca
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c
> @@ -0,0 +1,63 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=sapphirerapids" } */
> +/* { dg-final { scan-assembler-times "vmov..." 3 } } */
> +/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 3 } } */
> +/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 3 } } */
> +/* { dg-final { scan-assembler-not "vperm2f128" } } */
> +
> +#include<x86intrin.h>
> +
> +/* Vpermf128 */
> +__m256
> +perm0 (__m256 a, __m256 b)
> +{
> + return _mm256_permute2f128_ps (a, b, 50);
> +}
> +
> +__m256
> +perm1 (__m256 a, __m256 b)
> +{
> + return _mm256_permute2f128_ps (a, b, 18);
> +}
> +
> +__m256
> +perm2 (__m256 a, __m256 b)
> +{
> + return _mm256_permute2f128_ps (a, b, 48);
> +}
> +
> +__m256i
> +perm3 (__m256i a, __m256i b)
> +{
> + return _mm256_permute2f128_si256 (a, b, 50);
> +}
> +
> +__m256i
> +perm4 (__m256i a, __m256i b)
> +{
> + return _mm256_permute2f128_si256 (a, b, 18);
> +}
> +
> +__m256i
> +perm5 (__m256i a, __m256i b)
> +{
> + return _mm256_permute2f128_si256 (a, b, 48);
> +}
> +
> +__m256d
> +perm6 (__m256d a, __m256d b)
> +{
> + return _mm256_permute2f128_pd (a, b, 50);
> +}
> +
> +__m256d
> +perm7 (__m256d a, __m256d b)
> +{
> + return _mm256_permute2f128_pd (a, b, 18);
> +}
> +
> +__m256d
> +perm8 (__m256d a, __m256d b)
> +{
> + return _mm256_permute2f128_pd (a, b, 48);
> +}
> --
> 2.31.1
>
@@ -18437,6 +18437,8 @@
mask = INTVAL (operands[3]) / 2;
mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
operands[3] = GEN_INT (mask);
+ if (INTVAL (operands[3]) == 2 && !<mask_applied>)
+ return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
return "vshuf<shuffletype>64x2\t{%3, %2, %1, %0<mask_operand7>|%0<mask_operand7>, %1, %2, %3}";
}
[(set_attr "type" "sselog")
@@ -18595,6 +18597,9 @@
mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
operands[3] = GEN_INT (mask);
+ if (INTVAL (operands[3]) == 2 && !<mask_applied>)
+ return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+
return "vshuf<shuffletype>32x4\t{%3, %2, %1, %0<mask_operand11>|%0<mask_operand11>, %1, %2, %3}";
}
[(set_attr "type" "sselog")
@@ -25663,7 +25668,28 @@
(match_operand:SI 3 "const_0_to_255_operand")]
UNSPEC_VPERMTI))]
"TARGET_AVX2"
- "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ {
+ int mask = INTVAL (operands[3]);
+ if ((mask & 0xbb) == 16)
+ {
+ if (rtx_equal_p (operands[0], operands[1]))
+ return "";
+ else
+ return "vmovaps\t{%1, %0|%0, %1}";
+ }
+ if ((mask & 0xbb) == 50)
+ {
+ if (rtx_equal_p (operands[0], operands[2]))
+ return "";
+ else
+ return "vmovaps\t{%2, %0|%0, %2}";
+ }
+ if ((mask & 0xbb) == 18)
+ return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+ if ((mask & 0xbb) == 48)
+ return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+ return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+ }
[(set_attr "type" "sselog")
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
@@ -26226,9 +26252,11 @@
&& avx_vperm2f128_parallel (operands[3], <MODE>mode)"
{
int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1;
- if (mask == 0x12)
- return "vinsert<i128>\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
- if (mask == 0x20)
+ if ((mask & 0xbb) == 0x12)
+ return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+ if ((mask & 0xbb) == 0x30)
+ return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+ if ((mask & 0xbb) == 0x20)
return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
operands[3] = GEN_INT (mask);
return "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
@@ -12,7 +12,7 @@ volatile __mmask8 m;
void extern
avx512vl_test (void)
{
- x = _mm256_shuffle_f32x4 (x, x, 2);
+ x = _mm256_shuffle_f32x4 (x, x, 3);
x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2);
x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2);
}
@@ -12,7 +12,7 @@ volatile __mmask8 m;
void extern
avx512vl_test (void)
{
- x = _mm256_shuffle_f64x2 (x, x, 2);
+ x = _mm256_shuffle_f64x2 (x, x, 3);
x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2);
x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2);
}
@@ -12,7 +12,7 @@ volatile __mmask8 m;
void extern
avx512vl_test (void)
{
- x = _mm256_shuffle_i32x4 (x, x, 2);
+ x = _mm256_shuffle_i32x4 (x, x, 3);
x = _mm256_mask_shuffle_i32x4 (x, m, x, x, 2);
x = _mm256_maskz_shuffle_i32x4 (m, x, x, 2);
}
@@ -12,7 +12,7 @@ volatile __mmask8 m;
void extern
avx512vl_test (void)
{
- x = _mm256_shuffle_i64x2 (x, x, 2);
+ x = _mm256_shuffle_i64x2 (x, x, 3);
x = _mm256_mask_shuffle_i64x2 (x, m, x, x, 2);
x = _mm256_maskz_shuffle_i64x2 (m, x, x, 2);
}
new file mode 100644
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=sapphirerapids" } */
+/* { dg-final { scan-assembler-times "vmovaps" 1 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 1 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 5 } } */
+
+#include<x86intrin.h>
+
+/* Vpermi128/Vpermf128 */
+__m256i
+perm0 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 50);
+}
+
+__m256i
+perm1 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 18);
+}
+
+__m256i
+perm2 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 48);
+}
+
+/* vshuf{i,f}{32x4,64x2} ymm .*/
+__m256i
+shuff0 (__m256i a, __m256i b)
+{
+ return _mm256_shuffle_i32x4(a, b, 2);
+}
+
+__m256
+shuff1 (__m256 a, __m256 b)
+{
+ return _mm256_shuffle_f32x4(a, b, 2);
+}
+
+__m256i
+shuff2 (__m256i a, __m256i b)
+{
+ return _mm256_shuffle_i64x2(a, b, 2);
+}
+
+__m256d
+shuff3 (__m256d a, __m256d b)
+{
+ return _mm256_shuffle_f64x2(a, b, 2);
+}
new file mode 100644
@@ -0,0 +1,68 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=sapphirerapids" } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vblendps" } } */
+/* { dg-final { scan-assembler-not "vperm2i128" } } */
+/* { dg-final { scan-assembler-not "vperm2f128" } } */
+
+#include<x86intrin.h>
+
+__m256i
+perm0 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 16);
+}
+
+__m256d
+perm1 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 16);
+}
+
+__m256
+perm2 (__m256 a, __m256 b)
+{
+ return _mm256_permute2f128_ps (a, b, 16);
+}
+
+__m256i
+perm3 (__m256i a, __m256i b)
+{
+ return _mm256_permute2f128_si256 (a, b, 16);
+}
+
+__m256i
+perm4 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 20);
+}
+
+__m256d
+perm5 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 20);
+}
+
+__m256i
+perm6 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 80);
+}
+
+__m256d
+perm7 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 80);
+}
+
+__m256i
+perm8 (__m256i a, __m256i b)
+{
+ return _mm256_permute2x128_si256 (a, b, 84);
+}
+
+__m256d
+perm9 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 84);
+}
new file mode 100644
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=sapphirerapids" } */
+/* { dg-final { scan-assembler-times "vmov..." 3 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$15" 3 } } */
+/* { dg-final { scan-assembler-times "vblendps\t\\\$240" 3 } } */
+/* { dg-final { scan-assembler-not "vperm2f128" } } */
+
+#include<x86intrin.h>
+
+/* Vpermf128 */
+__m256
+perm0 (__m256 a, __m256 b)
+{
+ return _mm256_permute2f128_ps (a, b, 50);
+}
+
+__m256
+perm1 (__m256 a, __m256 b)
+{
+ return _mm256_permute2f128_ps (a, b, 18);
+}
+
+__m256
+perm2 (__m256 a, __m256 b)
+{
+ return _mm256_permute2f128_ps (a, b, 48);
+}
+
+__m256i
+perm3 (__m256i a, __m256i b)
+{
+ return _mm256_permute2f128_si256 (a, b, 50);
+}
+
+__m256i
+perm4 (__m256i a, __m256i b)
+{
+ return _mm256_permute2f128_si256 (a, b, 18);
+}
+
+__m256i
+perm5 (__m256i a, __m256i b)
+{
+ return _mm256_permute2f128_si256 (a, b, 48);
+}
+
+__m256d
+perm6 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 50);
+}
+
+__m256d
+perm7 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 18);
+}
+
+__m256d
+perm8 (__m256d a, __m256d b)
+{
+ return _mm256_permute2f128_pd (a, b, 48);
+}