Lower complex type move to enable vectorization for complex type load&store.
Commit Message
> My original comments still stand (it feels like this should be more generic).
> Can we go the way lowering complex loads/stores first? A large part
> of the testcases
> added by the patch should pass after that.
This is the patch as suggested, one additional change is handling COMPLEX_CST
for rhs. And it will enable vectorization for pr106010-8a.c.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?
2022-07-20 Richard Biener <richard.guenther@gmail.com>
Hongtao Liu <hongtao.liu@intel.com>
gcc/ChangeLog:
PR tree-optimization/106010
* tree-complex.cc (init_dont_simulate_again): Lower complex
type move.
(expand_complex_move): Also expand COMPLEX_CST for rhs.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr106010-1a.c: New test.
* gcc.target/i386/pr106010-1b.c: New test.
* gcc.target/i386/pr106010-1c.c: New test.
* gcc.target/i386/pr106010-2a.c: New test.
* gcc.target/i386/pr106010-2b.c: New test.
* gcc.target/i386/pr106010-2c.c: New test.
* gcc.target/i386/pr106010-3a.c: New test.
* gcc.target/i386/pr106010-3b.c: New test.
* gcc.target/i386/pr106010-3c.c: New test.
* gcc.target/i386/pr106010-4a.c: New test.
* gcc.target/i386/pr106010-4b.c: New test.
* gcc.target/i386/pr106010-4c.c: New test.
* gcc.target/i386/pr106010-5a.c: New test.
* gcc.target/i386/pr106010-5b.c: New test.
* gcc.target/i386/pr106010-5c.c: New test.
* gcc.target/i386/pr106010-6a.c: New test.
* gcc.target/i386/pr106010-6b.c: New test.
* gcc.target/i386/pr106010-6c.c: New test.
* gcc.target/i386/pr106010-7a.c: New test.
* gcc.target/i386/pr106010-7b.c: New test.
* gcc.target/i386/pr106010-7c.c: New test.
* gcc.target/i386/pr106010-8a.c: New test.
* gcc.target/i386/pr106010-8b.c: New test.
* gcc.target/i386/pr106010-8c.c: New test.
* gcc.target/i386/pr106010-9a.c: New test.
* gcc.target/i386/pr106010-9b.c: New test.
* gcc.target/i386/pr106010-9c.c: New test.
* gcc.target/i386/pr106010-9d.c: New test.
---
gcc/testsuite/gcc.target/i386/pr106010-1a.c | 58 ++++++++
gcc/testsuite/gcc.target/i386/pr106010-1b.c | 63 ++++++++
gcc/testsuite/gcc.target/i386/pr106010-1c.c | 41 +++++
gcc/testsuite/gcc.target/i386/pr106010-2a.c | 82 ++++++++++
gcc/testsuite/gcc.target/i386/pr106010-2b.c | 62 ++++++++
gcc/testsuite/gcc.target/i386/pr106010-2c.c | 47 ++++++
gcc/testsuite/gcc.target/i386/pr106010-3a.c | 80 ++++++++++
gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++++
gcc/testsuite/gcc.target/i386/pr106010-3c.c | 69 +++++++++
gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 +++++++++++++
gcc/testsuite/gcc.target/i386/pr106010-4b.c | 67 +++++++++
gcc/testsuite/gcc.target/i386/pr106010-4c.c | 54 +++++++
gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++++
gcc/testsuite/gcc.target/i386/pr106010-5b.c | 80 ++++++++++
gcc/testsuite/gcc.target/i386/pr106010-5c.c | 62 ++++++++
gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 ++++++++++++++
gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr106010-6c.c | 80 ++++++++++
gcc/testsuite/gcc.target/i386/pr106010-7a.c | 58 ++++++++
gcc/testsuite/gcc.target/i386/pr106010-7b.c | 63 ++++++++
gcc/testsuite/gcc.target/i386/pr106010-7c.c | 41 +++++
gcc/testsuite/gcc.target/i386/pr106010-8a.c | 58 ++++++++
gcc/testsuite/gcc.target/i386/pr106010-8b.c | 53 +++++++
gcc/testsuite/gcc.target/i386/pr106010-8c.c | 38 +++++
gcc/testsuite/gcc.target/i386/pr106010-9a.c | 89 +++++++++++
gcc/testsuite/gcc.target/i386/pr106010-9b.c | 90 +++++++++++
gcc/testsuite/gcc.target/i386/pr106010-9c.c | 90 +++++++++++
gcc/testsuite/gcc.target/i386/pr106010-9d.c | 92 ++++++++++++
gcc/tree-complex.cc | 9 +-
29 files changed, 2141 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9b.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9c.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9d.c
Comments
On Wed, Jul 20, 2022 at 4:46 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> > My original comments still stand (it feels like this should be more generic).
> > Can we go the way lowering complex loads/stores first? A large part
> > of the testcases
> > added by the patch should pass after that.
>
> This is the patch as suggested, one additional change is handling COMPLEX_CST
> for rhs. And it will enable vectorization for pr106010-8a.c.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
OK.
Are there cases left your vectorizer patch handles over this one?
Thanks,
Richard.
> 2022-07-20 Richard Biener <richard.guenther@gmail.com>
> Hongtao Liu <hongtao.liu@intel.com>
>
> gcc/ChangeLog:
>
> PR tree-optimization/106010
> * tree-complex.cc (init_dont_simulate_again): Lower complex
> type move.
> (expand_complex_move): Also expand COMPLEX_CST for rhs.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr106010-1a.c: New test.
> * gcc.target/i386/pr106010-1b.c: New test.
> * gcc.target/i386/pr106010-1c.c: New test.
> * gcc.target/i386/pr106010-2a.c: New test.
> * gcc.target/i386/pr106010-2b.c: New test.
> * gcc.target/i386/pr106010-2c.c: New test.
> * gcc.target/i386/pr106010-3a.c: New test.
> * gcc.target/i386/pr106010-3b.c: New test.
> * gcc.target/i386/pr106010-3c.c: New test.
> * gcc.target/i386/pr106010-4a.c: New test.
> * gcc.target/i386/pr106010-4b.c: New test.
> * gcc.target/i386/pr106010-4c.c: New test.
> * gcc.target/i386/pr106010-5a.c: New test.
> * gcc.target/i386/pr106010-5b.c: New test.
> * gcc.target/i386/pr106010-5c.c: New test.
> * gcc.target/i386/pr106010-6a.c: New test.
> * gcc.target/i386/pr106010-6b.c: New test.
> * gcc.target/i386/pr106010-6c.c: New test.
> * gcc.target/i386/pr106010-7a.c: New test.
> * gcc.target/i386/pr106010-7b.c: New test.
> * gcc.target/i386/pr106010-7c.c: New test.
> * gcc.target/i386/pr106010-8a.c: New test.
> * gcc.target/i386/pr106010-8b.c: New test.
> * gcc.target/i386/pr106010-8c.c: New test.
> * gcc.target/i386/pr106010-9a.c: New test.
> * gcc.target/i386/pr106010-9b.c: New test.
> * gcc.target/i386/pr106010-9c.c: New test.
> * gcc.target/i386/pr106010-9d.c: New test.
> ---
> gcc/testsuite/gcc.target/i386/pr106010-1a.c | 58 ++++++++
> gcc/testsuite/gcc.target/i386/pr106010-1b.c | 63 ++++++++
> gcc/testsuite/gcc.target/i386/pr106010-1c.c | 41 +++++
> gcc/testsuite/gcc.target/i386/pr106010-2a.c | 82 ++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-2b.c | 62 ++++++++
> gcc/testsuite/gcc.target/i386/pr106010-2c.c | 47 ++++++
> gcc/testsuite/gcc.target/i386/pr106010-3a.c | 80 ++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-3c.c | 69 +++++++++
> gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 +++++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-4b.c | 67 +++++++++
> gcc/testsuite/gcc.target/i386/pr106010-4c.c | 54 +++++++
> gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-5b.c | 80 ++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-5c.c | 62 ++++++++
> gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 ++++++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-6c.c | 80 ++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-7a.c | 58 ++++++++
> gcc/testsuite/gcc.target/i386/pr106010-7b.c | 63 ++++++++
> gcc/testsuite/gcc.target/i386/pr106010-7c.c | 41 +++++
> gcc/testsuite/gcc.target/i386/pr106010-8a.c | 58 ++++++++
> gcc/testsuite/gcc.target/i386/pr106010-8b.c | 53 +++++++
> gcc/testsuite/gcc.target/i386/pr106010-8c.c | 38 +++++
> gcc/testsuite/gcc.target/i386/pr106010-9a.c | 89 +++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-9b.c | 90 +++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-9c.c | 90 +++++++++++
> gcc/testsuite/gcc.target/i386/pr106010-9d.c | 92 ++++++++++++
> gcc/tree-complex.cc | 9 +-
> 29 files changed, 2141 insertions(+), 1 deletion(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9b.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9c.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9d.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> new file mode 100644
> index 00000000000..b608f484934
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> +
> +#define N 10000
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b[i];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> new file mode 100644
> index 00000000000..0f377c3a548
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> @@ -0,0 +1,63 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-1a.c"
> +
> +void
> +avx_test (void)
> +{
> + _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> + _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> + _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> + _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> + _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> + _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> + _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> + _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> + _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> + _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> + _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> + _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> + char* p_init = (char*) malloc (2 * N * sizeof (double));
> +
> + __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> + __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> + __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> + __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> + __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> + __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> +
> + for (int i = 0; i != 2 * N * sizeof (double); i++)
> + p_init[i] = i;
> +
> + memcpy (pd_src, p_init, 2 * N * sizeof (double));
> + memcpy (ps_src, p_init, 2 * N * sizeof (float));
> + memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> + memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> + memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> + memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> +
> + foo_pd (pd_dst, pd_src);
> + foo_ps (ps_dst, ps_src);
> + foo_epi64 (epi64_dst, epi64_src);
> + foo_epi32 (epi32_dst, epi32_src);
> + foo_epi16 (epi16_dst, epi16_src);
> + foo_epi8 (epi8_dst, epi8_src);
> + if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> new file mode 100644
> index 00000000000..f07e9fb2d3d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> @@ -0,0 +1,41 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +#define N 10000
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b[i];
> +}
> +
> +static void
> +do_test (void)
> +{
> + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> + char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> +
> + __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> +
> + for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> + p_init[i] = i;
> +
> + memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> +
> + foo_ph (ph_dst, ph_src);
> + if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> + __builtin_abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> new file mode 100644
> index 00000000000..d2e2f8d4f43
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> @@ -0,0 +1,82 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> + a[0] = b[0];
> + a[1] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> + a[0] = b[0];
> + a[1] = b[1];
> + a[2] = b[2];
> + a[3] = b[3];
> +
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> + a[0] = b[0];
> + a[1] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> + a[0] = b[0];
> + a[1] = b[1];
> + a[2] = b[2];
> + a[3] = b[3];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> + a[0] = b[0];
> + a[1] = b[1];
> + a[2] = b[2];
> + a[3] = b[3];
> + a[4] = b[4];
> + a[5] = b[5];
> + a[6] = b[6];
> + a[7] = b[7];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> + a[0] = b[0];
> + a[1] = b[1];
> + a[2] = b[2];
> + a[3] = b[3];
> + a[4] = b[4];
> + a[5] = b[5];
> + a[6] = b[6];
> + a[7] = b[7];
> + a[8] = b[8];
> + a[9] = b[9];
> + a[10] = b[10];
> + a[11] = b[11];
> + a[12] = b[12];
> + a[13] = b[13];
> + a[14] = b[14];
> + a[15] = b[15];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> new file mode 100644
> index 00000000000..ac360752693
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> @@ -0,0 +1,62 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-2a.c"
> +
> +void
> +avx_test (void)
> +{
> + _Complex double* pd_src = (_Complex double*) malloc (32);
> + _Complex double* pd_dst = (_Complex double*) malloc (32);
> + _Complex float* ps_src = (_Complex float*) malloc (32);
> + _Complex float* ps_dst = (_Complex float*) malloc (32);
> + _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> + _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> + _Complex int* epi32_src = (_Complex int*) malloc (32);
> + _Complex int* epi32_dst = (_Complex int*) malloc (32);
> + _Complex short* epi16_src = (_Complex short*) malloc (32);
> + _Complex short* epi16_dst = (_Complex short*) malloc (32);
> + _Complex char* epi8_src = (_Complex char*) malloc (32);
> + _Complex char* epi8_dst = (_Complex char*) malloc (32);
> + char* p = (char* ) malloc (32);
> +
> + __builtin_memset (pd_dst, 0, 32);
> + __builtin_memset (ps_dst, 0, 32);
> + __builtin_memset (epi64_dst, 0, 32);
> + __builtin_memset (epi32_dst, 0, 32);
> + __builtin_memset (epi16_dst, 0, 32);
> + __builtin_memset (epi8_dst, 0, 32);
> +
> + for (int i = 0; i != 32; i++)
> + p[i] = i;
> + __builtin_memcpy (pd_src, p, 32);
> + __builtin_memcpy (ps_src, p, 32);
> + __builtin_memcpy (epi64_src, p, 32);
> + __builtin_memcpy (epi32_src, p, 32);
> + __builtin_memcpy (epi16_src, p, 32);
> + __builtin_memcpy (epi8_src, p, 32);
> +
> + foo_pd (pd_dst, pd_src);
> + foo_ps (ps_dst, ps_src);
> + foo_epi64 (epi64_dst, epi64_src);
> + foo_epi32 (epi32_dst, epi32_src);
> + foo_epi16 (epi16_dst, epi16_src);
> + foo_epi8 (epi8_dst, epi8_src);
> + if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> new file mode 100644
> index 00000000000..a002f209ec9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> @@ -0,0 +1,47 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> + a[0] = b[0];
> + a[1] = b[1];
> + a[2] = b[2];
> + a[3] = b[3];
> + a[4] = b[4];
> + a[5] = b[5];
> + a[6] = b[6];
> + a[7] = b[7];
> +}
> +
> +void
> +do_test (void)
> +{
> + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> + char* p = (char* ) malloc (32);
> +
> + __builtin_memset (ph_dst, 0, 32);
> +
> + for (int i = 0; i != 32; i++)
> + p[i] = i;
> + __builtin_memcpy (ph_src, p, 32);
> +
> + foo_ph (ph_dst, ph_src);
> + if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> new file mode 100644
> index 00000000000..c1b64b56b1c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> @@ -0,0 +1,80 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } } */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> + a[0] = b[1];
> + a[1] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> + a[0] = b[1];
> + a[1] = b[0];
> + a[2] = b[3];
> + a[3] = b[2];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> + a[0] = b[1];
> + a[1] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> + a[0] = b[3];
> + a[1] = b[2];
> + a[2] = b[1];
> + a[3] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> + a[0] = b[7];
> + a[1] = b[6];
> + a[2] = b[5];
> + a[3] = b[4];
> + a[4] = b[3];
> + a[5] = b[2];
> + a[6] = b[1];
> + a[7] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> + a[0] = b[7];
> + a[1] = b[6];
> + a[2] = b[5];
> + a[3] = b[4];
> + a[4] = b[3];
> + a[5] = b[2];
> + a[6] = b[1];
> + a[7] = b[0];
> + a[8] = b[15];
> + a[9] = b[14];
> + a[10] = b[13];
> + a[11] = b[12];
> + a[12] = b[11];
> + a[13] = b[10];
> + a[14] = b[9];
> + a[15] = b[8];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> new file mode 100644
> index 00000000000..e4fa3f3a541
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> @@ -0,0 +1,126 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr106010-3a.c"
> +
> +void
> +avx2_test (void)
> +{
> + _Complex double* pd_src = (_Complex double*) malloc (32);
> + _Complex double* pd_dst = (_Complex double*) malloc (32);
> + _Complex double* pd_exp = (_Complex double*) malloc (32);
> + _Complex float* ps_src = (_Complex float*) malloc (32);
> + _Complex float* ps_dst = (_Complex float*) malloc (32);
> + _Complex float* ps_exp = (_Complex float*) malloc (32);
> + _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> + _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> + _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> + _Complex int* epi32_src = (_Complex int*) malloc (32);
> + _Complex int* epi32_dst = (_Complex int*) malloc (32);
> + _Complex int* epi32_exp = (_Complex int*) malloc (32);
> + _Complex short* epi16_src = (_Complex short*) malloc (32);
> + _Complex short* epi16_dst = (_Complex short*) malloc (32);
> + _Complex short* epi16_exp = (_Complex short*) malloc (32);
> + _Complex char* epi8_src = (_Complex char*) malloc (32);
> + _Complex char* epi8_dst = (_Complex char*) malloc (32);
> + _Complex char* epi8_exp = (_Complex char*) malloc (32);
> + char* p = (char* ) malloc (32);
> + char* q = (char* ) malloc (32);
> +
> + __builtin_memset (pd_dst, 0, 32);
> + __builtin_memset (ps_dst, 0, 32);
> + __builtin_memset (epi64_dst, 0, 32);
> + __builtin_memset (epi32_dst, 0, 32);
> + __builtin_memset (epi16_dst, 0, 32);
> + __builtin_memset (epi8_dst, 0, 32);
> +
> + for (int i = 0; i != 32; i++)
> + p[i] = i;
> + __builtin_memcpy (pd_src, p, 32);
> + __builtin_memcpy (ps_src, p, 32);
> + __builtin_memcpy (epi64_src, p, 32);
> + __builtin_memcpy (epi32_src, p, 32);
> + __builtin_memcpy (epi16_src, p, 32);
> + __builtin_memcpy (epi8_src, p, 32);
> +
> + for (int i = 0; i != 16; i++)
> + {
> + p[i] = i + 16;
> + p[i + 16] = i;
> + }
> + __builtin_memcpy (pd_exp, p, 32);
> + __builtin_memcpy (epi64_exp, p, 32);
> +
> + for (int i = 0; i != 8; i++)
> + {
> + p[i] = i + 8;
> + p[i + 8] = i;
> + p[i + 16] = i + 24;
> + p[i + 24] = i + 16;
> + q[i] = i + 24;
> + q[i + 8] = i + 16;
> + q[i + 16] = i + 8;
> + q[i + 24] = i;
> + }
> + __builtin_memcpy (ps_exp, p, 32);
> + __builtin_memcpy (epi32_exp, q, 32);
> +
> +
> + for (int i = 0; i != 4; i++)
> + {
> + q[i] = i + 28;
> + q[i + 4] = i + 24;
> + q[i + 8] = i + 20;
> + q[i + 12] = i + 16;
> + q[i + 16] = i + 12;
> + q[i + 20] = i + 8;
> + q[i + 24] = i + 4;
> + q[i + 28] = i;
> + }
> + __builtin_memcpy (epi16_exp, q, 32);
> +
> + for (int i = 0; i != 2; i++)
> + {
> + q[i] = i + 14;
> + q[i + 2] = i + 12;
> + q[i + 4] = i + 10;
> + q[i + 6] = i + 8;
> + q[i + 8] = i + 6;
> + q[i + 10] = i + 4;
> + q[i + 12] = i + 2;
> + q[i + 14] = i;
> + q[i + 16] = i + 30;
> + q[i + 18] = i + 28;
> + q[i + 20] = i + 26;
> + q[i + 22] = i + 24;
> + q[i + 24] = i + 22;
> + q[i + 26] = i + 20;
> + q[i + 28] = i + 18;
> + q[i + 30] = i + 16;
> + }
> + __builtin_memcpy (epi8_exp, q, 32);
> +
> + foo_pd (pd_dst, pd_src);
> + foo_ps (ps_dst, ps_src);
> + foo_epi64 (epi64_dst, epi64_src);
> + foo_epi32 (epi32_dst, epi32_src);
> + foo_epi16 (epi16_dst, epi16_src);
> + foo_epi8 (epi8_dst, epi8_src);
> + if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> new file mode 100644
> index 00000000000..5a5a3d4b992
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> @@ -0,0 +1,69 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> + a[0] = b[1];
> + a[1] = b[0];
> + a[2] = b[4];
> + a[3] = b[3];
> + a[4] = b[7];
> + a[5] = b[6];
> + a[6] = b[2];
> + a[7] = b[5];
> +}
> +
> +void
> +do_test (void)
> +{
> + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> + _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> + char* p = (char* ) malloc (32);
> + char* q = (char* ) malloc (32);
> +
> + __builtin_memset (ph_dst, 0, 32);
> +
> + for (int i = 0; i != 32; i++)
> + p[i] = i;
> + __builtin_memcpy (ph_src, p, 32);
> +
> + for (int i = 0; i != 4; i++)
> + {
> + p[i] = i + 4;
> + p[i + 4] = i;
> + p[i + 8] = i + 16;
> + p[i + 12] = i + 12;
> + p[i + 16] = i + 28;
> + p[i + 20] = i + 24;
> + p[i + 24] = i + 8;
> + p[i + 28] = i + 20;
> + q[i] = i + 28;
> + q[i + 4] = i + 24;
> + q[i + 8] = i + 20;
> + q[i + 12] = i + 16;
> + q[i + 16] = i + 12;
> + q[i + 20] = i + 8;
> + q[i + 24] = i + 4;
> + q[i + 28] = i;
> + }
> + __builtin_memcpy (ph_exp, p, 32);
> +
> + foo_ph (ph_dst, ph_src);
> + if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> new file mode 100644
> index 00000000000..b7b0b532bb1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> @@ -0,0 +1,101 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a,
> + _Complex double b1,
> + _Complex double b2)
> +{
> + a[0] = b1;
> + a[1] = b2;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a,
> + _Complex float b1, _Complex float b2,
> + _Complex float b3, _Complex float b4)
> +{
> + a[0] = b1;
> + a[1] = b2;
> + a[2] = b3;
> + a[3] = b4;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a,
> + _Complex long long b1,
> + _Complex long long b2)
> +{
> + a[0] = b1;
> + a[1] = b2;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a,
> + _Complex int b1, _Complex int b2,
> + _Complex int b3, _Complex int b4)
> +{
> + a[0] = b1;
> + a[1] = b2;
> + a[2] = b3;
> + a[3] = b4;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a,
> + _Complex short b1, _Complex short b2,
> + _Complex short b3, _Complex short b4,
> + _Complex short b5, _Complex short b6,
> + _Complex short b7,_Complex short b8)
> +{
> + a[0] = b1;
> + a[1] = b2;
> + a[2] = b3;
> + a[3] = b4;
> + a[4] = b5;
> + a[5] = b6;
> + a[6] = b7;
> + a[7] = b8;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a,
> + _Complex char b1, _Complex char b2,
> + _Complex char b3, _Complex char b4,
> + _Complex char b5, _Complex char b6,
> + _Complex char b7,_Complex char b8,
> + _Complex char b9, _Complex char b10,
> + _Complex char b11, _Complex char b12,
> + _Complex char b13, _Complex char b14,
> + _Complex char b15,_Complex char b16)
> +{
> + a[0] = b1;
> + a[1] = b2;
> + a[2] = b3;
> + a[3] = b4;
> + a[4] = b5;
> + a[5] = b6;
> + a[6] = b7;
> + a[7] = b8;
> + a[8] = b9;
> + a[9] = b10;
> + a[10] = b11;
> + a[11] = b12;
> + a[12] = b13;
> + a[13] = b14;
> + a[14] = b15;
> + a[15] = b16;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> new file mode 100644
> index 00000000000..e2e79508c4b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> @@ -0,0 +1,67 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-4a.c"
> +
> +void
> +avx_test (void)
> +{
> + _Complex double* pd_src = (_Complex double*) malloc (32);
> + _Complex double* pd_dst = (_Complex double*) malloc (32);
> + _Complex float* ps_src = (_Complex float*) malloc (32);
> + _Complex float* ps_dst = (_Complex float*) malloc (32);
> + _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> + _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> + _Complex int* epi32_src = (_Complex int*) malloc (32);
> + _Complex int* epi32_dst = (_Complex int*) malloc (32);
> + _Complex short* epi16_src = (_Complex short*) malloc (32);
> + _Complex short* epi16_dst = (_Complex short*) malloc (32);
> + _Complex char* epi8_src = (_Complex char*) malloc (32);
> + _Complex char* epi8_dst = (_Complex char*) malloc (32);
> + char* p = (char* ) malloc (32);
> +
> + __builtin_memset (pd_dst, 0, 32);
> + __builtin_memset (ps_dst, 0, 32);
> + __builtin_memset (epi64_dst, 0, 32);
> + __builtin_memset (epi32_dst, 0, 32);
> + __builtin_memset (epi16_dst, 0, 32);
> + __builtin_memset (epi8_dst, 0, 32);
> +
> + for (int i = 0; i != 32; i++)
> + p[i] = i;
> + __builtin_memcpy (pd_src, p, 32);
> + __builtin_memcpy (ps_src, p, 32);
> + __builtin_memcpy (epi64_src, p, 32);
> + __builtin_memcpy (epi32_src, p, 32);
> + __builtin_memcpy (epi16_src, p, 32);
> + __builtin_memcpy (epi8_src, p, 32);
> +
> + foo_pd (pd_dst, pd_src[0], pd_src[1]);
> + foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> + foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> + foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> + foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> + epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> + foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> + epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> + epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> + epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> +
> + if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> new file mode 100644
> index 00000000000..8e02aefe3b5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> @@ -0,0 +1,54 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a,
> + _Complex _Float16 b1, _Complex _Float16 b2,
> + _Complex _Float16 b3, _Complex _Float16 b4,
> + _Complex _Float16 b5, _Complex _Float16 b6,
> + _Complex _Float16 b7,_Complex _Float16 b8)
> +{
> + a[0] = b1;
> + a[1] = b2;
> + a[2] = b3;
> + a[3] = b4;
> + a[4] = b5;
> + a[5] = b6;
> + a[6] = b7;
> + a[7] = b8;
> +}
> +
> +void
> +do_test (void)
> +{
> +
> + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> +
> + char* p = (char* ) malloc (32);
> +
> + __builtin_memset (ph_dst, 0, 32);
> +
> + for (int i = 0; i != 32; i++)
> + p[i] = i;
> +
> + __builtin_memcpy (ph_src, p, 32);
> +
> + foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> + ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> +
> + if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> + __builtin_abort ();
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> new file mode 100644
> index 00000000000..9d4a6f9846b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> @@ -0,0 +1,117 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> + a[0] = b[2];
> + a[1] = b[3];
> + a[2] = b[0];
> + a[3] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> + a[0] = b[4];
> + a[1] = b[5];
> + a[2] = b[6];
> + a[3] = b[7];
> + a[4] = b[0];
> + a[5] = b[1];
> + a[6] = b[2];
> + a[7] = b[3];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> + a[0] = b[2];
> + a[1] = b[3];
> + a[2] = b[0];
> + a[3] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> + a[0] = b[4];
> + a[1] = b[5];
> + a[2] = b[6];
> + a[3] = b[7];
> + a[4] = b[0];
> + a[5] = b[1];
> + a[6] = b[2];
> + a[7] = b[3];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> + a[0] = b[8];
> + a[1] = b[9];
> + a[2] = b[10];
> + a[3] = b[11];
> + a[4] = b[12];
> + a[5] = b[13];
> + a[6] = b[14];
> + a[7] = b[15];
> + a[8] = b[0];
> + a[9] = b[1];
> + a[10] = b[2];
> + a[11] = b[3];
> + a[12] = b[4];
> + a[13] = b[5];
> + a[14] = b[6];
> + a[15] = b[7];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> + a[0] = b[16];
> + a[1] = b[17];
> + a[2] = b[18];
> + a[3] = b[19];
> + a[4] = b[20];
> + a[5] = b[21];
> + a[6] = b[22];
> + a[7] = b[23];
> + a[8] = b[24];
> + a[9] = b[25];
> + a[10] = b[26];
> + a[11] = b[27];
> + a[12] = b[28];
> + a[13] = b[29];
> + a[14] = b[30];
> + a[15] = b[31];
> + a[16] = b[0];
> + a[17] = b[1];
> + a[18] = b[2];
> + a[19] = b[3];
> + a[20] = b[4];
> + a[21] = b[5];
> + a[22] = b[6];
> + a[23] = b[7];
> + a[24] = b[8];
> + a[25] = b[9];
> + a[26] = b[10];
> + a[27] = b[11];
> + a[28] = b[12];
> + a[29] = b[13];
> + a[30] = b[14];
> + a[31] = b[15];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> new file mode 100644
> index 00000000000..d5c6ebeb5cf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> @@ -0,0 +1,80 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-5a.c"
> +
> +void
> +avx_test (void)
> +{
> + _Complex double* pd_src = (_Complex double*) malloc (64);
> + _Complex double* pd_dst = (_Complex double*) malloc (64);
> + _Complex double* pd_exp = (_Complex double*) malloc (64);
> + _Complex float* ps_src = (_Complex float*) malloc (64);
> + _Complex float* ps_dst = (_Complex float*) malloc (64);
> + _Complex float* ps_exp = (_Complex float*) malloc (64);
> + _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> + _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> + _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> + _Complex int* epi32_src = (_Complex int*) malloc (64);
> + _Complex int* epi32_dst = (_Complex int*) malloc (64);
> + _Complex int* epi32_exp = (_Complex int*) malloc (64);
> + _Complex short* epi16_src = (_Complex short*) malloc (64);
> + _Complex short* epi16_dst = (_Complex short*) malloc (64);
> + _Complex short* epi16_exp = (_Complex short*) malloc (64);
> + _Complex char* epi8_src = (_Complex char*) malloc (64);
> + _Complex char* epi8_dst = (_Complex char*) malloc (64);
> + _Complex char* epi8_exp = (_Complex char*) malloc (64);
> + char* p = (char* ) malloc (64);
> + char* q = (char* ) malloc (64);
> +
> + __builtin_memset (pd_dst, 0, 64);
> + __builtin_memset (ps_dst, 0, 64);
> + __builtin_memset (epi64_dst, 0, 64);
> + __builtin_memset (epi32_dst, 0, 64);
> + __builtin_memset (epi16_dst, 0, 64);
> + __builtin_memset (epi8_dst, 0, 64);
> +
> + for (int i = 0; i != 64; i++)
> + {
> + p[i] = i;
> + q[i] = (i + 32) % 64;
> + }
> + __builtin_memcpy (pd_src, p, 64);
> + __builtin_memcpy (ps_src, p, 64);
> + __builtin_memcpy (epi64_src, p, 64);
> + __builtin_memcpy (epi32_src, p, 64);
> + __builtin_memcpy (epi16_src, p, 64);
> + __builtin_memcpy (epi8_src, p, 64);
> +
> + __builtin_memcpy (pd_exp, q, 64);
> + __builtin_memcpy (ps_exp, q, 64);
> + __builtin_memcpy (epi64_exp, q, 64);
> + __builtin_memcpy (epi32_exp, q, 64);
> + __builtin_memcpy (epi16_exp, q, 64);
> + __builtin_memcpy (epi8_exp, q, 64);
> +
> + foo_pd (pd_dst, pd_src);
> + foo_ps (ps_dst, ps_src);
> + foo_epi64 (epi64_dst, epi64_src);
> + foo_epi32 (epi32_dst, epi32_src);
> + foo_epi16 (epi16_dst, epi16_src);
> + foo_epi8 (epi8_dst, epi8_src);
> +
> + if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> new file mode 100644
> index 00000000000..9ce4e6dd5c0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> @@ -0,0 +1,62 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> + a[0] = b[8];
> + a[1] = b[9];
> + a[2] = b[10];
> + a[3] = b[11];
> + a[4] = b[12];
> + a[5] = b[13];
> + a[6] = b[14];
> + a[7] = b[15];
> + a[8] = b[0];
> + a[9] = b[1];
> + a[10] = b[2];
> + a[11] = b[3];
> + a[12] = b[4];
> + a[13] = b[5];
> + a[14] = b[6];
> + a[15] = b[7];
> +}
> +
> +void
> +do_test (void)
> +{
> + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> + _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> + char* p = (char* ) malloc (64);
> + char* q = (char* ) malloc (64);
> +
> + __builtin_memset (ph_dst, 0, 64);
> +
> + for (int i = 0; i != 64; i++)
> + {
> + p[i] = i;
> + q[i] = (i + 32) % 64;
> + }
> + __builtin_memcpy (ph_src, p, 64);
> +
> + __builtin_memcpy (ph_exp, q, 64);
> +
> + foo_ph (ph_dst, ph_src);
> +
> + if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> new file mode 100644
> index 00000000000..65a90d03684
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> @@ -0,0 +1,115 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } } */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> + a[0] = b[3];
> + a[1] = b[2];
> + a[2] = b[1];
> + a[3] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> + a[0] = b[7];
> + a[1] = b[6];
> + a[2] = b[5];
> + a[3] = b[4];
> + a[4] = b[3];
> + a[5] = b[2];
> + a[6] = b[1];
> + a[7] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> + a[0] = b[3];
> + a[1] = b[2];
> + a[2] = b[1];
> + a[3] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> + a[0] = b[7];
> + a[1] = b[6];
> + a[2] = b[5];
> + a[3] = b[4];
> + a[4] = b[3];
> + a[5] = b[2];
> + a[6] = b[1];
> + a[7] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> + a[0] = b[15];
> + a[1] = b[14];
> + a[2] = b[13];
> + a[3] = b[12];
> + a[4] = b[11];
> + a[5] = b[10];
> + a[6] = b[9];
> + a[7] = b[8];
> + a[8] = b[7];
> + a[9] = b[6];
> + a[10] = b[5];
> + a[11] = b[4];
> + a[12] = b[3];
> + a[13] = b[2];
> + a[14] = b[1];
> + a[15] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> + a[0] = b[31];
> + a[1] = b[30];
> + a[2] = b[29];
> + a[3] = b[28];
> + a[4] = b[27];
> + a[5] = b[26];
> + a[6] = b[25];
> + a[7] = b[24];
> + a[8] = b[23];
> + a[9] = b[22];
> + a[10] = b[21];
> + a[11] = b[20];
> + a[12] = b[19];
> + a[13] = b[18];
> + a[14] = b[17];
> + a[15] = b[16];
> + a[16] = b[15];
> + a[17] = b[14];
> + a[18] = b[13];
> + a[19] = b[12];
> + a[20] = b[11];
> + a[21] = b[10];
> + a[22] = b[9];
> + a[23] = b[8];
> + a[24] = b[7];
> + a[25] = b[6];
> + a[26] = b[5];
> + a[27] = b[4];
> + a[28] = b[3];
> + a[29] = b[2];
> + a[30] = b[1];
> + a[31] = b[0];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> new file mode 100644
> index 00000000000..1c5bb020939
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> @@ -0,0 +1,157 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr106010-6a.c"
> +
> +void
> +avx2_test (void)
> +{
> + _Complex double* pd_src = (_Complex double*) malloc (64);
> + _Complex double* pd_dst = (_Complex double*) malloc (64);
> + _Complex double* pd_exp = (_Complex double*) malloc (64);
> + _Complex float* ps_src = (_Complex float*) malloc (64);
> + _Complex float* ps_dst = (_Complex float*) malloc (64);
> + _Complex float* ps_exp = (_Complex float*) malloc (64);
> + _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> + _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> + _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> + _Complex int* epi32_src = (_Complex int*) malloc (64);
> + _Complex int* epi32_dst = (_Complex int*) malloc (64);
> + _Complex int* epi32_exp = (_Complex int*) malloc (64);
> + _Complex short* epi16_src = (_Complex short*) malloc (64);
> + _Complex short* epi16_dst = (_Complex short*) malloc (64);
> + _Complex short* epi16_exp = (_Complex short*) malloc (64);
> + _Complex char* epi8_src = (_Complex char*) malloc (64);
> + _Complex char* epi8_dst = (_Complex char*) malloc (64);
> + _Complex char* epi8_exp = (_Complex char*) malloc (64);
> + char* p = (char* ) malloc (64);
> + char* q = (char* ) malloc (64);
> +
> + __builtin_memset (pd_dst, 0, 64);
> + __builtin_memset (ps_dst, 0, 64);
> + __builtin_memset (epi64_dst, 0, 64);
> + __builtin_memset (epi32_dst, 0, 64);
> + __builtin_memset (epi16_dst, 0, 64);
> + __builtin_memset (epi8_dst, 0, 64);
> +
> + for (int i = 0; i != 64; i++)
> + p[i] = i;
> +
> + __builtin_memcpy (pd_src, p, 64);
> + __builtin_memcpy (ps_src, p, 64);
> + __builtin_memcpy (epi64_src, p, 64);
> + __builtin_memcpy (epi32_src, p, 64);
> + __builtin_memcpy (epi16_src, p, 64);
> + __builtin_memcpy (epi8_src, p, 64);
> +
> +
> + for (int i = 0; i != 16; i++)
> + {
> + q[i] = i + 48;
> + q[i + 16] = i + 32;
> + q[i + 32] = i + 16;
> + q[i + 48] = i;
> + }
> +
> + __builtin_memcpy (pd_exp, q, 64);
> + __builtin_memcpy (epi64_exp, q, 64);
> +
> + for (int i = 0; i != 8; i++)
> + {
> + q[i] = i + 56;
> + q[i + 8] = i + 48;
> + q[i + 16] = i + 40;
> + q[i + 24] = i + 32;
> + q[i + 32] = i + 24;
> + q[i + 40] = i + 16;
> + q[i + 48] = i + 8;
> + q[i + 56] = i;
> + }
> +
> + __builtin_memcpy (ps_exp, q, 64);
> + __builtin_memcpy (epi32_exp, q, 64);
> +
> + for (int i = 0; i != 4; i++)
> + {
> + q[i] = i + 60;
> + q[i + 4] = i + 56;
> + q[i + 8] = i + 52;
> + q[i + 12] = i + 48;
> + q[i + 16] = i + 44;
> + q[i + 20] = i + 40;
> + q[i + 24] = i + 36;
> + q[i + 28] = i + 32;
> + q[i + 32] = i + 28;
> + q[i + 36] = i + 24;
> + q[i + 40] = i + 20;
> + q[i + 44] = i + 16;
> + q[i + 48] = i + 12;
> + q[i + 52] = i + 8;
> + q[i + 56] = i + 4;
> + q[i + 60] = i;
> + }
> +
> + __builtin_memcpy (epi16_exp, q, 64);
> +
> + for (int i = 0; i != 2; i++)
> + {
> + q[i] = i + 62;
> + q[i + 2] = i + 60;
> + q[i + 4] = i + 58;
> + q[i + 6] = i + 56;
> + q[i + 8] = i + 54;
> + q[i + 10] = i + 52;
> + q[i + 12] = i + 50;
> + q[i + 14] = i + 48;
> + q[i + 16] = i + 46;
> + q[i + 18] = i + 44;
> + q[i + 20] = i + 42;
> + q[i + 22] = i + 40;
> + q[i + 24] = i + 38;
> + q[i + 26] = i + 36;
> + q[i + 28] = i + 34;
> + q[i + 30] = i + 32;
> + q[i + 32] = i + 30;
> + q[i + 34] = i + 28;
> + q[i + 36] = i + 26;
> + q[i + 38] = i + 24;
> + q[i + 40] = i + 22;
> + q[i + 42] = i + 20;
> + q[i + 44] = i + 18;
> + q[i + 46] = i + 16;
> + q[i + 48] = i + 14;
> + q[i + 50] = i + 12;
> + q[i + 52] = i + 10;
> + q[i + 54] = i + 8;
> + q[i + 56] = i + 6;
> + q[i + 58] = i + 4;
> + q[i + 60] = i + 2;
> + q[i + 62] = i;
> + }
> + __builtin_memcpy (epi8_exp, q, 64);
> +
> + foo_pd (pd_dst, pd_src);
> + foo_ps (ps_dst, ps_src);
> + foo_epi64 (epi64_dst, epi64_src);
> + foo_epi32 (epi32_dst, epi32_src);
> + foo_epi16 (epi16_dst, epi16_src);
> + foo_epi8 (epi8_dst, epi8_src);
> +
> + if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> new file mode 100644
> index 00000000000..b859d884a7f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> @@ -0,0 +1,80 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> + a[0] = b[15];
> + a[1] = b[14];
> + a[2] = b[13];
> + a[3] = b[12];
> + a[4] = b[11];
> + a[5] = b[10];
> + a[6] = b[9];
> + a[7] = b[8];
> + a[8] = b[7];
> + a[9] = b[6];
> + a[10] = b[5];
> + a[11] = b[4];
> + a[12] = b[3];
> + a[13] = b[2];
> + a[14] = b[1];
> + a[15] = b[0];
> +}
> +
> +void
> +do_test (void)
> +{
> + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> + _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> + char* p = (char* ) malloc (64);
> + char* q = (char* ) malloc (64);
> +
> + __builtin_memset (ph_dst, 0, 64);
> +
> + for (int i = 0; i != 64; i++)
> + p[i] = i;
> +
> + __builtin_memcpy (ph_src, p, 64);
> +
> + for (int i = 0; i != 4; i++)
> + {
> + q[i] = i + 60;
> + q[i + 4] = i + 56;
> + q[i + 8] = i + 52;
> + q[i + 12] = i + 48;
> + q[i + 16] = i + 44;
> + q[i + 20] = i + 40;
> + q[i + 24] = i + 36;
> + q[i + 28] = i + 32;
> + q[i + 32] = i + 28;
> + q[i + 36] = i + 24;
> + q[i + 40] = i + 20;
> + q[i + 44] = i + 16;
> + q[i + 48] = i + 12;
> + q[i + 52] = i + 8;
> + q[i + 56] = i + 4;
> + q[i + 60] = i;
> + }
> +
> + __builtin_memcpy (ph_exp, q, 64);
> +
> + foo_ph (ph_dst, ph_src);
> +
> + if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> new file mode 100644
> index 00000000000..2ea01fac927
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> +
> +#define N 10000
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> new file mode 100644
> index 00000000000..26482cc10f5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> @@ -0,0 +1,63 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-7a.c"
> +
> +void
> +avx_test (void)
> +{
> + _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> + _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> + _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> + _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> + _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> + _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> + _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> + _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> + _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> + _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> + _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> + _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> + char* p_init = (char*) malloc (2 * N * sizeof (double));
> +
> + __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> + __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> + __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> + __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> + __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> + __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> +
> + for (int i = 0; i != 2 * N * sizeof (double); i++)
> + p_init[i] = i % 2 + 3;
> +
> + memcpy (pd_src, p_init, 2 * N * sizeof (double));
> + memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> + memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> + memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> + memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> + memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> +
> + foo_pd (pd_dst, pd_src[0]);
> + foo_ps (ps_dst, ps_src[0]);
> + foo_epi64 (epi64_dst, epi64_src[0]);
> + foo_epi32 (epi32_dst, epi32_src[0]);
> + foo_epi16 (epi16_dst, epi16_src[0]);
> + foo_epi8 (epi8_dst, epi8_src[0]);
> + if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> + __builtin_abort ();
> + if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> + __builtin_abort ();
> +
> + return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> new file mode 100644
> index 00000000000..7f4056a5ecc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> @@ -0,0 +1,41 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +#define N 10000
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b;
> +}
> +
> +static void
> +do_test (void)
> +{
> + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> + char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> +
> + __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> +
> + for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> + p_init[i] = i % 2 + 3;
> +
> + memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> +
> + foo_ph (ph_dst, ph_src[0]);
> + if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> + __builtin_abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> new file mode 100644
> index 00000000000..11054b60d30
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> +
> +#define N 10000
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = 1.0 + 2.0i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = 1.0f + 2.0fi;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = 1 + 2i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = 1 + 2i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = 1 + 2i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = 1 + 2i;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> new file mode 100644
> index 00000000000..6bb0073b691
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> @@ -0,0 +1,53 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-8a.c"
> +
> +void
> +avx_test (void)
> +{
> + _Complex double pd_src = 1.0 + 2.0i;
> + _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> + _Complex float ps_src = 1.0 + 2.0i;
> + _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> + _Complex long long epi64_src = 1 + 2i;;
> + _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> + _Complex int epi32_src = 1 + 2i;
> + _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> + _Complex short epi16_src = 1 + 2i;
> + _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> + _Complex char epi8_src = 1 + 2i;
> + _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> +
> + __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> + __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> + __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> + __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> + __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> + __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> +
> + foo_pd (pd_dst);
> + foo_ps (ps_dst);
> + foo_epi64 (epi64_dst);
> + foo_epi32 (epi32_dst);
> + foo_epi16 (epi16_dst);
> + foo_epi8 (epi8_dst);
> + for (int i = 0 ; i != N; i++)
> + {
> + if (pd_dst[i] != pd_src)
> + __builtin_abort ();
> + if (ps_dst[i] != ps_src)
> + __builtin_abort ();
> + if (epi64_dst[i] != epi64_src)
> + __builtin_abort ();
> + if (epi32_dst[i] != epi32_src)
> + __builtin_abort ();
> + if (epi16_dst[i] != epi16_src)
> + __builtin_abort ();
> + if (epi8_dst[i] != epi8_src)
> + __builtin_abort ();
> + }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> new file mode 100644
> index 00000000000..61ae131829d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> @@ -0,0 +1,38 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +#define N 10000
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = 1.0f16 + 2.0f16i;
> +}
> +
> +static void
> +do_test (void)
> +{
> + _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> +
> + __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> +
> + foo_ph (ph_dst);
> + for (int i = 0; i != N; i++)
> + {
> + if (ph_dst[i] != ph_src)
> + __builtin_abort ();
> + }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9a.c b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
> new file mode 100644
> index 00000000000..e922f7b5400
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
> @@ -0,0 +1,89 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +
> +typedef struct { _Complex double c; double a1; double a2;}
> + cdf;
> +typedef struct { _Complex double c; double a1; double a2; double a3; double a4;}
> + cdf2;
> +typedef struct { _Complex double c1; _Complex double c2; double a1; double a2; double a3; double a4;}
> + cdf3;
> +typedef struct { _Complex double c1; _Complex double c2; double a1; double a2;}
> + cdf4;
> +
> +#define N 100
> +/* VMAT_ELEMENTWISE. */
> +void
> +__attribute__((noipa))
> +foo (cdf* a, cdf* __restrict b)
> +{
> + for (int i = 0; i < N; ++i)
> + {
> + a[i].c = b[i].c;
> + a[i].a1 = b[i].a1;
> + a[i].a2 = b[i].a2;
> + }
> +}
> +
> +/* VMAT_CONTIGUOUS_PERMUTE. */
> +void
> +__attribute__((noipa))
> +foo1 (cdf2* a, cdf2* __restrict b)
> +{
> + for (int i = 0; i < N; ++i)
> + {
> + a[i].c = b[i].c;
> + a[i].a1 = b[i].a1;
> + a[i].a2 = b[i].a2;
> + a[i].a3 = b[i].a3;
> + a[i].a4 = b[i].a4;
> + }
> +}
> +
> +/* VMAT_CONTIGUOUS. */
> +void
> +__attribute__((noipa))
> +foo2 (cdf3* a, cdf3* __restrict b)
> +{
> + for (int i = 0; i < N; ++i)
> + {
> + a[i].c1 = b[i].c1;
> + a[i].c2 = b[i].c2;
> + a[i].a1 = b[i].a1;
> + a[i].a2 = b[i].a2;
> + a[i].a3 = b[i].a3;
> + a[i].a4 = b[i].a4;
> + }
> +}
> +
> +/* VMAT_STRIDED_SLP. */
> +void
> +__attribute__((noipa))
> +foo3 (cdf4* a, cdf4* __restrict b)
> +{
> + for (int i = 0; i < N; ++i)
> + {
> + a[i].c1 = b[i].c1;
> + a[i].c2 = b[i].c2;
> + a[i].a1 = b[i].a1;
> + a[i].a2 = b[i].a2;
> + }
> +}
> +
> +/* VMAT_CONTIGUOUS_REVERSE. */
> +void
> +__attribute__((noipa))
> +foo4 (_Complex double* a, _Complex double* __restrict b)
> +{
> + for (int i = 0; i != N; i++)
> + a[i] = b[N-i-1];
> +}
> +
> +/* VMAT_CONTIGUOUS_DOWN. */
> +void
> +__attribute__((noipa))
> +foo5 (_Complex double* a, _Complex double* __restrict b)
> +{
> + for (int i = 0; i != N; i++)
> + a[N-i-1] = b[0];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9b.c b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
> new file mode 100644
> index 00000000000..e220445e6e3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
> @@ -0,0 +1,90 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -msse2 -fvect-cost-model=unlimited" } */
> +/* { dg-require-effective-target sse2 } */
> +
> +#include <string.h>
> +#include "sse2-check.h"
> +#include "pr106010-9a.c"
> +
> +static void
> +sse2_test (void)
> +{
> + _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> + _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> + _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> + _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> + cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> + cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> + cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> + cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> + cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> + cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> + cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> + cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> +
> + char* p_init = (char*) malloc (N * sizeof (cdf3));
> +
> + __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> + __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> + __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> + __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> + __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> + __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> +
> + for (int i = 0; i != N * sizeof (cdf3); i++)
> + p_init[i] = i;
> +
> + memcpy (cdf_src, p_init, N * sizeof (cdf));
> + memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> + memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> + memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> + memcpy (pd_src, p_init, N * sizeof (_Complex double));
> + for (int i = 0; i != 2 * N * sizeof (double); i++)
> + p_init[i] = i % 16;
> + memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> +
> + foo (cdf_dst, cdf_src);
> + foo1 (cdf2_dst, cdf2_src);
> + foo2 (cdf3_dst, cdf3_src);
> + foo3 (cdf4_dst, cdf4_src);
> + foo4 (pd_dst, pd_src);
> + foo5 (pd_dst2, pd_src2);
> + for (int i = 0; i != N; i++)
> + {
> + p_init[(N - i - 1) * 16] = i * 16;
> + p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> + p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> + p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> + p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> + p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> + p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> + p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> + p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> + p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> + p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> + p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> + p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> + p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> + p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> + p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> + }
> + memcpy (pd_src, p_init, N * 16);
> +
> + if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> + __builtin_abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9c.c b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
> new file mode 100644
> index 00000000000..ff51f6195b7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
> @@ -0,0 +1,90 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include <string.h>
> +#include "avx2-check.h"
> +#include "pr106010-9a.c"
> +
> +static void
> +avx2_test (void)
> +{
> + _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> + _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> + _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> + _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> + cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> + cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> + cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> + cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> + cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> + cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> + cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> + cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> +
> + char* p_init = (char*) malloc (N * sizeof (cdf3));
> +
> + __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> + __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> + __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> + __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> + __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> + __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> +
> + for (int i = 0; i != N * sizeof (cdf3); i++)
> + p_init[i] = i;
> +
> + memcpy (cdf_src, p_init, N * sizeof (cdf));
> + memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> + memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> + memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> + memcpy (pd_src, p_init, N * sizeof (_Complex double));
> + for (int i = 0; i != 2 * N * sizeof (double); i++)
> + p_init[i] = i % 16;
> + memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> +
> + foo (cdf_dst, cdf_src);
> + foo1 (cdf2_dst, cdf2_src);
> + foo2 (cdf3_dst, cdf3_src);
> + foo3 (cdf4_dst, cdf4_src);
> + foo4 (pd_dst, pd_src);
> + foo5 (pd_dst2, pd_src2);
> + for (int i = 0; i != N; i++)
> + {
> + p_init[(N - i - 1) * 16] = i * 16;
> + p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> + p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> + p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> + p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> + p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> + p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> + p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> + p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> + p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> + p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> + p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> + p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> + p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> + p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> + p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> + }
> + memcpy (pd_src, p_init, N * 16);
> +
> + if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> + __builtin_abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9d.c b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
> new file mode 100644
> index 00000000000..d4d8f1dd722
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
> @@ -0,0 +1,92 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -mavx512f -mavx512vl -fvect-cost-model=unlimited -mprefer-vector-width=512" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#include <string.h>
> +#include <stdlib.h>
> +#define AVX512F
> +#include "avx512-check.h"
> +#include "pr106010-9a.c"
> +
> +static void
> +test_512 (void)
> +{
> + _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> + _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> + _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> + _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> + cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> + cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> + cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> + cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> + cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> + cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> + cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> + cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> +
> + char* p_init = (char*) malloc (N * sizeof (cdf3));
> +
> + __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> + __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> + __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> + __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> + __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> + __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> +
> + for (int i = 0; i != N * sizeof (cdf3); i++)
> + p_init[i] = i;
> +
> + memcpy (cdf_src, p_init, N * sizeof (cdf));
> + memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> + memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> + memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> + memcpy (pd_src, p_init, N * sizeof (_Complex double));
> + for (int i = 0; i != 2 * N * sizeof (double); i++)
> + p_init[i] = i % 16;
> + memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> +
> + foo (cdf_dst, cdf_src);
> + foo1 (cdf2_dst, cdf2_src);
> + foo2 (cdf3_dst, cdf3_src);
> + foo3 (cdf4_dst, cdf4_src);
> + foo4 (pd_dst, pd_src);
> + foo5 (pd_dst2, pd_src2);
> + for (int i = 0; i != N; i++)
> + {
> + p_init[(N - i - 1) * 16] = i * 16;
> + p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> + p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> + p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> + p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> + p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> + p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> + p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> + p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> + p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> + p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> + p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> + p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> + p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> + p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> + p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> + }
> + memcpy (pd_src, p_init, N * 16);
> +
> + if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> + __builtin_abort ();
> +
> + if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> + __builtin_abort ();
> +}
> diff --git a/gcc/tree-complex.cc b/gcc/tree-complex.cc
> index 61950a0f099..ea9df6114a1 100644
> --- a/gcc/tree-complex.cc
> +++ b/gcc/tree-complex.cc
> @@ -297,6 +297,11 @@ init_dont_simulate_again (void)
> break;
>
> default:
> + /* When expand_complex_move would trigger make sure we
> + perform lowering even when there is no actual complex
> + operation. This helps consistency and vectorization. */
> + if (TREE_CODE (TREE_TYPE (gimple_op (stmt, 0))) == COMPLEX_TYPE)
> + saw_a_complex_op = true;
> break;
> }
>
> @@ -869,7 +874,9 @@ expand_complex_move (gimple_stmt_iterator *gsi, tree type)
> update_complex_assignment (gsi, r, i);
> }
> }
> - else if (rhs && TREE_CODE (rhs) == SSA_NAME && !TREE_SIDE_EFFECTS (lhs))
> + else if (rhs
> + && (TREE_CODE (rhs) == SSA_NAME || TREE_CODE (rhs) == COMPLEX_CST)
> + && !TREE_SIDE_EFFECTS (lhs))
> {
> tree x;
> gimple *t;
> --
> 2.18.1
>
On Wed, Jul 20, 2022 at 4:00 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Wed, Jul 20, 2022 at 4:46 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > > My original comments still stand (it feels like this should be more generic).
> > > Can we go the way lowering complex loads/stores first? A large part
> > > of the testcases
> > > added by the patch should pass after that.
> >
> > This is the patch as suggested, one additional change is handling COMPLEX_CST
> > for rhs. And it will enable vectorization for pr106010-8a.c.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
>
> OK.
>
> Are there cases left your vectorizer patch handles over this one?
No.
>
> Thanks,
> Richard.
>
> > 2022-07-20 Richard Biener <richard.guenther@gmail.com>
> > Hongtao Liu <hongtao.liu@intel.com>
> >
> > gcc/ChangeLog:
> >
> > PR tree-optimization/106010
> > * tree-complex.cc (init_dont_simulate_again): Lower complex
> > type move.
> > (expand_complex_move): Also expand COMPLEX_CST for rhs.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr106010-1a.c: New test.
> > * gcc.target/i386/pr106010-1b.c: New test.
> > * gcc.target/i386/pr106010-1c.c: New test.
> > * gcc.target/i386/pr106010-2a.c: New test.
> > * gcc.target/i386/pr106010-2b.c: New test.
> > * gcc.target/i386/pr106010-2c.c: New test.
> > * gcc.target/i386/pr106010-3a.c: New test.
> > * gcc.target/i386/pr106010-3b.c: New test.
> > * gcc.target/i386/pr106010-3c.c: New test.
> > * gcc.target/i386/pr106010-4a.c: New test.
> > * gcc.target/i386/pr106010-4b.c: New test.
> > * gcc.target/i386/pr106010-4c.c: New test.
> > * gcc.target/i386/pr106010-5a.c: New test.
> > * gcc.target/i386/pr106010-5b.c: New test.
> > * gcc.target/i386/pr106010-5c.c: New test.
> > * gcc.target/i386/pr106010-6a.c: New test.
> > * gcc.target/i386/pr106010-6b.c: New test.
> > * gcc.target/i386/pr106010-6c.c: New test.
> > * gcc.target/i386/pr106010-7a.c: New test.
> > * gcc.target/i386/pr106010-7b.c: New test.
> > * gcc.target/i386/pr106010-7c.c: New test.
> > * gcc.target/i386/pr106010-8a.c: New test.
> > * gcc.target/i386/pr106010-8b.c: New test.
> > * gcc.target/i386/pr106010-8c.c: New test.
> > * gcc.target/i386/pr106010-9a.c: New test.
> > * gcc.target/i386/pr106010-9b.c: New test.
> > * gcc.target/i386/pr106010-9c.c: New test.
> > * gcc.target/i386/pr106010-9d.c: New test.
> > ---
> > gcc/testsuite/gcc.target/i386/pr106010-1a.c | 58 ++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-1b.c | 63 ++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-1c.c | 41 +++++
> > gcc/testsuite/gcc.target/i386/pr106010-2a.c | 82 ++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-2b.c | 62 ++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-2c.c | 47 ++++++
> > gcc/testsuite/gcc.target/i386/pr106010-3a.c | 80 ++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-3c.c | 69 +++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 +++++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-4b.c | 67 +++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-4c.c | 54 +++++++
> > gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-5b.c | 80 ++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-5c.c | 62 ++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 ++++++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-6c.c | 80 ++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-7a.c | 58 ++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-7b.c | 63 ++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-7c.c | 41 +++++
> > gcc/testsuite/gcc.target/i386/pr106010-8a.c | 58 ++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-8b.c | 53 +++++++
> > gcc/testsuite/gcc.target/i386/pr106010-8c.c | 38 +++++
> > gcc/testsuite/gcc.target/i386/pr106010-9a.c | 89 +++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-9b.c | 90 +++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-9c.c | 90 +++++++++++
> > gcc/testsuite/gcc.target/i386/pr106010-9d.c | 92 ++++++++++++
> > gcc/tree-complex.cc | 9 +-
> > 29 files changed, 2141 insertions(+), 1 deletion(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9b.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9c.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9d.c
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > new file mode 100644
> > index 00000000000..b608f484934
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > @@ -0,0 +1,58 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b[i];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > new file mode 100644
> > index 00000000000..0f377c3a548
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > @@ -0,0 +1,63 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-1a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > + _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > + _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > + _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > + _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > + _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > + _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > + _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > + _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > + _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > + _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > + _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > + _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > + char* p_init = (char*) malloc (2 * N * sizeof (double));
> > +
> > + __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > + __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > + __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > + __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > + __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > + __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > +
> > + for (int i = 0; i != 2 * N * sizeof (double); i++)
> > + p_init[i] = i;
> > +
> > + memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > + memcpy (ps_src, p_init, 2 * N * sizeof (float));
> > + memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> > + memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> > + memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> > + memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> > +
> > + foo_pd (pd_dst, pd_src);
> > + foo_ps (ps_dst, ps_src);
> > + foo_epi64 (epi64_dst, epi64_src);
> > + foo_epi32 (epi32_dst, epi32_src);
> > + foo_epi16 (epi16_dst, epi16_src);
> > + foo_epi8 (epi8_dst, epi8_src);
> > + if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > new file mode 100644
> > index 00000000000..f07e9fb2d3d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > @@ -0,0 +1,41 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +#define N 10000
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b[i];
> > +}
> > +
> > +static void
> > +do_test (void)
> > +{
> > + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > + char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > +
> > + __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > +
> > + for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > + p_init[i] = i;
> > +
> > + memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > +
> > + foo_ph (ph_dst, ph_src);
> > + if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > + __builtin_abort ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > new file mode 100644
> > index 00000000000..d2e2f8d4f43
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > @@ -0,0 +1,82 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > + a[0] = b[0];
> > + a[1] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > + a[0] = b[0];
> > + a[1] = b[1];
> > + a[2] = b[2];
> > + a[3] = b[3];
> > +
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > + a[0] = b[0];
> > + a[1] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > + a[0] = b[0];
> > + a[1] = b[1];
> > + a[2] = b[2];
> > + a[3] = b[3];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > + a[0] = b[0];
> > + a[1] = b[1];
> > + a[2] = b[2];
> > + a[3] = b[3];
> > + a[4] = b[4];
> > + a[5] = b[5];
> > + a[6] = b[6];
> > + a[7] = b[7];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > + a[0] = b[0];
> > + a[1] = b[1];
> > + a[2] = b[2];
> > + a[3] = b[3];
> > + a[4] = b[4];
> > + a[5] = b[5];
> > + a[6] = b[6];
> > + a[7] = b[7];
> > + a[8] = b[8];
> > + a[9] = b[9];
> > + a[10] = b[10];
> > + a[11] = b[11];
> > + a[12] = b[12];
> > + a[13] = b[13];
> > + a[14] = b[14];
> > + a[15] = b[15];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > new file mode 100644
> > index 00000000000..ac360752693
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > @@ -0,0 +1,62 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-2a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > + _Complex double* pd_src = (_Complex double*) malloc (32);
> > + _Complex double* pd_dst = (_Complex double*) malloc (32);
> > + _Complex float* ps_src = (_Complex float*) malloc (32);
> > + _Complex float* ps_dst = (_Complex float*) malloc (32);
> > + _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > + _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > + _Complex int* epi32_src = (_Complex int*) malloc (32);
> > + _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > + _Complex short* epi16_src = (_Complex short*) malloc (32);
> > + _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > + _Complex char* epi8_src = (_Complex char*) malloc (32);
> > + _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > + char* p = (char* ) malloc (32);
> > +
> > + __builtin_memset (pd_dst, 0, 32);
> > + __builtin_memset (ps_dst, 0, 32);
> > + __builtin_memset (epi64_dst, 0, 32);
> > + __builtin_memset (epi32_dst, 0, 32);
> > + __builtin_memset (epi16_dst, 0, 32);
> > + __builtin_memset (epi8_dst, 0, 32);
> > +
> > + for (int i = 0; i != 32; i++)
> > + p[i] = i;
> > + __builtin_memcpy (pd_src, p, 32);
> > + __builtin_memcpy (ps_src, p, 32);
> > + __builtin_memcpy (epi64_src, p, 32);
> > + __builtin_memcpy (epi32_src, p, 32);
> > + __builtin_memcpy (epi16_src, p, 32);
> > + __builtin_memcpy (epi8_src, p, 32);
> > +
> > + foo_pd (pd_dst, pd_src);
> > + foo_ps (ps_dst, ps_src);
> > + foo_epi64 (epi64_dst, epi64_src);
> > + foo_epi32 (epi32_dst, epi32_src);
> > + foo_epi16 (epi16_dst, epi16_src);
> > + foo_epi8 (epi8_dst, epi8_src);
> > + if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > new file mode 100644
> > index 00000000000..a002f209ec9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > @@ -0,0 +1,47 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > + a[0] = b[0];
> > + a[1] = b[1];
> > + a[2] = b[2];
> > + a[3] = b[3];
> > + a[4] = b[4];
> > + a[5] = b[5];
> > + a[6] = b[6];
> > + a[7] = b[7];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > + char* p = (char* ) malloc (32);
> > +
> > + __builtin_memset (ph_dst, 0, 32);
> > +
> > + for (int i = 0; i != 32; i++)
> > + p[i] = i;
> > + __builtin_memcpy (ph_src, p, 32);
> > +
> > + foo_ph (ph_dst, ph_src);
> > + if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > new file mode 100644
> > index 00000000000..c1b64b56b1c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > @@ -0,0 +1,80 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } } */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > + a[0] = b[1];
> > + a[1] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > + a[0] = b[1];
> > + a[1] = b[0];
> > + a[2] = b[3];
> > + a[3] = b[2];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > + a[0] = b[1];
> > + a[1] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > + a[0] = b[3];
> > + a[1] = b[2];
> > + a[2] = b[1];
> > + a[3] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > + a[0] = b[7];
> > + a[1] = b[6];
> > + a[2] = b[5];
> > + a[3] = b[4];
> > + a[4] = b[3];
> > + a[5] = b[2];
> > + a[6] = b[1];
> > + a[7] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > + a[0] = b[7];
> > + a[1] = b[6];
> > + a[2] = b[5];
> > + a[3] = b[4];
> > + a[4] = b[3];
> > + a[5] = b[2];
> > + a[6] = b[1];
> > + a[7] = b[0];
> > + a[8] = b[15];
> > + a[9] = b[14];
> > + a[10] = b[13];
> > + a[11] = b[12];
> > + a[12] = b[11];
> > + a[13] = b[10];
> > + a[14] = b[9];
> > + a[15] = b[8];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > new file mode 100644
> > index 00000000000..e4fa3f3a541
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > @@ -0,0 +1,126 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx2 } */
> > +
> > +#include "avx2-check.h"
> > +#include <string.h>
> > +#include "pr106010-3a.c"
> > +
> > +void
> > +avx2_test (void)
> > +{
> > + _Complex double* pd_src = (_Complex double*) malloc (32);
> > + _Complex double* pd_dst = (_Complex double*) malloc (32);
> > + _Complex double* pd_exp = (_Complex double*) malloc (32);
> > + _Complex float* ps_src = (_Complex float*) malloc (32);
> > + _Complex float* ps_dst = (_Complex float*) malloc (32);
> > + _Complex float* ps_exp = (_Complex float*) malloc (32);
> > + _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > + _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > + _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> > + _Complex int* epi32_src = (_Complex int*) malloc (32);
> > + _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > + _Complex int* epi32_exp = (_Complex int*) malloc (32);
> > + _Complex short* epi16_src = (_Complex short*) malloc (32);
> > + _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > + _Complex short* epi16_exp = (_Complex short*) malloc (32);
> > + _Complex char* epi8_src = (_Complex char*) malloc (32);
> > + _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > + _Complex char* epi8_exp = (_Complex char*) malloc (32);
> > + char* p = (char* ) malloc (32);
> > + char* q = (char* ) malloc (32);
> > +
> > + __builtin_memset (pd_dst, 0, 32);
> > + __builtin_memset (ps_dst, 0, 32);
> > + __builtin_memset (epi64_dst, 0, 32);
> > + __builtin_memset (epi32_dst, 0, 32);
> > + __builtin_memset (epi16_dst, 0, 32);
> > + __builtin_memset (epi8_dst, 0, 32);
> > +
> > + for (int i = 0; i != 32; i++)
> > + p[i] = i;
> > + __builtin_memcpy (pd_src, p, 32);
> > + __builtin_memcpy (ps_src, p, 32);
> > + __builtin_memcpy (epi64_src, p, 32);
> > + __builtin_memcpy (epi32_src, p, 32);
> > + __builtin_memcpy (epi16_src, p, 32);
> > + __builtin_memcpy (epi8_src, p, 32);
> > +
> > + for (int i = 0; i != 16; i++)
> > + {
> > + p[i] = i + 16;
> > + p[i + 16] = i;
> > + }
> > + __builtin_memcpy (pd_exp, p, 32);
> > + __builtin_memcpy (epi64_exp, p, 32);
> > +
> > + for (int i = 0; i != 8; i++)
> > + {
> > + p[i] = i + 8;
> > + p[i + 8] = i;
> > + p[i + 16] = i + 24;
> > + p[i + 24] = i + 16;
> > + q[i] = i + 24;
> > + q[i + 8] = i + 16;
> > + q[i + 16] = i + 8;
> > + q[i + 24] = i;
> > + }
> > + __builtin_memcpy (ps_exp, p, 32);
> > + __builtin_memcpy (epi32_exp, q, 32);
> > +
> > +
> > + for (int i = 0; i != 4; i++)
> > + {
> > + q[i] = i + 28;
> > + q[i + 4] = i + 24;
> > + q[i + 8] = i + 20;
> > + q[i + 12] = i + 16;
> > + q[i + 16] = i + 12;
> > + q[i + 20] = i + 8;
> > + q[i + 24] = i + 4;
> > + q[i + 28] = i;
> > + }
> > + __builtin_memcpy (epi16_exp, q, 32);
> > +
> > + for (int i = 0; i != 2; i++)
> > + {
> > + q[i] = i + 14;
> > + q[i + 2] = i + 12;
> > + q[i + 4] = i + 10;
> > + q[i + 6] = i + 8;
> > + q[i + 8] = i + 6;
> > + q[i + 10] = i + 4;
> > + q[i + 12] = i + 2;
> > + q[i + 14] = i;
> > + q[i + 16] = i + 30;
> > + q[i + 18] = i + 28;
> > + q[i + 20] = i + 26;
> > + q[i + 22] = i + 24;
> > + q[i + 24] = i + 22;
> > + q[i + 26] = i + 20;
> > + q[i + 28] = i + 18;
> > + q[i + 30] = i + 16;
> > + }
> > + __builtin_memcpy (epi8_exp, q, 32);
> > +
> > + foo_pd (pd_dst, pd_src);
> > + foo_ps (ps_dst, ps_src);
> > + foo_epi64 (epi64_dst, epi64_src);
> > + foo_epi32 (epi32_dst, epi32_src);
> > + foo_epi16 (epi16_dst, epi16_src);
> > + foo_epi8 (epi8_dst, epi8_src);
> > + if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > new file mode 100644
> > index 00000000000..5a5a3d4b992
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > @@ -0,0 +1,69 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > + a[0] = b[1];
> > + a[1] = b[0];
> > + a[2] = b[4];
> > + a[3] = b[3];
> > + a[4] = b[7];
> > + a[5] = b[6];
> > + a[6] = b[2];
> > + a[7] = b[5];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > + _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> > + char* p = (char* ) malloc (32);
> > + char* q = (char* ) malloc (32);
> > +
> > + __builtin_memset (ph_dst, 0, 32);
> > +
> > + for (int i = 0; i != 32; i++)
> > + p[i] = i;
> > + __builtin_memcpy (ph_src, p, 32);
> > +
> > + for (int i = 0; i != 4; i++)
> > + {
> > + p[i] = i + 4;
> > + p[i + 4] = i;
> > + p[i + 8] = i + 16;
> > + p[i + 12] = i + 12;
> > + p[i + 16] = i + 28;
> > + p[i + 20] = i + 24;
> > + p[i + 24] = i + 8;
> > + p[i + 28] = i + 20;
> > + q[i] = i + 28;
> > + q[i + 4] = i + 24;
> > + q[i + 8] = i + 20;
> > + q[i + 12] = i + 16;
> > + q[i + 16] = i + 12;
> > + q[i + 20] = i + 8;
> > + q[i + 24] = i + 4;
> > + q[i + 28] = i;
> > + }
> > + __builtin_memcpy (ph_exp, p, 32);
> > +
> > + foo_ph (ph_dst, ph_src);
> > + if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > new file mode 100644
> > index 00000000000..b7b0b532bb1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > @@ -0,0 +1,101 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a,
> > + _Complex double b1,
> > + _Complex double b2)
> > +{
> > + a[0] = b1;
> > + a[1] = b2;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a,
> > + _Complex float b1, _Complex float b2,
> > + _Complex float b3, _Complex float b4)
> > +{
> > + a[0] = b1;
> > + a[1] = b2;
> > + a[2] = b3;
> > + a[3] = b4;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a,
> > + _Complex long long b1,
> > + _Complex long long b2)
> > +{
> > + a[0] = b1;
> > + a[1] = b2;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a,
> > + _Complex int b1, _Complex int b2,
> > + _Complex int b3, _Complex int b4)
> > +{
> > + a[0] = b1;
> > + a[1] = b2;
> > + a[2] = b3;
> > + a[3] = b4;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a,
> > + _Complex short b1, _Complex short b2,
> > + _Complex short b3, _Complex short b4,
> > + _Complex short b5, _Complex short b6,
> > + _Complex short b7,_Complex short b8)
> > +{
> > + a[0] = b1;
> > + a[1] = b2;
> > + a[2] = b3;
> > + a[3] = b4;
> > + a[4] = b5;
> > + a[5] = b6;
> > + a[6] = b7;
> > + a[7] = b8;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a,
> > + _Complex char b1, _Complex char b2,
> > + _Complex char b3, _Complex char b4,
> > + _Complex char b5, _Complex char b6,
> > + _Complex char b7,_Complex char b8,
> > + _Complex char b9, _Complex char b10,
> > + _Complex char b11, _Complex char b12,
> > + _Complex char b13, _Complex char b14,
> > + _Complex char b15,_Complex char b16)
> > +{
> > + a[0] = b1;
> > + a[1] = b2;
> > + a[2] = b3;
> > + a[3] = b4;
> > + a[4] = b5;
> > + a[5] = b6;
> > + a[6] = b7;
> > + a[7] = b8;
> > + a[8] = b9;
> > + a[9] = b10;
> > + a[10] = b11;
> > + a[11] = b12;
> > + a[12] = b13;
> > + a[13] = b14;
> > + a[14] = b15;
> > + a[15] = b16;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > new file mode 100644
> > index 00000000000..e2e79508c4b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > @@ -0,0 +1,67 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-4a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > + _Complex double* pd_src = (_Complex double*) malloc (32);
> > + _Complex double* pd_dst = (_Complex double*) malloc (32);
> > + _Complex float* ps_src = (_Complex float*) malloc (32);
> > + _Complex float* ps_dst = (_Complex float*) malloc (32);
> > + _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > + _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > + _Complex int* epi32_src = (_Complex int*) malloc (32);
> > + _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > + _Complex short* epi16_src = (_Complex short*) malloc (32);
> > + _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > + _Complex char* epi8_src = (_Complex char*) malloc (32);
> > + _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > + char* p = (char* ) malloc (32);
> > +
> > + __builtin_memset (pd_dst, 0, 32);
> > + __builtin_memset (ps_dst, 0, 32);
> > + __builtin_memset (epi64_dst, 0, 32);
> > + __builtin_memset (epi32_dst, 0, 32);
> > + __builtin_memset (epi16_dst, 0, 32);
> > + __builtin_memset (epi8_dst, 0, 32);
> > +
> > + for (int i = 0; i != 32; i++)
> > + p[i] = i;
> > + __builtin_memcpy (pd_src, p, 32);
> > + __builtin_memcpy (ps_src, p, 32);
> > + __builtin_memcpy (epi64_src, p, 32);
> > + __builtin_memcpy (epi32_src, p, 32);
> > + __builtin_memcpy (epi16_src, p, 32);
> > + __builtin_memcpy (epi8_src, p, 32);
> > +
> > + foo_pd (pd_dst, pd_src[0], pd_src[1]);
> > + foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> > + foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> > + foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> > + foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> > + epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> > + foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> > + epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> > + epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> > + epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> > +
> > + if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > new file mode 100644
> > index 00000000000..8e02aefe3b5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > @@ -0,0 +1,54 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a,
> > + _Complex _Float16 b1, _Complex _Float16 b2,
> > + _Complex _Float16 b3, _Complex _Float16 b4,
> > + _Complex _Float16 b5, _Complex _Float16 b6,
> > + _Complex _Float16 b7,_Complex _Float16 b8)
> > +{
> > + a[0] = b1;
> > + a[1] = b2;
> > + a[2] = b3;
> > + a[3] = b4;
> > + a[4] = b5;
> > + a[5] = b6;
> > + a[6] = b7;
> > + a[7] = b8;
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > +
> > + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > +
> > + char* p = (char* ) malloc (32);
> > +
> > + __builtin_memset (ph_dst, 0, 32);
> > +
> > + for (int i = 0; i != 32; i++)
> > + p[i] = i;
> > +
> > + __builtin_memcpy (ph_src, p, 32);
> > +
> > + foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> > + ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> > +
> > + if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > + __builtin_abort ();
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > new file mode 100644
> > index 00000000000..9d4a6f9846b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > @@ -0,0 +1,117 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > + a[0] = b[2];
> > + a[1] = b[3];
> > + a[2] = b[0];
> > + a[3] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > + a[0] = b[4];
> > + a[1] = b[5];
> > + a[2] = b[6];
> > + a[3] = b[7];
> > + a[4] = b[0];
> > + a[5] = b[1];
> > + a[6] = b[2];
> > + a[7] = b[3];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > + a[0] = b[2];
> > + a[1] = b[3];
> > + a[2] = b[0];
> > + a[3] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > + a[0] = b[4];
> > + a[1] = b[5];
> > + a[2] = b[6];
> > + a[3] = b[7];
> > + a[4] = b[0];
> > + a[5] = b[1];
> > + a[6] = b[2];
> > + a[7] = b[3];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > + a[0] = b[8];
> > + a[1] = b[9];
> > + a[2] = b[10];
> > + a[3] = b[11];
> > + a[4] = b[12];
> > + a[5] = b[13];
> > + a[6] = b[14];
> > + a[7] = b[15];
> > + a[8] = b[0];
> > + a[9] = b[1];
> > + a[10] = b[2];
> > + a[11] = b[3];
> > + a[12] = b[4];
> > + a[13] = b[5];
> > + a[14] = b[6];
> > + a[15] = b[7];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > + a[0] = b[16];
> > + a[1] = b[17];
> > + a[2] = b[18];
> > + a[3] = b[19];
> > + a[4] = b[20];
> > + a[5] = b[21];
> > + a[6] = b[22];
> > + a[7] = b[23];
> > + a[8] = b[24];
> > + a[9] = b[25];
> > + a[10] = b[26];
> > + a[11] = b[27];
> > + a[12] = b[28];
> > + a[13] = b[29];
> > + a[14] = b[30];
> > + a[15] = b[31];
> > + a[16] = b[0];
> > + a[17] = b[1];
> > + a[18] = b[2];
> > + a[19] = b[3];
> > + a[20] = b[4];
> > + a[21] = b[5];
> > + a[22] = b[6];
> > + a[23] = b[7];
> > + a[24] = b[8];
> > + a[25] = b[9];
> > + a[26] = b[10];
> > + a[27] = b[11];
> > + a[28] = b[12];
> > + a[29] = b[13];
> > + a[30] = b[14];
> > + a[31] = b[15];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > new file mode 100644
> > index 00000000000..d5c6ebeb5cf
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > @@ -0,0 +1,80 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-5a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > + _Complex double* pd_src = (_Complex double*) malloc (64);
> > + _Complex double* pd_dst = (_Complex double*) malloc (64);
> > + _Complex double* pd_exp = (_Complex double*) malloc (64);
> > + _Complex float* ps_src = (_Complex float*) malloc (64);
> > + _Complex float* ps_dst = (_Complex float*) malloc (64);
> > + _Complex float* ps_exp = (_Complex float*) malloc (64);
> > + _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > + _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > + _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > + _Complex int* epi32_src = (_Complex int*) malloc (64);
> > + _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > + _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > + _Complex short* epi16_src = (_Complex short*) malloc (64);
> > + _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > + _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > + _Complex char* epi8_src = (_Complex char*) malloc (64);
> > + _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > + _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > + char* p = (char* ) malloc (64);
> > + char* q = (char* ) malloc (64);
> > +
> > + __builtin_memset (pd_dst, 0, 64);
> > + __builtin_memset (ps_dst, 0, 64);
> > + __builtin_memset (epi64_dst, 0, 64);
> > + __builtin_memset (epi32_dst, 0, 64);
> > + __builtin_memset (epi16_dst, 0, 64);
> > + __builtin_memset (epi8_dst, 0, 64);
> > +
> > + for (int i = 0; i != 64; i++)
> > + {
> > + p[i] = i;
> > + q[i] = (i + 32) % 64;
> > + }
> > + __builtin_memcpy (pd_src, p, 64);
> > + __builtin_memcpy (ps_src, p, 64);
> > + __builtin_memcpy (epi64_src, p, 64);
> > + __builtin_memcpy (epi32_src, p, 64);
> > + __builtin_memcpy (epi16_src, p, 64);
> > + __builtin_memcpy (epi8_src, p, 64);
> > +
> > + __builtin_memcpy (pd_exp, q, 64);
> > + __builtin_memcpy (ps_exp, q, 64);
> > + __builtin_memcpy (epi64_exp, q, 64);
> > + __builtin_memcpy (epi32_exp, q, 64);
> > + __builtin_memcpy (epi16_exp, q, 64);
> > + __builtin_memcpy (epi8_exp, q, 64);
> > +
> > + foo_pd (pd_dst, pd_src);
> > + foo_ps (ps_dst, ps_src);
> > + foo_epi64 (epi64_dst, epi64_src);
> > + foo_epi32 (epi32_dst, epi32_src);
> > + foo_epi16 (epi16_dst, epi16_src);
> > + foo_epi8 (epi8_dst, epi8_src);
> > +
> > + if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > new file mode 100644
> > index 00000000000..9ce4e6dd5c0
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > @@ -0,0 +1,62 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > + a[0] = b[8];
> > + a[1] = b[9];
> > + a[2] = b[10];
> > + a[3] = b[11];
> > + a[4] = b[12];
> > + a[5] = b[13];
> > + a[6] = b[14];
> > + a[7] = b[15];
> > + a[8] = b[0];
> > + a[9] = b[1];
> > + a[10] = b[2];
> > + a[11] = b[3];
> > + a[12] = b[4];
> > + a[13] = b[5];
> > + a[14] = b[6];
> > + a[15] = b[7];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > + _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > + char* p = (char* ) malloc (64);
> > + char* q = (char* ) malloc (64);
> > +
> > + __builtin_memset (ph_dst, 0, 64);
> > +
> > + for (int i = 0; i != 64; i++)
> > + {
> > + p[i] = i;
> > + q[i] = (i + 32) % 64;
> > + }
> > + __builtin_memcpy (ph_src, p, 64);
> > +
> > + __builtin_memcpy (ph_exp, q, 64);
> > +
> > + foo_ph (ph_dst, ph_src);
> > +
> > + if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > new file mode 100644
> > index 00000000000..65a90d03684
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > @@ -0,0 +1,115 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } } */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > + a[0] = b[3];
> > + a[1] = b[2];
> > + a[2] = b[1];
> > + a[3] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > + a[0] = b[7];
> > + a[1] = b[6];
> > + a[2] = b[5];
> > + a[3] = b[4];
> > + a[4] = b[3];
> > + a[5] = b[2];
> > + a[6] = b[1];
> > + a[7] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > + a[0] = b[3];
> > + a[1] = b[2];
> > + a[2] = b[1];
> > + a[3] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > + a[0] = b[7];
> > + a[1] = b[6];
> > + a[2] = b[5];
> > + a[3] = b[4];
> > + a[4] = b[3];
> > + a[5] = b[2];
> > + a[6] = b[1];
> > + a[7] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > + a[0] = b[15];
> > + a[1] = b[14];
> > + a[2] = b[13];
> > + a[3] = b[12];
> > + a[4] = b[11];
> > + a[5] = b[10];
> > + a[6] = b[9];
> > + a[7] = b[8];
> > + a[8] = b[7];
> > + a[9] = b[6];
> > + a[10] = b[5];
> > + a[11] = b[4];
> > + a[12] = b[3];
> > + a[13] = b[2];
> > + a[14] = b[1];
> > + a[15] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > + a[0] = b[31];
> > + a[1] = b[30];
> > + a[2] = b[29];
> > + a[3] = b[28];
> > + a[4] = b[27];
> > + a[5] = b[26];
> > + a[6] = b[25];
> > + a[7] = b[24];
> > + a[8] = b[23];
> > + a[9] = b[22];
> > + a[10] = b[21];
> > + a[11] = b[20];
> > + a[12] = b[19];
> > + a[13] = b[18];
> > + a[14] = b[17];
> > + a[15] = b[16];
> > + a[16] = b[15];
> > + a[17] = b[14];
> > + a[18] = b[13];
> > + a[19] = b[12];
> > + a[20] = b[11];
> > + a[21] = b[10];
> > + a[22] = b[9];
> > + a[23] = b[8];
> > + a[24] = b[7];
> > + a[25] = b[6];
> > + a[26] = b[5];
> > + a[27] = b[4];
> > + a[28] = b[3];
> > + a[29] = b[2];
> > + a[30] = b[1];
> > + a[31] = b[0];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > new file mode 100644
> > index 00000000000..1c5bb020939
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > @@ -0,0 +1,157 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx2 } */
> > +
> > +#include "avx2-check.h"
> > +#include <string.h>
> > +#include "pr106010-6a.c"
> > +
> > +void
> > +avx2_test (void)
> > +{
> > + _Complex double* pd_src = (_Complex double*) malloc (64);
> > + _Complex double* pd_dst = (_Complex double*) malloc (64);
> > + _Complex double* pd_exp = (_Complex double*) malloc (64);
> > + _Complex float* ps_src = (_Complex float*) malloc (64);
> > + _Complex float* ps_dst = (_Complex float*) malloc (64);
> > + _Complex float* ps_exp = (_Complex float*) malloc (64);
> > + _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > + _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > + _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > + _Complex int* epi32_src = (_Complex int*) malloc (64);
> > + _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > + _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > + _Complex short* epi16_src = (_Complex short*) malloc (64);
> > + _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > + _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > + _Complex char* epi8_src = (_Complex char*) malloc (64);
> > + _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > + _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > + char* p = (char* ) malloc (64);
> > + char* q = (char* ) malloc (64);
> > +
> > + __builtin_memset (pd_dst, 0, 64);
> > + __builtin_memset (ps_dst, 0, 64);
> > + __builtin_memset (epi64_dst, 0, 64);
> > + __builtin_memset (epi32_dst, 0, 64);
> > + __builtin_memset (epi16_dst, 0, 64);
> > + __builtin_memset (epi8_dst, 0, 64);
> > +
> > + for (int i = 0; i != 64; i++)
> > + p[i] = i;
> > +
> > + __builtin_memcpy (pd_src, p, 64);
> > + __builtin_memcpy (ps_src, p, 64);
> > + __builtin_memcpy (epi64_src, p, 64);
> > + __builtin_memcpy (epi32_src, p, 64);
> > + __builtin_memcpy (epi16_src, p, 64);
> > + __builtin_memcpy (epi8_src, p, 64);
> > +
> > +
> > + for (int i = 0; i != 16; i++)
> > + {
> > + q[i] = i + 48;
> > + q[i + 16] = i + 32;
> > + q[i + 32] = i + 16;
> > + q[i + 48] = i;
> > + }
> > +
> > + __builtin_memcpy (pd_exp, q, 64);
> > + __builtin_memcpy (epi64_exp, q, 64);
> > +
> > + for (int i = 0; i != 8; i++)
> > + {
> > + q[i] = i + 56;
> > + q[i + 8] = i + 48;
> > + q[i + 16] = i + 40;
> > + q[i + 24] = i + 32;
> > + q[i + 32] = i + 24;
> > + q[i + 40] = i + 16;
> > + q[i + 48] = i + 8;
> > + q[i + 56] = i;
> > + }
> > +
> > + __builtin_memcpy (ps_exp, q, 64);
> > + __builtin_memcpy (epi32_exp, q, 64);
> > +
> > + for (int i = 0; i != 4; i++)
> > + {
> > + q[i] = i + 60;
> > + q[i + 4] = i + 56;
> > + q[i + 8] = i + 52;
> > + q[i + 12] = i + 48;
> > + q[i + 16] = i + 44;
> > + q[i + 20] = i + 40;
> > + q[i + 24] = i + 36;
> > + q[i + 28] = i + 32;
> > + q[i + 32] = i + 28;
> > + q[i + 36] = i + 24;
> > + q[i + 40] = i + 20;
> > + q[i + 44] = i + 16;
> > + q[i + 48] = i + 12;
> > + q[i + 52] = i + 8;
> > + q[i + 56] = i + 4;
> > + q[i + 60] = i;
> > + }
> > +
> > + __builtin_memcpy (epi16_exp, q, 64);
> > +
> > + for (int i = 0; i != 2; i++)
> > + {
> > + q[i] = i + 62;
> > + q[i + 2] = i + 60;
> > + q[i + 4] = i + 58;
> > + q[i + 6] = i + 56;
> > + q[i + 8] = i + 54;
> > + q[i + 10] = i + 52;
> > + q[i + 12] = i + 50;
> > + q[i + 14] = i + 48;
> > + q[i + 16] = i + 46;
> > + q[i + 18] = i + 44;
> > + q[i + 20] = i + 42;
> > + q[i + 22] = i + 40;
> > + q[i + 24] = i + 38;
> > + q[i + 26] = i + 36;
> > + q[i + 28] = i + 34;
> > + q[i + 30] = i + 32;
> > + q[i + 32] = i + 30;
> > + q[i + 34] = i + 28;
> > + q[i + 36] = i + 26;
> > + q[i + 38] = i + 24;
> > + q[i + 40] = i + 22;
> > + q[i + 42] = i + 20;
> > + q[i + 44] = i + 18;
> > + q[i + 46] = i + 16;
> > + q[i + 48] = i + 14;
> > + q[i + 50] = i + 12;
> > + q[i + 52] = i + 10;
> > + q[i + 54] = i + 8;
> > + q[i + 56] = i + 6;
> > + q[i + 58] = i + 4;
> > + q[i + 60] = i + 2;
> > + q[i + 62] = i;
> > + }
> > + __builtin_memcpy (epi8_exp, q, 64);
> > +
> > + foo_pd (pd_dst, pd_src);
> > + foo_ps (ps_dst, ps_src);
> > + foo_epi64 (epi64_dst, epi64_src);
> > + foo_epi32 (epi32_dst, epi32_src);
> > + foo_epi16 (epi16_dst, epi16_src);
> > + foo_epi8 (epi8_dst, epi8_src);
> > +
> > + if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > new file mode 100644
> > index 00000000000..b859d884a7f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > @@ -0,0 +1,80 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > + a[0] = b[15];
> > + a[1] = b[14];
> > + a[2] = b[13];
> > + a[3] = b[12];
> > + a[4] = b[11];
> > + a[5] = b[10];
> > + a[6] = b[9];
> > + a[7] = b[8];
> > + a[8] = b[7];
> > + a[9] = b[6];
> > + a[10] = b[5];
> > + a[11] = b[4];
> > + a[12] = b[3];
> > + a[13] = b[2];
> > + a[14] = b[1];
> > + a[15] = b[0];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > + _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > + char* p = (char* ) malloc (64);
> > + char* q = (char* ) malloc (64);
> > +
> > + __builtin_memset (ph_dst, 0, 64);
> > +
> > + for (int i = 0; i != 64; i++)
> > + p[i] = i;
> > +
> > + __builtin_memcpy (ph_src, p, 64);
> > +
> > + for (int i = 0; i != 4; i++)
> > + {
> > + q[i] = i + 60;
> > + q[i + 4] = i + 56;
> > + q[i + 8] = i + 52;
> > + q[i + 12] = i + 48;
> > + q[i + 16] = i + 44;
> > + q[i + 20] = i + 40;
> > + q[i + 24] = i + 36;
> > + q[i + 28] = i + 32;
> > + q[i + 32] = i + 28;
> > + q[i + 36] = i + 24;
> > + q[i + 40] = i + 20;
> > + q[i + 44] = i + 16;
> > + q[i + 48] = i + 12;
> > + q[i + 52] = i + 8;
> > + q[i + 56] = i + 4;
> > + q[i + 60] = i;
> > + }
> > +
> > + __builtin_memcpy (ph_exp, q, 64);
> > +
> > + foo_ph (ph_dst, ph_src);
> > +
> > + if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > new file mode 100644
> > index 00000000000..2ea01fac927
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > @@ -0,0 +1,58 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > new file mode 100644
> > index 00000000000..26482cc10f5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > @@ -0,0 +1,63 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-7a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > + _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > + _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > + _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > + _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > + _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > + _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > + _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > + _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > + _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > + _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > + _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > + _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > + char* p_init = (char*) malloc (2 * N * sizeof (double));
> > +
> > + __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > + __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > + __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > + __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > + __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > + __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > +
> > + for (int i = 0; i != 2 * N * sizeof (double); i++)
> > + p_init[i] = i % 2 + 3;
> > +
> > + memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > + memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> > + memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> > + memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> > + memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> > + memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> > +
> > + foo_pd (pd_dst, pd_src[0]);
> > + foo_ps (ps_dst, ps_src[0]);
> > + foo_epi64 (epi64_dst, epi64_src[0]);
> > + foo_epi32 (epi32_dst, epi32_src[0]);
> > + foo_epi16 (epi16_dst, epi16_src[0]);
> > + foo_epi8 (epi8_dst, epi8_src[0]);
> > + if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > + __builtin_abort ();
> > + if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > + __builtin_abort ();
> > +
> > + return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > new file mode 100644
> > index 00000000000..7f4056a5ecc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > @@ -0,0 +1,41 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +#define N 10000
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b;
> > +}
> > +
> > +static void
> > +do_test (void)
> > +{
> > + _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > + char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > +
> > + __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > +
> > + for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > + p_init[i] = i % 2 + 3;
> > +
> > + memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > +
> > + foo_ph (ph_dst, ph_src[0]);
> > + if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > + __builtin_abort ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > new file mode 100644
> > index 00000000000..11054b60d30
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > @@ -0,0 +1,58 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = 1.0 + 2.0i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = 1.0f + 2.0fi;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = 1 + 2i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = 1 + 2i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = 1 + 2i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = 1 + 2i;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > new file mode 100644
> > index 00000000000..6bb0073b691
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > @@ -0,0 +1,53 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-8a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > + _Complex double pd_src = 1.0 + 2.0i;
> > + _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > + _Complex float ps_src = 1.0 + 2.0i;
> > + _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > + _Complex long long epi64_src = 1 + 2i;;
> > + _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > + _Complex int epi32_src = 1 + 2i;
> > + _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > + _Complex short epi16_src = 1 + 2i;
> > + _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > + _Complex char epi8_src = 1 + 2i;
> > + _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > +
> > + __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > + __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > + __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > + __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > + __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > + __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > +
> > + foo_pd (pd_dst);
> > + foo_ps (ps_dst);
> > + foo_epi64 (epi64_dst);
> > + foo_epi32 (epi32_dst);
> > + foo_epi16 (epi16_dst);
> > + foo_epi8 (epi8_dst);
> > + for (int i = 0 ; i != N; i++)
> > + {
> > + if (pd_dst[i] != pd_src)
> > + __builtin_abort ();
> > + if (ps_dst[i] != ps_src)
> > + __builtin_abort ();
> > + if (epi64_dst[i] != epi64_src)
> > + __builtin_abort ();
> > + if (epi32_dst[i] != epi32_src)
> > + __builtin_abort ();
> > + if (epi16_dst[i] != epi16_src)
> > + __builtin_abort ();
> > + if (epi8_dst[i] != epi8_src)
> > + __builtin_abort ();
> > + }
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > new file mode 100644
> > index 00000000000..61ae131829d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > @@ -0,0 +1,38 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +#define N 10000
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = 1.0f16 + 2.0f16i;
> > +}
> > +
> > +static void
> > +do_test (void)
> > +{
> > + _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> > + _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > +
> > + __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > +
> > + foo_ph (ph_dst);
> > + for (int i = 0; i != N; i++)
> > + {
> > + if (ph_dst[i] != ph_src)
> > + __builtin_abort ();
> > + }
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9a.c b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
> > new file mode 100644
> > index 00000000000..e922f7b5400
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
> > @@ -0,0 +1,89 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > +
> > +typedef struct { _Complex double c; double a1; double a2;}
> > + cdf;
> > +typedef struct { _Complex double c; double a1; double a2; double a3; double a4;}
> > + cdf2;
> > +typedef struct { _Complex double c1; _Complex double c2; double a1; double a2; double a3; double a4;}
> > + cdf3;
> > +typedef struct { _Complex double c1; _Complex double c2; double a1; double a2;}
> > + cdf4;
> > +
> > +#define N 100
> > +/* VMAT_ELEMENTWISE. */
> > +void
> > +__attribute__((noipa))
> > +foo (cdf* a, cdf* __restrict b)
> > +{
> > + for (int i = 0; i < N; ++i)
> > + {
> > + a[i].c = b[i].c;
> > + a[i].a1 = b[i].a1;
> > + a[i].a2 = b[i].a2;
> > + }
> > +}
> > +
> > +/* VMAT_CONTIGUOUS_PERMUTE. */
> > +void
> > +__attribute__((noipa))
> > +foo1 (cdf2* a, cdf2* __restrict b)
> > +{
> > + for (int i = 0; i < N; ++i)
> > + {
> > + a[i].c = b[i].c;
> > + a[i].a1 = b[i].a1;
> > + a[i].a2 = b[i].a2;
> > + a[i].a3 = b[i].a3;
> > + a[i].a4 = b[i].a4;
> > + }
> > +}
> > +
> > +/* VMAT_CONTIGUOUS. */
> > +void
> > +__attribute__((noipa))
> > +foo2 (cdf3* a, cdf3* __restrict b)
> > +{
> > + for (int i = 0; i < N; ++i)
> > + {
> > + a[i].c1 = b[i].c1;
> > + a[i].c2 = b[i].c2;
> > + a[i].a1 = b[i].a1;
> > + a[i].a2 = b[i].a2;
> > + a[i].a3 = b[i].a3;
> > + a[i].a4 = b[i].a4;
> > + }
> > +}
> > +
> > +/* VMAT_STRIDED_SLP. */
> > +void
> > +__attribute__((noipa))
> > +foo3 (cdf4* a, cdf4* __restrict b)
> > +{
> > + for (int i = 0; i < N; ++i)
> > + {
> > + a[i].c1 = b[i].c1;
> > + a[i].c2 = b[i].c2;
> > + a[i].a1 = b[i].a1;
> > + a[i].a2 = b[i].a2;
> > + }
> > +}
> > +
> > +/* VMAT_CONTIGUOUS_REVERSE. */
> > +void
> > +__attribute__((noipa))
> > +foo4 (_Complex double* a, _Complex double* __restrict b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[i] = b[N-i-1];
> > +}
> > +
> > +/* VMAT_CONTIGUOUS_DOWN. */
> > +void
> > +__attribute__((noipa))
> > +foo5 (_Complex double* a, _Complex double* __restrict b)
> > +{
> > + for (int i = 0; i != N; i++)
> > + a[N-i-1] = b[0];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9b.c b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
> > new file mode 100644
> > index 00000000000..e220445e6e3
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
> > @@ -0,0 +1,90 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -msse2 -fvect-cost-model=unlimited" } */
> > +/* { dg-require-effective-target sse2 } */
> > +
> > +#include <string.h>
> > +#include "sse2-check.h"
> > +#include "pr106010-9a.c"
> > +
> > +static void
> > +sse2_test (void)
> > +{
> > + _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> > + cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> > + cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> > + cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> > + cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> > + cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> > + cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> > + cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> > +
> > + char* p_init = (char*) malloc (N * sizeof (cdf3));
> > +
> > + __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> > + __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> > + __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> > + __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> > + __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> > + __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> > +
> > + for (int i = 0; i != N * sizeof (cdf3); i++)
> > + p_init[i] = i;
> > +
> > + memcpy (cdf_src, p_init, N * sizeof (cdf));
> > + memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> > + memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> > + memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> > + memcpy (pd_src, p_init, N * sizeof (_Complex double));
> > + for (int i = 0; i != 2 * N * sizeof (double); i++)
> > + p_init[i] = i % 16;
> > + memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> > +
> > + foo (cdf_dst, cdf_src);
> > + foo1 (cdf2_dst, cdf2_src);
> > + foo2 (cdf3_dst, cdf3_src);
> > + foo3 (cdf4_dst, cdf4_src);
> > + foo4 (pd_dst, pd_src);
> > + foo5 (pd_dst2, pd_src2);
> > + for (int i = 0; i != N; i++)
> > + {
> > + p_init[(N - i - 1) * 16] = i * 16;
> > + p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> > + p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> > + p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> > + p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> > + p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> > + p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> > + p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> > + p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> > + p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> > + p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> > + p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> > + p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> > + p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> > + p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> > + p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> > + }
> > + memcpy (pd_src, p_init, N * 16);
> > +
> > + if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> > + __builtin_abort ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9c.c b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
> > new file mode 100644
> > index 00000000000..ff51f6195b7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
> > @@ -0,0 +1,90 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited" } */
> > +/* { dg-require-effective-target avx2 } */
> > +
> > +#include <string.h>
> > +#include "avx2-check.h"
> > +#include "pr106010-9a.c"
> > +
> > +static void
> > +avx2_test (void)
> > +{
> > + _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> > + cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> > + cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> > + cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> > + cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> > + cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> > + cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> > + cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> > +
> > + char* p_init = (char*) malloc (N * sizeof (cdf3));
> > +
> > + __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> > + __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> > + __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> > + __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> > + __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> > + __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> > +
> > + for (int i = 0; i != N * sizeof (cdf3); i++)
> > + p_init[i] = i;
> > +
> > + memcpy (cdf_src, p_init, N * sizeof (cdf));
> > + memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> > + memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> > + memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> > + memcpy (pd_src, p_init, N * sizeof (_Complex double));
> > + for (int i = 0; i != 2 * N * sizeof (double); i++)
> > + p_init[i] = i % 16;
> > + memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> > +
> > + foo (cdf_dst, cdf_src);
> > + foo1 (cdf2_dst, cdf2_src);
> > + foo2 (cdf3_dst, cdf3_src);
> > + foo3 (cdf4_dst, cdf4_src);
> > + foo4 (pd_dst, pd_src);
> > + foo5 (pd_dst2, pd_src2);
> > + for (int i = 0; i != N; i++)
> > + {
> > + p_init[(N - i - 1) * 16] = i * 16;
> > + p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> > + p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> > + p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> > + p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> > + p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> > + p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> > + p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> > + p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> > + p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> > + p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> > + p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> > + p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> > + p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> > + p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> > + p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> > + }
> > + memcpy (pd_src, p_init, N * 16);
> > +
> > + if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> > + __builtin_abort ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9d.c b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
> > new file mode 100644
> > index 00000000000..d4d8f1dd722
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
> > @@ -0,0 +1,92 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -mavx512f -mavx512vl -fvect-cost-model=unlimited -mprefer-vector-width=512" } */
> > +/* { dg-require-effective-target avx512f } */
> > +
> > +#include <string.h>
> > +#include <stdlib.h>
> > +#define AVX512F
> > +#include "avx512-check.h"
> > +#include "pr106010-9a.c"
> > +
> > +static void
> > +test_512 (void)
> > +{
> > + _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > + cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> > + cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> > + cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> > + cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> > + cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> > + cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> > + cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> > + cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> > +
> > + char* p_init = (char*) malloc (N * sizeof (cdf3));
> > +
> > + __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> > + __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> > + __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> > + __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> > + __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> > + __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> > +
> > + for (int i = 0; i != N * sizeof (cdf3); i++)
> > + p_init[i] = i;
> > +
> > + memcpy (cdf_src, p_init, N * sizeof (cdf));
> > + memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> > + memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> > + memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> > + memcpy (pd_src, p_init, N * sizeof (_Complex double));
> > + for (int i = 0; i != 2 * N * sizeof (double); i++)
> > + p_init[i] = i % 16;
> > + memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> > +
> > + foo (cdf_dst, cdf_src);
> > + foo1 (cdf2_dst, cdf2_src);
> > + foo2 (cdf3_dst, cdf3_src);
> > + foo3 (cdf4_dst, cdf4_src);
> > + foo4 (pd_dst, pd_src);
> > + foo5 (pd_dst2, pd_src2);
> > + for (int i = 0; i != N; i++)
> > + {
> > + p_init[(N - i - 1) * 16] = i * 16;
> > + p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> > + p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> > + p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> > + p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> > + p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> > + p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> > + p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> > + p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> > + p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> > + p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> > + p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> > + p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> > + p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> > + p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> > + p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> > + }
> > + memcpy (pd_src, p_init, N * 16);
> > +
> > + if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> > + __builtin_abort ();
> > +
> > + if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> > + __builtin_abort ();
> > +}
> > diff --git a/gcc/tree-complex.cc b/gcc/tree-complex.cc
> > index 61950a0f099..ea9df6114a1 100644
> > --- a/gcc/tree-complex.cc
> > +++ b/gcc/tree-complex.cc
> > @@ -297,6 +297,11 @@ init_dont_simulate_again (void)
> > break;
> >
> > default:
> > + /* When expand_complex_move would trigger make sure we
> > + perform lowering even when there is no actual complex
> > + operation. This helps consistency and vectorization. */
> > + if (TREE_CODE (TREE_TYPE (gimple_op (stmt, 0))) == COMPLEX_TYPE)
> > + saw_a_complex_op = true;
> > break;
> > }
> >
> > @@ -869,7 +874,9 @@ expand_complex_move (gimple_stmt_iterator *gsi, tree type)
> > update_complex_assignment (gsi, r, i);
> > }
> > }
> > - else if (rhs && TREE_CODE (rhs) == SSA_NAME && !TREE_SIDE_EFFECTS (lhs))
> > + else if (rhs
> > + && (TREE_CODE (rhs) == SSA_NAME || TREE_CODE (rhs) == COMPLEX_CST)
> > + && !TREE_SIDE_EFFECTS (lhs))
> > {
> > tree x;
> > gimple *t;
> > --
> > 2.18.1
> >
new file mode 100644
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b[i];
+}
new file mode 100644
@@ -0,0 +1,63 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-1a.c"
+
+void
+avx_test (void)
+{
+ _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
+ _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+ _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
+ _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+ _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
+ _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+ _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
+ _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+ _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
+ _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+ _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
+ _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+ char* p_init = (char*) malloc (2 * N * sizeof (double));
+
+ __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+ __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+ __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+ __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+ __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+ __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+ for (int i = 0; i != 2 * N * sizeof (double); i++)
+ p_init[i] = i;
+
+ memcpy (pd_src, p_init, 2 * N * sizeof (double));
+ memcpy (ps_src, p_init, 2 * N * sizeof (float));
+ memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
+ memcpy (epi32_src, p_init, 2 * N * sizeof (int));
+ memcpy (epi16_src, p_init, 2 * N * sizeof (short));
+ memcpy (epi8_src, p_init, 2 * N * sizeof (char));
+
+ foo_pd (pd_dst, pd_src);
+ foo_ps (ps_dst, ps_src);
+ foo_epi64 (epi64_dst, epi64_src);
+ foo_epi32 (epi32_dst, epi32_src);
+ foo_epi16 (epi16_dst, epi16_src);
+ foo_epi8 (epi8_dst, epi8_src);
+ if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
+ __builtin_abort ();
+
+ return;
+}
new file mode 100644
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b[i];
+}
+
+static void
+do_test (void)
+{
+ _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+ _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+ char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
+
+ __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+ for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
+ p_init[i] = i;
+
+ memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
+
+ foo_ph (ph_dst, ph_src);
+ if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
+ __builtin_abort ();
+}
new file mode 100644
@@ -0,0 +1,82 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+ a[0] = b[0];
+ a[1] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+ a[0] = b[0];
+ a[1] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+ a[4] = b[4];
+ a[5] = b[5];
+ a[6] = b[6];
+ a[7] = b[7];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+ a[4] = b[4];
+ a[5] = b[5];
+ a[6] = b[6];
+ a[7] = b[7];
+ a[8] = b[8];
+ a[9] = b[9];
+ a[10] = b[10];
+ a[11] = b[11];
+ a[12] = b[12];
+ a[13] = b[13];
+ a[14] = b[14];
+ a[15] = b[15];
+}
new file mode 100644
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-2a.c"
+
+void
+avx_test (void)
+{
+ _Complex double* pd_src = (_Complex double*) malloc (32);
+ _Complex double* pd_dst = (_Complex double*) malloc (32);
+ _Complex float* ps_src = (_Complex float*) malloc (32);
+ _Complex float* ps_dst = (_Complex float*) malloc (32);
+ _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+ _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+ _Complex int* epi32_src = (_Complex int*) malloc (32);
+ _Complex int* epi32_dst = (_Complex int*) malloc (32);
+ _Complex short* epi16_src = (_Complex short*) malloc (32);
+ _Complex short* epi16_dst = (_Complex short*) malloc (32);
+ _Complex char* epi8_src = (_Complex char*) malloc (32);
+ _Complex char* epi8_dst = (_Complex char*) malloc (32);
+ char* p = (char* ) malloc (32);
+
+ __builtin_memset (pd_dst, 0, 32);
+ __builtin_memset (ps_dst, 0, 32);
+ __builtin_memset (epi64_dst, 0, 32);
+ __builtin_memset (epi32_dst, 0, 32);
+ __builtin_memset (epi16_dst, 0, 32);
+ __builtin_memset (epi8_dst, 0, 32);
+
+ for (int i = 0; i != 32; i++)
+ p[i] = i;
+ __builtin_memcpy (pd_src, p, 32);
+ __builtin_memcpy (ps_src, p, 32);
+ __builtin_memcpy (epi64_src, p, 32);
+ __builtin_memcpy (epi32_src, p, 32);
+ __builtin_memcpy (epi16_src, p, 32);
+ __builtin_memcpy (epi8_src, p, 32);
+
+ foo_pd (pd_dst, pd_src);
+ foo_ps (ps_dst, ps_src);
+ foo_epi64 (epi64_dst, epi64_src);
+ foo_epi32 (epi32_dst, epi32_src);
+ foo_epi16 (epi16_dst, epi16_src);
+ foo_epi8 (epi8_dst, epi8_src);
+ if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+ __builtin_abort ();
+
+ return;
+}
new file mode 100644
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+ a[4] = b[4];
+ a[5] = b[5];
+ a[6] = b[6];
+ a[7] = b[7];
+}
+
+void
+do_test (void)
+{
+ _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+ _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+ char* p = (char* ) malloc (32);
+
+ __builtin_memset (ph_dst, 0, 32);
+
+ for (int i = 0; i != 32; i++)
+ p[i] = i;
+ __builtin_memcpy (ph_src, p, 32);
+
+ foo_ph (ph_dst, ph_src);
+ if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
+ __builtin_abort ();
+
+ return;
+}
new file mode 100644
@@ -0,0 +1,80 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+ a[0] = b[1];
+ a[1] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+ a[0] = b[1];
+ a[1] = b[0];
+ a[2] = b[3];
+ a[3] = b[2];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+ a[0] = b[1];
+ a[1] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+ a[0] = b[3];
+ a[1] = b[2];
+ a[2] = b[1];
+ a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+ a[0] = b[7];
+ a[1] = b[6];
+ a[2] = b[5];
+ a[3] = b[4];
+ a[4] = b[3];
+ a[5] = b[2];
+ a[6] = b[1];
+ a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+ a[0] = b[7];
+ a[1] = b[6];
+ a[2] = b[5];
+ a[3] = b[4];
+ a[4] = b[3];
+ a[5] = b[2];
+ a[6] = b[1];
+ a[7] = b[0];
+ a[8] = b[15];
+ a[9] = b[14];
+ a[10] = b[13];
+ a[11] = b[12];
+ a[12] = b[11];
+ a[13] = b[10];
+ a[14] = b[9];
+ a[15] = b[8];
+}
new file mode 100644
@@ -0,0 +1,126 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr106010-3a.c"
+
+void
+avx2_test (void)
+{
+ _Complex double* pd_src = (_Complex double*) malloc (32);
+ _Complex double* pd_dst = (_Complex double*) malloc (32);
+ _Complex double* pd_exp = (_Complex double*) malloc (32);
+ _Complex float* ps_src = (_Complex float*) malloc (32);
+ _Complex float* ps_dst = (_Complex float*) malloc (32);
+ _Complex float* ps_exp = (_Complex float*) malloc (32);
+ _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+ _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+ _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
+ _Complex int* epi32_src = (_Complex int*) malloc (32);
+ _Complex int* epi32_dst = (_Complex int*) malloc (32);
+ _Complex int* epi32_exp = (_Complex int*) malloc (32);
+ _Complex short* epi16_src = (_Complex short*) malloc (32);
+ _Complex short* epi16_dst = (_Complex short*) malloc (32);
+ _Complex short* epi16_exp = (_Complex short*) malloc (32);
+ _Complex char* epi8_src = (_Complex char*) malloc (32);
+ _Complex char* epi8_dst = (_Complex char*) malloc (32);
+ _Complex char* epi8_exp = (_Complex char*) malloc (32);
+ char* p = (char* ) malloc (32);
+ char* q = (char* ) malloc (32);
+
+ __builtin_memset (pd_dst, 0, 32);
+ __builtin_memset (ps_dst, 0, 32);
+ __builtin_memset (epi64_dst, 0, 32);
+ __builtin_memset (epi32_dst, 0, 32);
+ __builtin_memset (epi16_dst, 0, 32);
+ __builtin_memset (epi8_dst, 0, 32);
+
+ for (int i = 0; i != 32; i++)
+ p[i] = i;
+ __builtin_memcpy (pd_src, p, 32);
+ __builtin_memcpy (ps_src, p, 32);
+ __builtin_memcpy (epi64_src, p, 32);
+ __builtin_memcpy (epi32_src, p, 32);
+ __builtin_memcpy (epi16_src, p, 32);
+ __builtin_memcpy (epi8_src, p, 32);
+
+ for (int i = 0; i != 16; i++)
+ {
+ p[i] = i + 16;
+ p[i + 16] = i;
+ }
+ __builtin_memcpy (pd_exp, p, 32);
+ __builtin_memcpy (epi64_exp, p, 32);
+
+ for (int i = 0; i != 8; i++)
+ {
+ p[i] = i + 8;
+ p[i + 8] = i;
+ p[i + 16] = i + 24;
+ p[i + 24] = i + 16;
+ q[i] = i + 24;
+ q[i + 8] = i + 16;
+ q[i + 16] = i + 8;
+ q[i + 24] = i;
+ }
+ __builtin_memcpy (ps_exp, p, 32);
+ __builtin_memcpy (epi32_exp, q, 32);
+
+
+ for (int i = 0; i != 4; i++)
+ {
+ q[i] = i + 28;
+ q[i + 4] = i + 24;
+ q[i + 8] = i + 20;
+ q[i + 12] = i + 16;
+ q[i + 16] = i + 12;
+ q[i + 20] = i + 8;
+ q[i + 24] = i + 4;
+ q[i + 28] = i;
+ }
+ __builtin_memcpy (epi16_exp, q, 32);
+
+ for (int i = 0; i != 2; i++)
+ {
+ q[i] = i + 14;
+ q[i + 2] = i + 12;
+ q[i + 4] = i + 10;
+ q[i + 6] = i + 8;
+ q[i + 8] = i + 6;
+ q[i + 10] = i + 4;
+ q[i + 12] = i + 2;
+ q[i + 14] = i;
+ q[i + 16] = i + 30;
+ q[i + 18] = i + 28;
+ q[i + 20] = i + 26;
+ q[i + 22] = i + 24;
+ q[i + 24] = i + 22;
+ q[i + 26] = i + 20;
+ q[i + 28] = i + 18;
+ q[i + 30] = i + 16;
+ }
+ __builtin_memcpy (epi8_exp, q, 32);
+
+ foo_pd (pd_dst, pd_src);
+ foo_ps (ps_dst, ps_src);
+ foo_epi64 (epi64_dst, epi64_src);
+ foo_epi32 (epi32_dst, epi32_src);
+ foo_epi16 (epi16_dst, epi16_src);
+ foo_epi8 (epi8_dst, epi8_src);
+ if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
+ __builtin_abort ();
+
+ return;
+}
new file mode 100644
@@ -0,0 +1,69 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+ a[0] = b[1];
+ a[1] = b[0];
+ a[2] = b[4];
+ a[3] = b[3];
+ a[4] = b[7];
+ a[5] = b[6];
+ a[6] = b[2];
+ a[7] = b[5];
+}
+
+void
+do_test (void)
+{
+ _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+ _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+ _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
+ char* p = (char* ) malloc (32);
+ char* q = (char* ) malloc (32);
+
+ __builtin_memset (ph_dst, 0, 32);
+
+ for (int i = 0; i != 32; i++)
+ p[i] = i;
+ __builtin_memcpy (ph_src, p, 32);
+
+ for (int i = 0; i != 4; i++)
+ {
+ p[i] = i + 4;
+ p[i + 4] = i;
+ p[i + 8] = i + 16;
+ p[i + 12] = i + 12;
+ p[i + 16] = i + 28;
+ p[i + 20] = i + 24;
+ p[i + 24] = i + 8;
+ p[i + 28] = i + 20;
+ q[i] = i + 28;
+ q[i + 4] = i + 24;
+ q[i + 8] = i + 20;
+ q[i + 12] = i + 16;
+ q[i + 16] = i + 12;
+ q[i + 20] = i + 8;
+ q[i + 24] = i + 4;
+ q[i + 28] = i;
+ }
+ __builtin_memcpy (ph_exp, p, 32);
+
+ foo_ph (ph_dst, ph_src);
+ if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
+ __builtin_abort ();
+
+ return;
+}
new file mode 100644
@@ -0,0 +1,101 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a,
+ _Complex double b1,
+ _Complex double b2)
+{
+ a[0] = b1;
+ a[1] = b2;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a,
+ _Complex float b1, _Complex float b2,
+ _Complex float b3, _Complex float b4)
+{
+ a[0] = b1;
+ a[1] = b2;
+ a[2] = b3;
+ a[3] = b4;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a,
+ _Complex long long b1,
+ _Complex long long b2)
+{
+ a[0] = b1;
+ a[1] = b2;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a,
+ _Complex int b1, _Complex int b2,
+ _Complex int b3, _Complex int b4)
+{
+ a[0] = b1;
+ a[1] = b2;
+ a[2] = b3;
+ a[3] = b4;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a,
+ _Complex short b1, _Complex short b2,
+ _Complex short b3, _Complex short b4,
+ _Complex short b5, _Complex short b6,
+ _Complex short b7,_Complex short b8)
+{
+ a[0] = b1;
+ a[1] = b2;
+ a[2] = b3;
+ a[3] = b4;
+ a[4] = b5;
+ a[5] = b6;
+ a[6] = b7;
+ a[7] = b8;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a,
+ _Complex char b1, _Complex char b2,
+ _Complex char b3, _Complex char b4,
+ _Complex char b5, _Complex char b6,
+ _Complex char b7,_Complex char b8,
+ _Complex char b9, _Complex char b10,
+ _Complex char b11, _Complex char b12,
+ _Complex char b13, _Complex char b14,
+ _Complex char b15,_Complex char b16)
+{
+ a[0] = b1;
+ a[1] = b2;
+ a[2] = b3;
+ a[3] = b4;
+ a[4] = b5;
+ a[5] = b6;
+ a[6] = b7;
+ a[7] = b8;
+ a[8] = b9;
+ a[9] = b10;
+ a[10] = b11;
+ a[11] = b12;
+ a[12] = b13;
+ a[13] = b14;
+ a[14] = b15;
+ a[15] = b16;
+}
new file mode 100644
@@ -0,0 +1,67 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-4a.c"
+
+void
+avx_test (void)
+{
+ _Complex double* pd_src = (_Complex double*) malloc (32);
+ _Complex double* pd_dst = (_Complex double*) malloc (32);
+ _Complex float* ps_src = (_Complex float*) malloc (32);
+ _Complex float* ps_dst = (_Complex float*) malloc (32);
+ _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+ _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+ _Complex int* epi32_src = (_Complex int*) malloc (32);
+ _Complex int* epi32_dst = (_Complex int*) malloc (32);
+ _Complex short* epi16_src = (_Complex short*) malloc (32);
+ _Complex short* epi16_dst = (_Complex short*) malloc (32);
+ _Complex char* epi8_src = (_Complex char*) malloc (32);
+ _Complex char* epi8_dst = (_Complex char*) malloc (32);
+ char* p = (char* ) malloc (32);
+
+ __builtin_memset (pd_dst, 0, 32);
+ __builtin_memset (ps_dst, 0, 32);
+ __builtin_memset (epi64_dst, 0, 32);
+ __builtin_memset (epi32_dst, 0, 32);
+ __builtin_memset (epi16_dst, 0, 32);
+ __builtin_memset (epi8_dst, 0, 32);
+
+ for (int i = 0; i != 32; i++)
+ p[i] = i;
+ __builtin_memcpy (pd_src, p, 32);
+ __builtin_memcpy (ps_src, p, 32);
+ __builtin_memcpy (epi64_src, p, 32);
+ __builtin_memcpy (epi32_src, p, 32);
+ __builtin_memcpy (epi16_src, p, 32);
+ __builtin_memcpy (epi8_src, p, 32);
+
+ foo_pd (pd_dst, pd_src[0], pd_src[1]);
+ foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
+ foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
+ foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
+ foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
+ epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
+ foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
+ epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
+ epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
+ epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
+
+ if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
+ __builtin_abort ();
+
+ return;
+}
new file mode 100644
@@ -0,0 +1,54 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a,
+ _Complex _Float16 b1, _Complex _Float16 b2,
+ _Complex _Float16 b3, _Complex _Float16 b4,
+ _Complex _Float16 b5, _Complex _Float16 b6,
+ _Complex _Float16 b7,_Complex _Float16 b8)
+{
+ a[0] = b1;
+ a[1] = b2;
+ a[2] = b3;
+ a[3] = b4;
+ a[4] = b5;
+ a[5] = b6;
+ a[6] = b7;
+ a[7] = b8;
+}
+
+void
+do_test (void)
+{
+
+ _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+ _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+
+ char* p = (char* ) malloc (32);
+
+ __builtin_memset (ph_dst, 0, 32);
+
+ for (int i = 0; i != 32; i++)
+ p[i] = i;
+
+ __builtin_memcpy (ph_src, p, 32);
+
+ foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
+ ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
+
+ if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
+ __builtin_abort ();
+ return;
+}
new file mode 100644
@@ -0,0 +1,117 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+ a[0] = b[2];
+ a[1] = b[3];
+ a[2] = b[0];
+ a[3] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+ a[0] = b[4];
+ a[1] = b[5];
+ a[2] = b[6];
+ a[3] = b[7];
+ a[4] = b[0];
+ a[5] = b[1];
+ a[6] = b[2];
+ a[7] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+ a[0] = b[2];
+ a[1] = b[3];
+ a[2] = b[0];
+ a[3] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+ a[0] = b[4];
+ a[1] = b[5];
+ a[2] = b[6];
+ a[3] = b[7];
+ a[4] = b[0];
+ a[5] = b[1];
+ a[6] = b[2];
+ a[7] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+ a[0] = b[8];
+ a[1] = b[9];
+ a[2] = b[10];
+ a[3] = b[11];
+ a[4] = b[12];
+ a[5] = b[13];
+ a[6] = b[14];
+ a[7] = b[15];
+ a[8] = b[0];
+ a[9] = b[1];
+ a[10] = b[2];
+ a[11] = b[3];
+ a[12] = b[4];
+ a[13] = b[5];
+ a[14] = b[6];
+ a[15] = b[7];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+ a[0] = b[16];
+ a[1] = b[17];
+ a[2] = b[18];
+ a[3] = b[19];
+ a[4] = b[20];
+ a[5] = b[21];
+ a[6] = b[22];
+ a[7] = b[23];
+ a[8] = b[24];
+ a[9] = b[25];
+ a[10] = b[26];
+ a[11] = b[27];
+ a[12] = b[28];
+ a[13] = b[29];
+ a[14] = b[30];
+ a[15] = b[31];
+ a[16] = b[0];
+ a[17] = b[1];
+ a[18] = b[2];
+ a[19] = b[3];
+ a[20] = b[4];
+ a[21] = b[5];
+ a[22] = b[6];
+ a[23] = b[7];
+ a[24] = b[8];
+ a[25] = b[9];
+ a[26] = b[10];
+ a[27] = b[11];
+ a[28] = b[12];
+ a[29] = b[13];
+ a[30] = b[14];
+ a[31] = b[15];
+}
new file mode 100644
@@ -0,0 +1,80 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-5a.c"
+
+void
+avx_test (void)
+{
+ _Complex double* pd_src = (_Complex double*) malloc (64);
+ _Complex double* pd_dst = (_Complex double*) malloc (64);
+ _Complex double* pd_exp = (_Complex double*) malloc (64);
+ _Complex float* ps_src = (_Complex float*) malloc (64);
+ _Complex float* ps_dst = (_Complex float*) malloc (64);
+ _Complex float* ps_exp = (_Complex float*) malloc (64);
+ _Complex long long* epi64_src = (_Complex long long*) malloc (64);
+ _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
+ _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
+ _Complex int* epi32_src = (_Complex int*) malloc (64);
+ _Complex int* epi32_dst = (_Complex int*) malloc (64);
+ _Complex int* epi32_exp = (_Complex int*) malloc (64);
+ _Complex short* epi16_src = (_Complex short*) malloc (64);
+ _Complex short* epi16_dst = (_Complex short*) malloc (64);
+ _Complex short* epi16_exp = (_Complex short*) malloc (64);
+ _Complex char* epi8_src = (_Complex char*) malloc (64);
+ _Complex char* epi8_dst = (_Complex char*) malloc (64);
+ _Complex char* epi8_exp = (_Complex char*) malloc (64);
+ char* p = (char* ) malloc (64);
+ char* q = (char* ) malloc (64);
+
+ __builtin_memset (pd_dst, 0, 64);
+ __builtin_memset (ps_dst, 0, 64);
+ __builtin_memset (epi64_dst, 0, 64);
+ __builtin_memset (epi32_dst, 0, 64);
+ __builtin_memset (epi16_dst, 0, 64);
+ __builtin_memset (epi8_dst, 0, 64);
+
+ for (int i = 0; i != 64; i++)
+ {
+ p[i] = i;
+ q[i] = (i + 32) % 64;
+ }
+ __builtin_memcpy (pd_src, p, 64);
+ __builtin_memcpy (ps_src, p, 64);
+ __builtin_memcpy (epi64_src, p, 64);
+ __builtin_memcpy (epi32_src, p, 64);
+ __builtin_memcpy (epi16_src, p, 64);
+ __builtin_memcpy (epi8_src, p, 64);
+
+ __builtin_memcpy (pd_exp, q, 64);
+ __builtin_memcpy (ps_exp, q, 64);
+ __builtin_memcpy (epi64_exp, q, 64);
+ __builtin_memcpy (epi32_exp, q, 64);
+ __builtin_memcpy (epi16_exp, q, 64);
+ __builtin_memcpy (epi8_exp, q, 64);
+
+ foo_pd (pd_dst, pd_src);
+ foo_ps (ps_dst, ps_src);
+ foo_epi64 (epi64_dst, epi64_src);
+ foo_epi32 (epi32_dst, epi32_src);
+ foo_epi16 (epi16_dst, epi16_src);
+ foo_epi8 (epi8_dst, epi8_src);
+
+ if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
+ __builtin_abort ();
+
+ return;
+}
new file mode 100644
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+ a[0] = b[8];
+ a[1] = b[9];
+ a[2] = b[10];
+ a[3] = b[11];
+ a[4] = b[12];
+ a[5] = b[13];
+ a[6] = b[14];
+ a[7] = b[15];
+ a[8] = b[0];
+ a[9] = b[1];
+ a[10] = b[2];
+ a[11] = b[3];
+ a[12] = b[4];
+ a[13] = b[5];
+ a[14] = b[6];
+ a[15] = b[7];
+}
+
+void
+do_test (void)
+{
+ _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
+ _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
+ _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
+ char* p = (char* ) malloc (64);
+ char* q = (char* ) malloc (64);
+
+ __builtin_memset (ph_dst, 0, 64);
+
+ for (int i = 0; i != 64; i++)
+ {
+ p[i] = i;
+ q[i] = (i + 32) % 64;
+ }
+ __builtin_memcpy (ph_src, p, 64);
+
+ __builtin_memcpy (ph_exp, q, 64);
+
+ foo_ph (ph_dst, ph_src);
+
+ if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
+ __builtin_abort ();
+
+ return;
+}
new file mode 100644
@@ -0,0 +1,115 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+ a[0] = b[3];
+ a[1] = b[2];
+ a[2] = b[1];
+ a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+ a[0] = b[7];
+ a[1] = b[6];
+ a[2] = b[5];
+ a[3] = b[4];
+ a[4] = b[3];
+ a[5] = b[2];
+ a[6] = b[1];
+ a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+ a[0] = b[3];
+ a[1] = b[2];
+ a[2] = b[1];
+ a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+ a[0] = b[7];
+ a[1] = b[6];
+ a[2] = b[5];
+ a[3] = b[4];
+ a[4] = b[3];
+ a[5] = b[2];
+ a[6] = b[1];
+ a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+ a[0] = b[15];
+ a[1] = b[14];
+ a[2] = b[13];
+ a[3] = b[12];
+ a[4] = b[11];
+ a[5] = b[10];
+ a[6] = b[9];
+ a[7] = b[8];
+ a[8] = b[7];
+ a[9] = b[6];
+ a[10] = b[5];
+ a[11] = b[4];
+ a[12] = b[3];
+ a[13] = b[2];
+ a[14] = b[1];
+ a[15] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+ a[0] = b[31];
+ a[1] = b[30];
+ a[2] = b[29];
+ a[3] = b[28];
+ a[4] = b[27];
+ a[5] = b[26];
+ a[6] = b[25];
+ a[7] = b[24];
+ a[8] = b[23];
+ a[9] = b[22];
+ a[10] = b[21];
+ a[11] = b[20];
+ a[12] = b[19];
+ a[13] = b[18];
+ a[14] = b[17];
+ a[15] = b[16];
+ a[16] = b[15];
+ a[17] = b[14];
+ a[18] = b[13];
+ a[19] = b[12];
+ a[20] = b[11];
+ a[21] = b[10];
+ a[22] = b[9];
+ a[23] = b[8];
+ a[24] = b[7];
+ a[25] = b[6];
+ a[26] = b[5];
+ a[27] = b[4];
+ a[28] = b[3];
+ a[29] = b[2];
+ a[30] = b[1];
+ a[31] = b[0];
+}
new file mode 100644
@@ -0,0 +1,157 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr106010-6a.c"
+
+void
+avx2_test (void)
+{
+ _Complex double* pd_src = (_Complex double*) malloc (64);
+ _Complex double* pd_dst = (_Complex double*) malloc (64);
+ _Complex double* pd_exp = (_Complex double*) malloc (64);
+ _Complex float* ps_src = (_Complex float*) malloc (64);
+ _Complex float* ps_dst = (_Complex float*) malloc (64);
+ _Complex float* ps_exp = (_Complex float*) malloc (64);
+ _Complex long long* epi64_src = (_Complex long long*) malloc (64);
+ _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
+ _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
+ _Complex int* epi32_src = (_Complex int*) malloc (64);
+ _Complex int* epi32_dst = (_Complex int*) malloc (64);
+ _Complex int* epi32_exp = (_Complex int*) malloc (64);
+ _Complex short* epi16_src = (_Complex short*) malloc (64);
+ _Complex short* epi16_dst = (_Complex short*) malloc (64);
+ _Complex short* epi16_exp = (_Complex short*) malloc (64);
+ _Complex char* epi8_src = (_Complex char*) malloc (64);
+ _Complex char* epi8_dst = (_Complex char*) malloc (64);
+ _Complex char* epi8_exp = (_Complex char*) malloc (64);
+ char* p = (char* ) malloc (64);
+ char* q = (char* ) malloc (64);
+
+ __builtin_memset (pd_dst, 0, 64);
+ __builtin_memset (ps_dst, 0, 64);
+ __builtin_memset (epi64_dst, 0, 64);
+ __builtin_memset (epi32_dst, 0, 64);
+ __builtin_memset (epi16_dst, 0, 64);
+ __builtin_memset (epi8_dst, 0, 64);
+
+ for (int i = 0; i != 64; i++)
+ p[i] = i;
+
+ __builtin_memcpy (pd_src, p, 64);
+ __builtin_memcpy (ps_src, p, 64);
+ __builtin_memcpy (epi64_src, p, 64);
+ __builtin_memcpy (epi32_src, p, 64);
+ __builtin_memcpy (epi16_src, p, 64);
+ __builtin_memcpy (epi8_src, p, 64);
+
+
+ for (int i = 0; i != 16; i++)
+ {
+ q[i] = i + 48;
+ q[i + 16] = i + 32;
+ q[i + 32] = i + 16;
+ q[i + 48] = i;
+ }
+
+ __builtin_memcpy (pd_exp, q, 64);
+ __builtin_memcpy (epi64_exp, q, 64);
+
+ for (int i = 0; i != 8; i++)
+ {
+ q[i] = i + 56;
+ q[i + 8] = i + 48;
+ q[i + 16] = i + 40;
+ q[i + 24] = i + 32;
+ q[i + 32] = i + 24;
+ q[i + 40] = i + 16;
+ q[i + 48] = i + 8;
+ q[i + 56] = i;
+ }
+
+ __builtin_memcpy (ps_exp, q, 64);
+ __builtin_memcpy (epi32_exp, q, 64);
+
+ for (int i = 0; i != 4; i++)
+ {
+ q[i] = i + 60;
+ q[i + 4] = i + 56;
+ q[i + 8] = i + 52;
+ q[i + 12] = i + 48;
+ q[i + 16] = i + 44;
+ q[i + 20] = i + 40;
+ q[i + 24] = i + 36;
+ q[i + 28] = i + 32;
+ q[i + 32] = i + 28;
+ q[i + 36] = i + 24;
+ q[i + 40] = i + 20;
+ q[i + 44] = i + 16;
+ q[i + 48] = i + 12;
+ q[i + 52] = i + 8;
+ q[i + 56] = i + 4;
+ q[i + 60] = i;
+ }
+
+ __builtin_memcpy (epi16_exp, q, 64);
+
+ for (int i = 0; i != 2; i++)
+ {
+ q[i] = i + 62;
+ q[i + 2] = i + 60;
+ q[i + 4] = i + 58;
+ q[i + 6] = i + 56;
+ q[i + 8] = i + 54;
+ q[i + 10] = i + 52;
+ q[i + 12] = i + 50;
+ q[i + 14] = i + 48;
+ q[i + 16] = i + 46;
+ q[i + 18] = i + 44;
+ q[i + 20] = i + 42;
+ q[i + 22] = i + 40;
+ q[i + 24] = i + 38;
+ q[i + 26] = i + 36;
+ q[i + 28] = i + 34;
+ q[i + 30] = i + 32;
+ q[i + 32] = i + 30;
+ q[i + 34] = i + 28;
+ q[i + 36] = i + 26;
+ q[i + 38] = i + 24;
+ q[i + 40] = i + 22;
+ q[i + 42] = i + 20;
+ q[i + 44] = i + 18;
+ q[i + 46] = i + 16;
+ q[i + 48] = i + 14;
+ q[i + 50] = i + 12;
+ q[i + 52] = i + 10;
+ q[i + 54] = i + 8;
+ q[i + 56] = i + 6;
+ q[i + 58] = i + 4;
+ q[i + 60] = i + 2;
+ q[i + 62] = i;
+ }
+ __builtin_memcpy (epi8_exp, q, 64);
+
+ foo_pd (pd_dst, pd_src);
+ foo_ps (ps_dst, ps_src);
+ foo_epi64 (epi64_dst, epi64_src);
+ foo_epi32 (epi32_dst, epi32_src);
+ foo_epi16 (epi16_dst, epi16_src);
+ foo_epi8 (epi8_dst, epi8_src);
+
+ if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
+ __builtin_abort ();
+
+ return;
+}
new file mode 100644
@@ -0,0 +1,80 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+ a[0] = b[15];
+ a[1] = b[14];
+ a[2] = b[13];
+ a[3] = b[12];
+ a[4] = b[11];
+ a[5] = b[10];
+ a[6] = b[9];
+ a[7] = b[8];
+ a[8] = b[7];
+ a[9] = b[6];
+ a[10] = b[5];
+ a[11] = b[4];
+ a[12] = b[3];
+ a[13] = b[2];
+ a[14] = b[1];
+ a[15] = b[0];
+}
+
+void
+do_test (void)
+{
+ _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
+ _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
+ _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
+ char* p = (char* ) malloc (64);
+ char* q = (char* ) malloc (64);
+
+ __builtin_memset (ph_dst, 0, 64);
+
+ for (int i = 0; i != 64; i++)
+ p[i] = i;
+
+ __builtin_memcpy (ph_src, p, 64);
+
+ for (int i = 0; i != 4; i++)
+ {
+ q[i] = i + 60;
+ q[i + 4] = i + 56;
+ q[i + 8] = i + 52;
+ q[i + 12] = i + 48;
+ q[i + 16] = i + 44;
+ q[i + 20] = i + 40;
+ q[i + 24] = i + 36;
+ q[i + 28] = i + 32;
+ q[i + 32] = i + 28;
+ q[i + 36] = i + 24;
+ q[i + 40] = i + 20;
+ q[i + 44] = i + 16;
+ q[i + 48] = i + 12;
+ q[i + 52] = i + 8;
+ q[i + 56] = i + 4;
+ q[i + 60] = i;
+ }
+
+ __builtin_memcpy (ph_exp, q, 64);
+
+ foo_ph (ph_dst, ph_src);
+
+ if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
+ __builtin_abort ();
+
+ return;
+}
new file mode 100644
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b;
+}
new file mode 100644
@@ -0,0 +1,63 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-7a.c"
+
+void
+avx_test (void)
+{
+ _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
+ _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+ _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
+ _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+ _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
+ _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+ _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
+ _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+ _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
+ _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+ _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
+ _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+ char* p_init = (char*) malloc (2 * N * sizeof (double));
+
+ __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+ __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+ __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+ __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+ __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+ __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+ for (int i = 0; i != 2 * N * sizeof (double); i++)
+ p_init[i] = i % 2 + 3;
+
+ memcpy (pd_src, p_init, 2 * N * sizeof (double));
+ memcpy (ps_dst, p_init, 2 * N * sizeof (float));
+ memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
+ memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
+ memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
+ memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
+
+ foo_pd (pd_dst, pd_src[0]);
+ foo_ps (ps_dst, ps_src[0]);
+ foo_epi64 (epi64_dst, epi64_src[0]);
+ foo_epi32 (epi32_dst, epi32_src[0]);
+ foo_epi16 (epi16_dst, epi16_src[0]);
+ foo_epi8 (epi8_dst, epi8_src[0]);
+ if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
+ __builtin_abort ();
+ if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
+ __builtin_abort ();
+
+ return;
+}
new file mode 100644
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16 b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b;
+}
+
+static void
+do_test (void)
+{
+ _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+ _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+ char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
+
+ __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+ for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
+ p_init[i] = i % 2 + 3;
+
+ memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
+
+ foo_ph (ph_dst, ph_src[0]);
+ if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
+ __builtin_abort ();
+}
new file mode 100644
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = 1.0 + 2.0i;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = 1.0f + 2.0fi;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = 1 + 2i;
+}
new file mode 100644
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-8a.c"
+
+void
+avx_test (void)
+{
+ _Complex double pd_src = 1.0 + 2.0i;
+ _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+ _Complex float ps_src = 1.0 + 2.0i;
+ _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+ _Complex long long epi64_src = 1 + 2i;;
+ _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+ _Complex int epi32_src = 1 + 2i;
+ _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+ _Complex short epi16_src = 1 + 2i;
+ _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+ _Complex char epi8_src = 1 + 2i;
+ _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+
+ __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+ __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+ __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+ __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+ __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+ __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+ foo_pd (pd_dst);
+ foo_ps (ps_dst);
+ foo_epi64 (epi64_dst);
+ foo_epi32 (epi32_dst);
+ foo_epi16 (epi16_dst);
+ foo_epi8 (epi8_dst);
+ for (int i = 0 ; i != N; i++)
+ {
+ if (pd_dst[i] != pd_src)
+ __builtin_abort ();
+ if (ps_dst[i] != ps_src)
+ __builtin_abort ();
+ if (epi64_dst[i] != epi64_src)
+ __builtin_abort ();
+ if (epi32_dst[i] != epi32_src)
+ __builtin_abort ();
+ if (epi16_dst[i] != epi16_src)
+ __builtin_abort ();
+ if (epi8_dst[i] != epi8_src)
+ __builtin_abort ();
+ }
+}
new file mode 100644
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = 1.0f16 + 2.0f16i;
+}
+
+static void
+do_test (void)
+{
+ _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
+ _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+
+ __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+ foo_ph (ph_dst);
+ for (int i = 0; i != N; i++)
+ {
+ if (ph_dst[i] != ph_src)
+ __builtin_abort ();
+ }
+}
new file mode 100644
@@ -0,0 +1,89 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+
+typedef struct { _Complex double c; double a1; double a2;}
+ cdf;
+typedef struct { _Complex double c; double a1; double a2; double a3; double a4;}
+ cdf2;
+typedef struct { _Complex double c1; _Complex double c2; double a1; double a2; double a3; double a4;}
+ cdf3;
+typedef struct { _Complex double c1; _Complex double c2; double a1; double a2;}
+ cdf4;
+
+#define N 100
+/* VMAT_ELEMENTWISE. */
+void
+__attribute__((noipa))
+foo (cdf* a, cdf* __restrict b)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ a[i].c = b[i].c;
+ a[i].a1 = b[i].a1;
+ a[i].a2 = b[i].a2;
+ }
+}
+
+/* VMAT_CONTIGUOUS_PERMUTE. */
+void
+__attribute__((noipa))
+foo1 (cdf2* a, cdf2* __restrict b)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ a[i].c = b[i].c;
+ a[i].a1 = b[i].a1;
+ a[i].a2 = b[i].a2;
+ a[i].a3 = b[i].a3;
+ a[i].a4 = b[i].a4;
+ }
+}
+
+/* VMAT_CONTIGUOUS. */
+void
+__attribute__((noipa))
+foo2 (cdf3* a, cdf3* __restrict b)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ a[i].c1 = b[i].c1;
+ a[i].c2 = b[i].c2;
+ a[i].a1 = b[i].a1;
+ a[i].a2 = b[i].a2;
+ a[i].a3 = b[i].a3;
+ a[i].a4 = b[i].a4;
+ }
+}
+
+/* VMAT_STRIDED_SLP. */
+void
+__attribute__((noipa))
+foo3 (cdf4* a, cdf4* __restrict b)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ a[i].c1 = b[i].c1;
+ a[i].c2 = b[i].c2;
+ a[i].a1 = b[i].a1;
+ a[i].a2 = b[i].a2;
+ }
+}
+
+/* VMAT_CONTIGUOUS_REVERSE. */
+void
+__attribute__((noipa))
+foo4 (_Complex double* a, _Complex double* __restrict b)
+{
+ for (int i = 0; i != N; i++)
+ a[i] = b[N-i-1];
+}
+
+/* VMAT_CONTIGUOUS_DOWN. */
+void
+__attribute__((noipa))
+foo5 (_Complex double* a, _Complex double* __restrict b)
+{
+ for (int i = 0; i != N; i++)
+ a[N-i-1] = b[0];
+}
new file mode 100644
@@ -0,0 +1,90 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -msse2 -fvect-cost-model=unlimited" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <string.h>
+#include "sse2-check.h"
+#include "pr106010-9a.c"
+
+static void
+sse2_test (void)
+{
+ _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
+ _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
+ _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+ _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+ cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
+ cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
+ cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
+ cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
+ cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
+ cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
+ cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
+ cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
+
+ char* p_init = (char*) malloc (N * sizeof (cdf3));
+
+ __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
+ __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
+ __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
+ __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
+ __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
+ __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
+
+ for (int i = 0; i != N * sizeof (cdf3); i++)
+ p_init[i] = i;
+
+ memcpy (cdf_src, p_init, N * sizeof (cdf));
+ memcpy (cdf2_src, p_init, N * sizeof (cdf2));
+ memcpy (cdf3_src, p_init, N * sizeof (cdf3));
+ memcpy (cdf4_src, p_init, N * sizeof (cdf4));
+ memcpy (pd_src, p_init, N * sizeof (_Complex double));
+ for (int i = 0; i != 2 * N * sizeof (double); i++)
+ p_init[i] = i % 16;
+ memcpy (pd_src2, p_init, N * sizeof (_Complex double));
+
+ foo (cdf_dst, cdf_src);
+ foo1 (cdf2_dst, cdf2_src);
+ foo2 (cdf3_dst, cdf3_src);
+ foo3 (cdf4_dst, cdf4_src);
+ foo4 (pd_dst, pd_src);
+ foo5 (pd_dst2, pd_src2);
+ for (int i = 0; i != N; i++)
+ {
+ p_init[(N - i - 1) * 16] = i * 16;
+ p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
+ p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
+ p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
+ p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
+ p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
+ p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
+ p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
+ p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
+ p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
+ p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
+ p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
+ p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
+ p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
+ p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
+ p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
+ }
+ memcpy (pd_src, p_init, N * 16);
+
+ if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
+ __builtin_abort ();
+}
new file mode 100644
@@ -0,0 +1,90 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited" } */
+/* { dg-require-effective-target avx2 } */
+
+#include <string.h>
+#include "avx2-check.h"
+#include "pr106010-9a.c"
+
+static void
+avx2_test (void)
+{
+ _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
+ _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
+ _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+ _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+ cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
+ cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
+ cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
+ cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
+ cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
+ cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
+ cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
+ cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
+
+ char* p_init = (char*) malloc (N * sizeof (cdf3));
+
+ __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
+ __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
+ __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
+ __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
+ __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
+ __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
+
+ for (int i = 0; i != N * sizeof (cdf3); i++)
+ p_init[i] = i;
+
+ memcpy (cdf_src, p_init, N * sizeof (cdf));
+ memcpy (cdf2_src, p_init, N * sizeof (cdf2));
+ memcpy (cdf3_src, p_init, N * sizeof (cdf3));
+ memcpy (cdf4_src, p_init, N * sizeof (cdf4));
+ memcpy (pd_src, p_init, N * sizeof (_Complex double));
+ for (int i = 0; i != 2 * N * sizeof (double); i++)
+ p_init[i] = i % 16;
+ memcpy (pd_src2, p_init, N * sizeof (_Complex double));
+
+ foo (cdf_dst, cdf_src);
+ foo1 (cdf2_dst, cdf2_src);
+ foo2 (cdf3_dst, cdf3_src);
+ foo3 (cdf4_dst, cdf4_src);
+ foo4 (pd_dst, pd_src);
+ foo5 (pd_dst2, pd_src2);
+ for (int i = 0; i != N; i++)
+ {
+ p_init[(N - i - 1) * 16] = i * 16;
+ p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
+ p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
+ p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
+ p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
+ p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
+ p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
+ p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
+ p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
+ p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
+ p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
+ p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
+ p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
+ p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
+ p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
+ p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
+ }
+ memcpy (pd_src, p_init, N * 16);
+
+ if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
+ __builtin_abort ();
+}
new file mode 100644
@@ -0,0 +1,92 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mavx512f -mavx512vl -fvect-cost-model=unlimited -mprefer-vector-width=512" } */
+/* { dg-require-effective-target avx512f } */
+
+#include <string.h>
+#include <stdlib.h>
+#define AVX512F
+#include "avx512-check.h"
+#include "pr106010-9a.c"
+
+static void
+test_512 (void)
+{
+ _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
+ _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
+ _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+ _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+ cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
+ cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
+ cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
+ cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
+ cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
+ cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
+ cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
+ cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
+
+ char* p_init = (char*) malloc (N * sizeof (cdf3));
+
+ __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
+ __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
+ __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
+ __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
+ __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
+ __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
+
+ for (int i = 0; i != N * sizeof (cdf3); i++)
+ p_init[i] = i;
+
+ memcpy (cdf_src, p_init, N * sizeof (cdf));
+ memcpy (cdf2_src, p_init, N * sizeof (cdf2));
+ memcpy (cdf3_src, p_init, N * sizeof (cdf3));
+ memcpy (cdf4_src, p_init, N * sizeof (cdf4));
+ memcpy (pd_src, p_init, N * sizeof (_Complex double));
+ for (int i = 0; i != 2 * N * sizeof (double); i++)
+ p_init[i] = i % 16;
+ memcpy (pd_src2, p_init, N * sizeof (_Complex double));
+
+ foo (cdf_dst, cdf_src);
+ foo1 (cdf2_dst, cdf2_src);
+ foo2 (cdf3_dst, cdf3_src);
+ foo3 (cdf4_dst, cdf4_src);
+ foo4 (pd_dst, pd_src);
+ foo5 (pd_dst2, pd_src2);
+ for (int i = 0; i != N; i++)
+ {
+ p_init[(N - i - 1) * 16] = i * 16;
+ p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
+ p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
+ p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
+ p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
+ p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
+ p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
+ p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
+ p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
+ p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
+ p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
+ p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
+ p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
+ p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
+ p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
+ p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
+ }
+ memcpy (pd_src, p_init, N * 16);
+
+ if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
+ __builtin_abort ();
+
+ if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
+ __builtin_abort ();
+}
@@ -297,6 +297,11 @@ init_dont_simulate_again (void)
break;
default:
+ /* When expand_complex_move would trigger make sure we
+ perform lowering even when there is no actual complex
+ operation. This helps consistency and vectorization. */
+ if (TREE_CODE (TREE_TYPE (gimple_op (stmt, 0))) == COMPLEX_TYPE)
+ saw_a_complex_op = true;
break;
}
@@ -869,7 +874,9 @@ expand_complex_move (gimple_stmt_iterator *gsi, tree type)
update_complex_assignment (gsi, r, i);
}
}
- else if (rhs && TREE_CODE (rhs) == SSA_NAME && !TREE_SIDE_EFFECTS (lhs))
+ else if (rhs
+ && (TREE_CODE (rhs) == SSA_NAME || TREE_CODE (rhs) == COMPLEX_CST)
+ && !TREE_SIDE_EFFECTS (lhs))
{
tree x;
gimple *t;