Lower complex type move to enable vectorization for complex type load&store.

Message ID 20220720024559.40337-1-hongtao.liu@intel.com
State New, archived
Headers
Series Lower complex type move to enable vectorization for complex type load&store. |

Commit Message

Li, Pan2 via Gcc-patches July 20, 2022, 2:45 a.m. UTC
  > My original comments still stand (it feels like this should be more generic).
> Can we go the way lowering complex loads/stores first?  A large part
> of the testcases
> added by the patch should pass after that.

This is the patch as suggested, one additional change is handling COMPLEX_CST
for rhs. And it will enable vectorization for pr106010-8a.c.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

2022-07-20  Richard Biener  <richard.guenther@gmail.com>
	    Hongtao Liu  <hongtao.liu@intel.com>

gcc/ChangeLog:

	PR tree-optimization/106010
	* tree-complex.cc (init_dont_simulate_again): Lower complex
	type move.
	(expand_complex_move): Also expand COMPLEX_CST for rhs.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr106010-1a.c: New test.
	* gcc.target/i386/pr106010-1b.c: New test.
	* gcc.target/i386/pr106010-1c.c: New test.
	* gcc.target/i386/pr106010-2a.c: New test.
	* gcc.target/i386/pr106010-2b.c: New test.
	* gcc.target/i386/pr106010-2c.c: New test.
	* gcc.target/i386/pr106010-3a.c: New test.
	* gcc.target/i386/pr106010-3b.c: New test.
	* gcc.target/i386/pr106010-3c.c: New test.
	* gcc.target/i386/pr106010-4a.c: New test.
	* gcc.target/i386/pr106010-4b.c: New test.
	* gcc.target/i386/pr106010-4c.c: New test.
	* gcc.target/i386/pr106010-5a.c: New test.
	* gcc.target/i386/pr106010-5b.c: New test.
	* gcc.target/i386/pr106010-5c.c: New test.
	* gcc.target/i386/pr106010-6a.c: New test.
	* gcc.target/i386/pr106010-6b.c: New test.
	* gcc.target/i386/pr106010-6c.c: New test.
	* gcc.target/i386/pr106010-7a.c: New test.
	* gcc.target/i386/pr106010-7b.c: New test.
	* gcc.target/i386/pr106010-7c.c: New test.
	* gcc.target/i386/pr106010-8a.c: New test.
	* gcc.target/i386/pr106010-8b.c: New test.
	* gcc.target/i386/pr106010-8c.c: New test.
	* gcc.target/i386/pr106010-9a.c: New test.
	* gcc.target/i386/pr106010-9b.c: New test.
	* gcc.target/i386/pr106010-9c.c: New test.
	* gcc.target/i386/pr106010-9d.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 ++++++++
 gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 ++++++++
 gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
 gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 ++++++++
 gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
 gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 +++++++++
 gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 +++++++++
 gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 +++++++
 gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 ++++++++
 gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 ++++++++
 gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 ++++++++
 gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
 gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 ++++++++
 gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 +++++++
 gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
 gcc/testsuite/gcc.target/i386/pr106010-9a.c |  89 +++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-9b.c |  90 +++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-9c.c |  90 +++++++++++
 gcc/testsuite/gcc.target/i386/pr106010-9d.c |  92 ++++++++++++
 gcc/tree-complex.cc                         |   9 +-
 29 files changed, 2141 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9c.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9d.c
  

Comments

Li, Pan2 via Gcc-patches July 20, 2022, 8 a.m. UTC | #1
On Wed, Jul 20, 2022 at 4:46 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> > My original comments still stand (it feels like this should be more generic).
> > Can we go the way lowering complex loads/stores first?  A large part
> > of the testcases
> > added by the patch should pass after that.
>
> This is the patch as suggested, one additional change is handling COMPLEX_CST
> for rhs. And it will enable vectorization for pr106010-8a.c.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?

OK.

Are there cases left your vectorizer patch handles over this one?

Thanks,
Richard.

> 2022-07-20  Richard Biener  <richard.guenther@gmail.com>
>             Hongtao Liu  <hongtao.liu@intel.com>
>
> gcc/ChangeLog:
>
>         PR tree-optimization/106010
>         * tree-complex.cc (init_dont_simulate_again): Lower complex
>         type move.
>         (expand_complex_move): Also expand COMPLEX_CST for rhs.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr106010-1a.c: New test.
>         * gcc.target/i386/pr106010-1b.c: New test.
>         * gcc.target/i386/pr106010-1c.c: New test.
>         * gcc.target/i386/pr106010-2a.c: New test.
>         * gcc.target/i386/pr106010-2b.c: New test.
>         * gcc.target/i386/pr106010-2c.c: New test.
>         * gcc.target/i386/pr106010-3a.c: New test.
>         * gcc.target/i386/pr106010-3b.c: New test.
>         * gcc.target/i386/pr106010-3c.c: New test.
>         * gcc.target/i386/pr106010-4a.c: New test.
>         * gcc.target/i386/pr106010-4b.c: New test.
>         * gcc.target/i386/pr106010-4c.c: New test.
>         * gcc.target/i386/pr106010-5a.c: New test.
>         * gcc.target/i386/pr106010-5b.c: New test.
>         * gcc.target/i386/pr106010-5c.c: New test.
>         * gcc.target/i386/pr106010-6a.c: New test.
>         * gcc.target/i386/pr106010-6b.c: New test.
>         * gcc.target/i386/pr106010-6c.c: New test.
>         * gcc.target/i386/pr106010-7a.c: New test.
>         * gcc.target/i386/pr106010-7b.c: New test.
>         * gcc.target/i386/pr106010-7c.c: New test.
>         * gcc.target/i386/pr106010-8a.c: New test.
>         * gcc.target/i386/pr106010-8b.c: New test.
>         * gcc.target/i386/pr106010-8c.c: New test.
>         * gcc.target/i386/pr106010-9a.c: New test.
>         * gcc.target/i386/pr106010-9b.c: New test.
>         * gcc.target/i386/pr106010-9c.c: New test.
>         * gcc.target/i386/pr106010-9d.c: New test.
> ---
>  gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 ++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 ++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
>  gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 ++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 ++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
>  gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 ++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 +++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 +++++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 +++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 +++++++
>  gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 ++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 ++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 ++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 ++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 ++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 ++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
>  gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 ++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 +++++++
>  gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
>  gcc/testsuite/gcc.target/i386/pr106010-9a.c |  89 +++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-9b.c |  90 +++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-9c.c |  90 +++++++++++
>  gcc/testsuite/gcc.target/i386/pr106010-9d.c |  92 ++++++++++++
>  gcc/tree-complex.cc                         |   9 +-
>  29 files changed, 2141 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9c.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9d.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> new file mode 100644
> index 00000000000..b608f484934
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> +
> +#define N 10000
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> new file mode 100644
> index 00000000000..0f377c3a548
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> @@ -0,0 +1,63 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-1a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> +
> +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> +
> +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> +    p_init[i] = i;
> +
> +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> +  memcpy (ps_src, p_init, 2 * N * sizeof (float));
> +  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> +  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> +  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> +  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> +
> +  foo_pd (pd_dst, pd_src);
> +  foo_ps (ps_dst, ps_src);
> +  foo_epi64 (epi64_dst, epi64_src);
> +  foo_epi32 (epi32_dst, epi32_src);
> +  foo_epi16 (epi16_dst, epi16_src);
> +  foo_epi8 (epi8_dst, epi8_src);
> +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> new file mode 100644
> index 00000000000..f07e9fb2d3d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> @@ -0,0 +1,41 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +#define N 10000
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[i];
> +}
> +
> +static void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> +
> +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> +
> +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> +    p_init[i] = i;
> +
> +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> +
> +  foo_ph (ph_dst, ph_src);
> +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> +    __builtin_abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> new file mode 100644
> index 00000000000..d2e2f8d4f43
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> @@ -0,0 +1,82 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +  a[2] = b[2];
> +  a[3] = b[3];
> +
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +  a[2] = b[2];
> +  a[3] = b[3];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +  a[2] = b[2];
> +  a[3] = b[3];
> +  a[4] = b[4];
> +  a[5] = b[5];
> +  a[6] = b[6];
> +  a[7] = b[7];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +  a[2] = b[2];
> +  a[3] = b[3];
> +  a[4] = b[4];
> +  a[5] = b[5];
> +  a[6] = b[6];
> +  a[7] = b[7];
> +  a[8] = b[8];
> +  a[9] = b[9];
> +  a[10] = b[10];
> +  a[11] = b[11];
> +  a[12] = b[12];
> +  a[13] = b[13];
> +  a[14] = b[14];
> +  a[15] = b[15];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> new file mode 100644
> index 00000000000..ac360752693
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> @@ -0,0 +1,62 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-2a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (32);
> +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> +  _Complex float* ps_src = (_Complex float*) malloc (32);
> +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> +  char* p = (char* ) malloc (32);
> +
> +  __builtin_memset (pd_dst, 0, 32);
> +  __builtin_memset (ps_dst, 0, 32);
> +  __builtin_memset (epi64_dst, 0, 32);
> +  __builtin_memset (epi32_dst, 0, 32);
> +  __builtin_memset (epi16_dst, 0, 32);
> +  __builtin_memset (epi8_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +  __builtin_memcpy (pd_src, p, 32);
> +  __builtin_memcpy (ps_src, p, 32);
> +  __builtin_memcpy (epi64_src, p, 32);
> +  __builtin_memcpy (epi32_src, p, 32);
> +  __builtin_memcpy (epi16_src, p, 32);
> +  __builtin_memcpy (epi8_src, p, 32);
> +
> +  foo_pd (pd_dst, pd_src);
> +  foo_ps (ps_dst, ps_src);
> +  foo_epi64 (epi64_dst, epi64_src);
> +  foo_epi32 (epi32_dst, epi32_src);
> +  foo_epi16 (epi16_dst, epi16_src);
> +  foo_epi8 (epi8_dst, epi8_src);
> +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> new file mode 100644
> index 00000000000..a002f209ec9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> @@ -0,0 +1,47 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> +  a[0] = b[0];
> +  a[1] = b[1];
> +  a[2] = b[2];
> +  a[3] = b[3];
> +  a[4] = b[4];
> +  a[5] = b[5];
> +  a[6] = b[6];
> +  a[7] = b[7];
> +}
> +
> +void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> +  char* p = (char* ) malloc (32);
> +
> +   __builtin_memset (ph_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +  __builtin_memcpy (ph_src, p, 32);
> +
> +  foo_ph (ph_dst, ph_src);
> +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> new file mode 100644
> index 00000000000..c1b64b56b1c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> @@ -0,0 +1,80 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> +  a[0] = b[1];
> +  a[1] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> +  a[0] = b[1];
> +  a[1] = b[0];
> +  a[2] = b[3];
> +  a[3] = b[2];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> +  a[0] = b[1];
> +  a[1] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> +  a[0] = b[3];
> +  a[1] = b[2];
> +  a[2] = b[1];
> +  a[3] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> +  a[0] = b[7];
> +  a[1] = b[6];
> +  a[2] = b[5];
> +  a[3] = b[4];
> +  a[4] = b[3];
> +  a[5] = b[2];
> +  a[6] = b[1];
> +  a[7] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> +  a[0] = b[7];
> +  a[1] = b[6];
> +  a[2] = b[5];
> +  a[3] = b[4];
> +  a[4] = b[3];
> +  a[5] = b[2];
> +  a[6] = b[1];
> +  a[7] = b[0];
> +  a[8] = b[15];
> +  a[9] = b[14];
> +  a[10] = b[13];
> +  a[11] = b[12];
> +  a[12] = b[11];
> +  a[13] = b[10];
> +  a[14] = b[9];
> +  a[15] = b[8];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> new file mode 100644
> index 00000000000..e4fa3f3a541
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> @@ -0,0 +1,126 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr106010-3a.c"
> +
> +void
> +avx2_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (32);
> +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> +  _Complex double* pd_exp = (_Complex double*) malloc (32);
> +  _Complex float* ps_src = (_Complex float*) malloc (32);
> +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> +  _Complex float* ps_exp = (_Complex float*) malloc (32);
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> +  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> +  _Complex int* epi32_exp = (_Complex int*) malloc (32);
> +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> +  _Complex short* epi16_exp = (_Complex short*) malloc (32);
> +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> +  _Complex char* epi8_exp = (_Complex char*) malloc (32);
> +  char* p = (char* ) malloc (32);
> +  char* q = (char* ) malloc (32);
> +
> +  __builtin_memset (pd_dst, 0, 32);
> +  __builtin_memset (ps_dst, 0, 32);
> +  __builtin_memset (epi64_dst, 0, 32);
> +  __builtin_memset (epi32_dst, 0, 32);
> +  __builtin_memset (epi16_dst, 0, 32);
> +  __builtin_memset (epi8_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +  __builtin_memcpy (pd_src, p, 32);
> +  __builtin_memcpy (ps_src, p, 32);
> +  __builtin_memcpy (epi64_src, p, 32);
> +  __builtin_memcpy (epi32_src, p, 32);
> +  __builtin_memcpy (epi16_src, p, 32);
> +  __builtin_memcpy (epi8_src, p, 32);
> +
> +  for (int i = 0; i != 16; i++)
> +    {
> +      p[i] = i + 16;
> +      p[i + 16] = i;
> +    }
> +  __builtin_memcpy (pd_exp, p, 32);
> +  __builtin_memcpy (epi64_exp, p, 32);
> +
> +  for (int i = 0; i != 8; i++)
> +    {
> +      p[i] = i + 8;
> +      p[i + 8] = i;
> +      p[i + 16] = i + 24;
> +      p[i + 24] = i + 16;
> +      q[i] = i + 24;
> +      q[i + 8] = i + 16;
> +      q[i + 16] = i + 8;
> +      q[i + 24] = i;
> +    }
> +  __builtin_memcpy (ps_exp, p, 32);
> +  __builtin_memcpy (epi32_exp, q, 32);
> +
> +
> +  for (int i = 0; i != 4; i++)
> +    {
> +      q[i] = i + 28;
> +      q[i + 4] = i + 24;
> +      q[i + 8] = i + 20;
> +      q[i + 12] = i + 16;
> +      q[i + 16] = i + 12;
> +      q[i + 20] = i + 8;
> +      q[i + 24] = i + 4;
> +      q[i + 28] = i;
> +    }
> +  __builtin_memcpy (epi16_exp, q, 32);
> +
> +  for (int i = 0; i != 2; i++)
> +    {
> +      q[i] = i + 14;
> +      q[i + 2] = i + 12;
> +      q[i + 4] = i + 10;
> +      q[i + 6] = i + 8;
> +      q[i + 8] = i + 6;
> +      q[i + 10] = i + 4;
> +      q[i + 12] = i + 2;
> +      q[i + 14] = i;
> +      q[i + 16] = i + 30;
> +      q[i + 18] = i + 28;
> +      q[i + 20] = i + 26;
> +      q[i + 22] = i + 24;
> +      q[i + 24] = i + 22;
> +      q[i + 26] = i + 20;
> +      q[i + 28] = i + 18;
> +      q[i + 30] = i + 16;
> +    }
> +  __builtin_memcpy (epi8_exp, q, 32);
> +
> +  foo_pd (pd_dst, pd_src);
> +  foo_ps (ps_dst, ps_src);
> +  foo_epi64 (epi64_dst, epi64_src);
> +  foo_epi32 (epi32_dst, epi32_src);
> +  foo_epi16 (epi16_dst, epi16_src);
> +  foo_epi8 (epi8_dst, epi8_src);
> +  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> new file mode 100644
> index 00000000000..5a5a3d4b992
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> @@ -0,0 +1,69 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> +  a[0] = b[1];
> +  a[1] = b[0];
> +  a[2] = b[4];
> +  a[3] = b[3];
> +  a[4] = b[7];
> +  a[5] = b[6];
> +  a[6] = b[2];
> +  a[7] = b[5];
> +}
> +
> +void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> +  char* p = (char* ) malloc (32);
> +  char* q = (char* ) malloc (32);
> +
> +  __builtin_memset (ph_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +  __builtin_memcpy (ph_src, p, 32);
> +
> +  for (int i = 0; i != 4; i++)
> +    {
> +      p[i] = i + 4;
> +      p[i + 4] = i;
> +      p[i + 8] = i + 16;
> +      p[i + 12] = i + 12;
> +      p[i + 16] = i + 28;
> +      p[i + 20] = i + 24;
> +      p[i + 24] = i + 8;
> +      p[i + 28] = i + 20;
> +      q[i] = i + 28;
> +      q[i + 4] = i + 24;
> +      q[i + 8] = i + 20;
> +      q[i + 12] = i + 16;
> +      q[i + 16] = i + 12;
> +      q[i + 20] = i + 8;
> +      q[i + 24] = i + 4;
> +      q[i + 28] = i;
> +    }
> +  __builtin_memcpy (ph_exp, p, 32);
> +
> +  foo_ph (ph_dst, ph_src);
> +  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> new file mode 100644
> index 00000000000..b7b0b532bb1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> @@ -0,0 +1,101 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a,
> +       _Complex double b1,
> +       _Complex double b2)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a,
> +       _Complex float b1, _Complex float b2,
> +       _Complex float b3, _Complex float b4)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +  a[2] = b3;
> +  a[3] = b4;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a,
> +          _Complex long long b1,
> +          _Complex long long b2)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a,
> +          _Complex int b1, _Complex int b2,
> +          _Complex int b3, _Complex int b4)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +  a[2] = b3;
> +  a[3] = b4;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a,
> +          _Complex short b1, _Complex short b2,
> +          _Complex short b3, _Complex short b4,
> +          _Complex short b5, _Complex short b6,
> +          _Complex short b7,_Complex short b8)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +  a[2] = b3;
> +  a[3] = b4;
> +  a[4] = b5;
> +  a[5] = b6;
> +  a[6] = b7;
> +  a[7] = b8;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a,
> +         _Complex char b1, _Complex char b2,
> +         _Complex char b3, _Complex char b4,
> +         _Complex char b5, _Complex char b6,
> +         _Complex char b7,_Complex char b8,
> +         _Complex char b9, _Complex char b10,
> +         _Complex char b11, _Complex char b12,
> +         _Complex char b13, _Complex char b14,
> +         _Complex char b15,_Complex char b16)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +  a[2] = b3;
> +  a[3] = b4;
> +  a[4] = b5;
> +  a[5] = b6;
> +  a[6] = b7;
> +  a[7] = b8;
> +  a[8] = b9;
> +  a[9] = b10;
> +  a[10] = b11;
> +  a[11] = b12;
> +  a[12] = b13;
> +  a[13] = b14;
> +  a[14] = b15;
> +  a[15] = b16;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> new file mode 100644
> index 00000000000..e2e79508c4b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> @@ -0,0 +1,67 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-4a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (32);
> +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> +  _Complex float* ps_src = (_Complex float*) malloc (32);
> +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> +  char* p = (char* ) malloc (32);
> +
> +  __builtin_memset (pd_dst, 0, 32);
> +  __builtin_memset (ps_dst, 0, 32);
> +  __builtin_memset (epi64_dst, 0, 32);
> +  __builtin_memset (epi32_dst, 0, 32);
> +  __builtin_memset (epi16_dst, 0, 32);
> +  __builtin_memset (epi8_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +  __builtin_memcpy (pd_src, p, 32);
> +  __builtin_memcpy (ps_src, p, 32);
> +  __builtin_memcpy (epi64_src, p, 32);
> +  __builtin_memcpy (epi32_src, p, 32);
> +  __builtin_memcpy (epi16_src, p, 32);
> +  __builtin_memcpy (epi8_src, p, 32);
> +
> +  foo_pd (pd_dst, pd_src[0], pd_src[1]);
> +  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> +  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> +  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> +  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> +            epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> +  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> +           epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> +           epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> +           epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> +
> +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> new file mode 100644
> index 00000000000..8e02aefe3b5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> @@ -0,0 +1,54 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a,
> +       _Complex _Float16 b1, _Complex _Float16 b2,
> +       _Complex _Float16 b3, _Complex _Float16 b4,
> +       _Complex _Float16 b5, _Complex _Float16 b6,
> +       _Complex _Float16 b7,_Complex _Float16 b8)
> +{
> +  a[0] = b1;
> +  a[1] = b2;
> +  a[2] = b3;
> +  a[3] = b4;
> +  a[4] = b5;
> +  a[5] = b6;
> +  a[6] = b7;
> +  a[7] = b8;
> +}
> +
> +void
> +do_test (void)
> +{
> +
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> +
> +  char* p = (char* ) malloc (32);
> +
> +  __builtin_memset (ph_dst, 0, 32);
> +
> +  for (int i = 0; i != 32; i++)
> +    p[i] = i;
> +
> +  __builtin_memcpy (ph_src, p, 32);
> +
> +  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> +         ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> +
> +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> +    __builtin_abort ();
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> new file mode 100644
> index 00000000000..9d4a6f9846b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> @@ -0,0 +1,117 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> +  a[0] = b[2];
> +  a[1] = b[3];
> +  a[2] = b[0];
> +  a[3] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> +  a[0] = b[4];
> +  a[1] = b[5];
> +  a[2] = b[6];
> +  a[3] = b[7];
> +  a[4] = b[0];
> +  a[5] = b[1];
> +  a[6] = b[2];
> +  a[7] = b[3];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> +  a[0] = b[2];
> +  a[1] = b[3];
> +  a[2] = b[0];
> +  a[3] = b[1];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> +  a[0] = b[4];
> +  a[1] = b[5];
> +  a[2] = b[6];
> +  a[3] = b[7];
> +  a[4] = b[0];
> +  a[5] = b[1];
> +  a[6] = b[2];
> +  a[7] = b[3];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> +  a[0] = b[8];
> +  a[1] = b[9];
> +  a[2] = b[10];
> +  a[3] = b[11];
> +  a[4] = b[12];
> +  a[5] = b[13];
> +  a[6] = b[14];
> +  a[7] = b[15];
> +  a[8] = b[0];
> +  a[9] = b[1];
> +  a[10] = b[2];
> +  a[11] = b[3];
> +  a[12] = b[4];
> +  a[13] = b[5];
> +  a[14] = b[6];
> +  a[15] = b[7];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> +  a[0] = b[16];
> +  a[1] = b[17];
> +  a[2] = b[18];
> +  a[3] = b[19];
> +  a[4] = b[20];
> +  a[5] = b[21];
> +  a[6] = b[22];
> +  a[7] = b[23];
> +  a[8] = b[24];
> +  a[9] = b[25];
> +  a[10] = b[26];
> +  a[11] = b[27];
> +  a[12] = b[28];
> +  a[13] = b[29];
> +  a[14] = b[30];
> +  a[15] = b[31];
> +  a[16] = b[0];
> +  a[17] = b[1];
> +  a[18] = b[2];
> +  a[19] = b[3];
> +  a[20] = b[4];
> +  a[21] = b[5];
> +  a[22] = b[6];
> +  a[23] = b[7];
> +  a[24] = b[8];
> +  a[25] = b[9];
> +  a[26] = b[10];
> +  a[27] = b[11];
> +  a[28] = b[12];
> +  a[29] = b[13];
> +  a[30] = b[14];
> +  a[31] = b[15];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> new file mode 100644
> index 00000000000..d5c6ebeb5cf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> @@ -0,0 +1,80 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-5a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (64);
> +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> +  _Complex float* ps_src = (_Complex float*) malloc (64);
> +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> +  char* p = (char* ) malloc (64);
> +  char* q = (char* ) malloc (64);
> +
> +  __builtin_memset (pd_dst, 0, 64);
> +  __builtin_memset (ps_dst, 0, 64);
> +  __builtin_memset (epi64_dst, 0, 64);
> +  __builtin_memset (epi32_dst, 0, 64);
> +  __builtin_memset (epi16_dst, 0, 64);
> +  __builtin_memset (epi8_dst, 0, 64);
> +
> +  for (int i = 0; i != 64; i++)
> +    {
> +      p[i] = i;
> +      q[i] = (i + 32) % 64;
> +    }
> +  __builtin_memcpy (pd_src, p, 64);
> +  __builtin_memcpy (ps_src, p, 64);
> +  __builtin_memcpy (epi64_src, p, 64);
> +  __builtin_memcpy (epi32_src, p, 64);
> +  __builtin_memcpy (epi16_src, p, 64);
> +  __builtin_memcpy (epi8_src, p, 64);
> +
> +  __builtin_memcpy (pd_exp, q, 64);
> +  __builtin_memcpy (ps_exp, q, 64);
> +  __builtin_memcpy (epi64_exp, q, 64);
> +  __builtin_memcpy (epi32_exp, q, 64);
> +  __builtin_memcpy (epi16_exp, q, 64);
> +  __builtin_memcpy (epi8_exp, q, 64);
> +
> +  foo_pd (pd_dst, pd_src);
> +  foo_ps (ps_dst, ps_src);
> +  foo_epi64 (epi64_dst, epi64_src);
> +  foo_epi32 (epi32_dst, epi32_src);
> +  foo_epi16 (epi16_dst, epi16_src);
> +  foo_epi8 (epi8_dst, epi8_src);
> +
> +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> new file mode 100644
> index 00000000000..9ce4e6dd5c0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> @@ -0,0 +1,62 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> +  a[0] = b[8];
> +  a[1] = b[9];
> +  a[2] = b[10];
> +  a[3] = b[11];
> +  a[4] = b[12];
> +  a[5] = b[13];
> +  a[6] = b[14];
> +  a[7] = b[15];
> +  a[8] = b[0];
> +  a[9] = b[1];
> +  a[10] = b[2];
> +  a[11] = b[3];
> +  a[12] = b[4];
> +  a[13] = b[5];
> +  a[14] = b[6];
> +  a[15] = b[7];
> +}
> +
> +void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> +  char* p = (char* ) malloc (64);
> +  char* q = (char* ) malloc (64);
> +
> +  __builtin_memset (ph_dst, 0, 64);
> +
> +  for (int i = 0; i != 64; i++)
> +    {
> +      p[i] = i;
> +      q[i] = (i + 32) % 64;
> +    }
> +  __builtin_memcpy (ph_src, p, 64);
> +
> +  __builtin_memcpy (ph_exp, q, 64);
> +
> +  foo_ph (ph_dst, ph_src);
> +
> +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> new file mode 100644
> index 00000000000..65a90d03684
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> @@ -0,0 +1,115 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> +
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double* __restrict b)
> +{
> +  a[0] = b[3];
> +  a[1] = b[2];
> +  a[2] = b[1];
> +  a[3] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float* __restrict b)
> +{
> +  a[0] = b[7];
> +  a[1] = b[6];
> +  a[2] = b[5];
> +  a[3] = b[4];
> +  a[4] = b[3];
> +  a[5] = b[2];
> +  a[6] = b[1];
> +  a[7] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> +{
> +  a[0] = b[3];
> +  a[1] = b[2];
> +  a[2] = b[1];
> +  a[3] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> +{
> +  a[0] = b[7];
> +  a[1] = b[6];
> +  a[2] = b[5];
> +  a[3] = b[4];
> +  a[4] = b[3];
> +  a[5] = b[2];
> +  a[6] = b[1];
> +  a[7] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> +{
> +  a[0] = b[15];
> +  a[1] = b[14];
> +  a[2] = b[13];
> +  a[3] = b[12];
> +  a[4] = b[11];
> +  a[5] = b[10];
> +  a[6] = b[9];
> +  a[7] = b[8];
> +  a[8] = b[7];
> +  a[9] = b[6];
> +  a[10] = b[5];
> +  a[11] = b[4];
> +  a[12] = b[3];
> +  a[13] = b[2];
> +  a[14] = b[1];
> +  a[15] = b[0];
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> +{
> +  a[0] = b[31];
> +  a[1] = b[30];
> +  a[2] = b[29];
> +  a[3] = b[28];
> +  a[4] = b[27];
> +  a[5] = b[26];
> +  a[6] = b[25];
> +  a[7] = b[24];
> +  a[8] = b[23];
> +  a[9] = b[22];
> +  a[10] = b[21];
> +  a[11] = b[20];
> +  a[12] = b[19];
> +  a[13] = b[18];
> +  a[14] = b[17];
> +  a[15] = b[16];
> +  a[16] = b[15];
> +  a[17] = b[14];
> +  a[18] = b[13];
> +  a[19] = b[12];
> +  a[20] = b[11];
> +  a[21] = b[10];
> +  a[22] = b[9];
> +  a[23] = b[8];
> +  a[24] = b[7];
> +  a[25] = b[6];
> +  a[26] = b[5];
> +  a[27] = b[4];
> +  a[28] = b[3];
> +  a[29] = b[2];
> +  a[30] = b[1];
> +  a[31] = b[0];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> new file mode 100644
> index 00000000000..1c5bb020939
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> @@ -0,0 +1,157 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include "avx2-check.h"
> +#include <string.h>
> +#include "pr106010-6a.c"
> +
> +void
> +avx2_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (64);
> +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> +  _Complex float* ps_src = (_Complex float*) malloc (64);
> +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> +  char* p = (char* ) malloc (64);
> +  char* q = (char* ) malloc (64);
> +
> +  __builtin_memset (pd_dst, 0, 64);
> +  __builtin_memset (ps_dst, 0, 64);
> +  __builtin_memset (epi64_dst, 0, 64);
> +  __builtin_memset (epi32_dst, 0, 64);
> +  __builtin_memset (epi16_dst, 0, 64);
> +  __builtin_memset (epi8_dst, 0, 64);
> +
> +  for (int i = 0; i != 64; i++)
> +    p[i] = i;
> +
> +  __builtin_memcpy (pd_src, p, 64);
> +  __builtin_memcpy (ps_src, p, 64);
> +  __builtin_memcpy (epi64_src, p, 64);
> +  __builtin_memcpy (epi32_src, p, 64);
> +  __builtin_memcpy (epi16_src, p, 64);
> +  __builtin_memcpy (epi8_src, p, 64);
> +
> +
> +  for (int i = 0; i != 16; i++)
> +    {
> +      q[i] = i + 48;
> +      q[i + 16] = i + 32;
> +      q[i + 32] = i + 16;
> +      q[i + 48] = i;
> +    }
> +
> +  __builtin_memcpy (pd_exp, q, 64);
> +  __builtin_memcpy (epi64_exp, q, 64);
> +
> +   for (int i = 0; i != 8; i++)
> +    {
> +      q[i] = i + 56;
> +      q[i + 8] = i + 48;
> +      q[i + 16] = i + 40;
> +      q[i + 24] = i + 32;
> +      q[i + 32] = i + 24;
> +      q[i + 40] = i + 16;
> +      q[i + 48] = i + 8;
> +      q[i + 56] = i;
> +    }
> +
> +  __builtin_memcpy (ps_exp, q, 64);
> +  __builtin_memcpy (epi32_exp, q, 64);
> +
> +  for (int i = 0; i != 4; i++)
> +    {
> +      q[i] = i + 60;
> +      q[i + 4] = i + 56;
> +      q[i + 8] = i + 52;
> +      q[i + 12] = i + 48;
> +      q[i + 16] = i + 44;
> +      q[i + 20] = i + 40;
> +      q[i + 24] = i + 36;
> +      q[i + 28] = i + 32;
> +      q[i + 32] = i + 28;
> +      q[i + 36] = i + 24;
> +      q[i + 40] = i + 20;
> +      q[i + 44] = i + 16;
> +      q[i + 48] = i + 12;
> +      q[i + 52] = i + 8;
> +      q[i + 56] = i + 4;
> +      q[i + 60] = i;
> +    }
> +
> +  __builtin_memcpy (epi16_exp, q, 64);
> +
> +  for (int i = 0; i != 2; i++)
> +    {
> +      q[i] = i + 62;
> +      q[i + 2] = i + 60;
> +      q[i + 4] = i + 58;
> +      q[i + 6] = i + 56;
> +      q[i + 8] = i + 54;
> +      q[i + 10] = i + 52;
> +      q[i + 12] = i + 50;
> +      q[i + 14] = i + 48;
> +      q[i + 16] = i + 46;
> +      q[i + 18] = i + 44;
> +      q[i + 20] = i + 42;
> +      q[i + 22] = i + 40;
> +      q[i + 24] = i + 38;
> +      q[i + 26] = i + 36;
> +      q[i + 28] = i + 34;
> +      q[i + 30] = i + 32;
> +      q[i + 32] = i + 30;
> +      q[i + 34] = i + 28;
> +      q[i + 36] = i + 26;
> +      q[i + 38] = i + 24;
> +      q[i + 40] = i + 22;
> +      q[i + 42] = i + 20;
> +      q[i + 44] = i + 18;
> +      q[i + 46] = i + 16;
> +      q[i + 48] = i + 14;
> +      q[i + 50] = i + 12;
> +      q[i + 52] = i + 10;
> +      q[i + 54] = i + 8;
> +      q[i + 56] = i + 6;
> +      q[i + 58] = i + 4;
> +      q[i + 60] = i + 2;
> +      q[i + 62] = i;
> +    }
> +  __builtin_memcpy (epi8_exp, q, 64);
> +
> +  foo_pd (pd_dst, pd_src);
> +  foo_ps (ps_dst, ps_src);
> +  foo_epi64 (epi64_dst, epi64_src);
> +  foo_epi32 (epi32_dst, epi32_src);
> +  foo_epi16 (epi16_dst, epi16_src);
> +  foo_epi8 (epi8_dst, epi8_src);
> +
> +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> new file mode 100644
> index 00000000000..b859d884a7f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> @@ -0,0 +1,80 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> +{
> +  a[0] = b[15];
> +  a[1] = b[14];
> +  a[2] = b[13];
> +  a[3] = b[12];
> +  a[4] = b[11];
> +  a[5] = b[10];
> +  a[6] = b[9];
> +  a[7] = b[8];
> +  a[8] = b[7];
> +  a[9] = b[6];
> +  a[10] = b[5];
> +  a[11] = b[4];
> +  a[12] = b[3];
> +  a[13] = b[2];
> +  a[14] = b[1];
> +  a[15] = b[0];
> +}
> +
> +void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> +  char* p = (char* ) malloc (64);
> +  char* q = (char* ) malloc (64);
> +
> +  __builtin_memset (ph_dst, 0, 64);
> +
> +  for (int i = 0; i != 64; i++)
> +    p[i] = i;
> +
> +  __builtin_memcpy (ph_src, p, 64);
> +
> +  for (int i = 0; i != 4; i++)
> +    {
> +      q[i] = i + 60;
> +      q[i + 4] = i + 56;
> +      q[i + 8] = i + 52;
> +      q[i + 12] = i + 48;
> +      q[i + 16] = i + 44;
> +      q[i + 20] = i + 40;
> +      q[i + 24] = i + 36;
> +      q[i + 28] = i + 32;
> +      q[i + 32] = i + 28;
> +      q[i + 36] = i + 24;
> +      q[i + 40] = i + 20;
> +      q[i + 44] = i + 16;
> +      q[i + 48] = i + 12;
> +      q[i + 52] = i + 8;
> +      q[i + 56] = i + 4;
> +      q[i + 60] = i;
> +    }
> +
> +  __builtin_memcpy (ph_exp, q, 64);
> +
> +  foo_ph (ph_dst, ph_src);
> +
> +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> new file mode 100644
> index 00000000000..2ea01fac927
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> +
> +#define N 10000
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a, _Complex double b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a, _Complex float b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a, _Complex long long b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a, _Complex int b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a, _Complex short b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a, _Complex char b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> new file mode 100644
> index 00000000000..26482cc10f5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> @@ -0,0 +1,63 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-7a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> +
> +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> +
> +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> +    p_init[i] = i % 2 + 3;
> +
> +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> +  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> +  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> +  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> +  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> +  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> +
> +  foo_pd (pd_dst, pd_src[0]);
> +  foo_ps (ps_dst, ps_src[0]);
> +  foo_epi64 (epi64_dst, epi64_src[0]);
> +  foo_epi32 (epi32_dst, epi32_src[0]);
> +  foo_epi16 (epi16_dst, epi16_src[0]);
> +  foo_epi8 (epi8_dst, epi8_src[0]);
> +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> +    __builtin_abort ();
> +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> +    __builtin_abort ();
> +
> +  return;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> new file mode 100644
> index 00000000000..7f4056a5ecc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> @@ -0,0 +1,41 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +#define N 10000
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b;
> +}
> +
> +static void
> +do_test (void)
> +{
> +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> +
> +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> +
> +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> +    p_init[i] = i % 2 + 3;
> +
> +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> +
> +  foo_ph (ph_dst, ph_src[0]);
> +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> +    __builtin_abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> new file mode 100644
> index 00000000000..11054b60d30
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> +
> +#define N 10000
> +void
> +__attribute__((noipa))
> +foo_pd (_Complex double* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1.0 + 2.0i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_ps (_Complex float* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1.0f + 2.0fi;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi64 (_Complex long long* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1 + 2i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi32 (_Complex int* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1 + 2i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi16 (_Complex short* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1 + 2i;
> +}
> +
> +void
> +__attribute__((noipa))
> +foo_epi8 (_Complex char* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1 + 2i;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> new file mode 100644
> index 00000000000..6bb0073b691
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> @@ -0,0 +1,53 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +#include <string.h>
> +#include "pr106010-8a.c"
> +
> +void
> +avx_test (void)
> +{
> +  _Complex double pd_src = 1.0 + 2.0i;
> +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> +  _Complex float ps_src = 1.0 + 2.0i;
> +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> +  _Complex long long epi64_src = 1 + 2i;;
> +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> +  _Complex int epi32_src = 1 + 2i;
> +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> +  _Complex short epi16_src = 1 + 2i;
> +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> +  _Complex char epi8_src = 1 + 2i;
> +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> +
> +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> +
> +  foo_pd (pd_dst);
> +  foo_ps (ps_dst);
> +  foo_epi64 (epi64_dst);
> +  foo_epi32 (epi32_dst);
> +  foo_epi16 (epi16_dst);
> +  foo_epi8 (epi8_dst);
> +  for (int i = 0 ; i != N; i++)
> +    {
> +      if (pd_dst[i] != pd_src)
> +       __builtin_abort ();
> +      if (ps_dst[i] != ps_src)
> +       __builtin_abort ();
> +      if (epi64_dst[i] != epi64_src)
> +       __builtin_abort ();
> +      if (epi32_dst[i] != epi32_src)
> +       __builtin_abort ();
> +      if (epi16_dst[i] != epi16_src)
> +       __builtin_abort ();
> +      if (epi8_dst[i] != epi8_src)
> +       __builtin_abort ();
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> new file mode 100644
> index 00000000000..61ae131829d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> @@ -0,0 +1,38 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +#include <string.h>
> +
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512FP16
> +#include "avx512-check.h"
> +
> +#define N 10000
> +
> +void
> +__attribute__((noipa))
> +foo_ph (_Complex _Float16* a)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = 1.0f16 + 2.0f16i;
> +}
> +
> +static void
> +do_test (void)
> +{
> +  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> +
> +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> +
> +  foo_ph (ph_dst);
> +  for (int i = 0; i != N; i++)
> +    {
> +      if (ph_dst[i] != ph_src)
> +       __builtin_abort ();
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9a.c b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
> new file mode 100644
> index 00000000000..e922f7b5400
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
> @@ -0,0 +1,89 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> +
> +typedef struct { _Complex double c; double a1; double a2;}
> +  cdf;
> +typedef struct { _Complex double c; double a1; double a2; double a3; double a4;}
> +  cdf2;
> +typedef struct { _Complex double c1; _Complex double c2; double a1; double a2; double a3; double a4;}
> +  cdf3;
> +typedef struct { _Complex double c1; _Complex double c2; double a1; double a2;}
> +  cdf4;
> +
> +#define N 100
> +/* VMAT_ELEMENTWISE.  */
> +void
> +__attribute__((noipa))
> +foo (cdf* a, cdf* __restrict b)
> +{
> +   for (int i = 0; i < N; ++i)
> +    {
> +      a[i].c = b[i].c;
> +      a[i].a1 = b[i].a1;
> +      a[i].a2 = b[i].a2;
> +    }
> +}
> +
> +/* VMAT_CONTIGUOUS_PERMUTE.  */
> +void
> +__attribute__((noipa))
> +foo1 (cdf2* a, cdf2* __restrict b)
> +{
> +   for (int i = 0; i < N; ++i)
> +    {
> +      a[i].c = b[i].c;
> +      a[i].a1 = b[i].a1;
> +      a[i].a2 = b[i].a2;
> +      a[i].a3 = b[i].a3;
> +      a[i].a4 = b[i].a4;
> +    }
> +}
> +
> +/* VMAT_CONTIGUOUS.  */
> +void
> +__attribute__((noipa))
> +foo2 (cdf3* a, cdf3* __restrict b)
> +{
> +   for (int i = 0; i < N; ++i)
> +    {
> +      a[i].c1 = b[i].c1;
> +      a[i].c2 = b[i].c2;
> +      a[i].a1 = b[i].a1;
> +      a[i].a2 = b[i].a2;
> +      a[i].a3 = b[i].a3;
> +      a[i].a4 = b[i].a4;
> +    }
> +}
> +
> +/* VMAT_STRIDED_SLP.  */
> +void
> +__attribute__((noipa))
> +foo3 (cdf4* a, cdf4* __restrict b)
> +{
> +   for (int i = 0; i < N; ++i)
> +    {
> +      a[i].c1 = b[i].c1;
> +      a[i].c2 = b[i].c2;
> +      a[i].a1 = b[i].a1;
> +      a[i].a2 = b[i].a2;
> +    }
> +}
> +
> +/* VMAT_CONTIGUOUS_REVERSE.  */
> +void
> +__attribute__((noipa))
> +foo4 (_Complex double* a, _Complex double* __restrict b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[i] = b[N-i-1];
> +}
> +
> +/* VMAT_CONTIGUOUS_DOWN.  */
> +void
> +__attribute__((noipa))
> +foo5 (_Complex double* a, _Complex double* __restrict b)
> +{
> +  for (int i = 0; i != N; i++)
> +    a[N-i-1] = b[0];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9b.c b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
> new file mode 100644
> index 00000000000..e220445e6e3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
> @@ -0,0 +1,90 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -msse2 -fvect-cost-model=unlimited" } */
> +/* { dg-require-effective-target sse2 } */
> +
> +#include <string.h>
> +#include "sse2-check.h"
> +#include "pr106010-9a.c"
> +
> +static void
> +sse2_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> +  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> +  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> +  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> +  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> +  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> +  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> +  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> +
> +  char* p_init = (char*) malloc (N * sizeof (cdf3));
> +
> +  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> +  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> +  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> +  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> +  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> +  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> +
> +  for (int i = 0; i != N * sizeof (cdf3); i++)
> +    p_init[i] = i;
> +
> +  memcpy (cdf_src, p_init, N * sizeof (cdf));
> +  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> +  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> +  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> +  memcpy (pd_src, p_init, N * sizeof (_Complex double));
> +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> +    p_init[i] = i % 16;
> +  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> +
> +  foo (cdf_dst, cdf_src);
> +  foo1 (cdf2_dst, cdf2_src);
> +  foo2 (cdf3_dst, cdf3_src);
> +  foo3 (cdf4_dst, cdf4_src);
> +  foo4 (pd_dst, pd_src);
> +  foo5 (pd_dst2, pd_src2);
> +  for (int i = 0; i != N; i++)
> +    {
> +      p_init[(N - i - 1) * 16] = i * 16;
> +      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> +      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> +      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> +      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> +      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> +      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> +      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> +      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> +      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> +      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> +      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> +      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> +      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> +      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> +      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> +    }
> +  memcpy (pd_src, p_init, N * 16);
> +
> +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> +    __builtin_abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9c.c b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
> new file mode 100644
> index 00000000000..ff51f6195b7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
> @@ -0,0 +1,90 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited" } */
> +/* { dg-require-effective-target avx2 } */
> +
> +#include <string.h>
> +#include "avx2-check.h"
> +#include "pr106010-9a.c"
> +
> +static void
> +avx2_test (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> +  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> +  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> +  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> +  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> +  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> +  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> +  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> +
> +  char* p_init = (char*) malloc (N * sizeof (cdf3));
> +
> +  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> +  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> +  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> +  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> +  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> +  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> +
> +  for (int i = 0; i != N * sizeof (cdf3); i++)
> +    p_init[i] = i;
> +
> +  memcpy (cdf_src, p_init, N * sizeof (cdf));
> +  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> +  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> +  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> +  memcpy (pd_src, p_init, N * sizeof (_Complex double));
> +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> +    p_init[i] = i % 16;
> +  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> +
> +  foo (cdf_dst, cdf_src);
> +  foo1 (cdf2_dst, cdf2_src);
> +  foo2 (cdf3_dst, cdf3_src);
> +  foo3 (cdf4_dst, cdf4_src);
> +  foo4 (pd_dst, pd_src);
> +  foo5 (pd_dst2, pd_src2);
> +  for (int i = 0; i != N; i++)
> +    {
> +      p_init[(N - i - 1) * 16] = i * 16;
> +      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> +      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> +      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> +      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> +      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> +      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> +      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> +      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> +      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> +      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> +      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> +      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> +      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> +      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> +      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> +    }
> +  memcpy (pd_src, p_init, N * 16);
> +
> +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> +    __builtin_abort ();
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9d.c b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
> new file mode 100644
> index 00000000000..d4d8f1dd722
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
> @@ -0,0 +1,92 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3 -mavx512f -mavx512vl -fvect-cost-model=unlimited -mprefer-vector-width=512" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#include <string.h>
> +#include <stdlib.h>
> +#define AVX512F
> +#include "avx512-check.h"
> +#include "pr106010-9a.c"
> +
> +static void
> +test_512 (void)
> +{
> +  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> +  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> +  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> +  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> +  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> +  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> +  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> +  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> +  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> +
> +  char* p_init = (char*) malloc (N * sizeof (cdf3));
> +
> +  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> +  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> +  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> +  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> +  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> +  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> +
> +  for (int i = 0; i != N * sizeof (cdf3); i++)
> +    p_init[i] = i;
> +
> +  memcpy (cdf_src, p_init, N * sizeof (cdf));
> +  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> +  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> +  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> +  memcpy (pd_src, p_init, N * sizeof (_Complex double));
> +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> +    p_init[i] = i % 16;
> +  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> +
> +  foo (cdf_dst, cdf_src);
> +  foo1 (cdf2_dst, cdf2_src);
> +  foo2 (cdf3_dst, cdf3_src);
> +  foo3 (cdf4_dst, cdf4_src);
> +  foo4 (pd_dst, pd_src);
> +  foo5 (pd_dst2, pd_src2);
> +  for (int i = 0; i != N; i++)
> +    {
> +      p_init[(N - i - 1) * 16] = i * 16;
> +      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> +      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> +      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> +      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> +      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> +      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> +      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> +      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> +      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> +      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> +      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> +      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> +      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> +      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> +      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> +    }
> +  memcpy (pd_src, p_init, N * 16);
> +
> +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> +    __builtin_abort ();
> +
> +  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> +    __builtin_abort ();
> +}
> diff --git a/gcc/tree-complex.cc b/gcc/tree-complex.cc
> index 61950a0f099..ea9df6114a1 100644
> --- a/gcc/tree-complex.cc
> +++ b/gcc/tree-complex.cc
> @@ -297,6 +297,11 @@ init_dont_simulate_again (void)
>                 break;
>
>               default:
> +               /* When expand_complex_move would trigger make sure we
> +                  perform lowering even when there is no actual complex
> +                  operation.  This helps consistency and vectorization.  */
> +               if (TREE_CODE (TREE_TYPE (gimple_op (stmt, 0))) == COMPLEX_TYPE)
> +                 saw_a_complex_op = true;
>                 break;
>               }
>
> @@ -869,7 +874,9 @@ expand_complex_move (gimple_stmt_iterator *gsi, tree type)
>           update_complex_assignment (gsi, r, i);
>         }
>      }
> -  else if (rhs && TREE_CODE (rhs) == SSA_NAME && !TREE_SIDE_EFFECTS (lhs))
> +  else if (rhs
> +          && (TREE_CODE (rhs) == SSA_NAME || TREE_CODE (rhs) == COMPLEX_CST)
> +          && !TREE_SIDE_EFFECTS (lhs))
>      {
>        tree x;
>        gimple *t;
> --
> 2.18.1
>
  
Li, Pan2 via Gcc-patches July 20, 2022, 8:02 a.m. UTC | #2
On Wed, Jul 20, 2022 at 4:00 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Wed, Jul 20, 2022 at 4:46 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > > My original comments still stand (it feels like this should be more generic).
> > > Can we go the way lowering complex loads/stores first?  A large part
> > > of the testcases
> > > added by the patch should pass after that.
> >
> > This is the patch as suggested, one additional change is handling COMPLEX_CST
> > for rhs. And it will enable vectorization for pr106010-8a.c.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
>
> OK.
>
> Are there cases left your vectorizer patch handles over this one?
No.
>
> Thanks,
> Richard.
>
> > 2022-07-20  Richard Biener  <richard.guenther@gmail.com>
> >             Hongtao Liu  <hongtao.liu@intel.com>
> >
> > gcc/ChangeLog:
> >
> >         PR tree-optimization/106010
> >         * tree-complex.cc (init_dont_simulate_again): Lower complex
> >         type move.
> >         (expand_complex_move): Also expand COMPLEX_CST for rhs.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr106010-1a.c: New test.
> >         * gcc.target/i386/pr106010-1b.c: New test.
> >         * gcc.target/i386/pr106010-1c.c: New test.
> >         * gcc.target/i386/pr106010-2a.c: New test.
> >         * gcc.target/i386/pr106010-2b.c: New test.
> >         * gcc.target/i386/pr106010-2c.c: New test.
> >         * gcc.target/i386/pr106010-3a.c: New test.
> >         * gcc.target/i386/pr106010-3b.c: New test.
> >         * gcc.target/i386/pr106010-3c.c: New test.
> >         * gcc.target/i386/pr106010-4a.c: New test.
> >         * gcc.target/i386/pr106010-4b.c: New test.
> >         * gcc.target/i386/pr106010-4c.c: New test.
> >         * gcc.target/i386/pr106010-5a.c: New test.
> >         * gcc.target/i386/pr106010-5b.c: New test.
> >         * gcc.target/i386/pr106010-5c.c: New test.
> >         * gcc.target/i386/pr106010-6a.c: New test.
> >         * gcc.target/i386/pr106010-6b.c: New test.
> >         * gcc.target/i386/pr106010-6c.c: New test.
> >         * gcc.target/i386/pr106010-7a.c: New test.
> >         * gcc.target/i386/pr106010-7b.c: New test.
> >         * gcc.target/i386/pr106010-7c.c: New test.
> >         * gcc.target/i386/pr106010-8a.c: New test.
> >         * gcc.target/i386/pr106010-8b.c: New test.
> >         * gcc.target/i386/pr106010-8c.c: New test.
> >         * gcc.target/i386/pr106010-9a.c: New test.
> >         * gcc.target/i386/pr106010-9b.c: New test.
> >         * gcc.target/i386/pr106010-9c.c: New test.
> >         * gcc.target/i386/pr106010-9d.c: New test.
> > ---
> >  gcc/testsuite/gcc.target/i386/pr106010-1a.c |  58 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-1b.c |  63 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-1c.c |  41 +++++
> >  gcc/testsuite/gcc.target/i386/pr106010-2a.c |  82 ++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-2b.c |  62 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-2c.c |  47 ++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-3a.c |  80 ++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-3b.c | 126 ++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-3c.c |  69 +++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-4a.c | 101 +++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-4b.c |  67 +++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-4c.c |  54 +++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-5a.c | 117 +++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-5b.c |  80 ++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-5c.c |  62 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-6a.c | 115 ++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-6b.c | 157 ++++++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-6c.c |  80 ++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-7a.c |  58 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-7b.c |  63 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-7c.c |  41 +++++
> >  gcc/testsuite/gcc.target/i386/pr106010-8a.c |  58 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-8b.c |  53 +++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-8c.c |  38 +++++
> >  gcc/testsuite/gcc.target/i386/pr106010-9a.c |  89 +++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-9b.c |  90 +++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-9c.c |  90 +++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106010-9d.c |  92 ++++++++++++
> >  gcc/tree-complex.cc                         |   9 +-
> >  29 files changed, 2141 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-1c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-2c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-3c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-4c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-5c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-6c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-7c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-8c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9b.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9c.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106010-9d.c
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > new file mode 100644
> > index 00000000000..b608f484934
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
> > @@ -0,0 +1,58 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > new file mode 100644
> > index 00000000000..0f377c3a548
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
> > @@ -0,0 +1,63 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-1a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > +
> > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > +
> > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > +    p_init[i] = i;
> > +
> > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > +  memcpy (ps_src, p_init, 2 * N * sizeof (float));
> > +  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
> > +  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
> > +  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
> > +  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
> > +
> > +  foo_pd (pd_dst, pd_src);
> > +  foo_ps (ps_dst, ps_src);
> > +  foo_epi64 (epi64_dst, epi64_src);
> > +  foo_epi32 (epi32_dst, epi32_src);
> > +  foo_epi16 (epi16_dst, epi16_src);
> > +  foo_epi8 (epi8_dst, epi8_src);
> > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > new file mode 100644
> > index 00000000000..f07e9fb2d3d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
> > @@ -0,0 +1,41 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +#define N 10000
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[i];
> > +}
> > +
> > +static void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > +
> > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > +
> > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > +    p_init[i] = i;
> > +
> > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > +
> > +  foo_ph (ph_dst, ph_src);
> > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > +    __builtin_abort ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > new file mode 100644
> > index 00000000000..d2e2f8d4f43
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
> > @@ -0,0 +1,82 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +  a[2] = b[2];
> > +  a[3] = b[3];
> > +
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +  a[2] = b[2];
> > +  a[3] = b[3];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +  a[2] = b[2];
> > +  a[3] = b[3];
> > +  a[4] = b[4];
> > +  a[5] = b[5];
> > +  a[6] = b[6];
> > +  a[7] = b[7];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +  a[2] = b[2];
> > +  a[3] = b[3];
> > +  a[4] = b[4];
> > +  a[5] = b[5];
> > +  a[6] = b[6];
> > +  a[7] = b[7];
> > +  a[8] = b[8];
> > +  a[9] = b[9];
> > +  a[10] = b[10];
> > +  a[11] = b[11];
> > +  a[12] = b[12];
> > +  a[13] = b[13];
> > +  a[14] = b[14];
> > +  a[15] = b[15];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > new file mode 100644
> > index 00000000000..ac360752693
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
> > @@ -0,0 +1,62 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-2a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > +  char* p = (char* ) malloc (32);
> > +
> > +  __builtin_memset (pd_dst, 0, 32);
> > +  __builtin_memset (ps_dst, 0, 32);
> > +  __builtin_memset (epi64_dst, 0, 32);
> > +  __builtin_memset (epi32_dst, 0, 32);
> > +  __builtin_memset (epi16_dst, 0, 32);
> > +  __builtin_memset (epi8_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +  __builtin_memcpy (pd_src, p, 32);
> > +  __builtin_memcpy (ps_src, p, 32);
> > +  __builtin_memcpy (epi64_src, p, 32);
> > +  __builtin_memcpy (epi32_src, p, 32);
> > +  __builtin_memcpy (epi16_src, p, 32);
> > +  __builtin_memcpy (epi8_src, p, 32);
> > +
> > +  foo_pd (pd_dst, pd_src);
> > +  foo_ps (ps_dst, ps_src);
> > +  foo_epi64 (epi64_dst, epi64_src);
> > +  foo_epi32 (epi32_dst, epi32_src);
> > +  foo_epi16 (epi16_dst, epi16_src);
> > +  foo_epi8 (epi8_dst, epi8_src);
> > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > new file mode 100644
> > index 00000000000..a002f209ec9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
> > @@ -0,0 +1,47 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > +  a[0] = b[0];
> > +  a[1] = b[1];
> > +  a[2] = b[2];
> > +  a[3] = b[3];
> > +  a[4] = b[4];
> > +  a[5] = b[5];
> > +  a[6] = b[6];
> > +  a[7] = b[7];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > +  char* p = (char* ) malloc (32);
> > +
> > +   __builtin_memset (ph_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +  __builtin_memcpy (ph_src, p, 32);
> > +
> > +  foo_ph (ph_dst, ph_src);
> > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > new file mode 100644
> > index 00000000000..c1b64b56b1c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
> > @@ -0,0 +1,80 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > +  a[0] = b[1];
> > +  a[1] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > +  a[0] = b[1];
> > +  a[1] = b[0];
> > +  a[2] = b[3];
> > +  a[3] = b[2];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > +  a[0] = b[1];
> > +  a[1] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > +  a[0] = b[3];
> > +  a[1] = b[2];
> > +  a[2] = b[1];
> > +  a[3] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > +  a[0] = b[7];
> > +  a[1] = b[6];
> > +  a[2] = b[5];
> > +  a[3] = b[4];
> > +  a[4] = b[3];
> > +  a[5] = b[2];
> > +  a[6] = b[1];
> > +  a[7] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > +  a[0] = b[7];
> > +  a[1] = b[6];
> > +  a[2] = b[5];
> > +  a[3] = b[4];
> > +  a[4] = b[3];
> > +  a[5] = b[2];
> > +  a[6] = b[1];
> > +  a[7] = b[0];
> > +  a[8] = b[15];
> > +  a[9] = b[14];
> > +  a[10] = b[13];
> > +  a[11] = b[12];
> > +  a[12] = b[11];
> > +  a[13] = b[10];
> > +  a[14] = b[9];
> > +  a[15] = b[8];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > new file mode 100644
> > index 00000000000..e4fa3f3a541
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
> > @@ -0,0 +1,126 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx2 } */
> > +
> > +#include "avx2-check.h"
> > +#include <string.h>
> > +#include "pr106010-3a.c"
> > +
> > +void
> > +avx2_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > +  _Complex double* pd_exp = (_Complex double*) malloc (32);
> > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > +  _Complex float* ps_exp = (_Complex float*) malloc (32);
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
> > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > +  _Complex int* epi32_exp = (_Complex int*) malloc (32);
> > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > +  _Complex short* epi16_exp = (_Complex short*) malloc (32);
> > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > +  _Complex char* epi8_exp = (_Complex char*) malloc (32);
> > +  char* p = (char* ) malloc (32);
> > +  char* q = (char* ) malloc (32);
> > +
> > +  __builtin_memset (pd_dst, 0, 32);
> > +  __builtin_memset (ps_dst, 0, 32);
> > +  __builtin_memset (epi64_dst, 0, 32);
> > +  __builtin_memset (epi32_dst, 0, 32);
> > +  __builtin_memset (epi16_dst, 0, 32);
> > +  __builtin_memset (epi8_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +  __builtin_memcpy (pd_src, p, 32);
> > +  __builtin_memcpy (ps_src, p, 32);
> > +  __builtin_memcpy (epi64_src, p, 32);
> > +  __builtin_memcpy (epi32_src, p, 32);
> > +  __builtin_memcpy (epi16_src, p, 32);
> > +  __builtin_memcpy (epi8_src, p, 32);
> > +
> > +  for (int i = 0; i != 16; i++)
> > +    {
> > +      p[i] = i + 16;
> > +      p[i + 16] = i;
> > +    }
> > +  __builtin_memcpy (pd_exp, p, 32);
> > +  __builtin_memcpy (epi64_exp, p, 32);
> > +
> > +  for (int i = 0; i != 8; i++)
> > +    {
> > +      p[i] = i + 8;
> > +      p[i + 8] = i;
> > +      p[i + 16] = i + 24;
> > +      p[i + 24] = i + 16;
> > +      q[i] = i + 24;
> > +      q[i + 8] = i + 16;
> > +      q[i + 16] = i + 8;
> > +      q[i + 24] = i;
> > +    }
> > +  __builtin_memcpy (ps_exp, p, 32);
> > +  __builtin_memcpy (epi32_exp, q, 32);
> > +
> > +
> > +  for (int i = 0; i != 4; i++)
> > +    {
> > +      q[i] = i + 28;
> > +      q[i + 4] = i + 24;
> > +      q[i + 8] = i + 20;
> > +      q[i + 12] = i + 16;
> > +      q[i + 16] = i + 12;
> > +      q[i + 20] = i + 8;
> > +      q[i + 24] = i + 4;
> > +      q[i + 28] = i;
> > +    }
> > +  __builtin_memcpy (epi16_exp, q, 32);
> > +
> > +  for (int i = 0; i != 2; i++)
> > +    {
> > +      q[i] = i + 14;
> > +      q[i + 2] = i + 12;
> > +      q[i + 4] = i + 10;
> > +      q[i + 6] = i + 8;
> > +      q[i + 8] = i + 6;
> > +      q[i + 10] = i + 4;
> > +      q[i + 12] = i + 2;
> > +      q[i + 14] = i;
> > +      q[i + 16] = i + 30;
> > +      q[i + 18] = i + 28;
> > +      q[i + 20] = i + 26;
> > +      q[i + 22] = i + 24;
> > +      q[i + 24] = i + 22;
> > +      q[i + 26] = i + 20;
> > +      q[i + 28] = i + 18;
> > +      q[i + 30] = i + 16;
> > +    }
> > +  __builtin_memcpy (epi8_exp, q, 32);
> > +
> > +  foo_pd (pd_dst, pd_src);
> > +  foo_ps (ps_dst, ps_src);
> > +  foo_epi64 (epi64_dst, epi64_src);
> > +  foo_epi32 (epi32_dst, epi32_src);
> > +  foo_epi16 (epi16_dst, epi16_src);
> > +  foo_epi8 (epi8_dst, epi8_src);
> > +  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > new file mode 100644
> > index 00000000000..5a5a3d4b992
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
> > @@ -0,0 +1,69 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > +  a[0] = b[1];
> > +  a[1] = b[0];
> > +  a[2] = b[4];
> > +  a[3] = b[3];
> > +  a[4] = b[7];
> > +  a[5] = b[6];
> > +  a[6] = b[2];
> > +  a[7] = b[5];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
> > +  char* p = (char* ) malloc (32);
> > +  char* q = (char* ) malloc (32);
> > +
> > +  __builtin_memset (ph_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +  __builtin_memcpy (ph_src, p, 32);
> > +
> > +  for (int i = 0; i != 4; i++)
> > +    {
> > +      p[i] = i + 4;
> > +      p[i + 4] = i;
> > +      p[i + 8] = i + 16;
> > +      p[i + 12] = i + 12;
> > +      p[i + 16] = i + 28;
> > +      p[i + 20] = i + 24;
> > +      p[i + 24] = i + 8;
> > +      p[i + 28] = i + 20;
> > +      q[i] = i + 28;
> > +      q[i + 4] = i + 24;
> > +      q[i + 8] = i + 20;
> > +      q[i + 12] = i + 16;
> > +      q[i + 16] = i + 12;
> > +      q[i + 20] = i + 8;
> > +      q[i + 24] = i + 4;
> > +      q[i + 28] = i;
> > +    }
> > +  __builtin_memcpy (ph_exp, p, 32);
> > +
> > +  foo_ph (ph_dst, ph_src);
> > +  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > new file mode 100644
> > index 00000000000..b7b0b532bb1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
> > @@ -0,0 +1,101 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a,
> > +       _Complex double b1,
> > +       _Complex double b2)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a,
> > +       _Complex float b1, _Complex float b2,
> > +       _Complex float b3, _Complex float b4)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +  a[2] = b3;
> > +  a[3] = b4;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a,
> > +          _Complex long long b1,
> > +          _Complex long long b2)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a,
> > +          _Complex int b1, _Complex int b2,
> > +          _Complex int b3, _Complex int b4)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +  a[2] = b3;
> > +  a[3] = b4;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a,
> > +          _Complex short b1, _Complex short b2,
> > +          _Complex short b3, _Complex short b4,
> > +          _Complex short b5, _Complex short b6,
> > +          _Complex short b7,_Complex short b8)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +  a[2] = b3;
> > +  a[3] = b4;
> > +  a[4] = b5;
> > +  a[5] = b6;
> > +  a[6] = b7;
> > +  a[7] = b8;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a,
> > +         _Complex char b1, _Complex char b2,
> > +         _Complex char b3, _Complex char b4,
> > +         _Complex char b5, _Complex char b6,
> > +         _Complex char b7,_Complex char b8,
> > +         _Complex char b9, _Complex char b10,
> > +         _Complex char b11, _Complex char b12,
> > +         _Complex char b13, _Complex char b14,
> > +         _Complex char b15,_Complex char b16)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +  a[2] = b3;
> > +  a[3] = b4;
> > +  a[4] = b5;
> > +  a[5] = b6;
> > +  a[6] = b7;
> > +  a[7] = b8;
> > +  a[8] = b9;
> > +  a[9] = b10;
> > +  a[10] = b11;
> > +  a[11] = b12;
> > +  a[12] = b13;
> > +  a[13] = b14;
> > +  a[14] = b15;
> > +  a[15] = b16;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > new file mode 100644
> > index 00000000000..e2e79508c4b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
> > @@ -0,0 +1,67 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-4a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (32);
> > +  _Complex double* pd_dst = (_Complex double*) malloc (32);
> > +  _Complex float* ps_src = (_Complex float*) malloc (32);
> > +  _Complex float* ps_dst = (_Complex float*) malloc (32);
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
> > +  _Complex int* epi32_src = (_Complex int*) malloc (32);
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (32);
> > +  _Complex short* epi16_src = (_Complex short*) malloc (32);
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (32);
> > +  _Complex char* epi8_src = (_Complex char*) malloc (32);
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (32);
> > +  char* p = (char* ) malloc (32);
> > +
> > +  __builtin_memset (pd_dst, 0, 32);
> > +  __builtin_memset (ps_dst, 0, 32);
> > +  __builtin_memset (epi64_dst, 0, 32);
> > +  __builtin_memset (epi32_dst, 0, 32);
> > +  __builtin_memset (epi16_dst, 0, 32);
> > +  __builtin_memset (epi8_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +  __builtin_memcpy (pd_src, p, 32);
> > +  __builtin_memcpy (ps_src, p, 32);
> > +  __builtin_memcpy (epi64_src, p, 32);
> > +  __builtin_memcpy (epi32_src, p, 32);
> > +  __builtin_memcpy (epi16_src, p, 32);
> > +  __builtin_memcpy (epi8_src, p, 32);
> > +
> > +  foo_pd (pd_dst, pd_src[0], pd_src[1]);
> > +  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
> > +  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
> > +  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
> > +  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
> > +            epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
> > +  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
> > +           epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
> > +           epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
> > +           epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
> > +
> > +  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > new file mode 100644
> > index 00000000000..8e02aefe3b5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
> > @@ -0,0 +1,54 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a,
> > +       _Complex _Float16 b1, _Complex _Float16 b2,
> > +       _Complex _Float16 b3, _Complex _Float16 b4,
> > +       _Complex _Float16 b5, _Complex _Float16 b6,
> > +       _Complex _Float16 b7,_Complex _Float16 b8)
> > +{
> > +  a[0] = b1;
> > +  a[1] = b2;
> > +  a[2] = b3;
> > +  a[3] = b4;
> > +  a[4] = b5;
> > +  a[5] = b6;
> > +  a[6] = b7;
> > +  a[7] = b8;
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > +
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
> > +
> > +  char* p = (char* ) malloc (32);
> > +
> > +  __builtin_memset (ph_dst, 0, 32);
> > +
> > +  for (int i = 0; i != 32; i++)
> > +    p[i] = i;
> > +
> > +  __builtin_memcpy (ph_src, p, 32);
> > +
> > +  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
> > +         ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
> > +
> > +  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
> > +    __builtin_abort ();
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > new file mode 100644
> > index 00000000000..9d4a6f9846b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
> > @@ -0,0 +1,117 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > +  a[0] = b[2];
> > +  a[1] = b[3];
> > +  a[2] = b[0];
> > +  a[3] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > +  a[0] = b[4];
> > +  a[1] = b[5];
> > +  a[2] = b[6];
> > +  a[3] = b[7];
> > +  a[4] = b[0];
> > +  a[5] = b[1];
> > +  a[6] = b[2];
> > +  a[7] = b[3];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > +  a[0] = b[2];
> > +  a[1] = b[3];
> > +  a[2] = b[0];
> > +  a[3] = b[1];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > +  a[0] = b[4];
> > +  a[1] = b[5];
> > +  a[2] = b[6];
> > +  a[3] = b[7];
> > +  a[4] = b[0];
> > +  a[5] = b[1];
> > +  a[6] = b[2];
> > +  a[7] = b[3];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > +  a[0] = b[8];
> > +  a[1] = b[9];
> > +  a[2] = b[10];
> > +  a[3] = b[11];
> > +  a[4] = b[12];
> > +  a[5] = b[13];
> > +  a[6] = b[14];
> > +  a[7] = b[15];
> > +  a[8] = b[0];
> > +  a[9] = b[1];
> > +  a[10] = b[2];
> > +  a[11] = b[3];
> > +  a[12] = b[4];
> > +  a[13] = b[5];
> > +  a[14] = b[6];
> > +  a[15] = b[7];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > +  a[0] = b[16];
> > +  a[1] = b[17];
> > +  a[2] = b[18];
> > +  a[3] = b[19];
> > +  a[4] = b[20];
> > +  a[5] = b[21];
> > +  a[6] = b[22];
> > +  a[7] = b[23];
> > +  a[8] = b[24];
> > +  a[9] = b[25];
> > +  a[10] = b[26];
> > +  a[11] = b[27];
> > +  a[12] = b[28];
> > +  a[13] = b[29];
> > +  a[14] = b[30];
> > +  a[15] = b[31];
> > +  a[16] = b[0];
> > +  a[17] = b[1];
> > +  a[18] = b[2];
> > +  a[19] = b[3];
> > +  a[20] = b[4];
> > +  a[21] = b[5];
> > +  a[22] = b[6];
> > +  a[23] = b[7];
> > +  a[24] = b[8];
> > +  a[25] = b[9];
> > +  a[26] = b[10];
> > +  a[27] = b[11];
> > +  a[28] = b[12];
> > +  a[29] = b[13];
> > +  a[30] = b[14];
> > +  a[31] = b[15];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > new file mode 100644
> > index 00000000000..d5c6ebeb5cf
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
> > @@ -0,0 +1,80 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-5a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > +  char* p = (char* ) malloc (64);
> > +  char* q = (char* ) malloc (64);
> > +
> > +  __builtin_memset (pd_dst, 0, 64);
> > +  __builtin_memset (ps_dst, 0, 64);
> > +  __builtin_memset (epi64_dst, 0, 64);
> > +  __builtin_memset (epi32_dst, 0, 64);
> > +  __builtin_memset (epi16_dst, 0, 64);
> > +  __builtin_memset (epi8_dst, 0, 64);
> > +
> > +  for (int i = 0; i != 64; i++)
> > +    {
> > +      p[i] = i;
> > +      q[i] = (i + 32) % 64;
> > +    }
> > +  __builtin_memcpy (pd_src, p, 64);
> > +  __builtin_memcpy (ps_src, p, 64);
> > +  __builtin_memcpy (epi64_src, p, 64);
> > +  __builtin_memcpy (epi32_src, p, 64);
> > +  __builtin_memcpy (epi16_src, p, 64);
> > +  __builtin_memcpy (epi8_src, p, 64);
> > +
> > +  __builtin_memcpy (pd_exp, q, 64);
> > +  __builtin_memcpy (ps_exp, q, 64);
> > +  __builtin_memcpy (epi64_exp, q, 64);
> > +  __builtin_memcpy (epi32_exp, q, 64);
> > +  __builtin_memcpy (epi16_exp, q, 64);
> > +  __builtin_memcpy (epi8_exp, q, 64);
> > +
> > +  foo_pd (pd_dst, pd_src);
> > +  foo_ps (ps_dst, ps_src);
> > +  foo_epi64 (epi64_dst, epi64_src);
> > +  foo_epi32 (epi32_dst, epi32_src);
> > +  foo_epi16 (epi16_dst, epi16_src);
> > +  foo_epi8 (epi8_dst, epi8_src);
> > +
> > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > new file mode 100644
> > index 00000000000..9ce4e6dd5c0
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
> > @@ -0,0 +1,62 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > +  a[0] = b[8];
> > +  a[1] = b[9];
> > +  a[2] = b[10];
> > +  a[3] = b[11];
> > +  a[4] = b[12];
> > +  a[5] = b[13];
> > +  a[6] = b[14];
> > +  a[7] = b[15];
> > +  a[8] = b[0];
> > +  a[9] = b[1];
> > +  a[10] = b[2];
> > +  a[11] = b[3];
> > +  a[12] = b[4];
> > +  a[13] = b[5];
> > +  a[14] = b[6];
> > +  a[15] = b[7];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > +  char* p = (char* ) malloc (64);
> > +  char* q = (char* ) malloc (64);
> > +
> > +  __builtin_memset (ph_dst, 0, 64);
> > +
> > +  for (int i = 0; i != 64; i++)
> > +    {
> > +      p[i] = i;
> > +      q[i] = (i + 32) % 64;
> > +    }
> > +  __builtin_memcpy (ph_src, p, 64);
> > +
> > +  __builtin_memcpy (ph_exp, q, 64);
> > +
> > +  foo_ph (ph_dst, ph_src);
> > +
> > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > new file mode 100644
> > index 00000000000..65a90d03684
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
> > @@ -0,0 +1,115 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double* __restrict b)
> > +{
> > +  a[0] = b[3];
> > +  a[1] = b[2];
> > +  a[2] = b[1];
> > +  a[3] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float* __restrict b)
> > +{
> > +  a[0] = b[7];
> > +  a[1] = b[6];
> > +  a[2] = b[5];
> > +  a[3] = b[4];
> > +  a[4] = b[3];
> > +  a[5] = b[2];
> > +  a[6] = b[1];
> > +  a[7] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
> > +{
> > +  a[0] = b[3];
> > +  a[1] = b[2];
> > +  a[2] = b[1];
> > +  a[3] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int* __restrict b)
> > +{
> > +  a[0] = b[7];
> > +  a[1] = b[6];
> > +  a[2] = b[5];
> > +  a[3] = b[4];
> > +  a[4] = b[3];
> > +  a[5] = b[2];
> > +  a[6] = b[1];
> > +  a[7] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short* __restrict b)
> > +{
> > +  a[0] = b[15];
> > +  a[1] = b[14];
> > +  a[2] = b[13];
> > +  a[3] = b[12];
> > +  a[4] = b[11];
> > +  a[5] = b[10];
> > +  a[6] = b[9];
> > +  a[7] = b[8];
> > +  a[8] = b[7];
> > +  a[9] = b[6];
> > +  a[10] = b[5];
> > +  a[11] = b[4];
> > +  a[12] = b[3];
> > +  a[13] = b[2];
> > +  a[14] = b[1];
> > +  a[15] = b[0];
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char* __restrict b)
> > +{
> > +  a[0] = b[31];
> > +  a[1] = b[30];
> > +  a[2] = b[29];
> > +  a[3] = b[28];
> > +  a[4] = b[27];
> > +  a[5] = b[26];
> > +  a[6] = b[25];
> > +  a[7] = b[24];
> > +  a[8] = b[23];
> > +  a[9] = b[22];
> > +  a[10] = b[21];
> > +  a[11] = b[20];
> > +  a[12] = b[19];
> > +  a[13] = b[18];
> > +  a[14] = b[17];
> > +  a[15] = b[16];
> > +  a[16] = b[15];
> > +  a[17] = b[14];
> > +  a[18] = b[13];
> > +  a[19] = b[12];
> > +  a[20] = b[11];
> > +  a[21] = b[10];
> > +  a[22] = b[9];
> > +  a[23] = b[8];
> > +  a[24] = b[7];
> > +  a[25] = b[6];
> > +  a[26] = b[5];
> > +  a[27] = b[4];
> > +  a[28] = b[3];
> > +  a[29] = b[2];
> > +  a[30] = b[1];
> > +  a[31] = b[0];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > new file mode 100644
> > index 00000000000..1c5bb020939
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
> > @@ -0,0 +1,157 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx2 } */
> > +
> > +#include "avx2-check.h"
> > +#include <string.h>
> > +#include "pr106010-6a.c"
> > +
> > +void
> > +avx2_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (64);
> > +  _Complex double* pd_dst = (_Complex double*) malloc (64);
> > +  _Complex double* pd_exp = (_Complex double*) malloc (64);
> > +  _Complex float* ps_src = (_Complex float*) malloc (64);
> > +  _Complex float* ps_dst = (_Complex float*) malloc (64);
> > +  _Complex float* ps_exp = (_Complex float*) malloc (64);
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
> > +  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
> > +  _Complex int* epi32_src = (_Complex int*) malloc (64);
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (64);
> > +  _Complex int* epi32_exp = (_Complex int*) malloc (64);
> > +  _Complex short* epi16_src = (_Complex short*) malloc (64);
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (64);
> > +  _Complex short* epi16_exp = (_Complex short*) malloc (64);
> > +  _Complex char* epi8_src = (_Complex char*) malloc (64);
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (64);
> > +  _Complex char* epi8_exp = (_Complex char*) malloc (64);
> > +  char* p = (char* ) malloc (64);
> > +  char* q = (char* ) malloc (64);
> > +
> > +  __builtin_memset (pd_dst, 0, 64);
> > +  __builtin_memset (ps_dst, 0, 64);
> > +  __builtin_memset (epi64_dst, 0, 64);
> > +  __builtin_memset (epi32_dst, 0, 64);
> > +  __builtin_memset (epi16_dst, 0, 64);
> > +  __builtin_memset (epi8_dst, 0, 64);
> > +
> > +  for (int i = 0; i != 64; i++)
> > +    p[i] = i;
> > +
> > +  __builtin_memcpy (pd_src, p, 64);
> > +  __builtin_memcpy (ps_src, p, 64);
> > +  __builtin_memcpy (epi64_src, p, 64);
> > +  __builtin_memcpy (epi32_src, p, 64);
> > +  __builtin_memcpy (epi16_src, p, 64);
> > +  __builtin_memcpy (epi8_src, p, 64);
> > +
> > +
> > +  for (int i = 0; i != 16; i++)
> > +    {
> > +      q[i] = i + 48;
> > +      q[i + 16] = i + 32;
> > +      q[i + 32] = i + 16;
> > +      q[i + 48] = i;
> > +    }
> > +
> > +  __builtin_memcpy (pd_exp, q, 64);
> > +  __builtin_memcpy (epi64_exp, q, 64);
> > +
> > +   for (int i = 0; i != 8; i++)
> > +    {
> > +      q[i] = i + 56;
> > +      q[i + 8] = i + 48;
> > +      q[i + 16] = i + 40;
> > +      q[i + 24] = i + 32;
> > +      q[i + 32] = i + 24;
> > +      q[i + 40] = i + 16;
> > +      q[i + 48] = i + 8;
> > +      q[i + 56] = i;
> > +    }
> > +
> > +  __builtin_memcpy (ps_exp, q, 64);
> > +  __builtin_memcpy (epi32_exp, q, 64);
> > +
> > +  for (int i = 0; i != 4; i++)
> > +    {
> > +      q[i] = i + 60;
> > +      q[i + 4] = i + 56;
> > +      q[i + 8] = i + 52;
> > +      q[i + 12] = i + 48;
> > +      q[i + 16] = i + 44;
> > +      q[i + 20] = i + 40;
> > +      q[i + 24] = i + 36;
> > +      q[i + 28] = i + 32;
> > +      q[i + 32] = i + 28;
> > +      q[i + 36] = i + 24;
> > +      q[i + 40] = i + 20;
> > +      q[i + 44] = i + 16;
> > +      q[i + 48] = i + 12;
> > +      q[i + 52] = i + 8;
> > +      q[i + 56] = i + 4;
> > +      q[i + 60] = i;
> > +    }
> > +
> > +  __builtin_memcpy (epi16_exp, q, 64);
> > +
> > +  for (int i = 0; i != 2; i++)
> > +    {
> > +      q[i] = i + 62;
> > +      q[i + 2] = i + 60;
> > +      q[i + 4] = i + 58;
> > +      q[i + 6] = i + 56;
> > +      q[i + 8] = i + 54;
> > +      q[i + 10] = i + 52;
> > +      q[i + 12] = i + 50;
> > +      q[i + 14] = i + 48;
> > +      q[i + 16] = i + 46;
> > +      q[i + 18] = i + 44;
> > +      q[i + 20] = i + 42;
> > +      q[i + 22] = i + 40;
> > +      q[i + 24] = i + 38;
> > +      q[i + 26] = i + 36;
> > +      q[i + 28] = i + 34;
> > +      q[i + 30] = i + 32;
> > +      q[i + 32] = i + 30;
> > +      q[i + 34] = i + 28;
> > +      q[i + 36] = i + 26;
> > +      q[i + 38] = i + 24;
> > +      q[i + 40] = i + 22;
> > +      q[i + 42] = i + 20;
> > +      q[i + 44] = i + 18;
> > +      q[i + 46] = i + 16;
> > +      q[i + 48] = i + 14;
> > +      q[i + 50] = i + 12;
> > +      q[i + 52] = i + 10;
> > +      q[i + 54] = i + 8;
> > +      q[i + 56] = i + 6;
> > +      q[i + 58] = i + 4;
> > +      q[i + 60] = i + 2;
> > +      q[i + 62] = i;
> > +    }
> > +  __builtin_memcpy (epi8_exp, q, 64);
> > +
> > +  foo_pd (pd_dst, pd_src);
> > +  foo_ps (ps_dst, ps_src);
> > +  foo_epi64 (epi64_dst, epi64_src);
> > +  foo_epi32 (epi32_dst, epi32_src);
> > +  foo_epi16 (epi16_dst, epi16_src);
> > +  foo_epi8 (epi8_dst, epi8_src);
> > +
> > +  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > new file mode 100644
> > index 00000000000..b859d884a7f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
> > @@ -0,0 +1,80 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
> > +/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
> > +{
> > +  a[0] = b[15];
> > +  a[1] = b[14];
> > +  a[2] = b[13];
> > +  a[3] = b[12];
> > +  a[4] = b[11];
> > +  a[5] = b[10];
> > +  a[6] = b[9];
> > +  a[7] = b[8];
> > +  a[8] = b[7];
> > +  a[9] = b[6];
> > +  a[10] = b[5];
> > +  a[11] = b[4];
> > +  a[12] = b[3];
> > +  a[13] = b[2];
> > +  a[14] = b[1];
> > +  a[15] = b[0];
> > +}
> > +
> > +void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
> > +  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
> > +  char* p = (char* ) malloc (64);
> > +  char* q = (char* ) malloc (64);
> > +
> > +  __builtin_memset (ph_dst, 0, 64);
> > +
> > +  for (int i = 0; i != 64; i++)
> > +    p[i] = i;
> > +
> > +  __builtin_memcpy (ph_src, p, 64);
> > +
> > +  for (int i = 0; i != 4; i++)
> > +    {
> > +      q[i] = i + 60;
> > +      q[i + 4] = i + 56;
> > +      q[i + 8] = i + 52;
> > +      q[i + 12] = i + 48;
> > +      q[i + 16] = i + 44;
> > +      q[i + 20] = i + 40;
> > +      q[i + 24] = i + 36;
> > +      q[i + 28] = i + 32;
> > +      q[i + 32] = i + 28;
> > +      q[i + 36] = i + 24;
> > +      q[i + 40] = i + 20;
> > +      q[i + 44] = i + 16;
> > +      q[i + 48] = i + 12;
> > +      q[i + 52] = i + 8;
> > +      q[i + 56] = i + 4;
> > +      q[i + 60] = i;
> > +    }
> > +
> > +  __builtin_memcpy (ph_exp, q, 64);
> > +
> > +  foo_ph (ph_dst, ph_src);
> > +
> > +  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > new file mode 100644
> > index 00000000000..2ea01fac927
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
> > @@ -0,0 +1,58 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a, _Complex double b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a, _Complex float b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a, _Complex long long b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a, _Complex int b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a, _Complex short b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a, _Complex char b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > new file mode 100644
> > index 00000000000..26482cc10f5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
> > @@ -0,0 +1,63 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-7a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
> > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > +  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
> > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > +  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > +  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > +  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > +  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > +  char* p_init = (char*) malloc (2 * N * sizeof (double));
> > +
> > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > +
> > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > +    p_init[i] = i % 2 + 3;
> > +
> > +  memcpy (pd_src, p_init, 2 * N * sizeof (double));
> > +  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
> > +  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
> > +  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
> > +  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
> > +  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
> > +
> > +  foo_pd (pd_dst, pd_src[0]);
> > +  foo_ps (ps_dst, ps_src[0]);
> > +  foo_epi64 (epi64_dst, epi64_src[0]);
> > +  foo_epi32 (epi32_dst, epi32_src[0]);
> > +  foo_epi16 (epi16_dst, epi16_src[0]);
> > +  foo_epi8 (epi8_dst, epi8_src[0]);
> > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
> > +    __builtin_abort ();
> > +  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
> > +    __builtin_abort ();
> > +
> > +  return;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > new file mode 100644
> > index 00000000000..7f4056a5ecc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
> > @@ -0,0 +1,41 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +#define N 10000
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a, _Complex _Float16 b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b;
> > +}
> > +
> > +static void
> > +do_test (void)
> > +{
> > +  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > +  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
> > +
> > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > +
> > +  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
> > +    p_init[i] = i % 2 + 3;
> > +
> > +  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
> > +
> > +  foo_ph (ph_dst, ph_src[0]);
> > +  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
> > +    __builtin_abort ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > new file mode 100644
> > index 00000000000..11054b60d30
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
> > @@ -0,0 +1,58 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
> > +
> > +#define N 10000
> > +void
> > +__attribute__((noipa))
> > +foo_pd (_Complex double* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1.0 + 2.0i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ps (_Complex float* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1.0f + 2.0fi;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi64 (_Complex long long* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1 + 2i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi32 (_Complex int* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1 + 2i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi16 (_Complex short* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1 + 2i;
> > +}
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_epi8 (_Complex char* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1 + 2i;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > new file mode 100644
> > index 00000000000..6bb0073b691
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
> > @@ -0,0 +1,53 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
> > +/* { dg-require-effective-target avx } */
> > +
> > +#include "avx-check.h"
> > +#include <string.h>
> > +#include "pr106010-8a.c"
> > +
> > +void
> > +avx_test (void)
> > +{
> > +  _Complex double pd_src = 1.0 + 2.0i;
> > +  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
> > +  _Complex float ps_src = 1.0 + 2.0i;
> > +  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
> > +  _Complex long long epi64_src = 1 + 2i;;
> > +  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
> > +  _Complex int epi32_src = 1 + 2i;
> > +  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
> > +  _Complex short epi16_src = 1 + 2i;
> > +  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
> > +  _Complex char epi8_src = 1 + 2i;
> > +  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
> > +
> > +  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
> > +  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
> > +  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
> > +  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
> > +  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
> > +  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
> > +
> > +  foo_pd (pd_dst);
> > +  foo_ps (ps_dst);
> > +  foo_epi64 (epi64_dst);
> > +  foo_epi32 (epi32_dst);
> > +  foo_epi16 (epi16_dst);
> > +  foo_epi8 (epi8_dst);
> > +  for (int i = 0 ; i != N; i++)
> > +    {
> > +      if (pd_dst[i] != pd_src)
> > +       __builtin_abort ();
> > +      if (ps_dst[i] != ps_src)
> > +       __builtin_abort ();
> > +      if (epi64_dst[i] != epi64_src)
> > +       __builtin_abort ();
> > +      if (epi32_dst[i] != epi32_src)
> > +       __builtin_abort ();
> > +      if (epi16_dst[i] != epi16_src)
> > +       __builtin_abort ();
> > +      if (epi8_dst[i] != epi8_src)
> > +       __builtin_abort ();
> > +    }
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > new file mode 100644
> > index 00000000000..61ae131829d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
> > @@ -0,0 +1,38 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
> > +/* { dg-require-effective-target avx512fp16 } */
> > +
> > +#include <string.h>
> > +
> > +static void do_test (void);
> > +
> > +#define DO_TEST do_test
> > +#define AVX512FP16
> > +#include "avx512-check.h"
> > +
> > +#define N 10000
> > +
> > +void
> > +__attribute__((noipa))
> > +foo_ph (_Complex _Float16* a)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = 1.0f16 + 2.0f16i;
> > +}
> > +
> > +static void
> > +do_test (void)
> > +{
> > +  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
> > +  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
> > +
> > +  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
> > +
> > +  foo_ph (ph_dst);
> > +  for (int i = 0; i != N; i++)
> > +    {
> > +      if (ph_dst[i] != ph_src)
> > +       __builtin_abort ();
> > +    }
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9a.c b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
> > new file mode 100644
> > index 00000000000..e922f7b5400
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
> > @@ -0,0 +1,89 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
> > +
> > +typedef struct { _Complex double c; double a1; double a2;}
> > +  cdf;
> > +typedef struct { _Complex double c; double a1; double a2; double a3; double a4;}
> > +  cdf2;
> > +typedef struct { _Complex double c1; _Complex double c2; double a1; double a2; double a3; double a4;}
> > +  cdf3;
> > +typedef struct { _Complex double c1; _Complex double c2; double a1; double a2;}
> > +  cdf4;
> > +
> > +#define N 100
> > +/* VMAT_ELEMENTWISE.  */
> > +void
> > +__attribute__((noipa))
> > +foo (cdf* a, cdf* __restrict b)
> > +{
> > +   for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i].c = b[i].c;
> > +      a[i].a1 = b[i].a1;
> > +      a[i].a2 = b[i].a2;
> > +    }
> > +}
> > +
> > +/* VMAT_CONTIGUOUS_PERMUTE.  */
> > +void
> > +__attribute__((noipa))
> > +foo1 (cdf2* a, cdf2* __restrict b)
> > +{
> > +   for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i].c = b[i].c;
> > +      a[i].a1 = b[i].a1;
> > +      a[i].a2 = b[i].a2;
> > +      a[i].a3 = b[i].a3;
> > +      a[i].a4 = b[i].a4;
> > +    }
> > +}
> > +
> > +/* VMAT_CONTIGUOUS.  */
> > +void
> > +__attribute__((noipa))
> > +foo2 (cdf3* a, cdf3* __restrict b)
> > +{
> > +   for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i].c1 = b[i].c1;
> > +      a[i].c2 = b[i].c2;
> > +      a[i].a1 = b[i].a1;
> > +      a[i].a2 = b[i].a2;
> > +      a[i].a3 = b[i].a3;
> > +      a[i].a4 = b[i].a4;
> > +    }
> > +}
> > +
> > +/* VMAT_STRIDED_SLP.  */
> > +void
> > +__attribute__((noipa))
> > +foo3 (cdf4* a, cdf4* __restrict b)
> > +{
> > +   for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i].c1 = b[i].c1;
> > +      a[i].c2 = b[i].c2;
> > +      a[i].a1 = b[i].a1;
> > +      a[i].a2 = b[i].a2;
> > +    }
> > +}
> > +
> > +/* VMAT_CONTIGUOUS_REVERSE.  */
> > +void
> > +__attribute__((noipa))
> > +foo4 (_Complex double* a, _Complex double* __restrict b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[i] = b[N-i-1];
> > +}
> > +
> > +/* VMAT_CONTIGUOUS_DOWN.  */
> > +void
> > +__attribute__((noipa))
> > +foo5 (_Complex double* a, _Complex double* __restrict b)
> > +{
> > +  for (int i = 0; i != N; i++)
> > +    a[N-i-1] = b[0];
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9b.c b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
> > new file mode 100644
> > index 00000000000..e220445e6e3
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
> > @@ -0,0 +1,90 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -msse2 -fvect-cost-model=unlimited" } */
> > +/* { dg-require-effective-target sse2 } */
> > +
> > +#include <string.h>
> > +#include "sse2-check.h"
> > +#include "pr106010-9a.c"
> > +
> > +static void
> > +sse2_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> > +  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> > +  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> > +  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> > +  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> > +  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> > +  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> > +  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> > +
> > +  char* p_init = (char*) malloc (N * sizeof (cdf3));
> > +
> > +  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> > +  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> > +  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> > +  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> > +  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> > +  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> > +
> > +  for (int i = 0; i != N * sizeof (cdf3); i++)
> > +    p_init[i] = i;
> > +
> > +  memcpy (cdf_src, p_init, N * sizeof (cdf));
> > +  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> > +  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> > +  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> > +  memcpy (pd_src, p_init, N * sizeof (_Complex double));
> > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > +    p_init[i] = i % 16;
> > +  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> > +
> > +  foo (cdf_dst, cdf_src);
> > +  foo1 (cdf2_dst, cdf2_src);
> > +  foo2 (cdf3_dst, cdf3_src);
> > +  foo3 (cdf4_dst, cdf4_src);
> > +  foo4 (pd_dst, pd_src);
> > +  foo5 (pd_dst2, pd_src2);
> > +  for (int i = 0; i != N; i++)
> > +    {
> > +      p_init[(N - i - 1) * 16] = i * 16;
> > +      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> > +      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> > +      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> > +      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> > +      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> > +      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> > +      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> > +      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> > +      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> > +      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> > +      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> > +      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> > +      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> > +      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> > +      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> > +    }
> > +  memcpy (pd_src, p_init, N * 16);
> > +
> > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> > +    __builtin_abort ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9c.c b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
> > new file mode 100644
> > index 00000000000..ff51f6195b7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
> > @@ -0,0 +1,90 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited" } */
> > +/* { dg-require-effective-target avx2 } */
> > +
> > +#include <string.h>
> > +#include "avx2-check.h"
> > +#include "pr106010-9a.c"
> > +
> > +static void
> > +avx2_test (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> > +  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> > +  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> > +  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> > +  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> > +  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> > +  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> > +  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> > +
> > +  char* p_init = (char*) malloc (N * sizeof (cdf3));
> > +
> > +  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> > +  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> > +  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> > +  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> > +  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> > +  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> > +
> > +  for (int i = 0; i != N * sizeof (cdf3); i++)
> > +    p_init[i] = i;
> > +
> > +  memcpy (cdf_src, p_init, N * sizeof (cdf));
> > +  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> > +  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> > +  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> > +  memcpy (pd_src, p_init, N * sizeof (_Complex double));
> > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > +    p_init[i] = i % 16;
> > +  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> > +
> > +  foo (cdf_dst, cdf_src);
> > +  foo1 (cdf2_dst, cdf2_src);
> > +  foo2 (cdf3_dst, cdf3_src);
> > +  foo3 (cdf4_dst, cdf4_src);
> > +  foo4 (pd_dst, pd_src);
> > +  foo5 (pd_dst2, pd_src2);
> > +  for (int i = 0; i != N; i++)
> > +    {
> > +      p_init[(N - i - 1) * 16] = i * 16;
> > +      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> > +      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> > +      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> > +      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> > +      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> > +      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> > +      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> > +      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> > +      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> > +      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> > +      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> > +      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> > +      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> > +      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> > +      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> > +    }
> > +  memcpy (pd_src, p_init, N * 16);
> > +
> > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> > +    __builtin_abort ();
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9d.c b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
> > new file mode 100644
> > index 00000000000..d4d8f1dd722
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
> > @@ -0,0 +1,92 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O3 -mavx512f -mavx512vl -fvect-cost-model=unlimited -mprefer-vector-width=512" } */
> > +/* { dg-require-effective-target avx512f } */
> > +
> > +#include <string.h>
> > +#include <stdlib.h>
> > +#define AVX512F
> > +#include "avx512-check.h"
> > +#include "pr106010-9a.c"
> > +
> > +static void
> > +test_512 (void)
> > +{
> > +  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
> > +  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
> > +  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
> > +  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
> > +  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
> > +  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
> > +  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
> > +  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
> > +  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
> > +
> > +  char* p_init = (char*) malloc (N * sizeof (cdf3));
> > +
> > +  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
> > +  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
> > +  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
> > +  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
> > +  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
> > +  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
> > +
> > +  for (int i = 0; i != N * sizeof (cdf3); i++)
> > +    p_init[i] = i;
> > +
> > +  memcpy (cdf_src, p_init, N * sizeof (cdf));
> > +  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
> > +  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
> > +  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
> > +  memcpy (pd_src, p_init, N * sizeof (_Complex double));
> > +  for (int i = 0; i != 2 * N * sizeof (double); i++)
> > +    p_init[i] = i % 16;
> > +  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
> > +
> > +  foo (cdf_dst, cdf_src);
> > +  foo1 (cdf2_dst, cdf2_src);
> > +  foo2 (cdf3_dst, cdf3_src);
> > +  foo3 (cdf4_dst, cdf4_src);
> > +  foo4 (pd_dst, pd_src);
> > +  foo5 (pd_dst2, pd_src2);
> > +  for (int i = 0; i != N; i++)
> > +    {
> > +      p_init[(N - i - 1) * 16] = i * 16;
> > +      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
> > +      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
> > +      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
> > +      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
> > +      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
> > +      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
> > +      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
> > +      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
> > +      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
> > +      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
> > +      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
> > +      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
> > +      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
> > +      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
> > +      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
> > +    }
> > +  memcpy (pd_src, p_init, N * 16);
> > +
> > +  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
> > +    __builtin_abort ();
> > +
> > +  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
> > +    __builtin_abort ();
> > +}
> > diff --git a/gcc/tree-complex.cc b/gcc/tree-complex.cc
> > index 61950a0f099..ea9df6114a1 100644
> > --- a/gcc/tree-complex.cc
> > +++ b/gcc/tree-complex.cc
> > @@ -297,6 +297,11 @@ init_dont_simulate_again (void)
> >                 break;
> >
> >               default:
> > +               /* When expand_complex_move would trigger make sure we
> > +                  perform lowering even when there is no actual complex
> > +                  operation.  This helps consistency and vectorization.  */
> > +               if (TREE_CODE (TREE_TYPE (gimple_op (stmt, 0))) == COMPLEX_TYPE)
> > +                 saw_a_complex_op = true;
> >                 break;
> >               }
> >
> > @@ -869,7 +874,9 @@ expand_complex_move (gimple_stmt_iterator *gsi, tree type)
> >           update_complex_assignment (gsi, r, i);
> >         }
> >      }
> > -  else if (rhs && TREE_CODE (rhs) == SSA_NAME && !TREE_SIDE_EFFECTS (lhs))
> > +  else if (rhs
> > +          && (TREE_CODE (rhs) == SSA_NAME || TREE_CODE (rhs) == COMPLEX_CST)
> > +          && !TREE_SIDE_EFFECTS (lhs))
> >      {
> >        tree x;
> >        gimple *t;
> > --
> > 2.18.1
> >
  

Patch

diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1a.c b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
new file mode 100644
index 00000000000..b608f484934
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-1a.c
@@ -0,0 +1,58 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1b.c b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
new file mode 100644
index 00000000000..0f377c3a548
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-1b.c
@@ -0,0 +1,63 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-1a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
+  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+  char* p_init = (char*) malloc (2 * N * sizeof (double));
+
+  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i;
+
+  memcpy (pd_src, p_init, 2 * N * sizeof (double));
+  memcpy (ps_src, p_init, 2 * N * sizeof (float));
+  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
+  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
+  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
+  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-1c.c b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
new file mode 100644
index 00000000000..f07e9fb2d3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-1c.c
@@ -0,0 +1,41 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[i];
+}
+
+static void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
+
+  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
+    p_init[i] = i;
+
+  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
+
+  foo_ph (ph_dst, ph_src);
+  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2a.c b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
new file mode 100644
index 00000000000..d2e2f8d4f43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-2a.c
@@ -0,0 +1,82 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 2 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+  a[8] = b[8];
+  a[9] = b[9];
+  a[10] = b[10];
+  a[11] = b[11];
+  a[12] = b[12];
+  a[13] = b[13];
+  a[14] = b[14];
+  a[15] = b[15];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2b.c b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
new file mode 100644
index 00000000000..ac360752693
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-2b.c
@@ -0,0 +1,62 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-2a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (32);
+  _Complex double* pd_dst = (_Complex double*) malloc (32);
+  _Complex float* ps_src = (_Complex float*) malloc (32);
+  _Complex float* ps_dst = (_Complex float*) malloc (32);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+  _Complex int* epi32_src = (_Complex int*) malloc (32);
+  _Complex int* epi32_dst = (_Complex int*) malloc (32);
+  _Complex short* epi16_src = (_Complex short*) malloc (32);
+  _Complex short* epi16_dst = (_Complex short*) malloc (32);
+  _Complex char* epi8_src = (_Complex char*) malloc (32);
+  _Complex char* epi8_dst = (_Complex char*) malloc (32);
+  char* p = (char* ) malloc (32);
+
+  __builtin_memset (pd_dst, 0, 32);
+  __builtin_memset (ps_dst, 0, 32);
+  __builtin_memset (epi64_dst, 0, 32);
+  __builtin_memset (epi32_dst, 0, 32);
+  __builtin_memset (epi16_dst, 0, 32);
+  __builtin_memset (epi8_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (pd_src, p, 32);
+  __builtin_memcpy (ps_src, p, 32);
+  __builtin_memcpy (epi64_src, p, 32);
+  __builtin_memcpy (epi32_src, p, 32);
+  __builtin_memcpy (epi16_src, p, 32);
+  __builtin_memcpy (epi8_src, p, 32);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-2c.c b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
new file mode 100644
index 00000000000..a002f209ec9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-2c.c
@@ -0,0 +1,47 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+  char* p = (char* ) malloc (32);
+
+   __builtin_memset (ph_dst, 0, 32);
+ 
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (ph_src, p, 32);
+ 
+  foo_ph (ph_dst, ph_src);
+  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3a.c b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
new file mode 100644
index 00000000000..c1b64b56b1c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-3a.c
@@ -0,0 +1,80 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 2 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 6, 7, 4, 5 \}} 1 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 1 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17 \}} 1 "slp2" } }  */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+  a[2] = b[3];
+  a[3] = b[2];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[3];
+  a[1] = b[2];
+  a[2] = b[1];
+  a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+  a[8] = b[15];
+  a[9] = b[14];
+  a[10] = b[13];
+  a[11] = b[12];
+  a[12] = b[11];
+  a[13] = b[10];
+  a[14] = b[9];
+  a[15] = b[8];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3b.c b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
new file mode 100644
index 00000000000..e4fa3f3a541
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-3b.c
@@ -0,0 +1,126 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr106010-3a.c"
+
+void
+avx2_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (32);
+  _Complex double* pd_dst = (_Complex double*) malloc (32);
+  _Complex double* pd_exp = (_Complex double*) malloc (32);
+  _Complex float* ps_src = (_Complex float*) malloc (32);
+  _Complex float* ps_dst = (_Complex float*) malloc (32);
+  _Complex float* ps_exp = (_Complex float*) malloc (32);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_exp = (_Complex long long*) malloc (32);
+  _Complex int* epi32_src = (_Complex int*) malloc (32);
+  _Complex int* epi32_dst = (_Complex int*) malloc (32);
+  _Complex int* epi32_exp = (_Complex int*) malloc (32);
+  _Complex short* epi16_src = (_Complex short*) malloc (32);
+  _Complex short* epi16_dst = (_Complex short*) malloc (32);
+  _Complex short* epi16_exp = (_Complex short*) malloc (32);
+  _Complex char* epi8_src = (_Complex char*) malloc (32);
+  _Complex char* epi8_dst = (_Complex char*) malloc (32);
+  _Complex char* epi8_exp = (_Complex char*) malloc (32);
+  char* p = (char* ) malloc (32);
+  char* q = (char* ) malloc (32);
+
+  __builtin_memset (pd_dst, 0, 32);
+  __builtin_memset (ps_dst, 0, 32);
+  __builtin_memset (epi64_dst, 0, 32);
+  __builtin_memset (epi32_dst, 0, 32);
+  __builtin_memset (epi16_dst, 0, 32);
+  __builtin_memset (epi8_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (pd_src, p, 32);
+  __builtin_memcpy (ps_src, p, 32);
+  __builtin_memcpy (epi64_src, p, 32);
+  __builtin_memcpy (epi32_src, p, 32);
+  __builtin_memcpy (epi16_src, p, 32);
+  __builtin_memcpy (epi8_src, p, 32);
+
+  for (int i = 0; i != 16; i++)
+    {
+      p[i] = i + 16;
+      p[i + 16] = i;
+    }
+  __builtin_memcpy (pd_exp, p, 32);
+  __builtin_memcpy (epi64_exp, p, 32);
+
+  for (int i = 0; i != 8; i++)
+    {
+      p[i] = i + 8;
+      p[i + 8] = i;
+      p[i + 16] = i + 24;
+      p[i + 24] = i + 16;
+      q[i] = i + 24;
+      q[i + 8] = i + 16;
+      q[i + 16] = i + 8;
+      q[i + 24] = i;
+    }
+  __builtin_memcpy (ps_exp, p, 32);
+  __builtin_memcpy (epi32_exp, q, 32);
+
+
+  for (int i = 0; i != 4; i++)
+    {
+      q[i] = i + 28;
+      q[i + 4] = i + 24;
+      q[i + 8] = i + 20;
+      q[i + 12] = i + 16;
+      q[i + 16] = i + 12;
+      q[i + 20] = i + 8;
+      q[i + 24] = i + 4;
+      q[i + 28] = i;
+    }
+  __builtin_memcpy (epi16_exp, q, 32);
+
+  for (int i = 0; i != 2; i++)
+    {
+      q[i] = i + 14;
+      q[i + 2] = i + 12;
+      q[i + 4] = i + 10;
+      q[i + 6] = i + 8;
+      q[i + 8] = i + 6;
+      q[i + 10] = i + 4;
+      q[i + 12] = i + 2;
+      q[i + 14] = i;
+      q[i + 16] = i + 30;
+      q[i + 18] = i + 28;
+      q[i + 20] = i + 26;
+      q[i + 22] = i + 24;
+      q[i + 24] = i + 22;
+      q[i + 26] = i + 20;
+      q[i + 28] = i + 18;
+      q[i + 30] = i + 16;
+    }
+  __builtin_memcpy (epi8_exp, q, 32);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+  if (__builtin_memcmp (pd_dst, pd_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_exp, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_exp, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-3c.c b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
new file mode 100644
index 00000000000..5a5a3d4b992
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-3c.c
@@ -0,0 +1,69 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1, 8, 9, 6, 7, 14, 15, 12, 13, 4, 5, 10, 11 \}} 1 "slp2" } }  */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[1];
+  a[1] = b[0];
+  a[2] = b[4];
+  a[3] = b[3];
+  a[4] = b[7];
+  a[5] = b[6];
+  a[6] = b[2];
+  a[7] = b[5];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (32);
+  char* p = (char* ) malloc (32);
+  char* q = (char* ) malloc (32);
+
+  __builtin_memset (ph_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (ph_src, p, 32);
+
+  for (int i = 0; i != 4; i++)
+    {
+      p[i] = i + 4;
+      p[i + 4] = i;
+      p[i + 8] = i + 16;
+      p[i + 12] = i + 12;
+      p[i + 16] = i + 28;
+      p[i + 20] = i + 24;
+      p[i + 24] = i + 8;
+      p[i + 28] = i + 20;
+      q[i] = i + 28;
+      q[i + 4] = i + 24;
+      q[i + 8] = i + 20;
+      q[i + 12] = i + 16;
+      q[i + 16] = i + 12;
+      q[i + 20] = i + 8;
+      q[i + 24] = i + 4;
+      q[i + 28] = i;
+    }
+  __builtin_memcpy (ph_exp, p, 32);
+
+  foo_ph (ph_dst, ph_src);
+  if (__builtin_memcmp (ph_dst, ph_exp, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4a.c b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
new file mode 100644
index 00000000000..b7b0b532bb1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-4a.c
@@ -0,0 +1,101 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a,
+	_Complex double b1,
+	_Complex double b2)
+{
+  a[0] = b1;
+  a[1] = b2;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a,
+	_Complex float b1, _Complex float b2,
+	_Complex float b3, _Complex float b4)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a,
+	   _Complex long long b1,
+	   _Complex long long b2)
+{
+  a[0] = b1;
+  a[1] = b2;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a,
+	   _Complex int b1, _Complex int b2,
+	   _Complex int b3, _Complex int b4)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a,
+	   _Complex short b1, _Complex short b2,
+	   _Complex short b3, _Complex short b4,
+	   _Complex short b5, _Complex short b6,
+	   _Complex short b7,_Complex short b8)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+  a[4] = b5;
+  a[5] = b6;
+  a[6] = b7;
+  a[7] = b8;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a,
+	  _Complex char b1, _Complex char b2,
+	  _Complex char b3, _Complex char b4,
+	  _Complex char b5, _Complex char b6,
+	  _Complex char b7,_Complex char b8,
+	  _Complex char b9, _Complex char b10,
+	  _Complex char b11, _Complex char b12,
+	  _Complex char b13, _Complex char b14,
+	  _Complex char b15,_Complex char b16)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+  a[4] = b5;
+  a[5] = b6;
+  a[6] = b7;
+  a[7] = b8;
+  a[8] = b9;
+  a[9] = b10;
+  a[10] = b11;
+  a[11] = b12;
+  a[12] = b13;
+  a[13] = b14;
+  a[14] = b15;
+  a[15] = b16;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4b.c b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
new file mode 100644
index 00000000000..e2e79508c4b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-4b.c
@@ -0,0 +1,67 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-4a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (32);
+  _Complex double* pd_dst = (_Complex double*) malloc (32);
+  _Complex float* ps_src = (_Complex float*) malloc (32);
+  _Complex float* ps_dst = (_Complex float*) malloc (32);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (32);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (32);
+  _Complex int* epi32_src = (_Complex int*) malloc (32);
+  _Complex int* epi32_dst = (_Complex int*) malloc (32);
+  _Complex short* epi16_src = (_Complex short*) malloc (32);
+  _Complex short* epi16_dst = (_Complex short*) malloc (32);
+  _Complex char* epi8_src = (_Complex char*) malloc (32);
+  _Complex char* epi8_dst = (_Complex char*) malloc (32);
+  char* p = (char* ) malloc (32);
+
+  __builtin_memset (pd_dst, 0, 32);
+  __builtin_memset (ps_dst, 0, 32);
+  __builtin_memset (epi64_dst, 0, 32);
+  __builtin_memset (epi32_dst, 0, 32);
+  __builtin_memset (epi16_dst, 0, 32);
+  __builtin_memset (epi8_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+  __builtin_memcpy (pd_src, p, 32);
+  __builtin_memcpy (ps_src, p, 32);
+  __builtin_memcpy (epi64_src, p, 32);
+  __builtin_memcpy (epi32_src, p, 32);
+  __builtin_memcpy (epi16_src, p, 32);
+  __builtin_memcpy (epi8_src, p, 32);
+
+  foo_pd (pd_dst, pd_src[0], pd_src[1]);
+  foo_ps (ps_dst, ps_src[0], ps_src[1], ps_src[2], ps_src[3]);
+  foo_epi64 (epi64_dst, epi64_src[0], epi64_src[1]);
+  foo_epi32 (epi32_dst, epi32_src[0], epi32_src[1], epi32_src[2], epi32_src[3]);
+  foo_epi16 (epi16_dst, epi16_src[0], epi16_src[1], epi16_src[2], epi16_src[3],
+	     epi16_src[4], epi16_src[5], epi16_src[6], epi16_src[7]);
+  foo_epi8 (epi8_dst, epi8_src[0], epi8_src[1], epi8_src[2], epi8_src[3],
+	    epi8_src[4], epi8_src[5], epi8_src[6], epi8_src[7],
+	    epi8_src[8], epi8_src[9], epi8_src[10], epi8_src[11],
+	    epi8_src[12], epi8_src[13], epi8_src[14], epi8_src[15]);
+
+  if (__builtin_memcmp (pd_dst, pd_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, 32) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_src, 32) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-4c.c b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
new file mode 100644
index 00000000000..8e02aefe3b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-4c.c
@@ -0,0 +1,54 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -fdump-tree-slp-details -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a,
+	_Complex _Float16 b1, _Complex _Float16 b2,
+	_Complex _Float16 b3, _Complex _Float16 b4,
+	_Complex _Float16 b5, _Complex _Float16 b6,
+	_Complex _Float16 b7,_Complex _Float16 b8)
+{
+  a[0] = b1;
+  a[1] = b2;
+  a[2] = b3;
+  a[3] = b4;
+  a[4] = b5;
+  a[5] = b6;
+  a[6] = b7;
+  a[7] = b8;
+}
+
+void
+do_test (void)
+{
+
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (32);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (32);
+
+  char* p = (char* ) malloc (32);
+
+  __builtin_memset (ph_dst, 0, 32);
+
+  for (int i = 0; i != 32; i++)
+    p[i] = i;
+
+  __builtin_memcpy (ph_src, p, 32);
+
+  foo_ph (ph_dst, ph_src[0], ph_src[1], ph_src[2], ph_src[3],
+	  ph_src[4], ph_src[5], ph_src[6], ph_src[7]);
+
+  if (__builtin_memcmp (ph_dst, ph_src, 32) != 0)
+    __builtin_abort ();
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5a.c b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
new file mode 100644
index 00000000000..9d4a6f9846b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-5a.c
@@ -0,0 +1,117 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 4 "slp2" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 4 "slp2" } } */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[2];
+  a[1] = b[3];
+  a[2] = b[0];
+  a[3] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[4];
+  a[1] = b[5];
+  a[2] = b[6];
+  a[3] = b[7];
+  a[4] = b[0];
+  a[5] = b[1];
+  a[6] = b[2];
+  a[7] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[2];
+  a[1] = b[3];
+  a[2] = b[0];
+  a[3] = b[1];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[4];
+  a[1] = b[5];
+  a[2] = b[6];
+  a[3] = b[7];
+  a[4] = b[0];
+  a[5] = b[1];
+  a[6] = b[2];
+  a[7] = b[3];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[8];
+  a[1] = b[9];
+  a[2] = b[10];
+  a[3] = b[11];
+  a[4] = b[12];
+  a[5] = b[13];
+  a[6] = b[14];
+  a[7] = b[15];
+  a[8] = b[0];
+  a[9] = b[1];
+  a[10] = b[2];
+  a[11] = b[3];
+  a[12] = b[4];
+  a[13] = b[5];
+  a[14] = b[6];
+  a[15] = b[7];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[16];
+  a[1] = b[17];
+  a[2] = b[18];
+  a[3] = b[19];
+  a[4] = b[20];
+  a[5] = b[21];
+  a[6] = b[22];
+  a[7] = b[23];
+  a[8] = b[24];
+  a[9] = b[25];
+  a[10] = b[26];
+  a[11] = b[27];
+  a[12] = b[28];
+  a[13] = b[29];
+  a[14] = b[30];
+  a[15] = b[31];
+  a[16] = b[0];
+  a[17] = b[1];
+  a[18] = b[2];
+  a[19] = b[3];
+  a[20] = b[4];
+  a[21] = b[5];
+  a[22] = b[6];
+  a[23] = b[7];
+  a[24] = b[8];
+  a[25] = b[9];
+  a[26] = b[10];
+  a[27] = b[11];
+  a[28] = b[12];
+  a[29] = b[13];
+  a[30] = b[14];
+  a[31] = b[15];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5b.c b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
new file mode 100644
index 00000000000..d5c6ebeb5cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-5b.c
@@ -0,0 +1,80 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-5a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (64);
+  _Complex double* pd_dst = (_Complex double*) malloc (64);
+  _Complex double* pd_exp = (_Complex double*) malloc (64);
+  _Complex float* ps_src = (_Complex float*) malloc (64);
+  _Complex float* ps_dst = (_Complex float*) malloc (64);
+  _Complex float* ps_exp = (_Complex float*) malloc (64);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
+  _Complex int* epi32_src = (_Complex int*) malloc (64);
+  _Complex int* epi32_dst = (_Complex int*) malloc (64);
+  _Complex int* epi32_exp = (_Complex int*) malloc (64);
+  _Complex short* epi16_src = (_Complex short*) malloc (64);
+  _Complex short* epi16_dst = (_Complex short*) malloc (64);
+  _Complex short* epi16_exp = (_Complex short*) malloc (64);
+  _Complex char* epi8_src = (_Complex char*) malloc (64);
+  _Complex char* epi8_dst = (_Complex char*) malloc (64);
+  _Complex char* epi8_exp = (_Complex char*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (pd_dst, 0, 64);
+  __builtin_memset (ps_dst, 0, 64);
+  __builtin_memset (epi64_dst, 0, 64);
+  __builtin_memset (epi32_dst, 0, 64);
+  __builtin_memset (epi16_dst, 0, 64);
+  __builtin_memset (epi8_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    {
+      p[i] = i;
+      q[i] = (i + 32) % 64;
+    }
+  __builtin_memcpy (pd_src, p, 64);
+  __builtin_memcpy (ps_src, p, 64);
+  __builtin_memcpy (epi64_src, p, 64);
+  __builtin_memcpy (epi32_src, p, 64);
+  __builtin_memcpy (epi16_src, p, 64);
+  __builtin_memcpy (epi8_src, p, 64);
+
+  __builtin_memcpy (pd_exp, q, 64);
+  __builtin_memcpy (ps_exp, q, 64);
+  __builtin_memcpy (epi64_exp, q, 64);
+  __builtin_memcpy (epi32_exp, q, 64);
+  __builtin_memcpy (epi16_exp, q, 64);
+  __builtin_memcpy (epi8_exp, q, 64);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+
+  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-5c.c b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
new file mode 100644
index 00000000000..9ce4e6dd5c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-5c.c
@@ -0,0 +1,62 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 4 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[8];
+  a[1] = b[9];
+  a[2] = b[10];
+  a[3] = b[11];
+  a[4] = b[12];
+  a[5] = b[13];
+  a[6] = b[14];
+  a[7] = b[15];
+  a[8] = b[0];
+  a[9] = b[1];
+  a[10] = b[2];
+  a[11] = b[3];
+  a[12] = b[4];
+  a[13] = b[5];
+  a[14] = b[6];
+  a[15] = b[7];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (ph_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    {
+      p[i] = i;
+      q[i] = (i + 32) % 64;
+    }
+  __builtin_memcpy (ph_src, p, 64);
+
+  __builtin_memcpy (ph_exp, q, 64);
+
+  foo_ph (ph_dst, ph_src);
+
+  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6a.c b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
new file mode 100644
index 00000000000..65a90d03684
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-6a.c
@@ -0,0 +1,115 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-slp-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 6 "slp2" } }*/
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 2, 3, 0, 1 \}} 4 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 6, 7, 4, 5, 2, 3, 0, 1 \}} 4 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
+
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double* __restrict b)
+{
+  a[0] = b[3];
+  a[1] = b[2];
+  a[2] = b[1];
+  a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long* __restrict b)
+{
+  a[0] = b[3];
+  a[1] = b[2];
+  a[2] = b[1];
+  a[3] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int* __restrict b)
+{
+  a[0] = b[7];
+  a[1] = b[6];
+  a[2] = b[5];
+  a[3] = b[4];
+  a[4] = b[3];
+  a[5] = b[2];
+  a[6] = b[1];
+  a[7] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short* __restrict b)
+{
+  a[0] = b[15];
+  a[1] = b[14];
+  a[2] = b[13];
+  a[3] = b[12];
+  a[4] = b[11];
+  a[5] = b[10];
+  a[6] = b[9];
+  a[7] = b[8];
+  a[8] = b[7];
+  a[9] = b[6];
+  a[10] = b[5];
+  a[11] = b[4];
+  a[12] = b[3];
+  a[13] = b[2];
+  a[14] = b[1];
+  a[15] = b[0];
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char* __restrict b)
+{
+  a[0] = b[31];
+  a[1] = b[30];
+  a[2] = b[29];
+  a[3] = b[28];
+  a[4] = b[27];
+  a[5] = b[26];
+  a[6] = b[25];
+  a[7] = b[24];
+  a[8] = b[23];
+  a[9] = b[22];
+  a[10] = b[21];
+  a[11] = b[20];
+  a[12] = b[19];
+  a[13] = b[18];
+  a[14] = b[17];
+  a[15] = b[16];
+  a[16] = b[15];
+  a[17] = b[14];
+  a[18] = b[13];
+  a[19] = b[12];
+  a[20] = b[11];
+  a[21] = b[10];
+  a[22] = b[9];
+  a[23] = b[8];
+  a[24] = b[7];
+  a[25] = b[6];
+  a[26] = b[5];
+  a[27] = b[4];
+  a[28] = b[3];
+  a[29] = b[2];
+  a[30] = b[1];
+  a[31] = b[0];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6b.c b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
new file mode 100644
index 00000000000..1c5bb020939
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-6b.c
@@ -0,0 +1,157 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx2 -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx2 } */
+
+#include "avx2-check.h"
+#include <string.h>
+#include "pr106010-6a.c"
+
+void
+avx2_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (64);
+  _Complex double* pd_dst = (_Complex double*) malloc (64);
+  _Complex double* pd_exp = (_Complex double*) malloc (64);
+  _Complex float* ps_src = (_Complex float*) malloc (64);
+  _Complex float* ps_dst = (_Complex float*) malloc (64);
+  _Complex float* ps_exp = (_Complex float*) malloc (64);
+  _Complex long long* epi64_src = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (64);
+  _Complex long long* epi64_exp = (_Complex long long*) malloc (64);
+  _Complex int* epi32_src = (_Complex int*) malloc (64);
+  _Complex int* epi32_dst = (_Complex int*) malloc (64);
+  _Complex int* epi32_exp = (_Complex int*) malloc (64);
+  _Complex short* epi16_src = (_Complex short*) malloc (64);
+  _Complex short* epi16_dst = (_Complex short*) malloc (64);
+  _Complex short* epi16_exp = (_Complex short*) malloc (64);
+  _Complex char* epi8_src = (_Complex char*) malloc (64);
+  _Complex char* epi8_dst = (_Complex char*) malloc (64);
+  _Complex char* epi8_exp = (_Complex char*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (pd_dst, 0, 64);
+  __builtin_memset (ps_dst, 0, 64);
+  __builtin_memset (epi64_dst, 0, 64);
+  __builtin_memset (epi32_dst, 0, 64);
+  __builtin_memset (epi16_dst, 0, 64);
+  __builtin_memset (epi8_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    p[i] = i;
+
+  __builtin_memcpy (pd_src, p, 64);
+  __builtin_memcpy (ps_src, p, 64);
+  __builtin_memcpy (epi64_src, p, 64);
+  __builtin_memcpy (epi32_src, p, 64);
+  __builtin_memcpy (epi16_src, p, 64);
+  __builtin_memcpy (epi8_src, p, 64);
+
+
+  for (int i = 0; i != 16; i++)
+    {
+      q[i] = i + 48;
+      q[i + 16] = i + 32;
+      q[i + 32] = i + 16;
+      q[i + 48] = i;
+    }
+ 
+  __builtin_memcpy (pd_exp, q, 64);
+  __builtin_memcpy (epi64_exp, q, 64);
+
+   for (int i = 0; i != 8; i++)
+    {
+      q[i] = i + 56;
+      q[i + 8] = i + 48;
+      q[i + 16] = i + 40;
+      q[i + 24] = i + 32;
+      q[i + 32] = i + 24;
+      q[i + 40] = i + 16;
+      q[i + 48] = i + 8;
+      q[i + 56] = i;
+    }
+
+  __builtin_memcpy (ps_exp, q, 64);
+  __builtin_memcpy (epi32_exp, q, 64);
+
+  for (int i = 0; i != 4; i++)
+    {
+      q[i] = i + 60;
+      q[i + 4] = i + 56;
+      q[i + 8] = i + 52;
+      q[i + 12] = i + 48;
+      q[i + 16] = i + 44;
+      q[i + 20] = i + 40;
+      q[i + 24] = i + 36;
+      q[i + 28] = i + 32;
+      q[i + 32] = i + 28;
+      q[i + 36] = i + 24;
+      q[i + 40] = i + 20;
+      q[i + 44] = i + 16;
+      q[i + 48] = i + 12;
+      q[i + 52] = i + 8;
+      q[i + 56] = i + 4;
+      q[i + 60] = i;
+    }
+
+  __builtin_memcpy (epi16_exp, q, 64);
+
+  for (int i = 0; i != 2; i++)
+    {
+      q[i] = i + 62;
+      q[i + 2] = i + 60;
+      q[i + 4] = i + 58;
+      q[i + 6] = i + 56;
+      q[i + 8] = i + 54;
+      q[i + 10] = i + 52;
+      q[i + 12] = i + 50;
+      q[i + 14] = i + 48;
+      q[i + 16] = i + 46;
+      q[i + 18] = i + 44;
+      q[i + 20] = i + 42;
+      q[i + 22] = i + 40;
+      q[i + 24] = i + 38;
+      q[i + 26] = i + 36;
+      q[i + 28] = i + 34;
+      q[i + 30] = i + 32;
+      q[i + 32] = i + 30;
+      q[i + 34] = i + 28;
+      q[i + 36] = i + 26;
+      q[i + 38] = i + 24;
+      q[i + 40] = i + 22;
+      q[i + 42] = i + 20;
+      q[i + 44] = i + 18;
+      q[i + 46] = i + 16;
+      q[i + 48] = i + 14;
+      q[i + 50] = i + 12;
+      q[i + 52] = i + 10;
+      q[i + 54] = i + 8;
+      q[i + 56] = i + 6;
+      q[i + 58] = i + 4;
+      q[i + 60] = i + 2;
+      q[i + 62] = i;
+    }
+  __builtin_memcpy (epi8_exp, q, 64);
+
+  foo_pd (pd_dst, pd_src);
+  foo_ps (ps_dst, ps_src);
+  foo_epi64 (epi64_dst, epi64_src);
+  foo_epi32 (epi32_dst, epi32_src);
+  foo_epi16 (epi16_dst, epi16_src);
+  foo_epi8 (epi8_dst, epi8_src);
+
+  if (__builtin_memcmp (pd_dst, pd_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_exp, 64) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-6c.c b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
new file mode 100644
index 00000000000..b859d884a7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-6c.c
@@ -0,0 +1,80 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-slp-details" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*VEC_PERM_EXPR.*\{ 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 \}} 2 "slp2" } }  */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized using (?:32|64) byte vectors" 1 "slp2" } } */
+
+#include <string.h>
+
+static void do_test (void);
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16* __restrict b)
+{
+  a[0] = b[15];
+  a[1] = b[14];
+  a[2] = b[13];
+  a[3] = b[12];
+  a[4] = b[11];
+  a[5] = b[10];
+  a[6] = b[9];
+  a[7] = b[8];
+  a[8] = b[7];
+  a[9] = b[6];
+  a[10] = b[5];
+  a[11] = b[4];
+  a[12] = b[3];
+  a[13] = b[2];
+  a[14] = b[1];
+  a[15] = b[0];
+}
+
+void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (64);
+  _Complex _Float16* ph_exp = (_Complex _Float16*) malloc (64);
+  char* p = (char* ) malloc (64);
+  char* q = (char* ) malloc (64);
+
+  __builtin_memset (ph_dst, 0, 64);
+
+  for (int i = 0; i != 64; i++)
+    p[i] = i;
+
+  __builtin_memcpy (ph_src, p, 64);
+
+  for (int i = 0; i != 4; i++)
+    {
+      q[i] = i + 60;
+      q[i + 4] = i + 56;
+      q[i + 8] = i + 52;
+      q[i + 12] = i + 48;
+      q[i + 16] = i + 44;
+      q[i + 20] = i + 40;
+      q[i + 24] = i + 36;
+      q[i + 28] = i + 32;
+      q[i + 32] = i + 28;
+      q[i + 36] = i + 24;
+      q[i + 40] = i + 20;
+      q[i + 44] = i + 16;
+      q[i + 48] = i + 12;
+      q[i + 52] = i + 8;
+      q[i + 56] = i + 4;
+      q[i + 60] = i;
+    }
+
+  __builtin_memcpy (ph_exp, q, 64);
+
+  foo_ph (ph_dst, ph_src);
+  
+  if (__builtin_memcmp (ph_dst, ph_exp, 64) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7a.c b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
new file mode 100644
index 00000000000..2ea01fac927
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7a.c
@@ -0,0 +1,58 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a, _Complex double b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a, _Complex float b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a, _Complex long long b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a, _Complex int b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a, _Complex short b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a, _Complex char b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
new file mode 100644
index 00000000000..26482cc10f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
@@ -0,0 +1,63 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-7a.c"
+
+void
+avx_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex float* ps_src = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex long long* epi64_src = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex int* epi32_src = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex short* epi16_src = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex char* epi8_src = (_Complex char*) malloc (2 * N * sizeof (char));
+  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+  char* p_init = (char*) malloc (2 * N * sizeof (double));
+
+  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i % 2 + 3;
+
+  memcpy (pd_src, p_init, 2 * N * sizeof (double));
+  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
+  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
+  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
+  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
+  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
+
+  foo_pd (pd_dst, pd_src[0]);
+  foo_ps (ps_dst, ps_src[0]);
+  foo_epi64 (epi64_dst, epi64_src[0]);
+  foo_epi32 (epi32_dst, epi32_src[0]);
+  foo_epi16 (epi16_dst, epi16_src[0]);
+  foo_epi8 (epi8_dst, epi8_src[0]);
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (ps_dst, ps_src, N * 2 * sizeof (float)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi64_dst, epi64_src, N * 2 * sizeof (long long)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi32_dst, epi32_src, N * 2 * sizeof (int)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi16_dst, epi16_src, N * 2 * sizeof (short)) != 0)
+    __builtin_abort ();
+  if (__builtin_memcmp (epi8_dst, epi8_src, N * 2 * sizeof (char)) != 0)
+    __builtin_abort ();
+
+  return;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7c.c b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
new file mode 100644
index 00000000000..7f4056a5ecc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7c.c
@@ -0,0 +1,41 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a, _Complex _Float16 b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b;
+}
+
+static void
+do_test (void)
+{
+  _Complex _Float16* ph_src = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+  char* p_init = (char*) malloc (2 * N * sizeof (_Float16));
+
+  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+  for (int i = 0; i != 2 * N * sizeof (_Float16); i++)
+    p_init[i] = i % 2 + 3;
+
+  memcpy (ph_src, p_init, 2 * N * sizeof (_Float16));
+
+  foo_ph (ph_dst, ph_src[0]);
+  if (__builtin_memcmp (ph_dst, ph_src, N * 2 * sizeof (_Float16)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8a.c b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
new file mode 100644
index 00000000000..11054b60d30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-8a.c
@@ -0,0 +1,58 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -fdump-tree-vect-details -mprefer-vector-width=256" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) double>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) float>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(4\) long long int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(8\) int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) short int>} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(32\) char>} 1 "vect" } } */
+
+#define N 10000
+void
+__attribute__((noipa))
+foo_pd (_Complex double* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1.0 + 2.0i;
+}
+
+void
+__attribute__((noipa))
+foo_ps (_Complex float* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1.0f + 2.0fi;
+}
+
+void
+__attribute__((noipa))
+foo_epi64 (_Complex long long* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi32 (_Complex int* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi16 (_Complex short* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
+
+void
+__attribute__((noipa))
+foo_epi8 (_Complex char* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1 + 2i;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8b.c b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
new file mode 100644
index 00000000000..6bb0073b691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-8b.c
@@ -0,0 +1,53 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+#include <string.h>
+#include "pr106010-8a.c"
+
+void
+avx_test (void)
+{
+  _Complex double pd_src = 1.0 + 2.0i;
+  _Complex double* pd_dst = (_Complex double*) malloc (2 * N * sizeof (double));
+  _Complex float ps_src = 1.0 + 2.0i;
+  _Complex float* ps_dst = (_Complex float*) malloc (2 * N * sizeof (float));
+  _Complex long long epi64_src = 1 + 2i;;
+  _Complex long long* epi64_dst = (_Complex long long*) malloc (2 * N * sizeof (long long));
+  _Complex int epi32_src = 1 + 2i;
+  _Complex int* epi32_dst = (_Complex int*) malloc (2 * N * sizeof (int));
+  _Complex short epi16_src = 1 + 2i;
+  _Complex short* epi16_dst = (_Complex short*) malloc (2 * N * sizeof (short));
+  _Complex char epi8_src = 1 + 2i;
+  _Complex char* epi8_dst = (_Complex char*) malloc (2 * N * sizeof (char));
+
+  __builtin_memset (pd_dst, 0, 2 * N * sizeof (double));
+  __builtin_memset (ps_dst, 0, 2 * N * sizeof (float));
+  __builtin_memset (epi64_dst, 0, 2 * N * sizeof (long long));
+  __builtin_memset (epi32_dst, 0, 2 * N * sizeof (int));
+  __builtin_memset (epi16_dst, 0, 2 * N * sizeof (short));
+  __builtin_memset (epi8_dst, 0, 2 * N * sizeof (char));
+
+  foo_pd (pd_dst);
+  foo_ps (ps_dst);
+  foo_epi64 (epi64_dst);
+  foo_epi32 (epi32_dst);
+  foo_epi16 (epi16_dst);
+  foo_epi8 (epi8_dst);
+  for (int i = 0 ; i != N; i++)
+    {
+      if (pd_dst[i] != pd_src)
+	__builtin_abort ();
+      if (ps_dst[i] != ps_src)
+	__builtin_abort ();
+      if (epi64_dst[i] != epi64_src)
+	__builtin_abort ();
+      if (epi32_dst[i] != epi32_src)
+	__builtin_abort ();
+      if (epi16_dst[i] != epi16_src)
+	__builtin_abort ();
+      if (epi8_dst[i] != epi8_src)
+	__builtin_abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-8c.c b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
new file mode 100644
index 00000000000..61ae131829d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-8c.c
@@ -0,0 +1,38 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -ftree-vectorize -fvect-cost-model=unlimited -mprefer-vector-width=256 -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)add new stmt:.*MEM <vector\(16\) _Float16>} 1 "vect" } } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#include <string.h>
+
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512FP16
+#include "avx512-check.h"
+
+#define N 10000
+
+void
+__attribute__((noipa))
+foo_ph (_Complex _Float16* a)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = 1.0f16 + 2.0f16i;
+}
+
+static void
+do_test (void)
+{
+  _Complex _Float16 ph_src = 1.0f16 + 2.0f16i;
+  _Complex _Float16* ph_dst = (_Complex _Float16*) malloc (2 * N * sizeof (_Float16));
+
+  __builtin_memset (ph_dst, 0, 2 * N * sizeof (_Float16));
+
+  foo_ph (ph_dst);
+  for (int i = 0; i != N; i++)
+    {
+      if (ph_dst[i] != ph_src)
+	__builtin_abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9a.c b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
new file mode 100644
index 00000000000..e922f7b5400
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-9a.c
@@ -0,0 +1,89 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" } } */
+
+typedef struct { _Complex double c; double a1; double a2;}
+  cdf;
+typedef struct { _Complex double c; double a1; double a2; double a3; double a4;}
+  cdf2;
+typedef struct { _Complex double c1; _Complex double c2; double a1; double a2; double a3; double a4;}
+  cdf3;
+typedef struct { _Complex double c1; _Complex double c2; double a1; double a2;}
+  cdf4;
+
+#define N 100
+/* VMAT_ELEMENTWISE.  */
+void
+__attribute__((noipa))
+foo (cdf* a, cdf* __restrict b)
+{
+   for (int i = 0; i < N; ++i)
+    {
+      a[i].c = b[i].c;
+      a[i].a1 = b[i].a1;
+      a[i].a2 = b[i].a2;
+    }
+}
+
+/* VMAT_CONTIGUOUS_PERMUTE.  */
+void
+__attribute__((noipa))
+foo1 (cdf2* a, cdf2* __restrict b)
+{
+   for (int i = 0; i < N; ++i)
+    {
+      a[i].c = b[i].c;
+      a[i].a1 = b[i].a1;
+      a[i].a2 = b[i].a2;
+      a[i].a3 = b[i].a3;
+      a[i].a4 = b[i].a4;
+    }
+}
+
+/* VMAT_CONTIGUOUS.  */
+void
+__attribute__((noipa))
+foo2 (cdf3* a, cdf3* __restrict b)
+{
+   for (int i = 0; i < N; ++i)
+    {
+      a[i].c1 = b[i].c1;
+      a[i].c2 = b[i].c2;
+      a[i].a1 = b[i].a1;
+      a[i].a2 = b[i].a2;
+      a[i].a3 = b[i].a3;
+      a[i].a4 = b[i].a4;
+    }
+}
+
+/* VMAT_STRIDED_SLP.  */
+void
+__attribute__((noipa))
+foo3 (cdf4* a, cdf4* __restrict b)
+{
+   for (int i = 0; i < N; ++i)
+    {
+      a[i].c1 = b[i].c1;
+      a[i].c2 = b[i].c2;
+      a[i].a1 = b[i].a1;
+      a[i].a2 = b[i].a2;
+    }
+}
+
+/* VMAT_CONTIGUOUS_REVERSE.  */
+void
+__attribute__((noipa))
+foo4 (_Complex double* a, _Complex double* __restrict b)
+{
+  for (int i = 0; i != N; i++)
+    a[i] = b[N-i-1];
+}
+
+/* VMAT_CONTIGUOUS_DOWN.  */
+void
+__attribute__((noipa))
+foo5 (_Complex double* a, _Complex double* __restrict b)
+{
+  for (int i = 0; i != N; i++)
+    a[N-i-1] = b[0];
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9b.c b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
new file mode 100644
index 00000000000..e220445e6e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-9b.c
@@ -0,0 +1,90 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -msse2 -fvect-cost-model=unlimited" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <string.h>
+#include "sse2-check.h"
+#include "pr106010-9a.c"
+
+static void
+sse2_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
+  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
+  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
+  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
+  
+  char* p_init = (char*) malloc (N * sizeof (cdf3));
+
+  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
+  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
+  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
+  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
+  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
+  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
+
+  for (int i = 0; i != N * sizeof (cdf3); i++)
+    p_init[i] = i;
+
+  memcpy (cdf_src, p_init, N * sizeof (cdf));
+  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
+  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
+  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
+  memcpy (pd_src, p_init, N * sizeof (_Complex double));
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i % 16;
+  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
+
+  foo (cdf_dst, cdf_src);
+  foo1 (cdf2_dst, cdf2_src);
+  foo2 (cdf3_dst, cdf3_src);
+  foo3 (cdf4_dst, cdf4_src);
+  foo4 (pd_dst, pd_src);
+  foo5 (pd_dst2, pd_src2);
+  for (int i = 0; i != N; i++)
+    {
+      p_init[(N - i - 1) * 16] = i * 16;
+      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
+      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
+      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
+      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
+      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
+      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
+      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
+      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
+      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
+      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
+      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
+      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
+      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
+      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
+      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
+    }
+  memcpy (pd_src, p_init, N * 16);
+ 
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9c.c b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
new file mode 100644
index 00000000000..ff51f6195b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-9c.c
@@ -0,0 +1,90 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -mavx2 -fvect-cost-model=unlimited" } */
+/* { dg-require-effective-target avx2 } */
+
+#include <string.h>
+#include "avx2-check.h"
+#include "pr106010-9a.c"
+
+static void
+avx2_test (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
+  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
+  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
+  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
+  
+  char* p_init = (char*) malloc (N * sizeof (cdf3));
+
+  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
+  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
+  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
+  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
+  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
+  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
+
+  for (int i = 0; i != N * sizeof (cdf3); i++)
+    p_init[i] = i;
+
+  memcpy (cdf_src, p_init, N * sizeof (cdf));
+  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
+  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
+  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
+  memcpy (pd_src, p_init, N * sizeof (_Complex double));
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i % 16;
+  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
+
+  foo (cdf_dst, cdf_src);
+  foo1 (cdf2_dst, cdf2_src);
+  foo2 (cdf3_dst, cdf3_src);
+  foo3 (cdf4_dst, cdf4_src);
+  foo4 (pd_dst, pd_src);
+  foo5 (pd_dst2, pd_src2);
+  for (int i = 0; i != N; i++)
+    {
+      p_init[(N - i - 1) * 16] = i * 16;
+      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
+      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
+      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
+      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
+      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
+      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
+      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
+      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
+      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
+      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
+      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
+      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
+      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
+      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
+      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
+    }
+  memcpy (pd_src, p_init, N * 16);
+ 
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106010-9d.c b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
new file mode 100644
index 00000000000..d4d8f1dd722
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106010-9d.c
@@ -0,0 +1,92 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -mavx512f -mavx512vl -fvect-cost-model=unlimited -mprefer-vector-width=512" } */
+/* { dg-require-effective-target avx512f } */
+
+#include <string.h>
+#include <stdlib.h>
+#define AVX512F
+#include "avx512-check.h"
+#include "pr106010-9a.c"
+
+static void
+test_512 (void)
+{
+  _Complex double* pd_src = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_src2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  _Complex double* pd_dst2 = (_Complex double*) malloc (N * sizeof (_Complex double));
+  cdf* cdf_src = (cdf*) malloc (N * sizeof (cdf));
+  cdf* cdf_dst = (cdf*) malloc (N * sizeof (cdf));
+  cdf2* cdf2_src = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf2* cdf2_dst = (cdf2*) malloc (N * sizeof (cdf2));
+  cdf3* cdf3_src = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf3* cdf3_dst = (cdf3*) malloc (N * sizeof (cdf3));
+  cdf4* cdf4_src = (cdf4*) malloc (N * sizeof (cdf4));
+  cdf4* cdf4_dst = (cdf4*) malloc (N * sizeof (cdf4));
+  
+  char* p_init = (char*) malloc (N * sizeof (cdf3));
+
+  __builtin_memset (cdf_dst, 0, N * sizeof (cdf));
+  __builtin_memset (cdf2_dst, 0, N * sizeof (cdf2));
+  __builtin_memset (cdf3_dst, 0, N * sizeof (cdf3));
+  __builtin_memset (cdf4_dst, 0, N * sizeof (cdf4));
+  __builtin_memset (pd_dst, 0, N * sizeof (_Complex double));
+  __builtin_memset (pd_dst2, 0, N * sizeof (_Complex double));
+
+  for (int i = 0; i != N * sizeof (cdf3); i++)
+    p_init[i] = i;
+
+  memcpy (cdf_src, p_init, N * sizeof (cdf));
+  memcpy (cdf2_src, p_init, N * sizeof (cdf2));
+  memcpy (cdf3_src, p_init, N * sizeof (cdf3));
+  memcpy (cdf4_src, p_init, N * sizeof (cdf4));
+  memcpy (pd_src, p_init, N * sizeof (_Complex double));
+  for (int i = 0; i != 2 * N * sizeof (double); i++)
+    p_init[i] = i % 16;
+  memcpy (pd_src2, p_init, N * sizeof (_Complex double));
+
+  foo (cdf_dst, cdf_src);
+  foo1 (cdf2_dst, cdf2_src);
+  foo2 (cdf3_dst, cdf3_src);
+  foo3 (cdf4_dst, cdf4_src);
+  foo4 (pd_dst, pd_src);
+  foo5 (pd_dst2, pd_src2);
+  for (int i = 0; i != N; i++)
+    {
+      p_init[(N - i - 1) * 16] = i * 16;
+      p_init[(N - i - 1) * 16 + 1] = i * 16 + 1;
+      p_init[(N - i - 1) * 16 + 2] = i * 16 + 2;
+      p_init[(N - i - 1) * 16 + 3] = i * 16 + 3;
+      p_init[(N - i - 1) * 16 + 4] = i * 16 + 4;
+      p_init[(N - i - 1) * 16 + 5] = i * 16 + 5;
+      p_init[(N - i - 1) * 16 + 6] = i * 16 + 6;
+      p_init[(N - i - 1) * 16 + 7] = i * 16 + 7;
+      p_init[(N - i - 1) * 16 + 8] = i * 16 + 8;
+      p_init[(N - i - 1) * 16 + 9] = i * 16 + 9;
+      p_init[(N - i - 1) * 16 + 10] = i * 16 + 10;
+      p_init[(N - i - 1) * 16 + 11] = i * 16 + 11;
+      p_init[(N - i - 1) * 16 + 12] = i * 16 + 12;
+      p_init[(N - i - 1) * 16 + 13] = i * 16 + 13;
+      p_init[(N - i - 1) * 16 + 14] = i * 16 + 14;
+      p_init[(N - i - 1) * 16 + 15] = i * 16 + 15;
+    }
+  memcpy (pd_src, p_init, N * 16);
+ 
+  if (__builtin_memcmp (pd_dst, pd_src, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (pd_dst2, pd_src2, N * 2 * sizeof (double)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf_dst, cdf_src, N * sizeof (cdf)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf2_dst, cdf2_src, N * sizeof (cdf2)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf3_dst, cdf3_src, N * sizeof (cdf3)) != 0)
+    __builtin_abort ();
+
+  if (__builtin_memcmp (cdf4_dst, cdf4_src, N * sizeof (cdf4)) != 0)
+    __builtin_abort ();
+}
diff --git a/gcc/tree-complex.cc b/gcc/tree-complex.cc
index 61950a0f099..ea9df6114a1 100644
--- a/gcc/tree-complex.cc
+++ b/gcc/tree-complex.cc
@@ -297,6 +297,11 @@  init_dont_simulate_again (void)
 		break;
 
 	      default:
+		/* When expand_complex_move would trigger make sure we
+		   perform lowering even when there is no actual complex
+		   operation.  This helps consistency and vectorization.  */
+		if (TREE_CODE (TREE_TYPE (gimple_op (stmt, 0))) == COMPLEX_TYPE)
+		  saw_a_complex_op = true;
 		break;
 	      }
 
@@ -869,7 +874,9 @@  expand_complex_move (gimple_stmt_iterator *gsi, tree type)
 	  update_complex_assignment (gsi, r, i);
 	}
     }
-  else if (rhs && TREE_CODE (rhs) == SSA_NAME && !TREE_SIDE_EFFECTS (lhs))
+  else if (rhs
+	   && (TREE_CODE (rhs) == SSA_NAME || TREE_CODE (rhs) == COMPLEX_CST)
+	   && !TREE_SIDE_EFFECTS (lhs))
     {
       tree x;
       gimple *t;