Add testcases for ffs/ctz vectorization.

Message ID 20230423030258.194509-1-hongtao.liu@intel.com
State Accepted
Headers
Series Add testcases for ffs/ctz vectorization. |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

liuhongt April 23, 2023, 3:02 a.m. UTC
  Ready push to trunk.

gcc/testsuite/ChangeLog:

	PR tree-optimization/109011
	* gcc.target/i386/pr109011-b1.c: New test.
	* gcc.target/i386/pr109011-b2.c: New test.
	* gcc.target/i386/pr109011-d1.c: New test.
	* gcc.target/i386/pr109011-d2.c: New test.
	* gcc.target/i386/pr109011-q1.c: New test.
	* gcc.target/i386/pr109011-q2.c: New test.
	* gcc.target/i386/pr109011-w1.c: New test.
	* gcc.target/i386/pr109011-w2.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr109011-b1.c  |  53 +++++++++
 gcc/testsuite/gcc.target/i386/pr109011-b2.c  | 104 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr109011-d1.c  |  46 ++++++++
 gcc/testsuite/gcc.target/i386/pr109011-d2.c  | 118 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr109011-dq1.c |  46 ++++++++
 gcc/testsuite/gcc.target/i386/pr109011-dq2.c | 104 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr109011-q1.c  |  46 ++++++++
 gcc/testsuite/gcc.target/i386/pr109011-q2.c  | 118 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr109011-w1.c  |  47 ++++++++
 gcc/testsuite/gcc.target/i386/pr109011-w2.c  | 104 ++++++++++++++++
 10 files changed, 786 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-b1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-b2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-d1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-d2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-dq1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-dq2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-q1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-q2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-w1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-w2.c
  

Patch

diff --git a/gcc/testsuite/gcc.target/i386/pr109011-b1.c b/gcc/testsuite/gcc.target/i386/pr109011-b1.c
new file mode 100644
index 00000000000..9833d3526f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-b1.c
@@ -0,0 +1,53 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \t\]+" 4 } } */
+/* 4 vplzcntd come from function clzw, the other 4 come from function clzb0.  */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 8 } } */
+
+void
+__attribute__((noipa))
+popcntb (unsigned char *p, unsigned char *q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa))
+clzb (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa))
+ffsb (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa))
+ctzb (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ctz (q[i]);
+}
+
+void
+__attribute__((noipa))
+clzb0 (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_clz (q[i]) : 8;
+}
+
+void
+__attribute__((noipa))
+ctzb0 (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_ctz (q[i]) : 8;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-b2.c b/gcc/testsuite/gcc.target/i386/pr109011-b2.c
new file mode 100644
index 00000000000..7f2042645d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-b2.c
@@ -0,0 +1,104 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-b1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntb_scalar (unsigned char *p, unsigned char *q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzb_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsb_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzb0_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_clz (q[i]) : 8;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzb0_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_ctz (q[i]) : 8;
+}
+
+void
+test_256 ()
+{
+  unsigned char src[2048];
+  unsigned char res[2048];
+  unsigned char exp[2048];
+  for (int i = 0; i != 2048; i++)
+    {
+      src[i] = i * i - 1;
+      res[i] = 0;
+      exp[i] = 1;
+    }
+
+  popcntb (&res[0], &src[0]);
+  popcntb_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048) != 0)
+    __builtin_abort ();
+
+  clzb (&res[0], &src[0]);
+  clzb_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048) != 0)
+    __builtin_abort ();
+
+  ffsb (&res[0], &src[0]);
+  ffsb_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048) != 0)
+    __builtin_abort ();
+
+  clzb0 (&res[0], &src[0]);
+  clzb0_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048) != 0)
+    __builtin_abort ();
+
+  ctzb0 (&res[0], &src[0]);
+  ctzb0_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048) != 0)
+    __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-d1.c b/gcc/testsuite/gcc.target/i386/pr109011-d1.c
new file mode 100644
index 00000000000..23eb2d57e07
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-d1.c
@@ -0,0 +1,46 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntd\[ \t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+"  5 } } */
+
+void
+popcntd (unsigned int *p, unsigned int *q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcount (q[i]);
+}
+
+void
+clzd (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clz (q[i]);
+}
+
+void
+ffsd (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffs (q[i]);
+}
+
+void
+ctzd (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ctz (q[i]);
+}
+
+void
+clzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_clz (q[i]) : 32;
+}
+
+void
+ctzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_ctz (q[i]) : 32;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-d2.c b/gcc/testsuite/gcc.target/i386/pr109011-d2.c
new file mode 100644
index 00000000000..f6fb78d1df0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-d2.c
@@ -0,0 +1,118 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-d1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntd_scalar (unsigned int *p, unsigned int *q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ctz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_clz (q[i]) : 32;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_ctz (q[i]) : 32;
+}
+
+void
+test_256 ()
+{
+  unsigned int src[2048];
+  unsigned int res[2048];
+  unsigned int exp[2048];
+  for (int i = 0; i != 2048; i++)
+    {
+      src[i] = i * i - 1;
+      res[i] = 0;
+      exp[i] = 1;
+    }
+
+  popcntd (&res[0], &src[0]);
+  popcntd_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+    __builtin_abort ();
+
+  clzd (&res[0], &src[0]);
+  clzd_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (&res[0], &exp[0], 2048 * 4) != 0)
+    __builtin_abort ();
+
+  ffsd (&res[0], &src[0]);
+  ffsd_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+    __builtin_abort ();
+
+  ctzd (&res[0], &src[0]);
+  ctzd_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+    __builtin_abort ();
+
+  clzd0 (&res[0], &src[0]);
+  clzd0_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+    __builtin_abort ();
+
+  ctzd0 (&res[0], &src[0]);
+  ctzd0_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+    __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-dq1.c b/gcc/testsuite/gcc.target/i386/pr109011-dq1.c
new file mode 100644
index 00000000000..876dce01946
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-dq1.c
@@ -0,0 +1,46 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntd\[ \t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+"  5 } } */
+
+void
+popcntd (unsigned int *p, unsigned int *q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+clzd (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clzll (q[i]);
+}
+
+void
+ffsd (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+ctzd (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ctzll (q[i]);
+}
+
+void
+clzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_clzll (q[i]) : 32;
+}
+
+void
+ctzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_ctzll (q[i]) : 32;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-dq2.c b/gcc/testsuite/gcc.target/i386/pr109011-dq2.c
new file mode 100644
index 00000000000..ceb6655a6d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-dq2.c
@@ -0,0 +1,104 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-dq1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntd_scalar (unsigned int *p, unsigned int *q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clzll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_clzll (q[i]) : 32;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_ctzll (q[i]) : 32;
+}
+
+void
+test_256 ()
+{
+  unsigned int src[2048];
+  unsigned int res[2048];
+  unsigned int exp[2048];
+  for (int i = 0; i != 2048; i++)
+    {
+      src[i] = i * i - 1;
+      res[i] = 0;
+      exp[i] = 1;
+    }
+
+  popcntd (&res[0], &src[0]);
+  popcntd_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+    __builtin_abort ();
+
+  clzd (&res[0], &src[0]);
+  clzd_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (&res[0], &exp[0], 2048 * 4) != 0)
+    __builtin_abort ();
+
+  ffsd (&res[0], &src[0]);
+  ffsd_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+    __builtin_abort ();
+
+  clzd0 (&res[0], &src[0]);
+  clzd0_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+    __builtin_abort ();
+
+  ctzd0 (&res[0], &src[0]);
+  ctzd0_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+    __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-q1.c b/gcc/testsuite/gcc.target/i386/pr109011-q1.c
new file mode 100644
index 00000000000..237381c796a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-q1.c
@@ -0,0 +1,46 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntq\[ \t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vplzcntq\[ \t\]+"  5 } } */
+
+void
+popcntq (unsigned long long *p, unsigned long long *q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+clzq (unsigned long long *p, unsigned long long* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clzll (q[i]);
+}
+
+void
+ffsq (unsigned long long *p, unsigned long long* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+ctzq (unsigned long long *p, unsigned long long* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ctzll (q[i]);
+}
+
+void
+clzq0 (unsigned long long *p, unsigned long long* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_clzll (q[i]) : 64;
+}
+
+void
+ctzq0 (unsigned long long *p, unsigned long long* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_ctzll (q[i]) : 64;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-q2.c b/gcc/testsuite/gcc.target/i386/pr109011-q2.c
new file mode 100644
index 00000000000..6f9654f0ef8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-q2.c
@@ -0,0 +1,118 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-q1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntq_scalar (unsigned long long *p, unsigned long long *q)
+{
+  for (unsigned long long i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzq_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+  for (unsigned long long i = 0; i < 2048; ++i)
+    p[i] = __builtin_clzll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsq_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+  for (unsigned long long i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzq_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+  for (unsigned long long i = 0; i < 2048; ++i)
+    p[i] = __builtin_ctzll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzq0_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+  for (unsigned long long i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_clzll (q[i]) : 64;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzq0_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+  for (unsigned long long i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_ctzll (q[i]) : 64;
+}
+
+void
+test_256 ()
+{
+  unsigned long long src[2048];
+  unsigned long long res[2048];
+  unsigned long long exp[2048];
+  for (unsigned long long i = 0; i != 2048ULL; i++)
+    {
+      src[i] = i * i - 1ULL;
+      res[i] = 0;
+      exp[i] = 1;
+    }
+
+  popcntq (&res[0], &src[0]);
+  popcntq_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+    __builtin_abort ();
+
+  clzq (&res[0], &src[0]);
+  clzq_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+    __builtin_abort ();
+
+  ffsq (&res[0], &src[0]);
+  ffsq_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+    __builtin_abort ();
+
+  ctzq (&res[0], &src[0]);
+  ctzq_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+    __builtin_abort ();
+
+  clzq0 (&res[0], &src[0]);
+  clzq0_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+    __builtin_abort ();
+
+  ctzq0 (&res[0], &src[0]);
+  ctzq0_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+    __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-w1.c b/gcc/testsuite/gcc.target/i386/pr109011-w1.c
new file mode 100644
index 00000000000..f6045abe8ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-w1.c
@@ -0,0 +1,47 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \t\]+" 4 } } */
+/* 2 vplzcntd come from function clzw, the other 2 come from function clzb0.  */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 4 } } */
+
+void
+popcntw (unsigned short *p, unsigned short *q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcount (q[i]);
+}
+
+void
+clzw (unsigned short *p, unsigned short* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clz (q[i]);
+}
+
+void
+ffsw (unsigned short *p, unsigned short* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffs (q[i]);
+}
+
+void
+ctzw (unsigned short *p, unsigned short* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ctz (q[i]);
+}
+
+void
+clzw0 (unsigned short *p, unsigned short* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_clz (q[i]) : 16;
+}
+
+void
+ctzw0 (unsigned short *p, unsigned short* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_ctz (q[i]) : 16;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-w2.c b/gcc/testsuite/gcc.target/i386/pr109011-w2.c
new file mode 100644
index 00000000000..15dd338eefa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-w2.c
@@ -0,0 +1,104 @@ 
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-w1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntw_scalar (unsigned short *p, unsigned short *q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzw_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsw_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzw0_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_clz (q[i]) : 16;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzw0_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+    p[i] = q[i] ? __builtin_ctz (q[i]) : 16;
+}
+
+void
+test_256 ()
+{
+  unsigned short src[2048];
+  unsigned short res[2048];
+  unsigned short exp[2048];
+  for (int i = 0; i != 2048; i++)
+    {
+      src[i] = i * i - 1;
+      res[i] = 0;
+      exp[i] = 1;
+    }
+
+  popcntw (&res[0], &src[0]);
+  popcntw_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+    __builtin_abort ();
+
+  clzw (&res[0], &src[0]);
+  clzw_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+    __builtin_abort ();
+
+  ffsw (&res[0], &src[0]);
+  ffsw_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+    __builtin_abort ();
+
+  clzw0 (&res[0], &src[0]);
+  clzw0_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+    __builtin_abort ();
+
+  ctzw0 (&res[0], &src[0]);
+  ctzw0_scalar (&exp[0], &src[0]);
+
+  if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+    __builtin_abort ();
+}
+
+void
+test_128 ()
+{}