new file mode 100644
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \t\]+" 4 } } */
+/* 4 vplzcntd come from function clzw, the other 4 come from function clzb0. */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 8 } } */
+
+void
+__attribute__((noipa))
+popcntb (unsigned char *p, unsigned char *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa))
+clzb (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa))
+ffsb (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa))
+ctzb (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctz (q[i]);
+}
+
+void
+__attribute__((noipa))
+clzb0 (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 8;
+}
+
+void
+__attribute__((noipa))
+ctzb0 (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 8;
+}
new file mode 100644
@@ -0,0 +1,104 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-b1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntb_scalar (unsigned char *p, unsigned char *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzb_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsb_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzb0_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 8;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzb0_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 8;
+}
+
+void
+test_256 ()
+{
+ unsigned char src[2048];
+ unsigned char res[2048];
+ unsigned char exp[2048];
+ for (int i = 0; i != 2048; i++)
+ {
+ src[i] = i * i - 1;
+ res[i] = 0;
+ exp[i] = 1;
+ }
+
+ popcntb (&res[0], &src[0]);
+ popcntb_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048) != 0)
+ __builtin_abort ();
+
+ clzb (&res[0], &src[0]);
+ clzb_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048) != 0)
+ __builtin_abort ();
+
+ ffsb (&res[0], &src[0]);
+ ffsb_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048) != 0)
+ __builtin_abort ();
+
+ clzb0 (&res[0], &src[0]);
+ clzb0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048) != 0)
+ __builtin_abort ();
+
+ ctzb0 (&res[0], &src[0]);
+ ctzb0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048) != 0)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
new file mode 100644
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntd\[ \t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 5 } } */
+
+void
+popcntd (unsigned int *p, unsigned int *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+clzd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+ffsd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+ctzd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctz (q[i]);
+}
+
+void
+clzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 32;
+}
+
+void
+ctzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 32;
+}
new file mode 100644
@@ -0,0 +1,118 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-d1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntd_scalar (unsigned int *p, unsigned int *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 32;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 32;
+}
+
+void
+test_256 ()
+{
+ unsigned int src[2048];
+ unsigned int res[2048];
+ unsigned int exp[2048];
+ for (int i = 0; i != 2048; i++)
+ {
+ src[i] = i * i - 1;
+ res[i] = 0;
+ exp[i] = 1;
+ }
+
+ popcntd (&res[0], &src[0]);
+ popcntd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ clzd (&res[0], &src[0]);
+ clzd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (&res[0], &exp[0], 2048 * 4) != 0)
+ __builtin_abort ();
+
+ ffsd (&res[0], &src[0]);
+ ffsd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ ctzd (&res[0], &src[0]);
+ ctzd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ clzd0 (&res[0], &src[0]);
+ clzd0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ ctzd0 (&res[0], &src[0]);
+ ctzd0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
new file mode 100644
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntd\[ \t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 5 } } */
+
+void
+popcntd (unsigned int *p, unsigned int *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+clzd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clzll (q[i]);
+}
+
+void
+ffsd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+ctzd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctzll (q[i]);
+}
+
+void
+clzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clzll (q[i]) : 32;
+}
+
+void
+ctzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctzll (q[i]) : 32;
+}
new file mode 100644
@@ -0,0 +1,104 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-dq1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntd_scalar (unsigned int *p, unsigned int *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clzll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clzll (q[i]) : 32;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctzll (q[i]) : 32;
+}
+
+void
+test_256 ()
+{
+ unsigned int src[2048];
+ unsigned int res[2048];
+ unsigned int exp[2048];
+ for (int i = 0; i != 2048; i++)
+ {
+ src[i] = i * i - 1;
+ res[i] = 0;
+ exp[i] = 1;
+ }
+
+ popcntd (&res[0], &src[0]);
+ popcntd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ clzd (&res[0], &src[0]);
+ clzd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (&res[0], &exp[0], 2048 * 4) != 0)
+ __builtin_abort ();
+
+ ffsd (&res[0], &src[0]);
+ ffsd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ clzd0 (&res[0], &src[0]);
+ clzd0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ ctzd0 (&res[0], &src[0]);
+ ctzd0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
new file mode 100644
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntq\[ \t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vplzcntq\[ \t\]+" 5 } } */
+
+void
+popcntq (unsigned long long *p, unsigned long long *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+clzq (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clzll (q[i]);
+}
+
+void
+ffsq (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+ctzq (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctzll (q[i]);
+}
+
+void
+clzq0 (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clzll (q[i]) : 64;
+}
+
+void
+ctzq0 (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctzll (q[i]) : 64;
+}
new file mode 100644
@@ -0,0 +1,118 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-q1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntq_scalar (unsigned long long *p, unsigned long long *q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzq_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = __builtin_clzll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsq_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzq_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctzll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzq0_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clzll (q[i]) : 64;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzq0_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctzll (q[i]) : 64;
+}
+
+void
+test_256 ()
+{
+ unsigned long long src[2048];
+ unsigned long long res[2048];
+ unsigned long long exp[2048];
+ for (unsigned long long i = 0; i != 2048ULL; i++)
+ {
+ src[i] = i * i - 1ULL;
+ res[i] = 0;
+ exp[i] = 1;
+ }
+
+ popcntq (&res[0], &src[0]);
+ popcntq_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+
+ clzq (&res[0], &src[0]);
+ clzq_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+
+ ffsq (&res[0], &src[0]);
+ ffsq_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+
+ ctzq (&res[0], &src[0]);
+ ctzq_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+
+ clzq0 (&res[0], &src[0]);
+ clzq0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+
+ ctzq0 (&res[0], &src[0]);
+ ctzq0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
new file mode 100644
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \t\]+" 4 } } */
+/* 2 vplzcntd come from function clzw, the other 2 come from function clzb0. */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 4 } } */
+
+void
+popcntw (unsigned short *p, unsigned short *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+clzw (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+ffsw (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+ctzw (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctz (q[i]);
+}
+
+void
+clzw0 (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 16;
+}
+
+void
+ctzw0 (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 16;
+}
new file mode 100644
@@ -0,0 +1,104 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-w1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntw_scalar (unsigned short *p, unsigned short *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzw_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsw_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzw0_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 16;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzw0_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 16;
+}
+
+void
+test_256 ()
+{
+ unsigned short src[2048];
+ unsigned short res[2048];
+ unsigned short exp[2048];
+ for (int i = 0; i != 2048; i++)
+ {
+ src[i] = i * i - 1;
+ res[i] = 0;
+ exp[i] = 1;
+ }
+
+ popcntw (&res[0], &src[0]);
+ popcntw_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+ __builtin_abort ();
+
+ clzw (&res[0], &src[0]);
+ clzw_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+ __builtin_abort ();
+
+ ffsw (&res[0], &src[0]);
+ ffsw_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+ __builtin_abort ();
+
+ clzw0 (&res[0], &src[0]);
+ clzw0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+ __builtin_abort ();
+
+ ctzw0 (&res[0], &src[0]);
+ ctzw0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{}