@@ -45,6 +45,44 @@ typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
typedef __bf16 __bfloat16;
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_castsi128_ps(__m128i __A)
+{
+ return (__m128) __A;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_castsi256_ps (__m256i __A)
+{
+ return (__m256) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_slli_epi32 (__m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_slli_epi32 (__m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_cvtepi16_epi32 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_cvtepi16_epi32 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
+}
+
#define _mm256_cvtneps_pbh(A) \
(__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (A)
#define _mm_cvtneps_pbh(A) \
@@ -182,23 +220,23 @@ extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpbh_ps (__m128bh __A)
{
- return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
- (__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16));
+ return (__m128)_mm_avx512_castsi128_ps ((__m128i)_mm_avx512_slli_epi32 (
+ (__m128i)_mm_avx512_cvtepi16_epi32 ((__m128i)__A), 16));
}
extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtpbh_ps (__m128bh __A)
{
- return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
- (__m256i)_mm256_cvtepi16_epi32 ((__m128i)__A), 16));
+ return (__m256)_mm256_avx512_castsi256_ps ((__m256i)_mm256_avx512_slli_epi32 (
+ (__m256i)_mm256_avx512_cvtepi16_epi32 ((__m128i)__A), 16));
}
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
{
- return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
+ return (__m128)_mm_avx512_castsi128_ps ((__m128i)_mm_avx512_slli_epi32 (
(__m128i)_mm_maskz_cvtepi16_epi32 (
(__mmask8)__U, (__m128i)__A), 16));
}
@@ -207,7 +245,7 @@ extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
{
- return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
+ return (__m256)_mm256_avx512_castsi256_ps ((__m256i)_mm256_avx512_slli_epi32 (
(__m256i)_mm256_maskz_cvtepi16_epi32 (
(__mmask8)__U, (__m128i)__A), 16));
}
@@ -216,8 +254,8 @@ extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_cvtpbh_ps (__m128 __S, __mmask8 __U, __m128bh __A)
{
- return (__m128)_mm_castsi128_ps ((__m128i)_mm_mask_slli_epi32 (
- (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32 (
+ return (__m128)_mm_avx512_castsi128_ps ((__m128i)_mm_mask_slli_epi32 (
+ (__m128i)__S, (__mmask8)__U, (__m128i)_mm_avx512_cvtepi16_epi32 (
(__m128i)__A), 16));
}
@@ -225,8 +263,8 @@ extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_cvtpbh_ps (__m256 __S, __mmask8 __U, __m128bh __A)
{
- return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 (
- (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32 (
+ return (__m256)_mm256_avx512_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 (
+ (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_avx512_cvtepi16_epi32 (
(__m128i)__A), 16));
}
@@ -34,6 +34,32 @@
#define __DISABLE_AVX512BW__
#endif /* __AVX512BW__ */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_set_epi32 (int __q3, int __q2, int __q1, int __q0)
+{
+ return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_set_epi16 (short __q7, short __q6, short __q5, short __q4,
+ short __q3, short __q2, short __q1, short __q0)
+{
+ return __extension__ (__m128i)(__v8hi){
+ __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_set_epi8 (char __q15, char __q14, char __q13, char __q12,
+ char __q11, char __q10, char __q09, char __q08,
+ char __q07, char __q06, char __q05, char __q04,
+ char __q03, char __q02, char __q01, char __q00)
+{
+ return __extension__ (__m128i)(__v16qi){
+ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+ __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+ };
+}
+
extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_ktest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF)
@@ -1449,7 +1449,7 @@ extern __inline __m128i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi16_si128 (short __A)
{
- return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, __A);
+ return _mm_avx512_set_epi16 (0, 0, 0, 0, 0, 0, 0, __A);
}
extern __inline short
@@ -34,6 +34,32 @@
#define __DISABLE_AVX512FP16VL__
#endif /* __AVX512FP16VL__ */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_set1_ps (float __F)
+{
+ return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_set1_ps (float __A)
+{
+ return __extension__ (__m256){ __A, __A, __A, __A,
+ __A, __A, __A, __A };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_and_si128 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v2du)__A & (__v2du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_and_si256 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v4du)__A & (__v4du)__B);
+}
+
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_castph_ps (__m128h __a)
@@ -147,15 +173,15 @@ extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_zextph128_ph256 (__m128h __A)
{
- return (__m256h) _mm256_insertf128_ps (_mm256_avx512_setzero_ps (),
- (__m128) __A, 0);
+ return (__m256h) _mm256_avx512_insertf128_ps (_mm256_avx512_setzero_ps (),
+ (__m128) __A, 0);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_conj_pch (__m256h __A)
{
- return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_set1_epi32 (1<<31));
+ return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_avx512_set1_epi32 (1<<31));
}
extern __inline __m256h
@@ -183,7 +209,7 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_conj_pch (__m128h __A)
{
- return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_set1_epi32 (1<<31));
+ return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_avx512_set1_epi32 (1<<31));
}
extern __inline __m128h
@@ -482,16 +508,16 @@ extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_ph (__m128h __A)
{
- return (__m128h) _mm_and_si128 ( _mm_set1_epi32 (0x7FFF7FFF),
- (__m128i) __A);
+ return (__m128h) _mm_avx512_and_si128 (_mm_avx512_set1_epi32 (0x7FFF7FFF),
+ (__m128i) __A);
}
extern __inline __m256h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_abs_ph (__m256h __A)
{
- return (__m256h) _mm256_and_si256 ( _mm256_set1_epi32 (0x7FFF7FFF),
- (__m256i) __A);
+ return (__m256h) _mm256_avx512_and_si256 (_mm256_avx512_set1_epi32 (0x7FFF7FFF),
+ (__m256i) __A);
}
/* vcmpph */
@@ -3145,8 +3171,8 @@ _mm256_maskz_fcmul_pch (__mmask8 __A, __m256h __B, __m256h __C)
}
#define _MM256_REDUCE_OP(op) \
- __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \
- __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \
+ __m128h __T1 = (__m128h) _mm256_avx512_extractf128_pd ((__m256d) __A, 0); \
+ __m128h __T2 = (__m128h) _mm256_avx512_extractf128_pd ((__m256d) __A, 1); \
__m128h __T3 = (__T1 op __T2); \
__m128h __T4 = (__m128h) __builtin_shuffle (__T3, \
(__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \
@@ -3172,8 +3198,8 @@ _mm256_reduce_mul_ph (__m256h __A)
#undef _MM256_REDUCE_OP
#define _MM256_REDUCE_OP(op) \
- __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \
- __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \
+ __m128h __T1 = (__m128h) _mm256_avx512_extractf128_pd ((__m256d) __A, 0); \
+ __m128h __T2 = (__m128h) _mm256_avx512_extractf128_pd ((__m256d) __A, 1); \
__m128h __T3 = _mm_##op (__T1, __T2); \
__m128h __T4 = (__m128h) __builtin_shuffle (__T3, \
(__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \
@@ -3321,7 +3347,7 @@ _mm256_set1_pch (_Float16 _Complex __A)
float __b;
} __u = { .__a = __A };
- return (__m256h) _mm256_set1_ps (__u.__b);
+ return (__m256h) _mm256_avx512_set1_ps (__u.__b);
}
extern __inline __m128h
@@ -3334,7 +3360,7 @@ _mm_set1_pch (_Float16 _Complex __A)
float __b;
} __u = { .__a = __A };
- return (__m128h) _mm_set1_ps (__u.__b);
+ return (__m128h) _mm_avx512_set1_ps (__u.__b);
}
// intrinsics below are alias for f*mul_*ch
@@ -44,6 +44,126 @@ typedef char __v32qi_u __attribute__ ((__vector_size__ (32), \
typedef char __v16qi_u __attribute__ ((__vector_size__ (16), \
__may_alias__, __aligned__ (1)));
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_set1_epi32 (int __A)
+{
+ return _mm_avx512_set_epi32 (__A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_set1_epi16 (short __A)
+{
+ return _mm_avx512_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_set1_epi8 (char __A)
+{
+ return _mm_avx512_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_set_epi16 (short __q15, short __q14, short __q13, short __q12,
+ short __q11, short __q10, short __q09, short __q08,
+ short __q07, short __q06, short __q05, short __q04,
+ short __q03, short __q02, short __q01, short __q00)
+{
+ return __extension__ (__m256i)(__v16hi){
+ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+ __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+ };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_set_epi8 (char __q31, char __q30, char __q29, char __q28,
+ char __q27, char __q26, char __q25, char __q24,
+ char __q23, char __q22, char __q21, char __q20,
+ char __q19, char __q18, char __q17, char __q16,
+ char __q15, char __q14, char __q13, char __q12,
+ char __q11, char __q10, char __q09, char __q08,
+ char __q07, char __q06, char __q05, char __q04,
+ char __q03, char __q02, char __q01, char __q00)
+{
+ return __extension__ (__m256i)(__v32qi){
+ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+ __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
+ __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
+ __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
+ };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_set1_epi16 (short __A)
+{
+ return _mm256_avx512_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_set1_epi32 (int __A)
+{
+ return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
+ __A, __A, __A, __A };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_set1_epi8 (char __A)
+{
+ return _mm256_avx512_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_max_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_min_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_max_epu16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_min_epu16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_max_epi8 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_min_epi8 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_max_epu8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avx512_min_epu8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
+}
+
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
@@ -53,6 +173,136 @@ _mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
(__mmask32) __U);
}
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_max_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_min_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_max_epu16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_min_epu16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
+{
+ return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
+ (__v4sf)__Y,
+ __O);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_extractf128_pd (__m256d __X, const int __N)
+{
+ return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avx512_extracti128_si256 (__m256i __X, const int __M)
+{
+ return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
+}
+#else
+#define _mm256_avx512_insertf128_ps(X, Y, O) \
+ ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
+ (__v4sf)(__m128)(Y), \
+ (int)(O)))
+
+#define _mm256_avx512_extractf128_pd(X, N) \
+ ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
+ (int)(N)))
+
+#define _mm256_avx512_extracti128_si256(X, M) \
+ ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
+#endif
+
+#define _MM256_AVX512_REDUCE_OPERATOR_BASIC_EPI16(op) \
+ __v8hi __T1 = (__v8hi)_mm256_avx512_extracti128_si256 (__W, 0); \
+ __v8hi __T2 = (__v8hi)_mm256_avx512_extracti128_si256 (__W, 1); \
+ __v8hi __T3 = __T1 op __T2; \
+ __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 4, 5, 6, 7, 4, 5, 6, 7); \
+ __v8hi __T5 = __T3 op __T4; \
+ __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 2, 3, 2, 3, 4, 5, 6, 7); \
+ __v8hi __T7 = __T5 op __T6; \
+ __v8hi __T8 = __builtin_shufflevector (__T7, __T7, 1, 1, 2, 3, 4, 5, 6, 7); \
+ __v8hi __T9 = __T7 op __T8; \
+ return __T9[0]
+
+#define _MM256_AVX512_REDUCE_OPERATOR_MAX_MIN_EP16(op) \
+ __m128i __T1 = _mm256_avx512_extracti128_si256 (__V, 0); \
+ __m128i __T2 = _mm256_avx512_extracti128_si256 (__V, 1); \
+ __m128i __T3 = _mm_avx512_##op (__T1, __T2); \
+ __m128i __T4 = (__m128i)__builtin_shufflevector ((__v8hi)__T3, \
+ (__v8hi)__T3, 4, 5, 6, 7, 4, 5, 6, 7); \
+ __m128i __T5 = _mm_avx512_##op (__T3, __T4); \
+ __m128i __T6 = (__m128i)__builtin_shufflevector ((__v8hi)__T5, \
+ (__v8hi)__T5, 2, 3, 2, 3, 4, 5, 6, 7); \
+ __m128i __T7 = _mm_avx512_##op (__T5, __T6); \
+ __m128i __T8 = (__m128i)__builtin_shufflevector ((__v8hi)__T7, \
+ (__v8hi)__T7, 1, 1, 2, 3, 4, 5, 6, 7); \
+ __v8hi __T9 = (__v8hi)_mm_avx512_##op (__T7, __T8); \
+ return __T9[0]
+
+#define _MM256_AVX512_REDUCE_OPERATOR_BASIC_EPI8(op) \
+ __v16qi __T1 = (__v16qi)_mm256_avx512_extracti128_si256 (__W, 0); \
+ __v16qi __T2 = (__v16qi)_mm256_avx512_extracti128_si256 (__W, 1); \
+ __v16qi __T3 = __T1 op __T2; \
+ __v16qi __T4 = __builtin_shufflevector (__T3, __T3, \
+ 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \
+ __v16qi __T5 = __T3 op __T4; \
+ __v16qi __T6 = __builtin_shufflevector (__T5, __T5, \
+ 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+ __v16qi __T7 = __T5 op __T6; \
+ __v16qi __T8 = __builtin_shufflevector (__T7, __T7, \
+ 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+ __v16qi __T9 = __T7 op __T8; \
+ __v16qi __T10 = __builtin_shufflevector (__T9, __T9, \
+ 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+ __v16qi __T11 = __T9 op __T10; \
+ return __T11[0]
+
+#define _MM256_AVX512_REDUCE_OPERATOR_MAX_MIN_EP8(op) \
+ __m128i __T1 = _mm256_avx512_extracti128_si256 (__V, 0); \
+ __m128i __T2 = _mm256_avx512_extracti128_si256 (__V, 1); \
+ __m128i __T3 = _mm_avx512_##op (__T1, __T2); \
+ __m128i __T4 = (__m128i)__builtin_shufflevector ((__v16qi)__T3, \
+ (__v16qi)__T3, \
+ 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); \
+ __m128i __T5 = _mm_avx512_##op (__T3, __T4); \
+ __m128i __T6 = (__m128i)__builtin_shufflevector ((__v16qi)__T5, \
+ (__v16qi)__T5, \
+ 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+ __m128i __T7 = _mm_avx512_##op (__T5, __T6); \
+ __m128i __T8 = (__m128i)__builtin_shufflevector ((__v16qi)__T7, \
+ (__v16qi)__T5, \
+ 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+ __m128i __T9 = _mm_avx512_##op (__T7, __T8); \
+ __m128i __T10 = (__m128i)__builtin_shufflevector ((__v16qi)__T9, \
+ (__v16qi)__T9, \
+ 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
+ __v16qi __T11 = (__v16qi)_mm_avx512_##op (__T9, __T10); \
+ return __T11[0]
+
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
@@ -4746,7 +4996,7 @@ extern __inline short
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_mul_epi16 (__mmask8 __M, __m128i __W)
{
- __W = _mm_mask_mov_epi16 (_mm_set1_epi16 (1), __M, __W);
+ __W = _mm_mask_mov_epi16 (_mm_avx512_set1_epi16 (1), __M, __W);
_MM_REDUCE_OPERATOR_BASIC_EPI16 (*);
}
@@ -4754,7 +5004,7 @@ extern __inline short
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_and_epi16 (__mmask8 __M, __m128i __W)
{
- __W = _mm_mask_mov_epi16 (_mm_set1_epi16 (-1), __M, __W);
+ __W = _mm_mask_mov_epi16 (_mm_avx512_set1_epi16 (-1), __M, __W);
_MM_REDUCE_OPERATOR_BASIC_EPI16 (&);
}
@@ -4770,8 +5020,8 @@ extern __inline short
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_max_epi16 (__mmask16 __M, __m128i __V)
{
- __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (-32767-1), __M, __V);
- _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16);
+ __V = _mm_mask_mov_epi16 (_mm_avx512_set1_epi16 (-32767-1), __M, __V);
+ _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (avx512_max_epi16);
}
extern __inline unsigned short
@@ -4779,23 +5029,23 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_max_epu16 (__mmask16 __M, __m128i __V)
{
__V = _mm_maskz_mov_epi16 (__M, __V);
- _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16);
+ _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (avx512_max_epu16);
}
extern __inline short
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_min_epi16 (__mmask16 __M, __m128i __V)
{
- __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (32767), __M, __V);
- _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16);
+ __V = _mm_mask_mov_epi16 (_mm_avx512_set1_epi16 (32767), __M, __V);
+ _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (avx512_min_epi16);
}
extern __inline unsigned short
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_min_epu16 (__mmask16 __M, __m128i __V)
{
- __V = _mm_mask_mov_epi16 (_mm_set1_epi16 (-1), __M, __V);
- _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16);
+ __V = _mm_mask_mov_epi16 (_mm_avx512_set1_epi16 (-1), __M, __V);
+ _MM_REDUCE_OPERATOR_MAX_MIN_EP16 (avx512_min_epu16);
}
extern __inline short
@@ -4803,23 +5053,23 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_add_epi16 (__mmask16 __M, __m256i __W)
{
__W = _mm256_maskz_mov_epi16 (__M, __W);
- _MM256_REDUCE_OPERATOR_BASIC_EPI16 (+);
+ _MM256_AVX512_REDUCE_OPERATOR_BASIC_EPI16 (+);
}
extern __inline short
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_mul_epi16 (__mmask16 __M, __m256i __W)
{
- __W = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (1), __M, __W);
- _MM256_REDUCE_OPERATOR_BASIC_EPI16 (*);
+ __W = _mm256_mask_mov_epi16 (_mm256_avx512_set1_epi16 (1), __M, __W);
+ _MM256_AVX512_REDUCE_OPERATOR_BASIC_EPI16 (*);
}
extern __inline short
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_and_epi16 (__mmask16 __M, __m256i __W)
{
- __W = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-1), __M, __W);
- _MM256_REDUCE_OPERATOR_BASIC_EPI16 (&);
+ __W = _mm256_mask_mov_epi16 (_mm256_avx512_set1_epi16 (-1), __M, __W);
+ _MM256_AVX512_REDUCE_OPERATOR_BASIC_EPI16 (&);
}
extern __inline short
@@ -4827,15 +5077,15 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_or_epi16 (__mmask16 __M, __m256i __W)
{
__W = _mm256_maskz_mov_epi16 (__M, __W);
- _MM256_REDUCE_OPERATOR_BASIC_EPI16 (|);
+ _MM256_AVX512_REDUCE_OPERATOR_BASIC_EPI16 (|);
}
extern __inline short
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_max_epi16 (__mmask16 __M, __m256i __V)
{
- __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-32767-1), __M, __V);
- _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16);
+ __V = _mm256_mask_mov_epi16 (_mm256_avx512_set1_epi16 (-32767-1), __M, __V);
+ _MM256_AVX512_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16);
}
extern __inline unsigned short
@@ -4843,23 +5093,23 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_max_epu16 (__mmask16 __M, __m256i __V)
{
__V = _mm256_maskz_mov_epi16 (__M, __V);
- _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16);
+ _MM256_AVX512_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16);
}
extern __inline short
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_min_epi16 (__mmask16 __M, __m256i __V)
{
- __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (32767), __M, __V);
- _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16);
+ __V = _mm256_mask_mov_epi16 (_mm256_avx512_set1_epi16 (32767), __M, __V);
+ _MM256_AVX512_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16);
}
extern __inline unsigned short
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_min_epu16 (__mmask16 __M, __m256i __V)
{
- __V = _mm256_mask_mov_epi16 (_mm256_set1_epi16 (-1), __M, __V);
- _MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16);
+ __V = _mm256_mask_mov_epi16 (_mm256_avx512_set1_epi16 (-1), __M, __V);
+ _MM256_AVX512_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16);
}
extern __inline char
@@ -4874,7 +5124,7 @@ extern __inline char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_mul_epi8 (__mmask16 __M, __m128i __W)
{
- __W = _mm_mask_mov_epi8 (_mm_set1_epi8 (1), __M, __W);
+ __W = _mm_mask_mov_epi8 (_mm_avx512_set1_epi8 (1), __M, __W);
_MM_REDUCE_OPERATOR_BASIC_EPI8 (*);
}
@@ -4882,7 +5132,7 @@ extern __inline char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_and_epi8 (__mmask16 __M, __m128i __W)
{
- __W = _mm_mask_mov_epi8 (_mm_set1_epi8 (-1), __M, __W);
+ __W = _mm_mask_mov_epi8 (_mm_avx512_set1_epi8 (-1), __M, __W);
_MM_REDUCE_OPERATOR_BASIC_EPI8 (&);
}
@@ -4898,8 +5148,8 @@ extern __inline signed char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_max_epi8 (__mmask16 __M, __m128i __V)
{
- __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (-127-1), __M, __V);
- _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8);
+ __V = _mm_mask_mov_epi8 (_mm_avx512_set1_epi8 (-127-1), __M, __V);
+ _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (avx512_max_epi8);
}
extern __inline unsigned char
@@ -4907,23 +5157,23 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_max_epu8 (__mmask16 __M, __m128i __V)
{
__V = _mm_maskz_mov_epi8 (__M, __V);
- _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8);
+ _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (avx512_max_epu8);
}
extern __inline signed char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_min_epi8 (__mmask16 __M, __m128i __V)
{
- __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (127), __M, __V);
- _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8);
+ __V = _mm_mask_mov_epi8 (_mm_avx512_set1_epi8 (127), __M, __V);
+ _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (avx512_min_epi8);
}
extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_reduce_min_epu8 (__mmask16 __M, __m128i __V)
{
- __V = _mm_mask_mov_epi8 (_mm_set1_epi8 (-1), __M, __V);
- _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8);
+ __V = _mm_mask_mov_epi8 (_mm_avx512_set1_epi8 (-1), __M, __V);
+ _MM_REDUCE_OPERATOR_MAX_MIN_EP8 (avx512_min_epu8);
}
extern __inline char
@@ -4931,23 +5181,23 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_add_epi8 (__mmask32 __M, __m256i __W)
{
__W = _mm256_maskz_mov_epi8 (__M, __W);
- _MM256_REDUCE_OPERATOR_BASIC_EPI8 (+);
+ _MM256_AVX512_REDUCE_OPERATOR_BASIC_EPI8 (+);
}
extern __inline char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_mul_epi8 (__mmask32 __M, __m256i __W)
{
- __W = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (1), __M, __W);
- _MM256_REDUCE_OPERATOR_BASIC_EPI8 (*);
+ __W = _mm256_mask_mov_epi8 (_mm256_avx512_set1_epi8 (1), __M, __W);
+ _MM256_AVX512_REDUCE_OPERATOR_BASIC_EPI8 (*);
}
extern __inline char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_and_epi8 (__mmask32 __M, __m256i __W)
{
- __W = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-1), __M, __W);
- _MM256_REDUCE_OPERATOR_BASIC_EPI8 (&);
+ __W = _mm256_mask_mov_epi8 (_mm256_avx512_set1_epi8 (-1), __M, __W);
+ _MM256_AVX512_REDUCE_OPERATOR_BASIC_EPI8 (&);
}
extern __inline char
@@ -4955,15 +5205,15 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_or_epi8 (__mmask32 __M, __m256i __W)
{
__W = _mm256_maskz_mov_epi8 (__M, __W);
- _MM256_REDUCE_OPERATOR_BASIC_EPI8 (|);
+ _MM256_AVX512_REDUCE_OPERATOR_BASIC_EPI8 (|);
}
extern __inline signed char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_max_epi8 (__mmask32 __M, __m256i __V)
{
- __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-127-1), __M, __V);
- _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8);
+ __V = _mm256_mask_mov_epi8 (_mm256_avx512_set1_epi8 (-127-1), __M, __V);
+ _MM256_AVX512_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8);
}
extern __inline unsigned char
@@ -4971,23 +5221,23 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_max_epu8 (__mmask32 __M, __m256i __V)
{
__V = _mm256_maskz_mov_epi8 (__M, __V);
- _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8);
+ _MM256_AVX512_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8);
}
extern __inline signed char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_min_epi8 (__mmask32 __M, __m256i __V)
{
- __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (127), __M, __V);
- _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8);
+ __V = _mm256_mask_mov_epi8 (_mm256_avx512_set1_epi8 (127), __M, __V);
+ _MM256_AVX512_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8);
}
extern __inline unsigned char
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_reduce_min_epu8 (__mmask32 __M, __m256i __V)
{
- __V = _mm256_mask_mov_epi8 (_mm256_set1_epi8 (-1), __M, __V);
- _MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8);
+ __V = _mm256_mask_mov_epi8 (_mm256_avx512_set1_epi8 (-1), __M, __V);
+ _MM256_AVX512_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8);
}
#ifdef __DISABLE_AVX512VLBW__