@@ -51,6 +51,424 @@ _mm_cvtsbh_ss (__bfloat16 __A)
return __tmp.a;
}
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_pbf16 (void)
+{
+ return (__m512bf16)(__v32bf) _mm512_setzero_ps ();
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_pbf16 (void)
+{
+ __m512bf16 __Y = __Y;
+ return __Y;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_pbf16 (__bf16 __h)
+{
+ return (__m512bf16)(__v32bf) {__h, __h, __h, __h, __h, __h, __h, __h,
+ __h, __h, __h, __h, __h, __h, __h, __h,
+ __h, __h, __h, __h, __h, __h, __h, __h,
+ __h, __h, __h, __h, __h, __h, __h, __h};
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_pbf16 (__bf16 __h1, __bf16 __h2, __bf16 __h3, __bf16 __h4,
+ __bf16 __h5, __bf16 __h6, __bf16 __h7, __bf16 __h8,
+ __bf16 __h9, __bf16 __h10, __bf16 __h11, __bf16 __h12,
+ __bf16 __h13, __bf16 __h14, __bf16 __h15, __bf16 __h16,
+ __bf16 __h17, __bf16 __h18, __bf16 __h19, __bf16 __h20,
+ __bf16 __h21, __bf16 __h22, __bf16 __h23, __bf16 __h24,
+ __bf16 __h25, __bf16 __h26, __bf16 __h27, __bf16 __h28,
+ __bf16 __h29, __bf16 __h30, __bf16 __h31, __bf16 __h32)
+{
+ return
+ (__m512bf16)(__v32bf) {__h32, __h31, __h30, __h29, __h28, __h27, __h26,
+ __h25, __h24, __h23, __h22, __h21, __h20, __h19,
+ __h18, __h17, __h16, __h15, __h14, __h13, __h12,
+ __h11, __h10, __h9, __h8, __h7, __h6, __h5,
+ __h4, __h3, __h2, __h1};
+}
+
+#define _mm512_setr_pbf16(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, \
+ h13, h14, h15, h16, h17, h18, h19, h20, h21, h22, \
+ h23, h24, h25, h26, h27, h28, h29, h30, h31, h32) \
+ _mm512_set_pbf16 ((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), \
+ (h24), (h23), (h22), (h21), (h20), (h19), (h18), (h17), \
+ (h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), \
+ (h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpbf16_ps (__m128bf16 __a)
+{
+ return (__m128) __a;
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpbf16_ps (__m256bf16 __a)
+{
+ return (__m256) __a;
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16_ps (__m512bf16 __a)
+{
+ return (__m512) __a;
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpbf16_pd (__m128bf16 __a)
+{
+ return (__m128d) __a;
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpbf16_pd (__m256bf16 __a)
+{
+ return (__m256d) __a;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16_pd (__m512bf16 __a)
+{
+ return (__m512d) __a;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpbf16_si128 (__m128bf16 __a)
+{
+ return (__m128i) __a;
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpbf16_si256 (__m256bf16 __a)
+{
+ return (__m256i) __a;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16_si512 (__m512bf16 __a)
+{
+ return (__m512i) __a;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_pbf16 (__m128 __a)
+{
+ return (__m128bf16) __a;
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_pbf16 (__m256 __a)
+{
+ return (__m256bf16) __a;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps_pbf16 (__m512 __a)
+{
+ return (__m512bf16) __a;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_pbf16 (__m128d __a)
+{
+ return (__m128bf16) __a;
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_pbf16 (__m256d __a)
+{
+ return (__m256bf16) __a;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd_pbf16 (__m512d __a)
+{
+ return (__m512bf16) __a;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_pbf16 (__m128i __a)
+{
+ return (__m128bf16) __a;
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_pbf16 (__m256i __a)
+{
+ return (__m256bf16) __a;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_pbf16 (__m512i __a)
+{
+ return (__m512bf16) __a;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpbf16256_pbf16128 (__m256bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16512_pbf16128 (__m512bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16512_pbf16256 (__m512bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpbf16128_pbf16256 (__m128bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,
+ -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16128_pbf16512 (__m128bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpbf16256_pbf16512 (__m256bf16 __a)
+{
+ return __builtin_shufflevector (__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextpbf16128_pbf16256 (__m128bf16 __A)
+{
+ return (__m256bf16) _mm256_insertf128_ps (_mm256_setzero_ps (),
+ (__m128) __A, 0);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextpbf16128_pbf16512 (__m128bf16 __A)
+{
+ return (__m512bf16) _mm512_insertf32x4 (_mm512_setzero_ps (),
+ (__m128) __A, 0);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextpbf16256_pbf16512 (__m256bf16 __A)
+{
+ return (__m512bf16) _mm512_insertf64x4 (_mm512_setzero_pd (),
+ (__m256d) __A, 0);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_pbf16 (__m512bf16 __A)
+{
+ return
+ (__m512bf16) _mm512_and_epi32 (_mm512_set1_epi32 (0x7FFF7FFF),
+ (__m512i) __A);
+}
+
+// loads with vmovsh if avx512fp16 enable:
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_pbf16 (void const *__p)
+{
+ return *(const __m512bf16 *) __p;
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_pbf16 (void const *__p)
+{
+ return *(const __m256bf16 *) __p;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pbf16 (void const *__p)
+{
+ return *(const __m128bf16 *) __p;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_pbf16 (void const *__p)
+{
+ struct __loadu_pbf16
+ {
+ __m512bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_pbf16 *) __p)->__v;
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_pbf16 (void const *__p)
+{
+ struct __loadu_pbf16
+ {
+ __m256bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_pbf16 *) __p)->__v;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_pbf16 (void const *__p)
+{
+ struct __loadu_pbf16
+ {
+ __m128bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_pbf16 *) __p)->__v;
+}
+
+// stores with vmovsh if avx512fp16 enable:
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_sbf16 (void *__dp, __m128bf16 __a)
+{
+ struct __mm_store_sbf16_struct
+ {
+ __bf16 __u;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __mm_store_sbf16_struct *) __dp)->__u = __a[0];
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_pbf16 (void *__P, __m512bf16 __A)
+{
+ *(__m512bf16 *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_pbf16 (void *__P, __m256bf16 __A)
+{
+ *(__m256bf16 *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pbf16 (void *__P, __m128bf16 __A)
+{
+ *(__m128bf16 *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_pbf16 (void *__P, __m512bf16 __A)
+{
+ struct __storeu_pbf16 {
+ __m512bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_pbf16 *) __P)->__v = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_pbf16 (void *__P, __m256bf16 __A)
+{
+ struct __storeu_pbf16
+ {
+ __m256bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_pbf16 *) __P)->__v = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_pbf16 (void *__P, __m128bf16 __A)
+{
+ struct __storeu_pbf16
+ {
+ __m128bf16_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_pbf16 *) __P)->__v = __A;
+}
+
+// moves with vmovsh if enable avx512fp16:
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_sbf16 (__m128bf16 __a, __m128bf16 __b)
+{
+ __a[0] = __b[0];
+ return __a;
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_pbf16 (__mmask32 __U, __m512bf16 __A, __m512bf16 __W)
+{
+ return (__m512bf16) __builtin_ia32_movdquhi512_mask ((__v32hi) __W,
+ (__v32hi) __A,
+ (__mmask32) __U);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_pbf16 (__m512bf16 __A, __m512i __I, __m512bf16 __B)
+{
+ return (__m512bf16) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+ (__v32hi) __I,
+ (__v32hi) __B,
+ (__mmask32)-1);
+}
+
+extern __inline __m512bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_pbf16 (__m512i __A, __m512bf16 __B)
+{
+ return (__m512bf16) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+ (__v32hi) __A,
+ (__v32hi)
+ (_mm512_setzero_si512 ()),
+ (__mmask32)-1);
+}
+
/* vcvtne2ps2bf16 */
extern __inline __m512bh
@@ -44,6 +44,183 @@ typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
typedef unsigned short __bfloat16;
+
+extern __inline __bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsbf16_bf16 (__m128bf16 __a)
+{
+ return __a[0];
+}
+
+extern __inline __bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsbf16_bf16 (__m256bf16 __a)
+{
+ return __a[0];
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_pbf16 (void)
+{
+ __m256bf16 __Y = __Y;
+ return __Y;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_pbf16 (void)
+{
+ __m128bf16 __Y = __Y;
+ return __Y;
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_pbf16 (void)
+{
+ return (__m128bf16)(__v8bf) _mm_setzero_ps ();
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_pbf16 (void)
+{
+ return (__m256bf16)(__v16bf) _mm256_setzero_ps ();
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_sbf16 (__bf16 bf)
+{
+ return (__v8bf)
+ __builtin_shufflevector ((__v8bf){bf, bf, bf, bf, bf, bf, bf, bf},
+ (__v8bf) _mm_setzero_pbf16 (), 0,
+ 8, 8, 8, 8, 8, 8, 8);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pbf16 (__bf16 bf)
+{
+ return (__m128bf16)(__v8bf) {bf, bf, bf, bf, bf, bf, bf, bf};
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_pbf16 (__bf16 bf)
+{
+ return (__m256bf16)(__v16bf) {bf, bf, bf, bf, bf, bf, bf, bf,
+ bf, bf, bf, bf, bf, bf, bf, bf};
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pbf16 (__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4,
+ __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8)
+{
+ return (__m128bf16)(__v8bf) {bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8};
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_pbf16 (__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4,
+ __bf16 bf5, __bf16 bf6, __bf16 bf7, __bf16 bf8,
+ __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
+ __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16)
+{
+ return (__m256bf16)(__v16bf) {bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8,
+ bf9, bf10, bf11, bf12, bf13, bf14,
+ bf15, bf16};
+}
+
+#define _mm_setr_pbf16(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8) \
+ _mm_set_pbf16 ((bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), (bf1))
+
+#define _mm256_setr_pbf16(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10, \
+ bf11, bf12, bf13, bf14, bf15, bf16) \
+ _mm256_set_pbf16 ((bf16), (bf15), (bf14), (bf13), (bf12), (bf11), (bf10), \
+ (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), \
+ (bf1))
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_pbf16 (__m256bf16 __A)
+{
+ return (__m256bf16) _mm256_and_si256 (_mm256_set1_epi32 (0x7FFF7FFF),
+ (__m256i)__A);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pbf16 (__m128bf16 __A)
+{
+ return (__m128bf16) _mm_and_si128 (_mm_set1_epi32 (0x7FFF7FFF),
+ (__m128i)__A);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_pbf16 (__mmask8 __U, __m128bf16 __A, __m128bf16 __W)
+{
+ return (__m128bf16)
+ __builtin_ia32_movdquhi128_mask ((__v8hi) __W,
+ (__v8hi) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_pbf16 (__mmask16 __U, __m256bf16 __A, __m256bf16 __W)
+{
+ return (__m256bf16)
+ __builtin_ia32_movdquhi256_mask ((__v16hi) __W,
+ (__v16hi) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_pbf16 (__m128bf16 __A, __m128i __I, __m128bf16 __B)
+{
+ return (__m128bf16)
+ __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+ (__v8hi) __I,
+ (__v8hi) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_pbf16 (__m256bf16 __A, __m256i __I, __m256bf16 __B)
+{
+ return (__m256bf16) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+ (__v16hi) __I,
+ (__v16hi) __B,
+ (__mmask16)-1);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutexvar_pbf16 (__m128i __A, __m128bf16 __B)
+{
+ return (__m128bf16) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+ (__v8hi) __A,
+ (__v8hi)
+ (_mm_setzero_si128 ()),
+ (__mmask8) -1);
+}
+
+extern __inline __m256bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_pbf16 (__m256i __A, __m256bf16 __B)
+{
+ return (__m256bf16) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+ (__v16hi) __A,
+ (__v16hi)
+ (_mm256_setzero_si256 ()),
+ (__mmask16) -1);
+}
/* vcvtne2ps2bf16 */
extern __inline __m256bh
@@ -53,6 +53,18 @@ typedef _Float16 __m256h_u __attribute__ ((__vector_size__ (32), \
typedef _Float16 __m512h_u __attribute__ ((__vector_size__ (64), \
__may_alias__, __aligned__ (1)));
+
+/* Internal data types for implementing the bf16 intrinsics. */
+typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __m512bf16 __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __m512bf16_u __attribute__((__vector_size__(64), __aligned__(1)));
+typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
+typedef __bf16 __m128bf16 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef __bf16 __m128bf16_u __attribute__((__vector_size__(16), __aligned__(1)));
+typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
+typedef __bf16 __m256bf16 __attribute__((__vector_size__(32), __aligned__(32)));
+typedef __bf16 __m256bf16_u __attribute__((__vector_size__(32), __aligned__(1)));
+
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5,
@@ -2771,6 +2783,44 @@ _mm_mask_store_sh (_Float16 const* __A, __mmask8 __B, __m128h __C)
__builtin_ia32_storesh_mask (__A, __C, __B);
}
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_sbf16 (void const *__dp)
+{
+ return (__m128bf16)
+ __builtin_ia32_loadsh_mask ((_Float16 const*) __dp,
+ _mm_setzero_ph(),
+ (__mmask8) -1);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_sbf16 (__m128bf16 __A, __mmask8 __B, const void *__C)
+{
+ return (__m128bf16)
+ __builtin_ia32_loadsh_mask ((_Float16 const*) __C,
+ (__v8hf) __A,
+ (__mmask8) __B);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_sbf16 (__mmask8 __A, const void *__B)
+{
+ return (__m128bf16)
+ __builtin_ia32_loadsh_mask ((_Float16 const*) __B,
+ _mm_setzero_ph(),
+ (__mmask8) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_sbf16 (const void *__A, __mmask8 __B, __m128bf16 __C)
+{
+ __builtin_ia32_storesh_mask ((_Float16 const*) __A,
+ (__v8hf) __C, (__mmask8) __B);
+}
+
extern __inline __m128h
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sh (__m128h __A, __m128h __B)
@@ -2793,6 +2843,26 @@ _mm_maskz_move_sh (__mmask8 __A, __m128h __B, __m128h __C)
return __builtin_ia32_vmovsh_mask (__B, __C, _mm_setzero_ph (), __A);
}
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_sbf16 (__m128bf16 __A, __mmask8 __B,
+ __m128bf16 __C, __m128bf16 __D)
+{
+ return (__m128bf16)
+ __builtin_ia32_vmovsh_mask ((__v8hf) __C, (__v8hf) __D,
+ (__v8hf) __A, (__mmask8) __B);
+}
+
+extern __inline __m128bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_sbf16 (__mmask8 __A, __m128bf16 __B, __m128bf16 __C)
+{
+ return (__m128bf16)
+ __builtin_ia32_vmovsh_mask ((__v8hf) __B, (__v8hf) __C,
+ _mm_setzero_ph(),
+ (__mmask8) __A);
+}
+
/* Intrinsics vcvtph2dq. */
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -118,9 +118,11 @@
#include <vpclmulqdqintrin.h>
+#ifdef __SSE2__
#include <avx512bf16vlintrin.h>
#include <avx512bf16intrin.h>
+#endif
#include <amxtileintrin.h>