[v2,x86] Fix incorrect _mm_cvtsbh_ss.
Checks
Commit Message
After supporting real __bf16, the implementation of _mm_cvtsbh_ss went
wrong.
The patch add a builtin to generate pslld for the intrinsic, also
extendbfsf2 is supported with pslld when !flag_signaling_nans &&
!HONOR_NANS (BFmode).
truncsfbf2 is supported with vcvtneps2bf16 when !flag_signaling_nans &&
!HONOR_NANS (BFmode) && flag_unsafe_math_optimizations.
Here's updated patch.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?
gcc/ChangeLog:
PR target/107748
* config/i386/avx512bf16intrin.h (_mm_cvtsbh_ss): Refined.
* config/i386/i386-builtin-types.def (FLOAT_FTYPE_BFLOAT16):
New function type.
* config/i386/i386-builtin.def (BDESC): New builtin.
* config/i386/i386-expand.cc (ix86_expand_args_builtin):
Handle the builtin.
* config/i386/i386.md (extendbfsf2): New expander.
(extendbfsf2_1): New define_insn.
(truncsfbf2): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: Scan pslld.
* gcc.target/i386/extendbfsf.c: New test.
---
gcc/config/i386/avx512bf16intrin.h | 4 +-
gcc/config/i386/i386-builtin-types.def | 1 +
gcc/config/i386/i386-builtin.def | 2 +
gcc/config/i386/i386-expand.cc | 1 +
gcc/config/i386/i386.md | 41 ++++++++++++++++++-
.../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c | 3 +-
gcc/testsuite/gcc.target/i386/extendbfsf.c | 16 ++++++++
7 files changed, 62 insertions(+), 6 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/extendbfsf.c
Comments
On Thu, Nov 24, 2022 at 09:22:00AM +0800, liuhongt via Gcc-patches wrote:
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -130,6 +130,7 @@ (define_c_enum "unspec" [
> ;; For AVX/AVX512F support
> UNSPEC_SCALEF
> UNSPEC_PCMP
> + UNSPEC_CVTBFSF
>
> ;; Generic math support
> UNSPEC_IEEE_MIN ; not commutative
> @@ -4961,6 +4962,31 @@ (define_insn "*extendhf<mode>2"
> (set_attr "prefix" "evex")
> (set_attr "mode" "<MODE>")])
>
> +(define_expand "extendbfsf2"
> + [(set (match_operand:SF 0 "register_operand")
> + (unspec:SF
> + [(match_operand:BF 1 "register_operand")]
> + UNSPEC_CVTBFSF))]
> + "TARGET_SSE2 && !HONOR_NANS (BFmode) && !flag_signaling_nans")
I think if !HONOR_NANS (BFmode), then flag_signaling_nans doesn't matter,
the former says that no NaNs may appear in a valid program,
so just testing !HONOR_NANS (BFmode) should be enough.
What I'm not sure about, my memory is weak, is whether one can
safely use the fast math related tests in define_expand conditions.
I vaguely remember init_all_optabs remembers the conditions, for
changes say in the ISA options optabs are reinited, but not sure if
that happens for optimization option changes like the fast math related
options are. So it would be perhaps safer to use just TARGET_SSE2
as the expand condition and in the C code body do
if (HONOR_NANS (BFmode) FAIL;
(similarly for truncsfbf2).
On the other side brief look at x86 insn-flags.h shows several fast math
related checks in HAVE_* macros.
PR92791 I found related to this was actually about
optimize_function_for_{size,speed}_p (cfun)
so maybe fast math related stuff is fine, just not the optimization for
speed or size.
Jakub
On Thu, Nov 24, 2022 at 4:53 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Thu, Nov 24, 2022 at 09:22:00AM +0800, liuhongt via Gcc-patches wrote:
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -130,6 +130,7 @@ (define_c_enum "unspec" [
> > ;; For AVX/AVX512F support
> > UNSPEC_SCALEF
> > UNSPEC_PCMP
> > + UNSPEC_CVTBFSF
> >
> > ;; Generic math support
> > UNSPEC_IEEE_MIN ; not commutative
> > @@ -4961,6 +4962,31 @@ (define_insn "*extendhf<mode>2"
> > (set_attr "prefix" "evex")
> > (set_attr "mode" "<MODE>")])
> >
> > +(define_expand "extendbfsf2"
> > + [(set (match_operand:SF 0 "register_operand")
> > + (unspec:SF
> > + [(match_operand:BF 1 "register_operand")]
> > + UNSPEC_CVTBFSF))]
> > + "TARGET_SSE2 && !HONOR_NANS (BFmode) && !flag_signaling_nans")
>
> I think if !HONOR_NANS (BFmode), then flag_signaling_nans doesn't matter,
> the former says that no NaNs may appear in a valid program,
> so just testing !HONOR_NANS (BFmode) should be enough.
I'll remove flag_signaling_nans.
>
> What I'm not sure about, my memory is weak, is whether one can
> safely use the fast math related tests in define_expand conditions.
> I vaguely remember init_all_optabs remembers the conditions, for
> changes say in the ISA options optabs are reinited, but not sure if
> that happens for optimization option changes like the fast math related
> options are. So it would be perhaps safer to use just TARGET_SSE2
> as the expand condition and in the C code body do
> if (HONOR_NANS (BFmode) FAIL;
> (similarly for truncsfbf2).
> On the other side brief look at x86 insn-flags.h shows several fast math
> related checks in HAVE_* macros.
> PR92791 I found related to this was actually about
Oh, good to know that, thanks.
> optimize_function_for_{size,speed}_p (cfun)
> so maybe fast math related stuff is fine, just not the optimization for
> speed or size.
I saw many backends(riscv,rs6000,mips,loongarch) already used HONOR_*
stuff in the expander conditions.
>
> Jakub
>
@@ -46,9 +46,7 @@ extern __inline float
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsbh_ss (__bf16 __A)
{
- union{ float a; unsigned int b;} __tmp;
- __tmp.b = ((unsigned int)(__A)) << 16;
- return __tmp.a;
+ return __builtin_ia32_cvtbf2sf (__A);
}
/* vcvtne2ps2bf16 */
@@ -1281,6 +1281,7 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI)
DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI)
# BF16 builtins
+DEF_FUNCTION_TYPE (FLOAT, BFLOAT16)
DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF)
DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, V32BF, USI)
DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, USI)
@@ -2838,6 +2838,8 @@ BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__
BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, "__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPBF16PS_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF)
BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, "__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPBF16PS_V4SF_MASK, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, "__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
+BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_extendbfsf2_1, "__builtin_ia32_cvtbf2sf", IX86_BUILTIN_CVTBF2SF, UNKNOWN, (int) FLOAT_FTYPE_BFLOAT16)
+
/* AVX512FP16. */
BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
@@ -10423,6 +10423,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
return ix86_expand_sse_ptest (d, exp, target);
case FLOAT128_FTYPE_FLOAT128:
case FLOAT_FTYPE_FLOAT:
+ case FLOAT_FTYPE_BFLOAT16:
case INT_FTYPE_INT:
case UINT_FTYPE_UINT:
case UINT16_FTYPE_UINT16:
@@ -130,6 +130,7 @@ (define_c_enum "unspec" [
;; For AVX/AVX512F support
UNSPEC_SCALEF
UNSPEC_PCMP
+ UNSPEC_CVTBFSF
;; Generic math support
UNSPEC_IEEE_MIN ; not commutative
@@ -4961,6 +4962,31 @@ (define_insn "*extendhf<mode>2"
(set_attr "prefix" "evex")
(set_attr "mode" "<MODE>")])
+(define_expand "extendbfsf2"
+ [(set (match_operand:SF 0 "register_operand")
+ (unspec:SF
+ [(match_operand:BF 1 "register_operand")]
+ UNSPEC_CVTBFSF))]
+ "TARGET_SSE2 && !HONOR_NANS (BFmode) && !flag_signaling_nans")
+
+;; Don't use float_extend since psrlld doesn't raise
+;; exceptions and turn a sNaN into a qNaN.
+(define_insn "extendbfsf2_1"
+ [(set (match_operand:SF 0 "register_operand" "=x,Yw")
+ (unspec:SF
+ [(match_operand:BF 1 "register_operand" " 0,Yw")]
+ UNSPEC_CVTBFSF))]
+ "TARGET_SSE2"
+ "@
+ pslld\t{$16, %0|%0, 16}
+ vpslld\t{$16, %1, %0|%0, %1, 16}"
+ [(set_attr "isa" "noavx,avx")
+ (set_attr "type" "sseishft")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix_data16" "1,*")
+ (set_attr "prefix" "orig,vex")
+ (set_attr "mode" "TI")
+ (set_attr "memory" "none")])
(define_expand "extend<mode>xf2"
[(set (match_operand:XF 0 "nonimmediate_operand")
@@ -5177,7 +5203,20 @@ (define_insn "*trunc<mode>hf2"
[(set_attr "type" "ssecvt")
(set_attr "prefix" "evex")
(set_attr "mode" "HF")])
-
+
+(define_insn "truncsfbf2"
+ [(set (match_operand:BF 0 "register_operand" "=x, v")
+ (float_truncate:BF
+ (match_operand:SF 1 "register_operand" "x,v")))]
+ "((TARGET_AVX512BF16 && TARGET_AVX512VL) || TARGET_AVXNECONVERT)
+ && !HONOR_NANS (BFmode) && flag_unsafe_math_optimizations
+ && !flag_signaling_nans"
+ "@
+ %{vex%} vcvtneps2bf16\t{%1, %0|%0, %1}
+ vcvtneps2bf16\t{%1, %0|%0, %1}"
+ [(set_attr "isa" "avxneconvert,avx512bf16vl")
+ (set_attr "prefix" "vex,evex")])
+
;; Signed conversion to DImode.
(define_expand "fix_truncxfdi2"
@@ -1,8 +1,7 @@
/* { dg-do compile } */
/* { dg-options "-mavx512bf16 -O2" } */
/* { dg-additional-options "-fno-PIE -mfpmath=sse" { target ia32 } } */
-/* { dg-final { scan-assembler-times "sall\[ \\t\]+\[^\{\n\]*16" 1 } } */
-/* { dg-final { scan-assembler-times "movl" 1 } } */
+/* { dg-final { scan-assembler-times "pslld" 1 } } */
#include <immintrin.h>
new file mode 100644
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -mavx512vl -O2 -ffast-math" } */
+/* { dg-final { scan-assembler-times "pslld" 1 } } */
+/* { dg-final { scan-assembler-times "vcvtneps2bf16" 1 } } */
+
+float
+extendsfbf (__bf16 a)
+{
+ return a;
+}
+
+__bf16
+truncsfbf (float a)
+{
+ return a;
+}