i386: Clear upper bits of XMM register for V4HFmode/V2HFmode operations [PR110762]
Checks
Commit Message
Similar like r14-2786-gade30fad6669e5, the patch is for V4HF/V2HFmode.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?
gcc/ChangeLog:
PR target/110762
* config/i386/mmx.md (<insn><mode>3): Changed from define_insn
to define_expand and break into ..
(<insn>v4hf3): .. this.
(divv4hf3): .. this.
(<insn>v2hf3): .. this.
(divv2hf3): .. this.
(movd_v2hf_to_sse): New define_expand.
(movq_<mode>_to_sse): Extend to V4HFmode.
(mmxdoublevecmode): Ditto.
(V2FI_V4HF): New mode iterator.
* config/i386/sse.md (*vec_concatv4sf): Extend to hanlde V8HF
by using mode iterator V4SF_V8HF, renamed to ..
(*vec_concat<mode>): .. this.
(*vec_concatv4sf_0): Extend to handle V8HF by using mode
iterator V4SF_V8HF, renamed to ..
(*vec_concat<mode>_0): .. this.
(*vec_concatv8hf_movss): New define_insn.
(V4SF_V8HF): New mode iterator.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110762-v4hf.c: New test.
---
gcc/config/i386/mmx.md | 109 +++++++++++++++---
gcc/config/i386/sse.md | 40 +++++--
gcc/testsuite/gcc.target/i386/pr110762-v4hf.c | 57 +++++++++
3 files changed, 177 insertions(+), 29 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr110762-v4hf.c
Comments
On Mon, Aug 7, 2023 at 10:57 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> Similar like r14-2786-gade30fad6669e5, the patch is for V4HF/V2HFmode.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
> PR target/110762
> * config/i386/mmx.md (<insn><mode>3): Changed from define_insn
> to define_expand and break into ..
> (<insn>v4hf3): .. this.
> (divv4hf3): .. this.
> (<insn>v2hf3): .. this.
> (divv2hf3): .. this.
> (movd_v2hf_to_sse): New define_expand.
> (movq_<mode>_to_sse): Extend to V4HFmode.
> (mmxdoublevecmode): Ditto.
> (V2FI_V4HF): New mode iterator.
> * config/i386/sse.md (*vec_concatv4sf): Extend to hanlde V8HF
> by using mode iterator V4SF_V8HF, renamed to ..
> (*vec_concat<mode>): .. this.
> (*vec_concatv4sf_0): Extend to handle V8HF by using mode
> iterator V4SF_V8HF, renamed to ..
> (*vec_concat<mode>_0): .. this.
> (*vec_concatv8hf_movss): New define_insn.
> (V4SF_V8HF): New mode iterator.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr110762-v4hf.c: New test.
LGTM.
Please also note the RFC patch [1] that relaxes clears for V2SFmode
with -fno-trapping-math. The patched compiler will then emit the same
code as clang does for -O2. Which raises another question - should gcc
default to -fno-trapping-math?
[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625795.html
Thanks,
Uros.
> ---
> gcc/config/i386/mmx.md | 109 +++++++++++++++---
> gcc/config/i386/sse.md | 40 +++++--
> gcc/testsuite/gcc.target/i386/pr110762-v4hf.c | 57 +++++++++
> 3 files changed, 177 insertions(+), 29 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr110762-v4hf.c
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index 896af76a33f..88bdf084f54 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -79,9 +79,7 @@ (define_mode_iterator V_16_32_64
> ;; V2S* modes
> (define_mode_iterator V2FI [V2SF V2SI])
>
> -;; 4-byte and 8-byte float16 vector modes
> -(define_mode_iterator VHF_32_64 [V4HF V2HF])
> -
> +(define_mode_iterator V2FI_V4HF [V2SF V2SI V4HF])
> ;; Mapping from integer vector mode to mnemonic suffix
> (define_mode_attr mmxvecsize
> [(V8QI "b") (V4QI "b") (V2QI "b")
> @@ -108,7 +106,7 @@ (define_mode_attr mmxintvecmodelower
>
> ;; Mapping of vector modes to a vector mode of double size
> (define_mode_attr mmxdoublevecmode
> - [(V2SF "V4SF") (V2SI "V4SI")])
> + [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF")])
>
> ;; Mapping of vector modes back to the scalar modes
> (define_mode_attr mmxscalarmode
> @@ -594,7 +592,7 @@ (define_insn "sse_movntq"
> (define_expand "movq_<mode>_to_sse"
> [(set (match_operand:<mmxdoublevecmode> 0 "register_operand")
> (vec_concat:<mmxdoublevecmode>
> - (match_operand:V2FI 1 "nonimmediate_operand")
> + (match_operand:V2FI_V4HF 1 "nonimmediate_operand")
> (match_dup 2)))]
> "TARGET_SSE2"
> "operands[2] = CONST0_RTX (<MODE>mode);")
> @@ -1927,21 +1925,94 @@ (define_expand "lroundv2sfv2si2"
> ;;
> ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>
> -(define_insn "<insn><mode>3"
> - [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
> - (plusminusmultdiv:VHF_32_64
> - (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
> - (match_operand:VHF_32_64 2 "register_operand" "v")))]
> +(define_expand "<insn>v4hf3"
> + [(set (match_operand:V4HF 0 "register_operand")
> + (plusminusmult:V4HF
> + (match_operand:V4HF 1 "nonimmediate_operand")
> + (match_operand:V4HF 2 "nonimmediate_operand")))]
> "TARGET_AVX512FP16 && TARGET_AVX512VL"
> - "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
> - [(set (attr "type")
> - (cond [(match_test "<CODE> == MULT")
> - (const_string "ssemul")
> - (match_test "<CODE> == DIV")
> - (const_string "ssediv")]
> - (const_string "sseadd")))
> - (set_attr "prefix" "evex")
> - (set_attr "mode" "V8HF")])
> +{
> + rtx op2 = gen_reg_rtx (V8HFmode);
> + rtx op1 = gen_reg_rtx (V8HFmode);
> + rtx op0 = gen_reg_rtx (V8HFmode);
> +
> + emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
> + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
> +
> + emit_insn (gen_<insn>v8hf3 (op0, op1, op2));
> +
> + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
> + DONE;
> +})
> +
> +(define_expand "divv4hf3"
> + [(set (match_operand:V4HF 0 "register_operand")
> + (div:V4HF
> + (match_operand:V4HF 1 "nonimmediate_operand")
> + (match_operand:V4HF 2 "nonimmediate_operand")))]
> + "TARGET_AVX512FP16 && TARGET_AVX512VL"
> +{
> + rtx op2 = gen_reg_rtx (V8HFmode);
> + rtx op1 = gen_reg_rtx (V8HFmode);
> + rtx op0 = gen_reg_rtx (V8HFmode);
> +
> + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
> + rtx tmp = gen_rtx_VEC_CONCAT (V8HFmode, operands[2],
> + force_reg (V4HFmode, CONST1_RTX (V4HFmode)));
> + emit_insn (gen_rtx_SET (op2, tmp));
> + emit_insn (gen_divv8hf3 (op0, op1, op2));
> + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
> + DONE;
> +})
> +
> +(define_expand "movd_v2hf_to_sse"
> + [(set (match_operand:V8HF 0 "register_operand")
> + (vec_merge:V8HF
> + (vec_duplicate:V8HF
> + (match_operand:V2HF 1 "nonimmediate_operand"))
> + (match_operand:V8HF 2 "reg_or_0_operand")
> + (const_int 3)))]
> + "TARGET_SSE")
> +
> +(define_expand "<insn>v2hf3"
> + [(set (match_operand:V2HF 0 "register_operand")
> + (plusminusmult:V2HF
> + (match_operand:V2HF 1 "nonimmediate_operand")
> + (match_operand:V2HF 2 "nonimmediate_operand")))]
> + "TARGET_AVX512FP16 && TARGET_AVX512VL"
> +{
> + rtx op2 = gen_reg_rtx (V8HFmode);
> + rtx op1 = gen_reg_rtx (V8HFmode);
> + rtx op0 = gen_reg_rtx (V8HFmode);
> +
> + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], CONST0_RTX (V8HFmode)));
> + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode)));
> + emit_insn (gen_<insn>v8hf3 (op0, op1, op2));
> +
> + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode));
> + DONE;
> +})
> +
> +(define_expand "divv2hf3"
> + [(set (match_operand:V2HF 0 "register_operand")
> + (div:V2HF
> + (match_operand:V2HF 1 "nonimmediate_operand")
> + (match_operand:V2HF 2 "nonimmediate_operand")))]
> + "TARGET_AVX512FP16 && TARGET_AVX512VL"
> +{
> + rtx op2 = gen_reg_rtx (V8HFmode);
> + rtx op1 = gen_reg_rtx (V8HFmode);
> + rtx op0 = gen_reg_rtx (V8HFmode);
> +
> + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2],
> + force_reg (V8HFmode, CONST1_RTX (V8HFmode))));
> + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode)));
> + emit_insn (gen_divv8hf3 (op0, op1, op2));
> +
> + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode));
> + DONE;
> +})
> +
>
> ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> ;;
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index ab455c3e297..7383a50ee0d 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -430,6 +430,9 @@ (define_mode_iterator VF_512
> (define_mode_iterator VFB_512
> [V32HF V16SF V8DF])
>
> +(define_mode_iterator V4SF_V8HF
> + [V4SF V8HF])
> +
> (define_mode_iterator VI48_AVX512VL
> [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
> V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
> @@ -10873,11 +10876,11 @@ (define_insn "*vec_concatv2sf_sse"
> (set_attr "type" "sselog,ssemov,mmxcvt,mmxmov")
> (set_attr "mode" "V4SF,SF,DI,DI")])
>
> -(define_insn "*vec_concatv4sf"
> - [(set (match_operand:V4SF 0 "register_operand" "=x,v,x,v")
> - (vec_concat:V4SF
> - (match_operand:V2SF 1 "register_operand" " 0,v,0,v")
> - (match_operand:V2SF 2 "nonimmediate_operand" " x,v,m,m")))]
> +(define_insn "*vec_concat<mode>"
> + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=x,v,x,v")
> + (vec_concat:V4SF_V8HF
> + (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,v")
> + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " x,v,m,m")))]
> "TARGET_SSE"
> "@
> movlhps\t{%2, %0|%0, %2}
> @@ -10889,17 +10892,34 @@ (define_insn "*vec_concatv4sf"
> (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex")
> (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
>
> -(define_insn "*vec_concatv4sf_0"
> - [(set (match_operand:V4SF 0 "register_operand" "=v")
> - (vec_concat:V4SF
> - (match_operand:V2SF 1 "nonimmediate_operand" "vm")
> - (match_operand:V2SF 2 "const0_operand")))]
> +(define_insn "*vec_concat<mode>_0"
> + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=v")
> + (vec_concat:V4SF_V8HF
> + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
> + (match_operand:<ssehalfvecmode> 2 "const0_operand")))]
> "TARGET_SSE2"
> "%vmovq\t{%1, %0|%0, %1}"
> [(set_attr "type" "ssemov")
> (set_attr "prefix" "maybe_vex")
> (set_attr "mode" "DF")])
>
> +(define_insn "*vec_concatv8hf_movss"
> + [(set (match_operand:V8HF 0 "register_operand" "=x,v,v")
> + (vec_merge:V8HF
> + (vec_duplicate:V8HF
> + (match_operand:V2HF 2 "nonimmediate_operand" "x,m,v"))
> + (match_operand:V8HF 1 "reg_or_0_operand" "0,C,v" )
> + (const_int 3)))]
> + "TARGET_SSE"
> + "@
> + movss\t{%2, %0|%0, %2}
> + %vmovss\t{%2, %0|%0, %2}
> + vmovss\t{%2, %1, %0|%0, %1, %2}"
> + [(set_attr "isa" "noavx,*,avx")
> + (set_attr "type" "ssemov")
> + (set_attr "prefix" "orig,maybe_vex,maybe_vex")
> + (set_attr "mode" "SF")])
> +
> ;; Avoid combining registers from different units in a single alternative,
> ;; see comment above inline_secondary_memory_needed function in i386.cc
> (define_insn "vec_set<mode>_0"
> diff --git a/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c
> new file mode 100644
> index 00000000000..332784ac694
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c
> @@ -0,0 +1,57 @@
> +/* PR target/110762 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -dp" } */
> +
> +typedef _Float16 v4hf __attribute__((vector_size(8)));
> +typedef _Float16 v2hf __attribute__((vector_size(4)));
> +
> +v4hf
> +foo (v4hf a, v4hf b)
> +{
> + return a + b;
> +}
> +
> +v4hf
> +foo2 (v4hf a, v4hf b)
> +{
> + return a - b;
> +}
> +
> +v4hf
> +foo3 (v4hf a, v4hf b)
> +{
> + return a * b;
> +}
> +
> +v4hf
> +foo1 (v4hf a, v4hf b)
> +{
> + return a / b;
> +}
> +
> +v2hf
> +foo4 (v2hf a, v2hf b)
> +{
> + return a + b;
> +}
> +
> +v2hf
> +foo5 (v2hf a, v2hf b)
> +{
> + return a - b;
> +}
> +
> +v2hf
> +foo6 (v2hf a, v2hf b)
> +{
> + return a * b;
> +}
> +
> +v2hf
> +foo7 (v2hf a, v2hf b)
> +{
> + return a / b;
> +}
> +
> +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_0" 7 } } */
> +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_movss" 8 } } */
> --
> 2.31.1
>
On Mon, Aug 7, 2023 at 11:20 AM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Mon, Aug 7, 2023 at 10:57 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > Similar like r14-2786-gade30fad6669e5, the patch is for V4HF/V2HFmode.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > PR target/110762
> > * config/i386/mmx.md (<insn><mode>3): Changed from define_insn
> > to define_expand and break into ..
> > (<insn>v4hf3): .. this.
> > (divv4hf3): .. this.
> > (<insn>v2hf3): .. this.
> > (divv2hf3): .. this.
> > (movd_v2hf_to_sse): New define_expand.
> > (movq_<mode>_to_sse): Extend to V4HFmode.
> > (mmxdoublevecmode): Ditto.
> > (V2FI_V4HF): New mode iterator.
> > * config/i386/sse.md (*vec_concatv4sf): Extend to hanlde V8HF
> > by using mode iterator V4SF_V8HF, renamed to ..
> > (*vec_concat<mode>): .. this.
> > (*vec_concatv4sf_0): Extend to handle V8HF by using mode
> > iterator V4SF_V8HF, renamed to ..
> > (*vec_concat<mode>_0): .. this.
> > (*vec_concatv8hf_movss): New define_insn.
> > (V4SF_V8HF): New mode iterator.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr110762-v4hf.c: New test.
>
> LGTM.
>
> Please also note the RFC patch [1] that relaxes clears for V2SFmode
> with -fno-trapping-math. The patched compiler will then emit the same
> code as clang does for -O2. Which raises another question - should gcc
> default to -fno-trapping-math?
I think we discussed this before and yes, IMHO we should default to
-fno-trapping-math at least for C/C++ to be consistent with our other
handling of the FP environment (default to -fno-rounding-math) and
lack of proper FENV access barriers for inspecting the exceptions.
Note Fortran has the -ffpe-trap= option which would then need to make
sure to also enable -ftrapping-math. Ada might have similar constraints
(it also uses -fnon-call-exceptions, but unless it enables CPU traps for
FP exceptions that would be a no-op). Note this also shows we should
possibly separate maintaining the IEEE exception state and considering
changes in the IEEE exception states to cause CPU traps (that's also
a source of common confusion on the user side).
Richard.
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625795.html
>
> Thanks,
> Uros.
>
> > ---
> > gcc/config/i386/mmx.md | 109 +++++++++++++++---
> > gcc/config/i386/sse.md | 40 +++++--
> > gcc/testsuite/gcc.target/i386/pr110762-v4hf.c | 57 +++++++++
> > 3 files changed, 177 insertions(+), 29 deletions(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr110762-v4hf.c
> >
> > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> > index 896af76a33f..88bdf084f54 100644
> > --- a/gcc/config/i386/mmx.md
> > +++ b/gcc/config/i386/mmx.md
> > @@ -79,9 +79,7 @@ (define_mode_iterator V_16_32_64
> > ;; V2S* modes
> > (define_mode_iterator V2FI [V2SF V2SI])
> >
> > -;; 4-byte and 8-byte float16 vector modes
> > -(define_mode_iterator VHF_32_64 [V4HF V2HF])
> > -
> > +(define_mode_iterator V2FI_V4HF [V2SF V2SI V4HF])
> > ;; Mapping from integer vector mode to mnemonic suffix
> > (define_mode_attr mmxvecsize
> > [(V8QI "b") (V4QI "b") (V2QI "b")
> > @@ -108,7 +106,7 @@ (define_mode_attr mmxintvecmodelower
> >
> > ;; Mapping of vector modes to a vector mode of double size
> > (define_mode_attr mmxdoublevecmode
> > - [(V2SF "V4SF") (V2SI "V4SI")])
> > + [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF")])
> >
> > ;; Mapping of vector modes back to the scalar modes
> > (define_mode_attr mmxscalarmode
> > @@ -594,7 +592,7 @@ (define_insn "sse_movntq"
> > (define_expand "movq_<mode>_to_sse"
> > [(set (match_operand:<mmxdoublevecmode> 0 "register_operand")
> > (vec_concat:<mmxdoublevecmode>
> > - (match_operand:V2FI 1 "nonimmediate_operand")
> > + (match_operand:V2FI_V4HF 1 "nonimmediate_operand")
> > (match_dup 2)))]
> > "TARGET_SSE2"
> > "operands[2] = CONST0_RTX (<MODE>mode);")
> > @@ -1927,21 +1925,94 @@ (define_expand "lroundv2sfv2si2"
> > ;;
> > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >
> > -(define_insn "<insn><mode>3"
> > - [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
> > - (plusminusmultdiv:VHF_32_64
> > - (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
> > - (match_operand:VHF_32_64 2 "register_operand" "v")))]
> > +(define_expand "<insn>v4hf3"
> > + [(set (match_operand:V4HF 0 "register_operand")
> > + (plusminusmult:V4HF
> > + (match_operand:V4HF 1 "nonimmediate_operand")
> > + (match_operand:V4HF 2 "nonimmediate_operand")))]
> > "TARGET_AVX512FP16 && TARGET_AVX512VL"
> > - "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
> > - [(set (attr "type")
> > - (cond [(match_test "<CODE> == MULT")
> > - (const_string "ssemul")
> > - (match_test "<CODE> == DIV")
> > - (const_string "ssediv")]
> > - (const_string "sseadd")))
> > - (set_attr "prefix" "evex")
> > - (set_attr "mode" "V8HF")])
> > +{
> > + rtx op2 = gen_reg_rtx (V8HFmode);
> > + rtx op1 = gen_reg_rtx (V8HFmode);
> > + rtx op0 = gen_reg_rtx (V8HFmode);
> > +
> > + emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
> > + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
> > +
> > + emit_insn (gen_<insn>v8hf3 (op0, op1, op2));
> > +
> > + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
> > + DONE;
> > +})
> > +
> > +(define_expand "divv4hf3"
> > + [(set (match_operand:V4HF 0 "register_operand")
> > + (div:V4HF
> > + (match_operand:V4HF 1 "nonimmediate_operand")
> > + (match_operand:V4HF 2 "nonimmediate_operand")))]
> > + "TARGET_AVX512FP16 && TARGET_AVX512VL"
> > +{
> > + rtx op2 = gen_reg_rtx (V8HFmode);
> > + rtx op1 = gen_reg_rtx (V8HFmode);
> > + rtx op0 = gen_reg_rtx (V8HFmode);
> > +
> > + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
> > + rtx tmp = gen_rtx_VEC_CONCAT (V8HFmode, operands[2],
> > + force_reg (V4HFmode, CONST1_RTX (V4HFmode)));
> > + emit_insn (gen_rtx_SET (op2, tmp));
> > + emit_insn (gen_divv8hf3 (op0, op1, op2));
> > + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
> > + DONE;
> > +})
> > +
> > +(define_expand "movd_v2hf_to_sse"
> > + [(set (match_operand:V8HF 0 "register_operand")
> > + (vec_merge:V8HF
> > + (vec_duplicate:V8HF
> > + (match_operand:V2HF 1 "nonimmediate_operand"))
> > + (match_operand:V8HF 2 "reg_or_0_operand")
> > + (const_int 3)))]
> > + "TARGET_SSE")
> > +
> > +(define_expand "<insn>v2hf3"
> > + [(set (match_operand:V2HF 0 "register_operand")
> > + (plusminusmult:V2HF
> > + (match_operand:V2HF 1 "nonimmediate_operand")
> > + (match_operand:V2HF 2 "nonimmediate_operand")))]
> > + "TARGET_AVX512FP16 && TARGET_AVX512VL"
> > +{
> > + rtx op2 = gen_reg_rtx (V8HFmode);
> > + rtx op1 = gen_reg_rtx (V8HFmode);
> > + rtx op0 = gen_reg_rtx (V8HFmode);
> > +
> > + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], CONST0_RTX (V8HFmode)));
> > + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode)));
> > + emit_insn (gen_<insn>v8hf3 (op0, op1, op2));
> > +
> > + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode));
> > + DONE;
> > +})
> > +
> > +(define_expand "divv2hf3"
> > + [(set (match_operand:V2HF 0 "register_operand")
> > + (div:V2HF
> > + (match_operand:V2HF 1 "nonimmediate_operand")
> > + (match_operand:V2HF 2 "nonimmediate_operand")))]
> > + "TARGET_AVX512FP16 && TARGET_AVX512VL"
> > +{
> > + rtx op2 = gen_reg_rtx (V8HFmode);
> > + rtx op1 = gen_reg_rtx (V8HFmode);
> > + rtx op0 = gen_reg_rtx (V8HFmode);
> > +
> > + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2],
> > + force_reg (V8HFmode, CONST1_RTX (V8HFmode))));
> > + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode)));
> > + emit_insn (gen_divv8hf3 (op0, op1, op2));
> > +
> > + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode));
> > + DONE;
> > +})
> > +
> >
> > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> > ;;
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index ab455c3e297..7383a50ee0d 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -430,6 +430,9 @@ (define_mode_iterator VF_512
> > (define_mode_iterator VFB_512
> > [V32HF V16SF V8DF])
> >
> > +(define_mode_iterator V4SF_V8HF
> > + [V4SF V8HF])
> > +
> > (define_mode_iterator VI48_AVX512VL
> > [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
> > V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
> > @@ -10873,11 +10876,11 @@ (define_insn "*vec_concatv2sf_sse"
> > (set_attr "type" "sselog,ssemov,mmxcvt,mmxmov")
> > (set_attr "mode" "V4SF,SF,DI,DI")])
> >
> > -(define_insn "*vec_concatv4sf"
> > - [(set (match_operand:V4SF 0 "register_operand" "=x,v,x,v")
> > - (vec_concat:V4SF
> > - (match_operand:V2SF 1 "register_operand" " 0,v,0,v")
> > - (match_operand:V2SF 2 "nonimmediate_operand" " x,v,m,m")))]
> > +(define_insn "*vec_concat<mode>"
> > + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=x,v,x,v")
> > + (vec_concat:V4SF_V8HF
> > + (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,v")
> > + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " x,v,m,m")))]
> > "TARGET_SSE"
> > "@
> > movlhps\t{%2, %0|%0, %2}
> > @@ -10889,17 +10892,34 @@ (define_insn "*vec_concatv4sf"
> > (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex")
> > (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
> >
> > -(define_insn "*vec_concatv4sf_0"
> > - [(set (match_operand:V4SF 0 "register_operand" "=v")
> > - (vec_concat:V4SF
> > - (match_operand:V2SF 1 "nonimmediate_operand" "vm")
> > - (match_operand:V2SF 2 "const0_operand")))]
> > +(define_insn "*vec_concat<mode>_0"
> > + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=v")
> > + (vec_concat:V4SF_V8HF
> > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
> > + (match_operand:<ssehalfvecmode> 2 "const0_operand")))]
> > "TARGET_SSE2"
> > "%vmovq\t{%1, %0|%0, %1}"
> > [(set_attr "type" "ssemov")
> > (set_attr "prefix" "maybe_vex")
> > (set_attr "mode" "DF")])
> >
> > +(define_insn "*vec_concatv8hf_movss"
> > + [(set (match_operand:V8HF 0 "register_operand" "=x,v,v")
> > + (vec_merge:V8HF
> > + (vec_duplicate:V8HF
> > + (match_operand:V2HF 2 "nonimmediate_operand" "x,m,v"))
> > + (match_operand:V8HF 1 "reg_or_0_operand" "0,C,v" )
> > + (const_int 3)))]
> > + "TARGET_SSE"
> > + "@
> > + movss\t{%2, %0|%0, %2}
> > + %vmovss\t{%2, %0|%0, %2}
> > + vmovss\t{%2, %1, %0|%0, %1, %2}"
> > + [(set_attr "isa" "noavx,*,avx")
> > + (set_attr "type" "ssemov")
> > + (set_attr "prefix" "orig,maybe_vex,maybe_vex")
> > + (set_attr "mode" "SF")])
> > +
> > ;; Avoid combining registers from different units in a single alternative,
> > ;; see comment above inline_secondary_memory_needed function in i386.cc
> > (define_insn "vec_set<mode>_0"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c
> > new file mode 100644
> > index 00000000000..332784ac694
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c
> > @@ -0,0 +1,57 @@
> > +/* PR target/110762 */
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -dp" } */
> > +
> > +typedef _Float16 v4hf __attribute__((vector_size(8)));
> > +typedef _Float16 v2hf __attribute__((vector_size(4)));
> > +
> > +v4hf
> > +foo (v4hf a, v4hf b)
> > +{
> > + return a + b;
> > +}
> > +
> > +v4hf
> > +foo2 (v4hf a, v4hf b)
> > +{
> > + return a - b;
> > +}
> > +
> > +v4hf
> > +foo3 (v4hf a, v4hf b)
> > +{
> > + return a * b;
> > +}
> > +
> > +v4hf
> > +foo1 (v4hf a, v4hf b)
> > +{
> > + return a / b;
> > +}
> > +
> > +v2hf
> > +foo4 (v2hf a, v2hf b)
> > +{
> > + return a + b;
> > +}
> > +
> > +v2hf
> > +foo5 (v2hf a, v2hf b)
> > +{
> > + return a - b;
> > +}
> > +
> > +v2hf
> > +foo6 (v2hf a, v2hf b)
> > +{
> > + return a * b;
> > +}
> > +
> > +v2hf
> > +foo7 (v2hf a, v2hf b)
> > +{
> > + return a / b;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_0" 7 } } */
> > +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_movss" 8 } } */
> > --
> > 2.31.1
> >
On Mon, Aug 7, 2023 at 5:19 PM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Mon, Aug 7, 2023 at 10:57 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > Similar like r14-2786-gade30fad6669e5, the patch is for V4HF/V2HFmode.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > PR target/110762
> > * config/i386/mmx.md (<insn><mode>3): Changed from define_insn
> > to define_expand and break into ..
> > (<insn>v4hf3): .. this.
> > (divv4hf3): .. this.
> > (<insn>v2hf3): .. this.
> > (divv2hf3): .. this.
> > (movd_v2hf_to_sse): New define_expand.
> > (movq_<mode>_to_sse): Extend to V4HFmode.
> > (mmxdoublevecmode): Ditto.
> > (V2FI_V4HF): New mode iterator.
> > * config/i386/sse.md (*vec_concatv4sf): Extend to hanlde V8HF
> > by using mode iterator V4SF_V8HF, renamed to ..
> > (*vec_concat<mode>): .. this.
> > (*vec_concatv4sf_0): Extend to handle V8HF by using mode
> > iterator V4SF_V8HF, renamed to ..
> > (*vec_concat<mode>_0): .. this.
> > (*vec_concatv8hf_movss): New define_insn.
> > (V4SF_V8HF): New mode iterator.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr110762-v4hf.c: New test.
>
> LGTM.
>
> Please also note the RFC patch [1] that relaxes clears for V2SFmode
> with -fno-trapping-math. The patched compiler will then emit the same
> code as clang does for -O2. Which raises another question - should gcc
> default to -fno-trapping-math?
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625795.html
>
I can create another patch to handle my parts for -fno-trapping-math
optimization.
> Thanks,
> Uros.
>
> > ---
> > gcc/config/i386/mmx.md | 109 +++++++++++++++---
> > gcc/config/i386/sse.md | 40 +++++--
> > gcc/testsuite/gcc.target/i386/pr110762-v4hf.c | 57 +++++++++
> > 3 files changed, 177 insertions(+), 29 deletions(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr110762-v4hf.c
> >
> > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> > index 896af76a33f..88bdf084f54 100644
> > --- a/gcc/config/i386/mmx.md
> > +++ b/gcc/config/i386/mmx.md
> > @@ -79,9 +79,7 @@ (define_mode_iterator V_16_32_64
> > ;; V2S* modes
> > (define_mode_iterator V2FI [V2SF V2SI])
> >
> > -;; 4-byte and 8-byte float16 vector modes
> > -(define_mode_iterator VHF_32_64 [V4HF V2HF])
> > -
> > +(define_mode_iterator V2FI_V4HF [V2SF V2SI V4HF])
> > ;; Mapping from integer vector mode to mnemonic suffix
> > (define_mode_attr mmxvecsize
> > [(V8QI "b") (V4QI "b") (V2QI "b")
> > @@ -108,7 +106,7 @@ (define_mode_attr mmxintvecmodelower
> >
> > ;; Mapping of vector modes to a vector mode of double size
> > (define_mode_attr mmxdoublevecmode
> > - [(V2SF "V4SF") (V2SI "V4SI")])
> > + [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF")])
> >
> > ;; Mapping of vector modes back to the scalar modes
> > (define_mode_attr mmxscalarmode
> > @@ -594,7 +592,7 @@ (define_insn "sse_movntq"
> > (define_expand "movq_<mode>_to_sse"
> > [(set (match_operand:<mmxdoublevecmode> 0 "register_operand")
> > (vec_concat:<mmxdoublevecmode>
> > - (match_operand:V2FI 1 "nonimmediate_operand")
> > + (match_operand:V2FI_V4HF 1 "nonimmediate_operand")
> > (match_dup 2)))]
> > "TARGET_SSE2"
> > "operands[2] = CONST0_RTX (<MODE>mode);")
> > @@ -1927,21 +1925,94 @@ (define_expand "lroundv2sfv2si2"
> > ;;
> > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> >
> > -(define_insn "<insn><mode>3"
> > - [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
> > - (plusminusmultdiv:VHF_32_64
> > - (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
> > - (match_operand:VHF_32_64 2 "register_operand" "v")))]
> > +(define_expand "<insn>v4hf3"
> > + [(set (match_operand:V4HF 0 "register_operand")
> > + (plusminusmult:V4HF
> > + (match_operand:V4HF 1 "nonimmediate_operand")
> > + (match_operand:V4HF 2 "nonimmediate_operand")))]
> > "TARGET_AVX512FP16 && TARGET_AVX512VL"
> > - "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
> > - [(set (attr "type")
> > - (cond [(match_test "<CODE> == MULT")
> > - (const_string "ssemul")
> > - (match_test "<CODE> == DIV")
> > - (const_string "ssediv")]
> > - (const_string "sseadd")))
> > - (set_attr "prefix" "evex")
> > - (set_attr "mode" "V8HF")])
> > +{
> > + rtx op2 = gen_reg_rtx (V8HFmode);
> > + rtx op1 = gen_reg_rtx (V8HFmode);
> > + rtx op0 = gen_reg_rtx (V8HFmode);
> > +
> > + emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
> > + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
> > +
> > + emit_insn (gen_<insn>v8hf3 (op0, op1, op2));
> > +
> > + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
> > + DONE;
> > +})
> > +
> > +(define_expand "divv4hf3"
> > + [(set (match_operand:V4HF 0 "register_operand")
> > + (div:V4HF
> > + (match_operand:V4HF 1 "nonimmediate_operand")
> > + (match_operand:V4HF 2 "nonimmediate_operand")))]
> > + "TARGET_AVX512FP16 && TARGET_AVX512VL"
> > +{
> > + rtx op2 = gen_reg_rtx (V8HFmode);
> > + rtx op1 = gen_reg_rtx (V8HFmode);
> > + rtx op0 = gen_reg_rtx (V8HFmode);
> > +
> > + emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
> > + rtx tmp = gen_rtx_VEC_CONCAT (V8HFmode, operands[2],
> > + force_reg (V4HFmode, CONST1_RTX (V4HFmode)));
> > + emit_insn (gen_rtx_SET (op2, tmp));
> > + emit_insn (gen_divv8hf3 (op0, op1, op2));
> > + emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
> > + DONE;
> > +})
> > +
> > +(define_expand "movd_v2hf_to_sse"
> > + [(set (match_operand:V8HF 0 "register_operand")
> > + (vec_merge:V8HF
> > + (vec_duplicate:V8HF
> > + (match_operand:V2HF 1 "nonimmediate_operand"))
> > + (match_operand:V8HF 2 "reg_or_0_operand")
> > + (const_int 3)))]
> > + "TARGET_SSE")
> > +
> > +(define_expand "<insn>v2hf3"
> > + [(set (match_operand:V2HF 0 "register_operand")
> > + (plusminusmult:V2HF
> > + (match_operand:V2HF 1 "nonimmediate_operand")
> > + (match_operand:V2HF 2 "nonimmediate_operand")))]
> > + "TARGET_AVX512FP16 && TARGET_AVX512VL"
> > +{
> > + rtx op2 = gen_reg_rtx (V8HFmode);
> > + rtx op1 = gen_reg_rtx (V8HFmode);
> > + rtx op0 = gen_reg_rtx (V8HFmode);
> > +
> > + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], CONST0_RTX (V8HFmode)));
> > + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode)));
> > + emit_insn (gen_<insn>v8hf3 (op0, op1, op2));
> > +
> > + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode));
> > + DONE;
> > +})
> > +
> > +(define_expand "divv2hf3"
> > + [(set (match_operand:V2HF 0 "register_operand")
> > + (div:V2HF
> > + (match_operand:V2HF 1 "nonimmediate_operand")
> > + (match_operand:V2HF 2 "nonimmediate_operand")))]
> > + "TARGET_AVX512FP16 && TARGET_AVX512VL"
> > +{
> > + rtx op2 = gen_reg_rtx (V8HFmode);
> > + rtx op1 = gen_reg_rtx (V8HFmode);
> > + rtx op0 = gen_reg_rtx (V8HFmode);
> > +
> > + emit_insn (gen_movd_v2hf_to_sse (op2, operands[2],
> > + force_reg (V8HFmode, CONST1_RTX (V8HFmode))));
> > + emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode)));
> > + emit_insn (gen_divv8hf3 (op0, op1, op2));
> > +
> > + emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode));
> > + DONE;
> > +})
> > +
> >
> > ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> > ;;
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index ab455c3e297..7383a50ee0d 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -430,6 +430,9 @@ (define_mode_iterator VF_512
> > (define_mode_iterator VFB_512
> > [V32HF V16SF V8DF])
> >
> > +(define_mode_iterator V4SF_V8HF
> > + [V4SF V8HF])
> > +
> > (define_mode_iterator VI48_AVX512VL
> > [V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
> > V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
> > @@ -10873,11 +10876,11 @@ (define_insn "*vec_concatv2sf_sse"
> > (set_attr "type" "sselog,ssemov,mmxcvt,mmxmov")
> > (set_attr "mode" "V4SF,SF,DI,DI")])
> >
> > -(define_insn "*vec_concatv4sf"
> > - [(set (match_operand:V4SF 0 "register_operand" "=x,v,x,v")
> > - (vec_concat:V4SF
> > - (match_operand:V2SF 1 "register_operand" " 0,v,0,v")
> > - (match_operand:V2SF 2 "nonimmediate_operand" " x,v,m,m")))]
> > +(define_insn "*vec_concat<mode>"
> > + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=x,v,x,v")
> > + (vec_concat:V4SF_V8HF
> > + (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,v")
> > + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " x,v,m,m")))]
> > "TARGET_SSE"
> > "@
> > movlhps\t{%2, %0|%0, %2}
> > @@ -10889,17 +10892,34 @@ (define_insn "*vec_concatv4sf"
> > (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex")
> > (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
> >
> > -(define_insn "*vec_concatv4sf_0"
> > - [(set (match_operand:V4SF 0 "register_operand" "=v")
> > - (vec_concat:V4SF
> > - (match_operand:V2SF 1 "nonimmediate_operand" "vm")
> > - (match_operand:V2SF 2 "const0_operand")))]
> > +(define_insn "*vec_concat<mode>_0"
> > + [(set (match_operand:V4SF_V8HF 0 "register_operand" "=v")
> > + (vec_concat:V4SF_V8HF
> > + (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
> > + (match_operand:<ssehalfvecmode> 2 "const0_operand")))]
> > "TARGET_SSE2"
> > "%vmovq\t{%1, %0|%0, %1}"
> > [(set_attr "type" "ssemov")
> > (set_attr "prefix" "maybe_vex")
> > (set_attr "mode" "DF")])
> >
> > +(define_insn "*vec_concatv8hf_movss"
> > + [(set (match_operand:V8HF 0 "register_operand" "=x,v,v")
> > + (vec_merge:V8HF
> > + (vec_duplicate:V8HF
> > + (match_operand:V2HF 2 "nonimmediate_operand" "x,m,v"))
> > + (match_operand:V8HF 1 "reg_or_0_operand" "0,C,v" )
> > + (const_int 3)))]
> > + "TARGET_SSE"
> > + "@
> > + movss\t{%2, %0|%0, %2}
> > + %vmovss\t{%2, %0|%0, %2}
> > + vmovss\t{%2, %1, %0|%0, %1, %2}"
> > + [(set_attr "isa" "noavx,*,avx")
> > + (set_attr "type" "ssemov")
> > + (set_attr "prefix" "orig,maybe_vex,maybe_vex")
> > + (set_attr "mode" "SF")])
> > +
> > ;; Avoid combining registers from different units in a single alternative,
> > ;; see comment above inline_secondary_memory_needed function in i386.cc
> > (define_insn "vec_set<mode>_0"
> > diff --git a/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c
> > new file mode 100644
> > index 00000000000..332784ac694
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr110762-v4hf.c
> > @@ -0,0 +1,57 @@
> > +/* PR target/110762 */
> > +/* { dg-do compile { target { ! ia32 } } } */
> > +/* { dg-options "-O2 -mavx512fp16 -mavx512vl -dp" } */
> > +
> > +typedef _Float16 v4hf __attribute__((vector_size(8)));
> > +typedef _Float16 v2hf __attribute__((vector_size(4)));
> > +
> > +v4hf
> > +foo (v4hf a, v4hf b)
> > +{
> > + return a + b;
> > +}
> > +
> > +v4hf
> > +foo2 (v4hf a, v4hf b)
> > +{
> > + return a - b;
> > +}
> > +
> > +v4hf
> > +foo3 (v4hf a, v4hf b)
> > +{
> > + return a * b;
> > +}
> > +
> > +v4hf
> > +foo1 (v4hf a, v4hf b)
> > +{
> > + return a / b;
> > +}
> > +
> > +v2hf
> > +foo4 (v2hf a, v2hf b)
> > +{
> > + return a + b;
> > +}
> > +
> > +v2hf
> > +foo5 (v2hf a, v2hf b)
> > +{
> > + return a - b;
> > +}
> > +
> > +v2hf
> > +foo6 (v2hf a, v2hf b)
> > +{
> > + return a * b;
> > +}
> > +
> > +v2hf
> > +foo7 (v2hf a, v2hf b)
> > +{
> > + return a / b;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_0" 7 } } */
> > +/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_movss" 8 } } */
> > --
> > 2.31.1
> >
On Mon, Aug 7, 2023 at 1:20 PM Richard Biener
<richard.guenther@gmail.com> wrote:
> > Please also note the RFC patch [1] that relaxes clears for V2SFmode
> > with -fno-trapping-math. The patched compiler will then emit the same
> > code as clang does for -O2. Which raises another question - should gcc
> > default to -fno-trapping-math?
>
> I think we discussed this before and yes, IMHO we should default to
> -fno-trapping-math at least for C/C++ to be consistent with our other
> handling of the FP environment (default to -fno-rounding-math) and
> lack of proper FENV access barriers for inspecting the exceptions.
>
> Note Fortran has the -ffpe-trap= option which would then need to make
> sure to also enable -ftrapping-math. Ada might have similar constraints
> (it also uses -fnon-call-exceptions, but unless it enables CPU traps for
> FP exceptions that would be a no-op). Note this also shows we should
> possibly separate maintaining the IEEE exception state and considering
> changes in the IEEE exception states to cause CPU traps (that's also
> a source of common confusion on the user side).
FTR: PR54192, "-fno-trapping-math by default?" [1]
[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54192
Uros.
@@ -79,9 +79,7 @@ (define_mode_iterator V_16_32_64
;; V2S* modes
(define_mode_iterator V2FI [V2SF V2SI])
-;; 4-byte and 8-byte float16 vector modes
-(define_mode_iterator VHF_32_64 [V4HF V2HF])
-
+(define_mode_iterator V2FI_V4HF [V2SF V2SI V4HF])
;; Mapping from integer vector mode to mnemonic suffix
(define_mode_attr mmxvecsize
[(V8QI "b") (V4QI "b") (V2QI "b")
@@ -108,7 +106,7 @@ (define_mode_attr mmxintvecmodelower
;; Mapping of vector modes to a vector mode of double size
(define_mode_attr mmxdoublevecmode
- [(V2SF "V4SF") (V2SI "V4SI")])
+ [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF")])
;; Mapping of vector modes back to the scalar modes
(define_mode_attr mmxscalarmode
@@ -594,7 +592,7 @@ (define_insn "sse_movntq"
(define_expand "movq_<mode>_to_sse"
[(set (match_operand:<mmxdoublevecmode> 0 "register_operand")
(vec_concat:<mmxdoublevecmode>
- (match_operand:V2FI 1 "nonimmediate_operand")
+ (match_operand:V2FI_V4HF 1 "nonimmediate_operand")
(match_dup 2)))]
"TARGET_SSE2"
"operands[2] = CONST0_RTX (<MODE>mode);")
@@ -1927,21 +1925,94 @@ (define_expand "lroundv2sfv2si2"
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(define_insn "<insn><mode>3"
- [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
- (plusminusmultdiv:VHF_32_64
- (match_operand:VHF_32_64 1 "register_operand" "<comm>v")
- (match_operand:VHF_32_64 2 "register_operand" "v")))]
+(define_expand "<insn>v4hf3"
+ [(set (match_operand:V4HF 0 "register_operand")
+ (plusminusmult:V4HF
+ (match_operand:V4HF 1 "nonimmediate_operand")
+ (match_operand:V4HF 2 "nonimmediate_operand")))]
"TARGET_AVX512FP16 && TARGET_AVX512VL"
- "v<insn>ph\t{%2, %1, %0|%0, %1, %2}"
- [(set (attr "type")
- (cond [(match_test "<CODE> == MULT")
- (const_string "ssemul")
- (match_test "<CODE> == DIV")
- (const_string "ssediv")]
- (const_string "sseadd")))
- (set_attr "prefix" "evex")
- (set_attr "mode" "V8HF")])
+{
+ rtx op2 = gen_reg_rtx (V8HFmode);
+ rtx op1 = gen_reg_rtx (V8HFmode);
+ rtx op0 = gen_reg_rtx (V8HFmode);
+
+ emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
+ emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+
+ emit_insn (gen_<insn>v8hf3 (op0, op1, op2));
+
+ emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+ DONE;
+})
+
+(define_expand "divv4hf3"
+ [(set (match_operand:V4HF 0 "register_operand")
+ (div:V4HF
+ (match_operand:V4HF 1 "nonimmediate_operand")
+ (match_operand:V4HF 2 "nonimmediate_operand")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+ rtx op2 = gen_reg_rtx (V8HFmode);
+ rtx op1 = gen_reg_rtx (V8HFmode);
+ rtx op0 = gen_reg_rtx (V8HFmode);
+
+ emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+ rtx tmp = gen_rtx_VEC_CONCAT (V8HFmode, operands[2],
+ force_reg (V4HFmode, CONST1_RTX (V4HFmode)));
+ emit_insn (gen_rtx_SET (op2, tmp));
+ emit_insn (gen_divv8hf3 (op0, op1, op2));
+ emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+ DONE;
+})
+
+(define_expand "movd_v2hf_to_sse"
+ [(set (match_operand:V8HF 0 "register_operand")
+ (vec_merge:V8HF
+ (vec_duplicate:V8HF
+ (match_operand:V2HF 1 "nonimmediate_operand"))
+ (match_operand:V8HF 2 "reg_or_0_operand")
+ (const_int 3)))]
+ "TARGET_SSE")
+
+(define_expand "<insn>v2hf3"
+ [(set (match_operand:V2HF 0 "register_operand")
+ (plusminusmult:V2HF
+ (match_operand:V2HF 1 "nonimmediate_operand")
+ (match_operand:V2HF 2 "nonimmediate_operand")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+ rtx op2 = gen_reg_rtx (V8HFmode);
+ rtx op1 = gen_reg_rtx (V8HFmode);
+ rtx op0 = gen_reg_rtx (V8HFmode);
+
+ emit_insn (gen_movd_v2hf_to_sse (op2, operands[2], CONST0_RTX (V8HFmode)));
+ emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode)));
+ emit_insn (gen_<insn>v8hf3 (op0, op1, op2));
+
+ emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode));
+ DONE;
+})
+
+(define_expand "divv2hf3"
+ [(set (match_operand:V2HF 0 "register_operand")
+ (div:V2HF
+ (match_operand:V2HF 1 "nonimmediate_operand")
+ (match_operand:V2HF 2 "nonimmediate_operand")))]
+ "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+ rtx op2 = gen_reg_rtx (V8HFmode);
+ rtx op1 = gen_reg_rtx (V8HFmode);
+ rtx op0 = gen_reg_rtx (V8HFmode);
+
+ emit_insn (gen_movd_v2hf_to_sse (op2, operands[2],
+ force_reg (V8HFmode, CONST1_RTX (V8HFmode))));
+ emit_insn (gen_movd_v2hf_to_sse (op1, operands[1], CONST0_RTX (V8HFmode)));
+ emit_insn (gen_divv8hf3 (op0, op1, op2));
+
+ emit_move_insn (operands[0], lowpart_subreg (V2HFmode, op0, V8HFmode));
+ DONE;
+})
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
@@ -430,6 +430,9 @@ (define_mode_iterator VF_512
(define_mode_iterator VFB_512
[V32HF V16SF V8DF])
+(define_mode_iterator V4SF_V8HF
+ [V4SF V8HF])
+
(define_mode_iterator VI48_AVX512VL
[V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
@@ -10873,11 +10876,11 @@ (define_insn "*vec_concatv2sf_sse"
(set_attr "type" "sselog,ssemov,mmxcvt,mmxmov")
(set_attr "mode" "V4SF,SF,DI,DI")])
-(define_insn "*vec_concatv4sf"
- [(set (match_operand:V4SF 0 "register_operand" "=x,v,x,v")
- (vec_concat:V4SF
- (match_operand:V2SF 1 "register_operand" " 0,v,0,v")
- (match_operand:V2SF 2 "nonimmediate_operand" " x,v,m,m")))]
+(define_insn "*vec_concat<mode>"
+ [(set (match_operand:V4SF_V8HF 0 "register_operand" "=x,v,x,v")
+ (vec_concat:V4SF_V8HF
+ (match_operand:<ssehalfvecmode> 1 "register_operand" " 0,v,0,v")
+ (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " x,v,m,m")))]
"TARGET_SSE"
"@
movlhps\t{%2, %0|%0, %2}
@@ -10889,17 +10892,34 @@ (define_insn "*vec_concatv4sf"
(set_attr "prefix" "orig,maybe_evex,orig,maybe_evex")
(set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
-(define_insn "*vec_concatv4sf_0"
- [(set (match_operand:V4SF 0 "register_operand" "=v")
- (vec_concat:V4SF
- (match_operand:V2SF 1 "nonimmediate_operand" "vm")
- (match_operand:V2SF 2 "const0_operand")))]
+(define_insn "*vec_concat<mode>_0"
+ [(set (match_operand:V4SF_V8HF 0 "register_operand" "=v")
+ (vec_concat:V4SF_V8HF
+ (match_operand:<ssehalfvecmode> 1 "nonimmediate_operand" "vm")
+ (match_operand:<ssehalfvecmode> 2 "const0_operand")))]
"TARGET_SSE2"
"%vmovq\t{%1, %0|%0, %1}"
[(set_attr "type" "ssemov")
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "DF")])
+(define_insn "*vec_concatv8hf_movss"
+ [(set (match_operand:V8HF 0 "register_operand" "=x,v,v")
+ (vec_merge:V8HF
+ (vec_duplicate:V8HF
+ (match_operand:V2HF 2 "nonimmediate_operand" "x,m,v"))
+ (match_operand:V8HF 1 "reg_or_0_operand" "0,C,v" )
+ (const_int 3)))]
+ "TARGET_SSE"
+ "@
+ movss\t{%2, %0|%0, %2}
+ %vmovss\t{%2, %0|%0, %2}
+ vmovss\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "isa" "noavx,*,avx")
+ (set_attr "type" "ssemov")
+ (set_attr "prefix" "orig,maybe_vex,maybe_vex")
+ (set_attr "mode" "SF")])
+
;; Avoid combining registers from different units in a single alternative,
;; see comment above inline_secondary_memory_needed function in i386.cc
(define_insn "vec_set<mode>_0"
new file mode 100644
@@ -0,0 +1,57 @@
+/* PR target/110762 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl -dp" } */
+
+typedef _Float16 v4hf __attribute__((vector_size(8)));
+typedef _Float16 v2hf __attribute__((vector_size(4)));
+
+v4hf
+foo (v4hf a, v4hf b)
+{
+ return a + b;
+}
+
+v4hf
+foo2 (v4hf a, v4hf b)
+{
+ return a - b;
+}
+
+v4hf
+foo3 (v4hf a, v4hf b)
+{
+ return a * b;
+}
+
+v4hf
+foo1 (v4hf a, v4hf b)
+{
+ return a / b;
+}
+
+v2hf
+foo4 (v2hf a, v2hf b)
+{
+ return a + b;
+}
+
+v2hf
+foo5 (v2hf a, v2hf b)
+{
+ return a - b;
+}
+
+v2hf
+foo6 (v2hf a, v2hf b)
+{
+ return a * b;
+}
+
+v2hf
+foo7 (v2hf a, v2hf b)
+{
+ return a / b;
+}
+
+/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_0" 7 } } */
+/* { dg-final { scan-assembler-times "\\*vec_concatv8hf_movss" 8 } } */