Support 64-bit vectorization for single-precision floating rounding operation.
Checks
Commit Message
Here's list the patch supported.
rint/nearbyint/ceil/floor/trunc/lrint/lceil/lfloor/round/lround.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?
gcc/ChangeLog:
PR target/106910
* config/i386/mmx.md (nearbyintv2sf2): New expander.
(rintv2sf2): Ditto.
(ceilv2sf2): Ditto.
(lceilv2sfv2si2): Ditto.
(floorv2sf2): Ditto.
(lfloorv2sfv2si2): Ditto.
(btruncv2sf2): Ditto.
(lrintv2sfv2si2): Ditto.
(roundv2sf2): Ditto.
(lroundv2sfv2si2): Ditto.
(*mmx_roundv2sf2): New define_insn.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr106910-1.c: New test.
---
gcc/config/i386/mmx.md | 154 +++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr106910-1.c | 77 +++++++++++
2 files changed, 231 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/i386/pr106910-1.c
Comments
On Tue, Sep 20, 2022 at 10:14 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> Here's list the patch supported.
> rint/nearbyint/ceil/floor/trunc/lrint/lceil/lfloor/round/lround.
>
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
> Ok for trunk?
>
> gcc/ChangeLog:
>
> PR target/106910
> * config/i386/mmx.md (nearbyintv2sf2): New expander.
> (rintv2sf2): Ditto.
> (ceilv2sf2): Ditto.
> (lceilv2sfv2si2): Ditto.
> (floorv2sf2): Ditto.
> (lfloorv2sfv2si2): Ditto.
> (btruncv2sf2): Ditto.
> (lrintv2sfv2si2): Ditto.
> (roundv2sf2): Ditto.
> (lroundv2sfv2si2): Ditto.
> (*mmx_roundv2sf2): New define_insn.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr106910-1.c: New test.
> ---
> gcc/config/i386/mmx.md | 154 +++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr106910-1.c | 77 +++++++++++
> 2 files changed, 231 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106910-1.c
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index dda4b43f5c1..222a041de58 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -1627,6 +1627,160 @@ (define_expand "vec_initv2sfsf"
> DONE;
> })
>
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;;
> +;; Parallel single-precision floating point rounding operations.
> +;;
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +
> +(define_expand "nearbyintv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "register_operand")
> + (match_dup 2)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
> + "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);")
> +
> +(define_expand "rintv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "register_operand")
> + (match_dup 2)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
> + "operands[2] = GEN_INT (ROUND_MXCSR);")
> +
> +(define_expand "ceilv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "register_operand")
> + (match_dup 2)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> + "operands[2] = GEN_INT (ROUND_CEIL | ROUND_NO_EXC);")
> +
> +(define_expand "lceilv2sfv2si2"
> + [(match_operand:V2SI 0 "register_operand")
> + (match_operand:V2SF 1 "register_operand")]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> +{
> + rtx tmp = gen_reg_rtx (V2SFmode);
> + emit_insn (gen_ceilv2sf2 (tmp, operands[1]));
> + emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
> + DONE;
> +})
> +
> +(define_expand "floorv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "vector_operand")
> + (match_dup 2)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> + "operands[2] = GEN_INT (ROUND_FLOOR | ROUND_NO_EXC);")
> +
> +(define_expand "lfloorv2sfv2si2"
> + [(match_operand:V2SI 0 "register_operand")
> + (match_operand:V2SF 1 "register_operand")]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> +{
> + rtx tmp = gen_reg_rtx (V2SFmode);
> + emit_insn (gen_floorv2sf2 (tmp, operands[1]));
> + emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
> + DONE;
> +})
> +
> +(define_expand "btruncv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "register_operand")
> + (match_dup 2)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && !flag_trapping_math"
> + "operands[2] = GEN_INT (ROUND_TRUNC | ROUND_NO_EXC);")
> +
> +(define_insn "*mmx_roundv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,v")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "register_operand" "Yr,x,v")
> + (match_operand:SI 2 "const_0_to_15_operand")]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
> + "%vroundps\t{%2, %1, %0|%0, %1, %2}"
> + [(set_attr "isa" "noavx,noavx,avx")
> + (set_attr "type" "ssecvt")
> + (set_attr "prefix_data16" "1,1,*")
> + (set_attr "prefix_extra" "1")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "orig,orig,vex")
> + (set_attr "mode" "V4SF")])
> +
> +(define_insn "lrintv2sfv2si2"
> + [(set (match_operand:V2SI 0 "register_operand" "=v")
> + (unspec:V2SI
> + [(match_operand:V2SF 1 "register_operand" "v")]
> + UNSPEC_FIX_NOTRUNC))]
> + "TARGET_MMX_WITH_SSE"
> + "%vcvtps2dq\t{%1, %0|%0, %1}"
> + [(set_attr "type" "ssecvt")
> + (set (attr "prefix_data16")
> + (if_then_else
> + (match_test "TARGET_AVX")
> + (const_string "*")
> + (const_string "1")))
> + (set_attr "prefix" "maybe_vex")
> + (set_attr "mode" "TI")])
> +
> +(define_expand "roundv2sf2"
> + [(set (match_dup 3)
> + (plus:V2SF
> + (match_operand:V2SF 1 "register_operand")
> + (match_dup 2)))
> + (set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_dup 3) (match_dup 4)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> +{
> + const struct real_format *fmt;
> + REAL_VALUE_TYPE pred_half, half_minus_pred_half;
> + rtx half, vec_half;
> +
> + /* load nextafter (0.5, 0.0) */
> + fmt = REAL_MODE_FORMAT (SFmode);
> + real_2expN (&half_minus_pred_half, -(fmt->p) - 1, SFmode);
> + real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
> + half = const_double_from_real_value (pred_half, SFmode);
> +
> + vec_half = ix86_build_const_vector (V2SFmode, true, half);
> + vec_half = force_reg (V2SFmode, vec_half);
> +
> + operands[2] = gen_reg_rtx (V2SFmode);
> + emit_insn (gen_copysignv2sf3 (operands[2], vec_half, operands[1]));
> +
> + operands[3] = gen_reg_rtx (V2SFmode);
> + operands[4] = GEN_INT (ROUND_TRUNC);
> +})
> +
> +(define_expand "lroundv2sfv2si2"
> + [(match_operand:V2SI 0 "register_operand")
> + (match_operand:V2SF 1 "register_operand")]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> +{
> + rtx tmp = gen_reg_rtx (V2SFmode);
> + emit_insn (gen_roundv2sf2 (tmp, operands[1]));
> + emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
> + DONE;
> +})
> +
> +
> ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> ;;
> ;; Parallel half-precision floating point arithmetic
> diff --git a/gcc/testsuite/gcc.target/i386/pr106910-1.c b/gcc/testsuite/gcc.target/i386/pr106910-1.c
> new file mode 100644
> index 00000000000..c7685a32183
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106910-1.c
> @@ -0,0 +1,77 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-msse4.1 -O2 -Ofast" } */
> +/* { dg-final { scan-assembler-times "roundps" 9 } } */
> +/* { dg-final { scan-assembler-times "cvtps2dq" 1 } } */
> +/* { dg-final { scan-assembler-times "cvttps2dq" 3 } } */
> +
> +#include<math.h>
> +
> +void
> +foo (float* p, float* __restrict q)
> +{
> + p[0] = truncf (q[0]);
> + p[1] = truncf (q[1]);
> +}
> +
> +void
> +foo1 (float* p, float* __restrict q)
> +{
> + p[0] = floorf (q[0]);
> + p[1] = floorf (q[1]);
> +}
> +
> +void
> +foo1i (int* p, float* __restrict q)
> +{
> + p[0] = (int) floorf (q[0]);
> + p[1] = (int) floorf (q[1]);
> +}
> +
> +void
> +foo2 (float* p, float* __restrict q)
> +{
> + p[0] = ceilf (q[0]);
> + p[1] = ceilf (q[1]);
> +}
> +
> +void
> +foo2i (int* p, float* __restrict q)
> +{
> + p[0] = (int) ceilf (q[0]);
> + p[1] = (int) ceilf (q[1]);
> +}
> +
> +void
> +foo3 (float* p, float* __restrict q)
> +{
> + p[0] = rintf (q[0]);
> + p[1] = rintf (q[1]);
> +}
> +
> +void
> +foo3i (int* p, float* __restrict q)
> +{
> + p[0] = (int) rintf (q[0]);
> + p[1] = (int) rintf (q[1]);
> +}
> +
> +void
> +foo4 (float* p, float* __restrict q)
> +{
> + p[0] = nearbyintf (q[0]);
> + p[1] = nearbyintf (q[1]);
> +}
> +
> +void
> +foo5(float* p, float* __restrict q)
> +{
> + p[0] = roundf (q[0]);
> + p[1] = roundf (q[1]);
> +}
> +
> +void
> +foo5i(int* p, float* __restrict q)
> +{
> + p[0] = (int) roundf (q[0]);
> + p[1] = (int) roundf (q[1]);
> +}
> --
> 2.27.0
>
On Tue, Sep 20, 2022 at 4:15 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Here's list the patch supported.
> rint/nearbyint/ceil/floor/trunc/lrint/lceil/lfloor/round/lround.
>
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
> Ok for trunk?
>
> gcc/ChangeLog:
>
> PR target/106910
> * config/i386/mmx.md (nearbyintv2sf2): New expander.
> (rintv2sf2): Ditto.
> (ceilv2sf2): Ditto.
> (lceilv2sfv2si2): Ditto.
> (floorv2sf2): Ditto.
> (lfloorv2sfv2si2): Ditto.
> (btruncv2sf2): Ditto.
> (lrintv2sfv2si2): Ditto.
> (roundv2sf2): Ditto.
> (lroundv2sfv2si2): Ditto.
> (*mmx_roundv2sf2): New define_insn.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr106910-1.c: New test.
OK.
Thanks,
Uros.
> ---
> gcc/config/i386/mmx.md | 154 +++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr106910-1.c | 77 +++++++++++
> 2 files changed, 231 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr106910-1.c
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index dda4b43f5c1..222a041de58 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -1627,6 +1627,160 @@ (define_expand "vec_initv2sfsf"
> DONE;
> })
>
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +;;
> +;; Parallel single-precision floating point rounding operations.
> +;;
> +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> +
> +(define_expand "nearbyintv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "register_operand")
> + (match_dup 2)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
> + "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);")
> +
> +(define_expand "rintv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "register_operand")
> + (match_dup 2)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
> + "operands[2] = GEN_INT (ROUND_MXCSR);")
> +
> +(define_expand "ceilv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "register_operand")
> + (match_dup 2)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> + "operands[2] = GEN_INT (ROUND_CEIL | ROUND_NO_EXC);")
> +
> +(define_expand "lceilv2sfv2si2"
> + [(match_operand:V2SI 0 "register_operand")
> + (match_operand:V2SF 1 "register_operand")]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> +{
> + rtx tmp = gen_reg_rtx (V2SFmode);
> + emit_insn (gen_ceilv2sf2 (tmp, operands[1]));
> + emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
> + DONE;
> +})
> +
> +(define_expand "floorv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "vector_operand")
> + (match_dup 2)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> + "operands[2] = GEN_INT (ROUND_FLOOR | ROUND_NO_EXC);")
> +
> +(define_expand "lfloorv2sfv2si2"
> + [(match_operand:V2SI 0 "register_operand")
> + (match_operand:V2SF 1 "register_operand")]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> +{
> + rtx tmp = gen_reg_rtx (V2SFmode);
> + emit_insn (gen_floorv2sf2 (tmp, operands[1]));
> + emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
> + DONE;
> +})
> +
> +(define_expand "btruncv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "register_operand")
> + (match_dup 2)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && !flag_trapping_math"
> + "operands[2] = GEN_INT (ROUND_TRUNC | ROUND_NO_EXC);")
> +
> +(define_insn "*mmx_roundv2sf2"
> + [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,v")
> + (unspec:V2SF
> + [(match_operand:V2SF 1 "register_operand" "Yr,x,v")
> + (match_operand:SI 2 "const_0_to_15_operand")]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
> + "%vroundps\t{%2, %1, %0|%0, %1, %2}"
> + [(set_attr "isa" "noavx,noavx,avx")
> + (set_attr "type" "ssecvt")
> + (set_attr "prefix_data16" "1,1,*")
> + (set_attr "prefix_extra" "1")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "orig,orig,vex")
> + (set_attr "mode" "V4SF")])
> +
> +(define_insn "lrintv2sfv2si2"
> + [(set (match_operand:V2SI 0 "register_operand" "=v")
> + (unspec:V2SI
> + [(match_operand:V2SF 1 "register_operand" "v")]
> + UNSPEC_FIX_NOTRUNC))]
> + "TARGET_MMX_WITH_SSE"
> + "%vcvtps2dq\t{%1, %0|%0, %1}"
> + [(set_attr "type" "ssecvt")
> + (set (attr "prefix_data16")
> + (if_then_else
> + (match_test "TARGET_AVX")
> + (const_string "*")
> + (const_string "1")))
> + (set_attr "prefix" "maybe_vex")
> + (set_attr "mode" "TI")])
> +
> +(define_expand "roundv2sf2"
> + [(set (match_dup 3)
> + (plus:V2SF
> + (match_operand:V2SF 1 "register_operand")
> + (match_dup 2)))
> + (set (match_operand:V2SF 0 "register_operand")
> + (unspec:V2SF
> + [(match_dup 3) (match_dup 4)]
> + UNSPEC_ROUND))]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> +{
> + const struct real_format *fmt;
> + REAL_VALUE_TYPE pred_half, half_minus_pred_half;
> + rtx half, vec_half;
> +
> + /* load nextafter (0.5, 0.0) */
> + fmt = REAL_MODE_FORMAT (SFmode);
> + real_2expN (&half_minus_pred_half, -(fmt->p) - 1, SFmode);
> + real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
> + half = const_double_from_real_value (pred_half, SFmode);
> +
> + vec_half = ix86_build_const_vector (V2SFmode, true, half);
> + vec_half = force_reg (V2SFmode, vec_half);
> +
> + operands[2] = gen_reg_rtx (V2SFmode);
> + emit_insn (gen_copysignv2sf3 (operands[2], vec_half, operands[1]));
> +
> + operands[3] = gen_reg_rtx (V2SFmode);
> + operands[4] = GEN_INT (ROUND_TRUNC);
> +})
> +
> +(define_expand "lroundv2sfv2si2"
> + [(match_operand:V2SI 0 "register_operand")
> + (match_operand:V2SF 1 "register_operand")]
> + "TARGET_SSE4_1 && !flag_trapping_math
> + && TARGET_MMX_WITH_SSE"
> +{
> + rtx tmp = gen_reg_rtx (V2SFmode);
> + emit_insn (gen_roundv2sf2 (tmp, operands[1]));
> + emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
> + DONE;
> +})
> +
> +
> ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
> ;;
> ;; Parallel half-precision floating point arithmetic
> diff --git a/gcc/testsuite/gcc.target/i386/pr106910-1.c b/gcc/testsuite/gcc.target/i386/pr106910-1.c
> new file mode 100644
> index 00000000000..c7685a32183
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106910-1.c
> @@ -0,0 +1,77 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-msse4.1 -O2 -Ofast" } */
> +/* { dg-final { scan-assembler-times "roundps" 9 } } */
> +/* { dg-final { scan-assembler-times "cvtps2dq" 1 } } */
> +/* { dg-final { scan-assembler-times "cvttps2dq" 3 } } */
> +
> +#include<math.h>
> +
> +void
> +foo (float* p, float* __restrict q)
> +{
> + p[0] = truncf (q[0]);
> + p[1] = truncf (q[1]);
> +}
> +
> +void
> +foo1 (float* p, float* __restrict q)
> +{
> + p[0] = floorf (q[0]);
> + p[1] = floorf (q[1]);
> +}
> +
> +void
> +foo1i (int* p, float* __restrict q)
> +{
> + p[0] = (int) floorf (q[0]);
> + p[1] = (int) floorf (q[1]);
> +}
> +
> +void
> +foo2 (float* p, float* __restrict q)
> +{
> + p[0] = ceilf (q[0]);
> + p[1] = ceilf (q[1]);
> +}
> +
> +void
> +foo2i (int* p, float* __restrict q)
> +{
> + p[0] = (int) ceilf (q[0]);
> + p[1] = (int) ceilf (q[1]);
> +}
> +
> +void
> +foo3 (float* p, float* __restrict q)
> +{
> + p[0] = rintf (q[0]);
> + p[1] = rintf (q[1]);
> +}
> +
> +void
> +foo3i (int* p, float* __restrict q)
> +{
> + p[0] = (int) rintf (q[0]);
> + p[1] = (int) rintf (q[1]);
> +}
> +
> +void
> +foo4 (float* p, float* __restrict q)
> +{
> + p[0] = nearbyintf (q[0]);
> + p[1] = nearbyintf (q[1]);
> +}
> +
> +void
> +foo5(float* p, float* __restrict q)
> +{
> + p[0] = roundf (q[0]);
> + p[1] = roundf (q[1]);
> +}
> +
> +void
> +foo5i(int* p, float* __restrict q)
> +{
> + p[0] = (int) roundf (q[0]);
> + p[1] = (int) roundf (q[1]);
> +}
> --
> 2.27.0
>
@@ -1627,6 +1627,160 @@ (define_expand "vec_initv2sfsf"
DONE;
})
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Parallel single-precision floating point rounding operations.
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_expand "nearbyintv2sf2"
+ [(set (match_operand:V2SF 0 "register_operand")
+ (unspec:V2SF
+ [(match_operand:V2SF 1 "register_operand")
+ (match_dup 2)]
+ UNSPEC_ROUND))]
+ "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+ "operands[2] = GEN_INT (ROUND_MXCSR | ROUND_NO_EXC);")
+
+(define_expand "rintv2sf2"
+ [(set (match_operand:V2SF 0 "register_operand")
+ (unspec:V2SF
+ [(match_operand:V2SF 1 "register_operand")
+ (match_dup 2)]
+ UNSPEC_ROUND))]
+ "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+ "operands[2] = GEN_INT (ROUND_MXCSR);")
+
+(define_expand "ceilv2sf2"
+ [(set (match_operand:V2SF 0 "register_operand")
+ (unspec:V2SF
+ [(match_operand:V2SF 1 "register_operand")
+ (match_dup 2)]
+ UNSPEC_ROUND))]
+ "TARGET_SSE4_1 && !flag_trapping_math
+ && TARGET_MMX_WITH_SSE"
+ "operands[2] = GEN_INT (ROUND_CEIL | ROUND_NO_EXC);")
+
+(define_expand "lceilv2sfv2si2"
+ [(match_operand:V2SI 0 "register_operand")
+ (match_operand:V2SF 1 "register_operand")]
+ "TARGET_SSE4_1 && !flag_trapping_math
+ && TARGET_MMX_WITH_SSE"
+{
+ rtx tmp = gen_reg_rtx (V2SFmode);
+ emit_insn (gen_ceilv2sf2 (tmp, operands[1]));
+ emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
+ DONE;
+})
+
+(define_expand "floorv2sf2"
+ [(set (match_operand:V2SF 0 "register_operand")
+ (unspec:V2SF
+ [(match_operand:V2SF 1 "vector_operand")
+ (match_dup 2)]
+ UNSPEC_ROUND))]
+ "TARGET_SSE4_1 && !flag_trapping_math
+ && TARGET_MMX_WITH_SSE"
+ "operands[2] = GEN_INT (ROUND_FLOOR | ROUND_NO_EXC);")
+
+(define_expand "lfloorv2sfv2si2"
+ [(match_operand:V2SI 0 "register_operand")
+ (match_operand:V2SF 1 "register_operand")]
+ "TARGET_SSE4_1 && !flag_trapping_math
+ && TARGET_MMX_WITH_SSE"
+{
+ rtx tmp = gen_reg_rtx (V2SFmode);
+ emit_insn (gen_floorv2sf2 (tmp, operands[1]));
+ emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
+ DONE;
+})
+
+(define_expand "btruncv2sf2"
+ [(set (match_operand:V2SF 0 "register_operand")
+ (unspec:V2SF
+ [(match_operand:V2SF 1 "register_operand")
+ (match_dup 2)]
+ UNSPEC_ROUND))]
+ "TARGET_SSE4_1 && !flag_trapping_math"
+ "operands[2] = GEN_INT (ROUND_TRUNC | ROUND_NO_EXC);")
+
+(define_insn "*mmx_roundv2sf2"
+ [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,v")
+ (unspec:V2SF
+ [(match_operand:V2SF 1 "register_operand" "Yr,x,v")
+ (match_operand:SI 2 "const_0_to_15_operand")]
+ UNSPEC_ROUND))]
+ "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+ "%vroundps\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "isa" "noavx,noavx,avx")
+ (set_attr "type" "ssecvt")
+ (set_attr "prefix_data16" "1,1,*")
+ (set_attr "prefix_extra" "1")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "orig,orig,vex")
+ (set_attr "mode" "V4SF")])
+
+(define_insn "lrintv2sfv2si2"
+ [(set (match_operand:V2SI 0 "register_operand" "=v")
+ (unspec:V2SI
+ [(match_operand:V2SF 1 "register_operand" "v")]
+ UNSPEC_FIX_NOTRUNC))]
+ "TARGET_MMX_WITH_SSE"
+ "%vcvtps2dq\t{%1, %0|%0, %1}"
+ [(set_attr "type" "ssecvt")
+ (set (attr "prefix_data16")
+ (if_then_else
+ (match_test "TARGET_AVX")
+ (const_string "*")
+ (const_string "1")))
+ (set_attr "prefix" "maybe_vex")
+ (set_attr "mode" "TI")])
+
+(define_expand "roundv2sf2"
+ [(set (match_dup 3)
+ (plus:V2SF
+ (match_operand:V2SF 1 "register_operand")
+ (match_dup 2)))
+ (set (match_operand:V2SF 0 "register_operand")
+ (unspec:V2SF
+ [(match_dup 3) (match_dup 4)]
+ UNSPEC_ROUND))]
+ "TARGET_SSE4_1 && !flag_trapping_math
+ && TARGET_MMX_WITH_SSE"
+{
+ const struct real_format *fmt;
+ REAL_VALUE_TYPE pred_half, half_minus_pred_half;
+ rtx half, vec_half;
+
+ /* load nextafter (0.5, 0.0) */
+ fmt = REAL_MODE_FORMAT (SFmode);
+ real_2expN (&half_minus_pred_half, -(fmt->p) - 1, SFmode);
+ real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
+ half = const_double_from_real_value (pred_half, SFmode);
+
+ vec_half = ix86_build_const_vector (V2SFmode, true, half);
+ vec_half = force_reg (V2SFmode, vec_half);
+
+ operands[2] = gen_reg_rtx (V2SFmode);
+ emit_insn (gen_copysignv2sf3 (operands[2], vec_half, operands[1]));
+
+ operands[3] = gen_reg_rtx (V2SFmode);
+ operands[4] = GEN_INT (ROUND_TRUNC);
+})
+
+(define_expand "lroundv2sfv2si2"
+ [(match_operand:V2SI 0 "register_operand")
+ (match_operand:V2SF 1 "register_operand")]
+ "TARGET_SSE4_1 && !flag_trapping_math
+ && TARGET_MMX_WITH_SSE"
+{
+ rtx tmp = gen_reg_rtx (V2SFmode);
+ emit_insn (gen_roundv2sf2 (tmp, operands[1]));
+ emit_insn (gen_fix_truncv2sfv2si2 (operands[0], tmp));
+ DONE;
+})
+
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Parallel half-precision floating point arithmetic
new file mode 100644
@@ -0,0 +1,77 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-msse4.1 -O2 -Ofast" } */
+/* { dg-final { scan-assembler-times "roundps" 9 } } */
+/* { dg-final { scan-assembler-times "cvtps2dq" 1 } } */
+/* { dg-final { scan-assembler-times "cvttps2dq" 3 } } */
+
+#include<math.h>
+
+void
+foo (float* p, float* __restrict q)
+{
+ p[0] = truncf (q[0]);
+ p[1] = truncf (q[1]);
+}
+
+void
+foo1 (float* p, float* __restrict q)
+{
+ p[0] = floorf (q[0]);
+ p[1] = floorf (q[1]);
+}
+
+void
+foo1i (int* p, float* __restrict q)
+{
+ p[0] = (int) floorf (q[0]);
+ p[1] = (int) floorf (q[1]);
+}
+
+void
+foo2 (float* p, float* __restrict q)
+{
+ p[0] = ceilf (q[0]);
+ p[1] = ceilf (q[1]);
+}
+
+void
+foo2i (int* p, float* __restrict q)
+{
+ p[0] = (int) ceilf (q[0]);
+ p[1] = (int) ceilf (q[1]);
+}
+
+void
+foo3 (float* p, float* __restrict q)
+{
+ p[0] = rintf (q[0]);
+ p[1] = rintf (q[1]);
+}
+
+void
+foo3i (int* p, float* __restrict q)
+{
+ p[0] = (int) rintf (q[0]);
+ p[1] = (int) rintf (q[1]);
+}
+
+void
+foo4 (float* p, float* __restrict q)
+{
+ p[0] = nearbyintf (q[0]);
+ p[1] = nearbyintf (q[1]);
+}
+
+void
+foo5(float* p, float* __restrict q)
+{
+ p[0] = roundf (q[0]);
+ p[1] = roundf (q[1]);
+}
+
+void
+foo5i(int* p, float* __restrict q)
+{
+ p[0] = (int) roundf (q[0]);
+ p[1] = (int) roundf (q[1]);
+}