i386: Improve code generation for vector __builtin_signbit (x.x[i]) ? -1 : 0 [PR112816]
Checks
Commit Message
Hi!
On the testcase I've recently fixed I've noticed bad code generation,
we emit
pxor %xmm1, %xmm1
psrld $31, %xmm0
pcmpeqd %xmm1, %xmm0
pcmpeqd %xmm1, %xmm0
or
vpxor %xmm1, %xmm1, %xmm1
vpsrld $31, %xmm0, %xmm0
vpcmpeqd %xmm1, %xmm0, %xmm0
vpcmpeqd %xmm1, %xmm0, %xmm2
rather than
psrad $31, %xmm2
or
vpsrad $31, %xmm1, %xmm2
The following patch fixes that using a combiner splitter.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
2023-12-04 Jakub Jelinek <jakub@redhat.com>
PR target/112816
* config/i386/sse.md ((eq (eq (lshiftrt x elt_bits-1) 0) 0)): New
splitter to turn psrld $31; pcmpeq; pcmpeq into psrad $31.
* gcc.target/i386/pr112816.c: New test.
Jakub
Comments
> -----Original Message-----
> From: Jakub Jelinek <jakub@redhat.com>
> Sent: Tuesday, December 5, 2023 3:01 PM
> To: Uros Bizjak <ubizjak@gmail.com>; Liu, Hongtao <hongtao.liu@intel.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: [PATCH] i386: Improve code generation for vector __builtin_signbit
> (x.x[i]) ? -1 : 0 [PR112816]
>
> Hi!
>
> On the testcase I've recently fixed I've noticed bad code generation, we emit
> pxor %xmm1, %xmm1
> psrld $31, %xmm0
> pcmpeqd %xmm1, %xmm0
> pcmpeqd %xmm1, %xmm0
> or
> vpxor %xmm1, %xmm1, %xmm1
> vpsrld $31, %xmm0, %xmm0
> vpcmpeqd %xmm1, %xmm0, %xmm0
> vpcmpeqd %xmm1, %xmm0, %xmm2
> rather than
> psrad $31, %xmm2
> or
> vpsrad $31, %xmm1, %xmm2
> The following patch fixes that using a combiner splitter.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
Ok.
>
> 2023-12-04 Jakub Jelinek <jakub@redhat.com>
>
> PR target/112816
> * config/i386/sse.md ((eq (eq (lshiftrt x elt_bits-1) 0) 0)): New
> splitter to turn psrld $31; pcmpeq; pcmpeq into psrad $31.
>
> * gcc.target/i386/pr112816.c: New test.
>
> --- gcc/config/i386/sse.md.jj 2023-12-04 09:00:12.722437462 +0100
> +++ gcc/config/i386/sse.md 2023-12-04 13:22:38.565833465 +0100
> @@ -16614,6 +16614,18 @@ (define_insn_and_split "*ashrv1ti3_inter
> DONE;
> })
>
> +(define_split
> + [(set (match_operand:VI248_AVX2 0 "register_operand")
> + (eq:VI248_AVX2
> + (eq:VI248_AVX2
> + (lshiftrt:VI248_AVX2
> + (match_operand:VI248_AVX2 1 "register_operand")
> + (match_operand:SI 2 "const_int_operand"))
> + (match_operand:VI248_AVX2 3 "const0_operand"))
> + (match_operand:VI248_AVX2 4 "const0_operand")))]
> + "INTVAL (operands[2]) == GET_MODE_PRECISION (<ssescalarmode>mode)
> - 1"
> + [(set (match_dup 0) (ashiftrt:VI248_AVX2 (match_dup 1) (match_dup
> +2)))])
> +
> (define_expand "rotlv1ti3"
> [(set (match_operand:V1TI 0 "register_operand")
> (rotate:V1TI
> --- gcc/testsuite/gcc.target/i386/pr112816.c.jj 2023-12-04
> 13:31:51.215061445 +0100
> +++ gcc/testsuite/gcc.target/i386/pr112816.c 2023-12-04
> 13:34:14.008053097 +0100
> @@ -0,0 +1,27 @@
> +/* PR target/112816 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mno-avx512f -masm=att" } */
> +/* { dg-final { scan-assembler-times "psrad\t\\\$31," 2 } } */
> +/* { dg-final { scan-assembler-not "pcmpeqd\t" } } */
> +
> +#define N 4
> +struct S { float x[N]; };
> +struct T { int x[N]; };
> +
> +__attribute__((target ("no-sse3,sse2"))) struct T foo (struct S x) {
> + struct T res;
> + for (int i = 0; i < N; ++i)
> + res.x[i] = __builtin_signbit (x.x[i]) ? -1 : 0;
> + return res;
> +}
> +
> +__attribute__((target ("avx2"))) struct T bar (struct S x) {
> + struct T res;
> + for (int i = 0; i < N; ++i)
> + res.x[i] = __builtin_signbit (x.x[i]) ? -1 : 0;
> + return res;
> +}
>
> Jakub
@@ -16614,6 +16614,18 @@ (define_insn_and_split "*ashrv1ti3_inter
DONE;
})
+(define_split
+ [(set (match_operand:VI248_AVX2 0 "register_operand")
+ (eq:VI248_AVX2
+ (eq:VI248_AVX2
+ (lshiftrt:VI248_AVX2
+ (match_operand:VI248_AVX2 1 "register_operand")
+ (match_operand:SI 2 "const_int_operand"))
+ (match_operand:VI248_AVX2 3 "const0_operand"))
+ (match_operand:VI248_AVX2 4 "const0_operand")))]
+ "INTVAL (operands[2]) == GET_MODE_PRECISION (<ssescalarmode>mode) - 1"
+ [(set (match_dup 0) (ashiftrt:VI248_AVX2 (match_dup 1) (match_dup 2)))])
+
(define_expand "rotlv1ti3"
[(set (match_operand:V1TI 0 "register_operand")
(rotate:V1TI
@@ -0,0 +1,27 @@
+/* PR target/112816 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-avx512f -masm=att" } */
+/* { dg-final { scan-assembler-times "psrad\t\\\$31," 2 } } */
+/* { dg-final { scan-assembler-not "pcmpeqd\t" } } */
+
+#define N 4
+struct S { float x[N]; };
+struct T { int x[N]; };
+
+__attribute__((target ("no-sse3,sse2"))) struct T
+foo (struct S x)
+{
+ struct T res;
+ for (int i = 0; i < N; ++i)
+ res.x[i] = __builtin_signbit (x.x[i]) ? -1 : 0;
+ return res;
+}
+
+__attribute__((target ("avx2"))) struct T
+bar (struct S x)
+{
+ struct T res;
+ for (int i = 0; i < N; ++i)
+ res.x[i] = __builtin_signbit (x.x[i]) ? -1 : 0;
+ return res;
+}