Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'
Checks
Commit Message
False dependency happens when destination is only updated by
pternlog. There is no false dependency when destination is also used
in source. So either a pxor should be inserted, or input operand
should be set with constraint '0'.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.
gcc/ChangeLog:
PR target/110438
PR target/110202
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov<mode>_constm1_pternlog_false_dep): New
define_insn.
(*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
Ditto.
(*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep):
Ditto.
(*<avx512>_cvtmask2<ssemodesuffix><mode>): Adjust to
define_insn_and_split to avoid false dependence.
(*<avx512>_cvtmask2<ssemodesuffix><mode>): Ditto.
(<mask_codefor>one_cmpl<mode>2<mask_name>): Adjust constraint
of operands 1 to '0' to avoid false dependence.
(*andnot<mode>3): Ditto.
(iornot<mode>3): Ditto.
(*<nlogic><mode>3): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr110438.c: New test.
---
gcc/config/i386/predicates.md | 8 +-
gcc/config/i386/sse.md | 113 ++++++++++++++++++++---
gcc/testsuite/gcc.target/i386/pr110438.c | 30 ++++++
3 files changed, 135 insertions(+), 16 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c
Comments
On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote:
> False dependency happens when destination is only updated by
> pternlog. There is no false dependency when destination is also used
> in source. So either a pxor should be inserted, or input operand
> should be set with constraint '0'.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ready to push to trunk.
Shouldn't this patch also remove uses of vpternlog in
standard_sse_constant_opcode?
A couple more questions below:
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal"
> ]
> (symbol_ref "true")))])
>
> +; False dependency happens on destination register which is not really
> +; used when moving all ones to vector register
> +(define_split
> + [(set (match_operand:VMOVE 0 "register_operand")
> + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> + "TARGET_AVX512F && reload_completed
> + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
> + && optimize_function_for_speed_p (cfun)"
Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate.
Doesn't it work here as well?
> + [(set (match_dup 0) (match_dup 2))
> + (parallel
> + [(set (match_dup 0) (match_dup 1))
> + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> + "operands[2] = CONST0_RTX (<MODE>mode);")
> +
> +(define_insn "*vmov<mode>_constm1_pternlog_false_dep"
> + [(set (match_operand:VMOVE 0 "register_operand" "=v")
> + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
> + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> + "TARGET_AVX512VL || <MODE_SIZE> == 64"
> + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
> + [(set_attr "type" "sselog1")
> + (set_attr "prefix" "evex")])
> +
> ;; If mem_addr points to a memory region with less than whole vector size bytes
> ;; of accessible memory and k is a mask that would prevent reading the inaccessible
> ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
> @@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>"
> operands[3] = CONST0_RTX (<MODE>mode);
> }")
>
> -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
> (vec_merge:VI48_AVX512VL
> (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> @@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> "@
> vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
> vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> + "&& !TARGET_AVX512DQ && reload_completed
> + && optimize_function_for_speed_p (cfun)"
> + [(set (match_dup 0) (match_dup 4))
> + (parallel
> + [(set (match_dup 0)
> + (vec_merge:VI48_AVX512VL
> + (match_dup 2)
> + (match_dup 3)
> + (match_dup 1)))
> + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> + "operands[4] = CONST0_RTX (<MODE>mode);"
> [(set_attr "isa" "avx512dq,*")
> (set_attr "length_immediate" "0,1")
> (set_attr "prefix" "evex")
> (set_attr "mode" "<sseinsnmode>")])
>
> +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep"
> + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
> + (vec_merge:VI48_AVX512VL
> + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> + (match_operand:VI48_AVX512VL 3 "const0_operand")
> + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
> + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> + "TARGET_AVX512F && !TARGET_AVX512DQ"
> + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> + [(set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set_attr "mode" "<sseinsnmode>")])
> +
> (define_expand "extendv2sfv2df2"
> [(set (match_operand:V2DF 0 "register_operand")
> (float_extend:V2DF
> @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2"
> operands[2] = force_reg (<MODE>mode, operands[2]);
> })
>
> -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
> - [(set (match_operand:VI 0 "register_operand" "=v,v")
> - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
> - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
> +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>"
> + [(set (match_operand:VI 0 "register_operand" "=v,v,v")
> + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br")
> + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
> "TARGET_AVX512F
> && (!<mask_applied>
> || <ssescalarmode>mode == SImode
> || <ssescalarmode>mode == DImode)"
> {
> + if (!<mask_applied> && which_alternative
> + && optimize_function_for_speed_p (cfun))
> + return "#";
> +
> if (TARGET_AVX512VL)
> return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
> else
> return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
> }
> + "&& reload_completed && !REG_P (operands[1]) && !<mask_applied>
> + && optimize_function_for_speed_p (cfun)"
> + [(set (match_dup 0) (match_dup 3))
> + (parallel
> + [(set (match_dup 0)
> + (xor:VI (match_dup 1) (match_dup 2)))
> + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> + "operands[3] = CONST0_RTX (<MODE>mode);"
Perhaps I'm misreading this, but this seems to result in
vpxor zmm0, zmm0
vpternlog zmm0, zmm0, [mem], 0x55
while in the PR the agreement was to emit
vmovdq? zmm0, [mem]
vpternlog zmm0, zmm0, zmm0, 0x55
when the source is in memory, because the former has three uops in fused domain?
> [(set_attr "type" "sselog")
> (set_attr "prefix" "evex")
> (set (attr "mode")
> @@ -17191,6 +17250,30 @@ (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
> (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> (const_int 1)))])
>
> +(define_insn "*one_cmpl<mode>2_pternlog_false_dep"
> + [(set (match_operand:VI 0 "register_operand" "=v,v")
> + (xor:VI (match_operand:VI 1 "bcst_vector_operand" "m, Br")
> + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))
> + (unspec [(match_operand:VI 3 "register_operand" "0,0")]
> + UNSPEC_INSN_FALSE_DEP)]
> + "TARGET_AVX512F"
> +{
> + if (TARGET_AVX512VL)
> + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
> + else
> + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
> +}
> + [(set_attr "type" "sselog")
> + (set_attr "prefix" "evex")
> + (set (attr "mode")
> + (if_then_else (match_test "TARGET_AVX512VL")
> + (const_string "<sseinsnmode>")
> + (const_string "XI")))
> + (set (attr "enabled")
> + (if_then_else (eq_attr "alternative" "0")
> + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> + (const_int 1)))])
> +
> (define_split
> [(set (match_operand:VI48_AVX512F 0 "register_operand")
> (vec_duplicate:VI48_AVX512F
> @@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3"
> [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v")
> (and:VI
> (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br"))
> - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))]
> + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))]
> "TARGET_SSE
> && (register_operand (operands[1], <MODE>mode)
> || register_operand (operands[2], <MODE>mode))"
> @@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3"
> [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
> (ior:VI
> (not:VI
> - (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
> - (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
> + (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr"))
> + (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))]
> "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> && (register_operand (operands[1], <MODE>mode)
> @@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3"
> (const_string "<sseinsnmode>")
> (const_string "XI")))
> (set (attr "enabled")
> - (if_then_else (eq_attr "alternative" "2,3")
> + (if_then_else (eq_attr "alternative" "0,1")
> (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> (const_string "*")))])
>
> @@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3"
> [(set (match_operand:VI 0 "register_operand" "=v,v")
> (not:VI
> (xor:VI
> - (match_operand:VI 1 "bcst_vector_operand" "%v,v")
> - (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> + (match_operand:VI 1 "bcst_vector_operand" "%0, 0")
> + (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))]
> "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> && (register_operand (operands[1], <MODE>mode)
> @@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3"
> (const_string "<sseinsnmode>")
> (const_string "XI")))
> (set (attr "enabled")
> - (if_then_else (eq_attr "alternative" "1")
> + (if_then_else (eq_attr "alternative" "0")
> (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> (const_string "*")))])
>
> @@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> (define_insn "*<nlogic><mode>3"
> [(set (match_operand:VI 0 "register_operand" "=v,v")
> (andor:VI
> - (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> - (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0"))
> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))]
> "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> && (register_operand (operands[1], <MODE>mode)
> @@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3"
> (const_string "<sseinsnmode>")
> (const_string "XI")))
> (set (attr "enabled")
> - (if_then_else (eq_attr "alternative" "1")
> + (if_then_else (eq_attr "alternative" "0")
> (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> (const_string "*")))])
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c
> new file mode 100644
> index 00000000000..11b8cc59fd2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr110438.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
> +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
> +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
> +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
> +
> +
> +#include <immintrin.h>
> +
> +__m512i g(void)
> +{
> + return (__m512i){ 0 } - 1;
> +}
> +
> +__m512i g1(__m512i* a)
> +{
> + return ~(*a);
> +}
> +
> +void
> +foo (int* a, int* __restrict b)
> +{
> + for (int i = 0; i != 16; i++)
> + {
> + if (b[i])
> + a[i] = -1;
> + else
> + a[i] = 0;
> + }
> +}
>
On Tue, Jul 11, 2023 at 12:24 AM Alexander Monakov via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
> On Mon, 10 Jul 2023, liuhongt via Gcc-patches wrote:
>
> > False dependency happens when destination is only updated by
> > pternlog. There is no false dependency when destination is also used
> > in source. So either a pxor should be inserted, or input operand
> > should be set with constraint '0'.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ready to push to trunk.
>
> Shouldn't this patch also remove uses of vpternlog in
> standard_sse_constant_opcode?
It's still needed when !optimize_function_for_speed_p (cfun).
>
> A couple more questions below:
>
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal"
> > ]
> > (symbol_ref "true")))])
> >
> > +; False dependency happens on destination register which is not really
> > +; used when moving all ones to vector register
> > +(define_split
> > + [(set (match_operand:VMOVE 0 "register_operand")
> > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
> > + "TARGET_AVX512F && reload_completed
> > + && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
> > + && optimize_function_for_speed_p (cfun)"
>
> Yan's patch used optimize_insn_for_speed_p (), which looks more appropriate.
> Doesn't it work here as well?
I'm just aligned with lzcnt/popcnt case, the difference between
option_insn_for_speed_p and optimized_function_for_speed_p is the
former will consider
!crtl->maybe_hot_insn_p but the latter just returns
!optimize_function_for_size_p (cfun). It looks
optimize_insn_for_speed_p() is more reasonable for single insn.
350optimize_insn_for_size_p (void)
351{
352 enum optimize_size_level ret = optimize_function_for_size_p (cfun);
353 if (ret < OPTIMIZE_SIZE_BALANCED && !crtl->maybe_hot_insn_p)
354 ret = OPTIMIZE_SIZE_BALANCED;
355 return ret;
>
> > + [(set (match_dup 0) (match_dup 2))
> > + (parallel
> > + [(set (match_dup 0) (match_dup 1))
> > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > + "operands[2] = CONST0_RTX (<MODE>mode);")
> > +
> > +(define_insn "*vmov<mode>_constm1_pternlog_false_dep"
> > + [(set (match_operand:VMOVE 0 "register_operand" "=v")
> > + (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
> > + (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> > + "TARGET_AVX512VL || <MODE_SIZE> == 64"
> > + "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
> > + [(set_attr "type" "sselog1")
> > + (set_attr "prefix" "evex")])
> > +
> > ;; If mem_addr points to a memory region with less than whole vector size bytes
> > ;; of accessible memory and k is a mask that would prevent reading the inaccessible
> > ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
> > @@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>"
> > operands[3] = CONST0_RTX (<MODE>mode);
> > }")
> >
> > -(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> > +(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> > [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
> > (vec_merge:VI48_AVX512VL
> > (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> > @@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
> > "@
> > vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
> > vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> > + "&& !TARGET_AVX512DQ && reload_completed
> > + && optimize_function_for_speed_p (cfun)"
> > + [(set (match_dup 0) (match_dup 4))
> > + (parallel
> > + [(set (match_dup 0)
> > + (vec_merge:VI48_AVX512VL
> > + (match_dup 2)
> > + (match_dup 3)
> > + (match_dup 1)))
> > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > + "operands[4] = CONST0_RTX (<MODE>mode);"
> > [(set_attr "isa" "avx512dq,*")
> > (set_attr "length_immediate" "0,1")
> > (set_attr "prefix" "evex")
> > (set_attr "mode" "<sseinsnmode>")])
> >
> > +(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep"
> > + [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
> > + (vec_merge:VI48_AVX512VL
> > + (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
> > + (match_operand:VI48_AVX512VL 3 "const0_operand")
> > + (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
> > + (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
> > + "TARGET_AVX512F && !TARGET_AVX512DQ"
> > + "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
> > + [(set_attr "length_immediate" "1")
> > + (set_attr "prefix" "evex")
> > + (set_attr "mode" "<sseinsnmode>")])
> > +
> > (define_expand "extendv2sfv2df2"
> > [(set (match_operand:V2DF 0 "register_operand")
> > (float_extend:V2DF
> > @@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2"
> > operands[2] = force_reg (<MODE>mode, operands[2]);
> > })
> >
> > -(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
> > - [(set (match_operand:VI 0 "register_operand" "=v,v")
> > - (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
> > - (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
> > +(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>"
> > + [(set (match_operand:VI 0 "register_operand" "=v,v,v")
> > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br")
> > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
> > "TARGET_AVX512F
> > && (!<mask_applied>
> > || <ssescalarmode>mode == SImode
> > || <ssescalarmode>mode == DImode)"
> > {
> > + if (!<mask_applied> && which_alternative
> > + && optimize_function_for_speed_p (cfun))
> > + return "#";
> > +
> > if (TARGET_AVX512VL)
> > return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
> > else
> > return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
> > }
> > + "&& reload_completed && !REG_P (operands[1]) && !<mask_applied>
> > + && optimize_function_for_speed_p (cfun)"
> > + [(set (match_dup 0) (match_dup 3))
> > + (parallel
> > + [(set (match_dup 0)
> > + (xor:VI (match_dup 1) (match_dup 2)))
> > + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
> > + "operands[3] = CONST0_RTX (<MODE>mode);"
>
> Perhaps I'm misreading this, but this seems to result in
>
> vpxor zmm0, zmm0
> vpternlog zmm0, zmm0, [mem], 0x55
>
I thought the first alternative (v,0,BC) would handle that, looks not,
i'll adjust the splitter to explicitly put operands[1] into
operands[0] when it's memory.
> while in the PR the agreement was to emit
>
> vmovdq? zmm0, [mem]
> vpternlog zmm0, zmm0, zmm0, 0x55
>
> when the source is in memory, because the former has three uops in fused domain?
>
> > [(set_attr "type" "sselog")
>
> > (set_attr "prefix" "evex")
> > (set (attr "mode")
> > @@ -17191,6 +17250,30 @@ (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
> > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> > (const_int 1)))])
> >
> > +(define_insn "*one_cmpl<mode>2_pternlog_false_dep"
> > + [(set (match_operand:VI 0 "register_operand" "=v,v")
> > + (xor:VI (match_operand:VI 1 "bcst_vector_operand" "m, Br")
> > + (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))
> > + (unspec [(match_operand:VI 3 "register_operand" "0,0")]
> > + UNSPEC_INSN_FALSE_DEP)]
> > + "TARGET_AVX512F"
> > +{
> > + if (TARGET_AVX512VL)
> > + return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
> > + else
> > + return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
> > +}
> > + [(set_attr "type" "sselog")
> > + (set_attr "prefix" "evex")
> > + (set (attr "mode")
> > + (if_then_else (match_test "TARGET_AVX512VL")
> > + (const_string "<sseinsnmode>")
> > + (const_string "XI")))
> > + (set (attr "enabled")
> > + (if_then_else (eq_attr "alternative" "0")
> > + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> > + (const_int 1)))])
> > +
> > (define_split
> > [(set (match_operand:VI48_AVX512F 0 "register_operand")
> > (vec_duplicate:VI48_AVX512F
> > @@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3"
> > [(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v")
> > (and:VI
> > (not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br"))
> > - (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))]
> > + (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))]
> > "TARGET_SSE
> > && (register_operand (operands[1], <MODE>mode)
> > || register_operand (operands[2], <MODE>mode))"
> > @@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3"
> > [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
> > (ior:VI
> > (not:VI
> > - (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
> > - (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
> > + (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr"))
> > + (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))]
> > "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> > || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> > && (register_operand (operands[1], <MODE>mode)
> > @@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3"
> > (const_string "<sseinsnmode>")
> > (const_string "XI")))
> > (set (attr "enabled")
> > - (if_then_else (eq_attr "alternative" "2,3")
> > + (if_then_else (eq_attr "alternative" "0,1")
> > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> > (const_string "*")))])
> >
> > @@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3"
> > [(set (match_operand:VI 0 "register_operand" "=v,v")
> > (not:VI
> > (xor:VI
> > - (match_operand:VI 1 "bcst_vector_operand" "%v,v")
> > - (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > + (match_operand:VI 1 "bcst_vector_operand" "%0, 0")
> > + (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))]
> > "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> > || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> > && (register_operand (operands[1], <MODE>mode)
> > @@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3"
> > (const_string "<sseinsnmode>")
> > (const_string "XI")))
> > (set (attr "enabled")
> > - (if_then_else (eq_attr "alternative" "1")
> > + (if_then_else (eq_attr "alternative" "0")
> > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> > (const_string "*")))])
> >
> > @@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> > (define_insn "*<nlogic><mode>3"
> > [(set (match_operand:VI 0 "register_operand" "=v,v")
> > (andor:VI
> > - (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> > - (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0"))
> > + (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))]
> > "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> > || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> > && (register_operand (operands[1], <MODE>mode)
> > @@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3"
> > (const_string "<sseinsnmode>")
> > (const_string "XI")))
> > (set (attr "enabled")
> > - (if_then_else (eq_attr "alternative" "1")
> > + (if_then_else (eq_attr "alternative" "0")
> > (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> > (const_string "*")))])
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr110438.c b/gcc/testsuite/gcc.target/i386/pr110438.c
> > new file mode 100644
> > index 00000000000..11b8cc59fd2
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr110438.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
> > +/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
> > +/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
> > +/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
> > +
> > +
> > +#include <immintrin.h>
> > +
> > +__m512i g(void)
> > +{
> > + return (__m512i){ 0 } - 1;
> > +}
> > +
> > +__m512i g1(__m512i* a)
> > +{
> > + return ~(*a);
> > +}
> > +
> > +void
> > +foo (int* a, int* __restrict b)
> > +{
> > + for (int i = 0; i != 16; i++)
> > + {
> > + if (b[i])
> > + a[i] = -1;
> > + else
> > + a[i] = 0;
> > + }
> > +}
> >
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
return false;
})
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
(define_predicate "vector_all_ones_operand"
(and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+ (ior (match_operand 0 "vector_all_ones_operand")
+ (match_operand 0 "float_vector_all_ones_operand")
+ (match_test "op == constm1_rtx")))
+
/* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit. */
(define_predicate "vector_all_ones_zero_extend_half_operand"
@@ -1382,6 +1382,29 @@ (define_insn "mov<mode>_internal"
]
(symbol_ref "true")))])
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+ [(set (match_operand:VMOVE 0 "register_operand")
+ (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+ "TARGET_AVX512F && reload_completed
+ && (<MODE_SIZE> == 64 || EXT_REX_SSE_REG_P (operands[0]))
+ && optimize_function_for_speed_p (cfun)"
+ [(set (match_dup 0) (match_dup 2))
+ (parallel
+ [(set (match_dup 0) (match_dup 1))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+ "operands[2] = CONST0_RTX (<MODE>mode);")
+
+(define_insn "*vmov<mode>_constm1_pternlog_false_dep"
+ [(set (match_operand:VMOVE 0 "register_operand" "=v")
+ (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" "<sseconstm1>"))
+ (unspec [(match_operand:VMOVE 2 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+ "TARGET_AVX512VL || <MODE_SIZE> == 64"
+ "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+ [(set_attr "type" "sselog1")
+ (set_attr "prefix" "evex")])
+
;; If mem_addr points to a memory region with less than whole vector size bytes
;; of accessible memory and k is a mask that would prevent reading the inaccessible
;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
@@ -9336,7 +9359,7 @@ (define_expand "<avx512>_cvtmask2<ssemodesuffix><mode>"
operands[3] = CONST0_RTX (<MODE>mode);
}")
-(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
+(define_insn_and_split "*<avx512>_cvtmask2<ssemodesuffix><mode>"
[(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
(match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9346,11 +9369,35 @@ (define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>"
"@
vpmovm2<ssemodesuffix>\t{%1, %0|%0, %1}
vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+ "&& !TARGET_AVX512DQ && reload_completed
+ && optimize_function_for_speed_p (cfun)"
+ [(set (match_dup 0) (match_dup 4))
+ (parallel
+ [(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+ (match_dup 2)
+ (match_dup 3)
+ (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+ "operands[4] = CONST0_RTX (<MODE>mode);"
[(set_attr "isa" "avx512dq,*")
(set_attr "length_immediate" "0,1")
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn "*<avx512>_cvtmask2<ssemodesuffix><mode>_pternlog_false_dep"
+ [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
+ (vec_merge:VI48_AVX512VL
+ (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
+ (match_operand:VI48_AVX512VL 3 "const0_operand")
+ (match_operand:<avx512fmaskmode> 1 "register_operand" "Yk")))
+ (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] UNSPEC_INSN_FALSE_DEP)]
+ "TARGET_AVX512F && !TARGET_AVX512DQ"
+ "vpternlog<ssemodesuffix>\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, %0, 0x81}"
+ [(set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_expand "extendv2sfv2df2"
[(set (match_operand:V2DF 0 "register_operand")
(float_extend:V2DF
@@ -17166,20 +17213,32 @@ (define_expand "one_cmpl<mode>2"
operands[2] = force_reg (<MODE>mode, operands[2]);
})
-(define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
- [(set (match_operand:VI 0 "register_operand" "=v,v")
- (xor:VI (match_operand:VI 1 "bcst_vector_operand" "vBr,m")
- (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))]
+(define_insn_and_split "<mask_codefor>one_cmpl<mode>2<mask_name>"
+ [(set (match_operand:VI 0 "register_operand" "=v,v,v")
+ (xor:VI (match_operand:VI 1 "bcst_vector_operand" " 0, m,Br")
+ (match_operand:VI 2 "vector_all_ones_operand" "BC,BC,BC")))]
"TARGET_AVX512F
&& (!<mask_applied>
|| <ssescalarmode>mode == SImode
|| <ssescalarmode>mode == DImode)"
{
+ if (!<mask_applied> && which_alternative
+ && optimize_function_for_speed_p (cfun))
+ return "#";
+
if (TARGET_AVX512VL)
return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
else
return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
}
+ "&& reload_completed && !REG_P (operands[1]) && !<mask_applied>
+ && optimize_function_for_speed_p (cfun)"
+ [(set (match_dup 0) (match_dup 3))
+ (parallel
+ [(set (match_dup 0)
+ (xor:VI (match_dup 1) (match_dup 2)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+ "operands[3] = CONST0_RTX (<MODE>mode);"
[(set_attr "type" "sselog")
(set_attr "prefix" "evex")
(set (attr "mode")
@@ -17191,6 +17250,30 @@ (define_insn "<mask_codefor>one_cmpl<mode>2<mask_name>"
(symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
(const_int 1)))])
+(define_insn "*one_cmpl<mode>2_pternlog_false_dep"
+ [(set (match_operand:VI 0 "register_operand" "=v,v")
+ (xor:VI (match_operand:VI 1 "bcst_vector_operand" "m, Br")
+ (match_operand:VI 2 "vector_all_ones_operand" "BC,BC")))
+ (unspec [(match_operand:VI 3 "register_operand" "0,0")]
+ UNSPEC_INSN_FALSE_DEP)]
+ "TARGET_AVX512F"
+{
+ if (TARGET_AVX512VL)
+ return "vpternlog<ternlogsuffix>\t{$0x55, %1, %0, %0<mask_operand3>|%0<mask_operand3>, %0, %1, 0x55}";
+ else
+ return "vpternlog<ternlogsuffix>\t{$0x55, %g1, %g0, %g0<mask_operand3>|%g0<mask_operand3>, %g0, %g1, 0x55}";
+}
+ [(set_attr "type" "sselog")
+ (set_attr "prefix" "evex")
+ (set (attr "mode")
+ (if_then_else (match_test "TARGET_AVX512VL")
+ (const_string "<sseinsnmode>")
+ (const_string "XI")))
+ (set (attr "enabled")
+ (if_then_else (eq_attr "alternative" "0")
+ (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+ (const_int 1)))])
+
(define_split
[(set (match_operand:VI48_AVX512F 0 "register_operand")
(vec_duplicate:VI48_AVX512F
@@ -17226,7 +17309,7 @@ (define_insn "*andnot<mode>3"
[(set (match_operand:VI 0 "register_operand" "=x,x,v,v,v")
(and:VI
(not:VI (match_operand:VI 1 "bcst_vector_operand" "0,x,v,m,Br"))
- (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,v,v")))]
+ (match_operand:VI 2 "bcst_vector_operand" "xBm,xm,vmBr,0,0")))]
"TARGET_SSE
&& (register_operand (operands[1], <MODE>mode)
|| register_operand (operands[2], <MODE>mode))"
@@ -17685,8 +17768,8 @@ (define_insn "*iornot<mode>3"
[(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
(ior:VI
(not:VI
- (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
- (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
+ (match_operand:VI 1 "bcst_vector_operand" "0,m, 0,vBr"))
+ (match_operand:VI 2 "bcst_vector_operand" "m,0,vBr, 0")))]
"(<MODE_SIZE> == 64 || TARGET_AVX512VL
|| (TARGET_AVX512F && !TARGET_PREFER_AVX256))
&& (register_operand (operands[1], <MODE>mode)
@@ -17710,7 +17793,7 @@ (define_insn "*iornot<mode>3"
(const_string "<sseinsnmode>")
(const_string "XI")))
(set (attr "enabled")
- (if_then_else (eq_attr "alternative" "2,3")
+ (if_then_else (eq_attr "alternative" "0,1")
(symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
(const_string "*")))])
@@ -17718,8 +17801,8 @@ (define_insn "*xnor<mode>3"
[(set (match_operand:VI 0 "register_operand" "=v,v")
(not:VI
(xor:VI
- (match_operand:VI 1 "bcst_vector_operand" "%v,v")
- (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+ (match_operand:VI 1 "bcst_vector_operand" "%0, 0")
+ (match_operand:VI 2 "bcst_vector_operand" " m,vBr"))))]
"(<MODE_SIZE> == 64 || TARGET_AVX512VL
|| (TARGET_AVX512F && !TARGET_PREFER_AVX256))
&& (register_operand (operands[1], <MODE>mode)
@@ -17738,7 +17821,7 @@ (define_insn "*xnor<mode>3"
(const_string "<sseinsnmode>")
(const_string "XI")))
(set (attr "enabled")
- (if_then_else (eq_attr "alternative" "1")
+ (if_then_else (eq_attr "alternative" "0")
(symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
(const_string "*")))])
@@ -17749,8 +17832,8 @@ (define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
(define_insn "*<nlogic><mode>3"
[(set (match_operand:VI 0 "register_operand" "=v,v")
(andor:VI
- (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
- (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+ (not:VI (match_operand:VI 1 "bcst_vector_operand" "%0, 0"))
+ (not:VI (match_operand:VI 2 "bcst_vector_operand" "m,vBr"))))]
"(<MODE_SIZE> == 64 || TARGET_AVX512VL
|| (TARGET_AVX512F && !TARGET_PREFER_AVX256))
&& (register_operand (operands[1], <MODE>mode)
@@ -17769,7 +17852,7 @@ (define_insn "*<nlogic><mode>3"
(const_string "<sseinsnmode>")
(const_string "XI")))
(set (attr "enabled")
- (if_then_else (eq_attr "alternative" "1")
+ (if_then_else (eq_attr "alternative" "0")
(symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
(const_string "*")))])
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2 -ftree-vectorize -mno-avx512dq -dp -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times {cvtmask2.*_pternlog} "1" } } */
+/* { dg-final { scan-assembler-times {constm1_pternlog} "1" } } */
+/* { dg-final { scan-assembler-not {(?n)vpternlogd.*\(} } } */
+
+
+#include <immintrin.h>
+
+__m512i g(void)
+{
+ return (__m512i){ 0 } - 1;
+}
+
+__m512i g1(__m512i* a)
+{
+ return ~(*a);
+}
+
+void
+foo (int* a, int* __restrict b)
+{
+ for (int i = 0; i != 16; i++)
+ {
+ if (b[i])
+ a[i] = -1;
+ else
+ a[i] = 0;
+ }
+}