[1/5] x86: use VPTERNLOG for further bitwise two-vector operations
Checks
Commit Message
All combinations of and, ior, xor, and not involving two operands can be
expressed that way in a single insn.
gcc/
PR target/93768
* config/i386/i386.cc (ix86_rtx_costs): Further special-case
bitwise vector operations.
* config/i386/sse.md (*iornot<mode>3): New insn.
(*xnor<mode>3): Likewise.
(*<nlogic><mode>3): Likewise.
(andor): New code iterator.
(nlogic): New code attribute.
(ternlog_nlogic): Likewise.
gcc/testsuite/
PR target/93768
gcc.target/i386/avx512-binop-not-1.h: New.
gcc.target/i386/avx512-binop-not-2.h: New.
gcc.target/i386/avx512f-orn-si-zmm-1.c: New test.
gcc.target/i386/avx512f-orn-si-zmm-2.c: New test.
---
The use of VI matches that in e.g. one_cmpl<mode>2 /
<mask_codefor>one_cmpl<mode>2<mask_name> and *andnot<mode>3, despite
(here and there)
- V64QI and V32HI being needlessly excluded when AVX512BW isn't enabled,
- V<n>TI not being covered,
- vector modes more narrow than 16 bytes not being covered.
Comments
On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> All combinations of and, ior, xor, and not involving two operands can be
> expressed that way in a single insn.
>
> gcc/
>
> PR target/93768
> * config/i386/i386.cc (ix86_rtx_costs): Further special-case
> bitwise vector operations.
> * config/i386/sse.md (*iornot<mode>3): New insn.
> (*xnor<mode>3): Likewise.
> (*<nlogic><mode>3): Likewise.
> (andor): New code iterator.
> (nlogic): New code attribute.
> (ternlog_nlogic): Likewise.
>
> gcc/testsuite/
>
> PR target/93768
> gcc.target/i386/avx512-binop-not-1.h: New.
> gcc.target/i386/avx512-binop-not-2.h: New.
> gcc.target/i386/avx512f-orn-si-zmm-1.c: New test.
> gcc.target/i386/avx512f-orn-si-zmm-2.c: New test.
> ---
> The use of VI matches that in e.g. one_cmpl<mode>2 /
> <mask_codefor>one_cmpl<mode>2<mask_name> and *andnot<mode>3, despite
> (here and there)
> - V64QI and V32HI being needlessly excluded when AVX512BW isn't enabled,
> - V<n>TI not being covered,
> - vector modes more narrow than 16 bytes not being covered.
>
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -21178,6 +21178,32 @@ ix86_rtx_costs (rtx x, machine_mode mode
> return false;
>
> case IOR:
> + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> + {
> + /* (ior (not ...) ...) can be a single insn in AVX512. */
> + if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F
> + && (GET_MODE_SIZE (mode) == 64
> + || (TARGET_AVX512VL
> + && (GET_MODE_SIZE (mode) == 32
> + || GET_MODE_SIZE (mode) == 16))))
> + {
> + rtx right = GET_CODE (XEXP (x, 1)) != NOT
> + ? XEXP (x, 1) : XEXP (XEXP (x, 1), 0);
> +
> + *total = ix86_vec_cost (mode, cost->sse_op)
> + + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
> + outer_code, opno, speed)
> + + rtx_cost (right, mode, outer_code, opno, speed);
> + return true;
> + }
> + *total = ix86_vec_cost (mode, cost->sse_op);
> + }
> + else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
> + *total = cost->add * 2;
> + else
> + *total = cost->add;
> + return false;
> +
> case XOR:
> if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> *total = ix86_vec_cost (mode, cost->sse_op);
> @@ -21198,11 +21224,20 @@ ix86_rtx_costs (rtx x, machine_mode mode
> /* pandn is a single instruction. */
> if (GET_CODE (XEXP (x, 0)) == NOT)
> {
> + rtx right = XEXP (x, 1);
> +
> + /* (and (not ...) (not ...)) can be a single insn in AVX512. */
> + if (GET_CODE (right) == NOT && TARGET_AVX512F
> + && (GET_MODE_SIZE (mode) == 64
> + || (TARGET_AVX512VL
> + && (GET_MODE_SIZE (mode) == 32
> + || GET_MODE_SIZE (mode) == 16))))
> + right = XEXP (right, 0);
> +
> *total = ix86_vec_cost (mode, cost->sse_op)
> + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
> outer_code, opno, speed)
> - + rtx_cost (XEXP (x, 1), mode,
> - outer_code, opno, speed);
> + + rtx_cost (right, mode, outer_code, opno, speed);
> return true;
> }
> else if (GET_CODE (XEXP (x, 1)) == NOT)
> @@ -21260,8 +21295,25 @@ ix86_rtx_costs (rtx x, machine_mode mode
>
> case NOT:
> if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> - // vnot is pxor -1.
> - *total = ix86_vec_cost (mode, cost->sse_op) + 1;
> + {
> + /* (not (xor ...)) can be a single insn in AVX512. */
> + if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F
> + && (GET_MODE_SIZE (mode) == 64
> + || (TARGET_AVX512VL
> + && (GET_MODE_SIZE (mode) == 32
> + || GET_MODE_SIZE (mode) == 16))))
> + {
> + *total = ix86_vec_cost (mode, cost->sse_op)
> + + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
> + outer_code, opno, speed)
> + + rtx_cost (XEXP (XEXP (x, 0), 1), mode,
> + outer_code, opno, speed);
> + return true;
> + }
> +
> + // vnot is pxor -1.
> + *total = ix86_vec_cost (mode, cost->sse_op) + 1;
> + }
> else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
> *total = cost->add * 2;
> else
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -17616,6 +17616,98 @@
> operands[2] = force_reg (V1TImode, CONSTM1_RTX (V1TImode));
> })
>
> +(define_insn "*iornot<mode>3"
> + [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
> + (ior:VI
> + (not:VI
> + (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
> + (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
> + "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> + || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> + && (register_operand (operands[1], <MODE>mode)
> + || register_operand (operands[2], <MODE>mode))"
> +{
> + if (!register_operand (operands[1], <MODE>mode))
> + {
> + if (TARGET_AVX512VL)
> + return "vpternlog<ternlogsuffix>\t{$0xdd, %1, %2, %0|%0, %2, %1, 0xdd}";
> + return "vpternlog<ternlogsuffix>\t{$0xdd, %g1, %g2, %g0|%g0, %g2, %g1, 0xdd}";
> + }
> + if (TARGET_AVX512VL)
> + return "vpternlog<ternlogsuffix>\t{$0xbb, %2, %1, %0|%0, %1, %2, 0xbb}";
> + return "vpternlog<ternlogsuffix>\t{$0xbb, %g2, %g1, %g0|%g0, %g1, %g2, 0xbb}";
> +}
> + [(set_attr "type" "sselog")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set (attr "mode")
> + (if_then_else (match_test "TARGET_AVX512VL")
> + (const_string "<sseinsnmode>")
> + (const_string "XI")))
> + (set (attr "enabled")
> + (if_then_else (eq_attr "alternative" "2,3")
> + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> + (const_string "*")))])
> +
> +(define_insn "*xnor<mode>3"
> + [(set (match_operand:VI 0 "register_operand" "=v,v")
> + (not:VI
> + (xor:VI
> + (match_operand:VI 1 "bcst_vector_operand" "%v,v")
> + (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> + "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> + || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> + && (register_operand (operands[1], <MODE>mode)
> + || register_operand (operands[2], <MODE>mode))"
> +{
> + if (TARGET_AVX512VL)
> + return "vpternlog<ternlogsuffix>\t{$0x99, %2, %1, %0|%0, %1, %2, 0x99}";
> + else
> + return "vpternlog<ternlogsuffix>\t{$0x99, %g2, %g1, %g0|%g0, %g1, %g2, 0x99}";
> +}
> + [(set_attr "type" "sselog")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set (attr "mode")
> + (if_then_else (match_test "TARGET_AVX512VL")
> + (const_string "<sseinsnmode>")
> + (const_string "XI")))
> + (set (attr "enabled")
> + (if_then_else (eq_attr "alternative" "1")
> + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> + (const_string "*")))])
> +
> +(define_code_iterator andor [and ior])
> +(define_code_attr nlogic [(and "nor") (ior "nand")])
> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> +
> +(define_insn "*<nlogic><mode>3"
> + [(set (match_operand:VI 0 "register_operand" "=v,v")
> + (andor:VI
> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
(and (not op1)) (not op2)) -> (not: (ior: op1 op2))
(ior (not op1) (not op2)) -> (not : (and op1 op2))
Even w/o avx512f, the transformation should also benefit since it
takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
The other 2 patterns: *xnor<mode>3 and iornot<mode>3 LGTM.
> + "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> + || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> + && (register_operand (operands[1], <MODE>mode)
> + || register_operand (operands[2], <MODE>mode))"
> +{
> + if (TARGET_AVX512VL)
> + return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %2, %1, %0|%0, %1, %2, <ternlog_nlogic>}";
> + else
> + return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %g2, %g1, %g0|%g0, %g1, %g2, <ternlog_nlogic>}";
> +}
> + [(set_attr "type" "sselog")
> + (set_attr "length_immediate" "1")
> + (set_attr "prefix" "evex")
> + (set (attr "mode")
> + (if_then_else (match_test "TARGET_AVX512VL")
> + (const_string "<sseinsnmode>")
> + (const_string "XI")))
> + (set (attr "enabled")
> + (if_then_else (eq_attr "alternative" "1")
> + (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> + (const_string "*")))])
> +
> (define_mode_iterator AVX512ZEXTMASK
> [(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI])
>
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-1.h
> @@ -0,0 +1,13 @@
> +#include <immintrin.h>
> +
> +#define PASTER2(x,y) x##y
> +#define PASTER3(x,y,z) _mm##x##_##y##_##z
> +#define OP(vec, op, suffix) PASTER3 (vec, op, suffix)
> +#define DUP(vec, suffix, val) PASTER3 (vec, set1, suffix) (val)
> +
> +type
> +foo (type x, SCALAR *f)
> +{
> + return OP (vec, op, suffix) (x, OP (vec, xor, suffix) (DUP (vec, suffix, *f),
> + DUP (vec, suffix, ~0)));
> +}
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-2.h
> @@ -0,0 +1,13 @@
> +#include <immintrin.h>
> +
> +#define PASTER2(x,y) x##y
> +#define PASTER3(x,y,z) _mm##x##_##y##_##z
> +#define OP(vec, op, suffix) PASTER3 (vec, op, suffix)
> +#define DUP(vec, suffix, val) PASTER3 (vec, set1, suffix) (val)
> +
> +type
> +foo (type x, SCALAR *f)
> +{
> + return OP (vec, op, suffix) (OP (vec, xor, suffix) (x, DUP (vec, suffix, ~0)),
> + DUP (vec, suffix, *f));
> +}
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
> +/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xdd, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vpbroadcast" } } */
> +
> +#define type __m512i
> +#define vec 512
> +#define op or
> +#define suffix epi32
> +#define SCALAR int
> +
> +#include "avx512-binop-not-1.h"
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
> +/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xbb, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vpbroadcast" } } */
> +
> +#define type __m512i
> +#define vec 512
> +#define op or
> +#define suffix epi32
> +#define SCALAR int
> +
> +#include "avx512-binop-not-2.h"
>
On 25.06.2023 06:42, Hongtao Liu wrote:
> On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> +(define_code_iterator andor [and ior])
>> +(define_code_attr nlogic [(and "nor") (ior "nand")])
>> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
>> +
>> +(define_insn "*<nlogic><mode>3"
>> + [(set (match_operand:VI 0 "register_operand" "=v,v")
>> + (andor:VI
>> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
>> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
> (and (not op1)) (not op2)) -> (not: (ior: op1 op2))
This wouldn't be a win (not + andn) -> (or + not), but what's
more important is ...
> (ior (not op1) (not op2)) -> (not : (and op1 op2))
>
> Even w/o avx512f, the transformation should also benefit since it
> takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
... that these transformations (from the, as per the doc,
canonical representation of nand and nor) are already occurring
in common code, _if_ no suitable insn can be found. That was at
least the conclusion I drew from looking around a lot, supported
by the code that's generated prior to this change.
Jan
On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 25.06.2023 06:42, Hongtao Liu wrote:
> > On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> +(define_code_iterator andor [and ior])
> >> +(define_code_attr nlogic [(and "nor") (ior "nand")])
> >> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> >> +
> >> +(define_insn "*<nlogic><mode>3"
> >> + [(set (match_operand:VI 0 "register_operand" "=v,v")
> >> + (andor:VI
> >> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> >> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
> > (and (not op1)) (not op2)) -> (not: (ior: op1 op2))
>
> This wouldn't be a win (not + andn) -> (or + not), but what's
> more important is ...
>
> > (ior (not op1) (not op2)) -> (not : (and op1 op2))
> >
> > Even w/o avx512f, the transformation should also benefit since it
> > takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
>
> ... that these transformations (from the, as per the doc,
> canonical representation of nand and nor) are already occurring
I see, there're already such simplifications in the gimple phase, so
the question: is there any need for and/ior:not not pattern?
Can you provide a testcase to demonstrate that and/ior: not not
pattern is needed?
> in common code, _if_ no suitable insn can be found. That was at
> least the conclusion I drew from looking around a lot, supported
> by the code that's generated prior to this change.
>
> Jan
On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote:
> >
> > On 25.06.2023 06:42, Hongtao Liu wrote:
> > > On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > >>
> > >> +(define_code_iterator andor [and ior])
> > >> +(define_code_attr nlogic [(and "nor") (ior "nand")])
> > >> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> > >> +
> > >> +(define_insn "*<nlogic><mode>3"
> > >> + [(set (match_operand:VI 0 "register_operand" "=v,v")
> > >> + (andor:VI
> > >> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> > >> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > > I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
> > > (and (not op1)) (not op2)) -> (not: (ior: op1 op2))
> >
> > This wouldn't be a win (not + andn) -> (or + not), but what's
> > more important is ...
> >
> > > (ior (not op1) (not op2)) -> (not : (and op1 op2))
> > >
> > > Even w/o avx512f, the transformation should also benefit since it
> > > takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
> >
> > ... that these transformations (from the, as per the doc,
> > canonical representation of nand and nor) are already occurring
> I see, there're already such simplifications in the gimple phase, so
> the question: is there any need for and/ior:not not pattern?
> Can you provide a testcase to demonstrate that and/ior: not not
> pattern is needed?
typedef int v4si __attribute__((vector_size(16)));
v4si
foo1 (v4si a, v4si b)
{
return ~a & ~b;
}
I only gimple have optimized it to
<bb 2> [local count: 1073741824]:
# DEBUG BEGIN_STMT
_1 = a_2(D) | b_3(D);
_4 = ~_1;
return _4;
But rtl still try to match
(set (reg:V4SI 86)
(and:V4SI (not:V4SI (reg:V4SI 88))
(not:V4SI (reg:V4SI 89))))
Hmm.
> > in common code, _if_ no suitable insn can be found. That was at
> > least the conclusion I drew from looking around a lot, supported
> > by the code that's generated prior to this change.
> >
> > Jan
>
>
>
> --
> BR,
> Hongtao
On Sun, Jun 25, 2023 at 3:23 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote:
> > >
> > > On 25.06.2023 06:42, Hongtao Liu wrote:
> > > > On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
> > > > <gcc-patches@gcc.gnu.org> wrote:
> > > >>
> > > >> +(define_code_iterator andor [and ior])
> > > >> +(define_code_attr nlogic [(and "nor") (ior "nand")])
> > > >> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> > > >> +
> > > >> +(define_insn "*<nlogic><mode>3"
> > > >> + [(set (match_operand:VI 0 "register_operand" "=v,v")
> > > >> + (andor:VI
> > > >> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> > > >> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > > > I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
> > > > (and (not op1)) (not op2)) -> (not: (ior: op1 op2))
> > >
> > > This wouldn't be a win (not + andn) -> (or + not), but what's
> > > more important is ...
> > >
> > > > (ior (not op1) (not op2)) -> (not : (and op1 op2))
> > > >
> > > > Even w/o avx512f, the transformation should also benefit since it
> > > > takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
> > >
> > > ... that these transformations (from the, as per the doc,
> > > canonical representation of nand and nor) are already occurring
> > I see, there're already such simplifications in the gimple phase, so
> > the question: is there any need for and/ior:not not pattern?
> > Can you provide a testcase to demonstrate that and/ior: not not
> > pattern is needed?
>
> typedef int v4si __attribute__((vector_size(16)));
> v4si
> foo1 (v4si a, v4si b)
> {
> return ~a & ~b;
> }
>
> I only gimple have optimized it to
>
> <bb 2> [local count: 1073741824]:
> # DEBUG BEGIN_STMT
> _1 = a_2(D) | b_3(D);
> _4 = ~_1;
> return _4;
>
>
> But rtl still try to match
>
> (set (reg:V4SI 86)
> (and:V4SI (not:V4SI (reg:V4SI 88))
> (not:V4SI (reg:V4SI 89))))
>
> Hmm.
In rtl, we're using xor -1 for not, so it's
(insn 8 7 9 2 (set (reg:V4SI 87)
(ior:V4SI (reg:V4SI 88)
(reg:V4SI 89))) "/app/example.cpp":6:15 6830 {*iorv4si3}
(expr_list:REG_DEAD (reg:V4SI 89)
(expr_list:REG_DEAD (reg:V4SI 88)
(nil))))
(insn 9 8 14 2 (set (reg:V4SI 86)
(xor:V4SI (reg:V4SI 87)
(const_vector:V4SI [
(const_int -1 [0xffffffffffffffff]) repeated x4
]))) "/app/example.cpp":6:18 6792 {*one_cmplv4si2}
Then simplified to
> (set (reg:V4SI 86)
> (and:V4SI (not:V4SI (reg:V4SI 88))
> (not:V4SI (reg:V4SI 89))))
>
by
3565 case XOR:
3566 if (trueop1 == CONST0_RTX (mode))
3567 return op0;
3568 if (INTEGRAL_MODE_P (mode) && trueop1 == CONSTM1_RTX (mode))
3569 return simplify_gen_unary (NOT, mode, op0, mode);
and
1018 /* Apply De Morgan's laws to reduce number of patterns for machines
1019 with negating logical insns (and-not, nand, etc.). If result has
1020 only one NOT, put it first, since that is how the patterns are
1021 coded. */
1022 if (GET_CODE (op) == IOR || GET_CODE (op) == AND)
1023 {
1024 rtx in1 = XEXP (op, 0), in2 = XEXP (op, 1);
1025 machine_mode op_mode;
1026
1027 op_mode = GET_MODE (in1);
1028 in1 = simplify_gen_unary (NOT, op_mode, in1, op_mode);
1029
1030 op_mode = GET_MODE (in2);
1031 if (op_mode == VOIDmode)
1032 op_mode = mode;
1033 in2 = simplify_gen_unary (NOT, op_mode, in2, op_mode);
1034
1035 if (GET_CODE (in2) == NOT && GET_CODE (in1) != NOT)
1036 std::swap (in1, in2);
1037
1038 return gen_rtx_fmt_ee (GET_CODE (op) == IOR ? AND : IOR,
1039 mode, in1, in2);
1040 }
Ok, got it, and/ior:not not pattern LGTM then.
> > > in common code, _if_ no suitable insn can be found. That was at
> > > least the conclusion I drew from looking around a lot, supported
> > > by the code that's generated prior to this change.
> > >
> > > Jan
> >
> >
> >
> > --
> > BR,
> > Hongtao
>
>
>
> --
> BR,
> Hongtao
On 25.06.2023 09:30, Hongtao Liu wrote:
> On Sun, Jun 25, 2023 at 3:23 PM Hongtao Liu <crazylht@gmail.com> wrote:
>>
>> On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
>>>
>>> On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>>
>>>> On 25.06.2023 06:42, Hongtao Liu wrote:
>>>>> On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
>>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>>>
>>>>>> +(define_code_iterator andor [and ior])
>>>>>> +(define_code_attr nlogic [(and "nor") (ior "nand")])
>>>>>> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
>>>>>> +
>>>>>> +(define_insn "*<nlogic><mode>3"
>>>>>> + [(set (match_operand:VI 0 "register_operand" "=v,v")
>>>>>> + (andor:VI
>>>>>> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
>>>>>> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
>>>>> I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
>>>>> (and (not op1)) (not op2)) -> (not: (ior: op1 op2))
>>>>
>>>> This wouldn't be a win (not + andn) -> (or + not), but what's
>>>> more important is ...
>>>>
>>>>> (ior (not op1) (not op2)) -> (not : (and op1 op2))
>>>>>
>>>>> Even w/o avx512f, the transformation should also benefit since it
>>>>> takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
>>>>
>>>> ... that these transformations (from the, as per the doc,
>>>> canonical representation of nand and nor) are already occurring
>>> I see, there're already such simplifications in the gimple phase, so
>>> the question: is there any need for and/ior:not not pattern?
>>> Can you provide a testcase to demonstrate that and/ior: not not
>>> pattern is needed?
>>
>> typedef int v4si __attribute__((vector_size(16)));
>> v4si
>> foo1 (v4si a, v4si b)
>> {
>> return ~a & ~b;
>> }
>>
>> I only gimple have optimized it to
>>
>> <bb 2> [local count: 1073741824]:
>> # DEBUG BEGIN_STMT
>> _1 = a_2(D) | b_3(D);
>> _4 = ~_1;
>> return _4;
>>
>>
>> But rtl still try to match
>>
>> (set (reg:V4SI 86)
>> (and:V4SI (not:V4SI (reg:V4SI 88))
>> (not:V4SI (reg:V4SI 89))))
>>
>> Hmm.
> In rtl, we're using xor -1 for not, so it's
>
> (insn 8 7 9 2 (set (reg:V4SI 87)
> (ior:V4SI (reg:V4SI 88)
> (reg:V4SI 89))) "/app/example.cpp":6:15 6830 {*iorv4si3}
> (expr_list:REG_DEAD (reg:V4SI 89)
> (expr_list:REG_DEAD (reg:V4SI 88)
> (nil))))
> (insn 9 8 14 2 (set (reg:V4SI 86)
> (xor:V4SI (reg:V4SI 87)
> (const_vector:V4SI [
> (const_int -1 [0xffffffffffffffff]) repeated x4
> ]))) "/app/example.cpp":6:18 6792 {*one_cmplv4si2}
>
> Then simplified to
>> (set (reg:V4SI 86)
>> (and:V4SI (not:V4SI (reg:V4SI 88))
>> (not:V4SI (reg:V4SI 89))))
>>
>
> by
>
> 3565 case XOR:
> 3566 if (trueop1 == CONST0_RTX (mode))
> 3567 return op0;
> 3568 if (INTEGRAL_MODE_P (mode) && trueop1 == CONSTM1_RTX (mode))
> 3569 return simplify_gen_unary (NOT, mode, op0, mode);
>
> and
>
> 1018 /* Apply De Morgan's laws to reduce number of patterns for machines
> 1019 with negating logical insns (and-not, nand, etc.). If result has
> 1020 only one NOT, put it first, since that is how the patterns are
> 1021 coded. */
> 1022 if (GET_CODE (op) == IOR || GET_CODE (op) == AND)
> 1023 {
> 1024 rtx in1 = XEXP (op, 0), in2 = XEXP (op, 1);
> 1025 machine_mode op_mode;
> 1026
> 1027 op_mode = GET_MODE (in1);
> 1028 in1 = simplify_gen_unary (NOT, op_mode, in1, op_mode);
> 1029
> 1030 op_mode = GET_MODE (in2);
> 1031 if (op_mode == VOIDmode)
> 1032 op_mode = mode;
> 1033 in2 = simplify_gen_unary (NOT, op_mode, in2, op_mode);
> 1034
> 1035 if (GET_CODE (in2) == NOT && GET_CODE (in1) != NOT)
> 1036 std::swap (in1, in2);
> 1037
> 1038 return gen_rtx_fmt_ee (GET_CODE (op) == IOR ? AND : IOR,
> 1039 mode, in1, in2);
> 1040 }
>
>
> Ok, got it, and/ior:not not pattern LGTM then.
Just to avoid misunderstandings - together with your initial
reply that's then an "okay" to the patch as a whole, right?
Thanks, Jan
On Sun, Jun 25, 2023 at 9:35 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 25.06.2023 09:30, Hongtao Liu wrote:
> > On Sun, Jun 25, 2023 at 3:23 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >>
> >> On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >>>
> >>> On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>>>
> >>>> On 25.06.2023 06:42, Hongtao Liu wrote:
> >>>>> On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
> >>>>> <gcc-patches@gcc.gnu.org> wrote:
> >>>>>>
> >>>>>> +(define_code_iterator andor [and ior])
> >>>>>> +(define_code_attr nlogic [(and "nor") (ior "nand")])
> >>>>>> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> >>>>>> +
> >>>>>> +(define_insn "*<nlogic><mode>3"
> >>>>>> + [(set (match_operand:VI 0 "register_operand" "=v,v")
> >>>>>> + (andor:VI
> >>>>>> + (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> >>>>>> + (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> >>>>> I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
> >>>>> (and (not op1)) (not op2)) -> (not: (ior: op1 op2))
> >>>>
> >>>> This wouldn't be a win (not + andn) -> (or + not), but what's
> >>>> more important is ...
> >>>>
> >>>>> (ior (not op1) (not op2)) -> (not : (and op1 op2))
> >>>>>
> >>>>> Even w/o avx512f, the transformation should also benefit since it
> >>>>> takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
> >>>>
> >>>> ... that these transformations (from the, as per the doc,
> >>>> canonical representation of nand and nor) are already occurring
> >>> I see, there're already such simplifications in the gimple phase, so
> >>> the question: is there any need for and/ior:not not pattern?
> >>> Can you provide a testcase to demonstrate that and/ior: not not
> >>> pattern is needed?
> >>
> >> typedef int v4si __attribute__((vector_size(16)));
> >> v4si
> >> foo1 (v4si a, v4si b)
> >> {
> >> return ~a & ~b;
> >> }
> >>
> >> I only gimple have optimized it to
> >>
> >> <bb 2> [local count: 1073741824]:
> >> # DEBUG BEGIN_STMT
> >> _1 = a_2(D) | b_3(D);
> >> _4 = ~_1;
> >> return _4;
> >>
> >>
> >> But rtl still try to match
> >>
> >> (set (reg:V4SI 86)
> >> (and:V4SI (not:V4SI (reg:V4SI 88))
> >> (not:V4SI (reg:V4SI 89))))
> >>
> >> Hmm.
> > In rtl, we're using xor -1 for not, so it's
> >
> > (insn 8 7 9 2 (set (reg:V4SI 87)
> > (ior:V4SI (reg:V4SI 88)
> > (reg:V4SI 89))) "/app/example.cpp":6:15 6830 {*iorv4si3}
> > (expr_list:REG_DEAD (reg:V4SI 89)
> > (expr_list:REG_DEAD (reg:V4SI 88)
> > (nil))))
> > (insn 9 8 14 2 (set (reg:V4SI 86)
> > (xor:V4SI (reg:V4SI 87)
> > (const_vector:V4SI [
> > (const_int -1 [0xffffffffffffffff]) repeated x4
> > ]))) "/app/example.cpp":6:18 6792 {*one_cmplv4si2}
> >
> > Then simplified to
> >> (set (reg:V4SI 86)
> >> (and:V4SI (not:V4SI (reg:V4SI 88))
> >> (not:V4SI (reg:V4SI 89))))
> >>
> >
> > by
> >
> > 3565 case XOR:
> > 3566 if (trueop1 == CONST0_RTX (mode))
> > 3567 return op0;
> > 3568 if (INTEGRAL_MODE_P (mode) && trueop1 == CONSTM1_RTX (mode))
> > 3569 return simplify_gen_unary (NOT, mode, op0, mode);
> >
> > and
> >
> > 1018 /* Apply De Morgan's laws to reduce number of patterns for machines
> > 1019 with negating logical insns (and-not, nand, etc.). If result has
> > 1020 only one NOT, put it first, since that is how the patterns are
> > 1021 coded. */
> > 1022 if (GET_CODE (op) == IOR || GET_CODE (op) == AND)
> > 1023 {
> > 1024 rtx in1 = XEXP (op, 0), in2 = XEXP (op, 1);
> > 1025 machine_mode op_mode;
> > 1026
> > 1027 op_mode = GET_MODE (in1);
> > 1028 in1 = simplify_gen_unary (NOT, op_mode, in1, op_mode);
> > 1029
> > 1030 op_mode = GET_MODE (in2);
> > 1031 if (op_mode == VOIDmode)
> > 1032 op_mode = mode;
> > 1033 in2 = simplify_gen_unary (NOT, op_mode, in2, op_mode);
> > 1034
> > 1035 if (GET_CODE (in2) == NOT && GET_CODE (in1) != NOT)
> > 1036 std::swap (in1, in2);
> > 1037
> > 1038 return gen_rtx_fmt_ee (GET_CODE (op) == IOR ? AND : IOR,
> > 1039 mode, in1, in2);
> > 1040 }
> >
> >
> > Ok, got it, and/ior:not not pattern LGTM then.
>
> Just to avoid misunderstandings - together with your initial
> reply that's then an "okay" to the patch as a whole, right?
Yes.
>
> Thanks, Jan
@@ -21178,6 +21178,32 @@ ix86_rtx_costs (rtx x, machine_mode mode
return false;
case IOR:
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* (ior (not ...) ...) can be a single insn in AVX512. */
+ if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F
+ && (GET_MODE_SIZE (mode) == 64
+ || (TARGET_AVX512VL
+ && (GET_MODE_SIZE (mode) == 32
+ || GET_MODE_SIZE (mode) == 16))))
+ {
+ rtx right = GET_CODE (XEXP (x, 1)) != NOT
+ ? XEXP (x, 1) : XEXP (XEXP (x, 1), 0);
+
+ *total = ix86_vec_cost (mode, cost->sse_op)
+ + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+ outer_code, opno, speed)
+ + rtx_cost (right, mode, outer_code, opno, speed);
+ return true;
+ }
+ *total = ix86_vec_cost (mode, cost->sse_op);
+ }
+ else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+ *total = cost->add * 2;
+ else
+ *total = cost->add;
+ return false;
+
case XOR:
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
*total = ix86_vec_cost (mode, cost->sse_op);
@@ -21198,11 +21224,20 @@ ix86_rtx_costs (rtx x, machine_mode mode
/* pandn is a single instruction. */
if (GET_CODE (XEXP (x, 0)) == NOT)
{
+ rtx right = XEXP (x, 1);
+
+ /* (and (not ...) (not ...)) can be a single insn in AVX512. */
+ if (GET_CODE (right) == NOT && TARGET_AVX512F
+ && (GET_MODE_SIZE (mode) == 64
+ || (TARGET_AVX512VL
+ && (GET_MODE_SIZE (mode) == 32
+ || GET_MODE_SIZE (mode) == 16))))
+ right = XEXP (right, 0);
+
*total = ix86_vec_cost (mode, cost->sse_op)
+ rtx_cost (XEXP (XEXP (x, 0), 0), mode,
outer_code, opno, speed)
- + rtx_cost (XEXP (x, 1), mode,
- outer_code, opno, speed);
+ + rtx_cost (right, mode, outer_code, opno, speed);
return true;
}
else if (GET_CODE (XEXP (x, 1)) == NOT)
@@ -21260,8 +21295,25 @@ ix86_rtx_costs (rtx x, machine_mode mode
case NOT:
if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
- // vnot is pxor -1.
- *total = ix86_vec_cost (mode, cost->sse_op) + 1;
+ {
+ /* (not (xor ...)) can be a single insn in AVX512. */
+ if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F
+ && (GET_MODE_SIZE (mode) == 64
+ || (TARGET_AVX512VL
+ && (GET_MODE_SIZE (mode) == 32
+ || GET_MODE_SIZE (mode) == 16))))
+ {
+ *total = ix86_vec_cost (mode, cost->sse_op)
+ + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+ outer_code, opno, speed)
+ + rtx_cost (XEXP (XEXP (x, 0), 1), mode,
+ outer_code, opno, speed);
+ return true;
+ }
+
+ // vnot is pxor -1.
+ *total = ix86_vec_cost (mode, cost->sse_op) + 1;
+ }
else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
*total = cost->add * 2;
else
@@ -17616,6 +17616,98 @@
operands[2] = force_reg (V1TImode, CONSTM1_RTX (V1TImode));
})
+(define_insn "*iornot<mode>3"
+ [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
+ (ior:VI
+ (not:VI
+ (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
+ (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
+ "(<MODE_SIZE> == 64 || TARGET_AVX512VL
+ || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
+ && (register_operand (operands[1], <MODE>mode)
+ || register_operand (operands[2], <MODE>mode))"
+{
+ if (!register_operand (operands[1], <MODE>mode))
+ {
+ if (TARGET_AVX512VL)
+ return "vpternlog<ternlogsuffix>\t{$0xdd, %1, %2, %0|%0, %2, %1, 0xdd}";
+ return "vpternlog<ternlogsuffix>\t{$0xdd, %g1, %g2, %g0|%g0, %g2, %g1, 0xdd}";
+ }
+ if (TARGET_AVX512VL)
+ return "vpternlog<ternlogsuffix>\t{$0xbb, %2, %1, %0|%0, %1, %2, 0xbb}";
+ return "vpternlog<ternlogsuffix>\t{$0xbb, %g2, %g1, %g0|%g0, %g1, %g2, 0xbb}";
+}
+ [(set_attr "type" "sselog")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set (attr "mode")
+ (if_then_else (match_test "TARGET_AVX512VL")
+ (const_string "<sseinsnmode>")
+ (const_string "XI")))
+ (set (attr "enabled")
+ (if_then_else (eq_attr "alternative" "2,3")
+ (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+ (const_string "*")))])
+
+(define_insn "*xnor<mode>3"
+ [(set (match_operand:VI 0 "register_operand" "=v,v")
+ (not:VI
+ (xor:VI
+ (match_operand:VI 1 "bcst_vector_operand" "%v,v")
+ (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+ "(<MODE_SIZE> == 64 || TARGET_AVX512VL
+ || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
+ && (register_operand (operands[1], <MODE>mode)
+ || register_operand (operands[2], <MODE>mode))"
+{
+ if (TARGET_AVX512VL)
+ return "vpternlog<ternlogsuffix>\t{$0x99, %2, %1, %0|%0, %1, %2, 0x99}";
+ else
+ return "vpternlog<ternlogsuffix>\t{$0x99, %g2, %g1, %g0|%g0, %g1, %g2, 0x99}";
+}
+ [(set_attr "type" "sselog")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set (attr "mode")
+ (if_then_else (match_test "TARGET_AVX512VL")
+ (const_string "<sseinsnmode>")
+ (const_string "XI")))
+ (set (attr "enabled")
+ (if_then_else (eq_attr "alternative" "1")
+ (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+ (const_string "*")))])
+
+(define_code_iterator andor [and ior])
+(define_code_attr nlogic [(and "nor") (ior "nand")])
+(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
+
+(define_insn "*<nlogic><mode>3"
+ [(set (match_operand:VI 0 "register_operand" "=v,v")
+ (andor:VI
+ (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
+ (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+ "(<MODE_SIZE> == 64 || TARGET_AVX512VL
+ || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
+ && (register_operand (operands[1], <MODE>mode)
+ || register_operand (operands[2], <MODE>mode))"
+{
+ if (TARGET_AVX512VL)
+ return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %2, %1, %0|%0, %1, %2, <ternlog_nlogic>}";
+ else
+ return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %g2, %g1, %g0|%g0, %g1, %g2, <ternlog_nlogic>}";
+}
+ [(set_attr "type" "sselog")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "evex")
+ (set (attr "mode")
+ (if_then_else (match_test "TARGET_AVX512VL")
+ (const_string "<sseinsnmode>")
+ (const_string "XI")))
+ (set (attr "enabled")
+ (if_then_else (eq_attr "alternative" "1")
+ (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+ (const_string "*")))])
+
(define_mode_iterator AVX512ZEXTMASK
[(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI])
@@ -0,0 +1,13 @@
+#include <immintrin.h>
+
+#define PASTER2(x,y) x##y
+#define PASTER3(x,y,z) _mm##x##_##y##_##z
+#define OP(vec, op, suffix) PASTER3 (vec, op, suffix)
+#define DUP(vec, suffix, val) PASTER3 (vec, set1, suffix) (val)
+
+type
+foo (type x, SCALAR *f)
+{
+ return OP (vec, op, suffix) (x, OP (vec, xor, suffix) (DUP (vec, suffix, *f),
+ DUP (vec, suffix, ~0)));
+}
@@ -0,0 +1,13 @@
+#include <immintrin.h>
+
+#define PASTER2(x,y) x##y
+#define PASTER3(x,y,z) _mm##x##_##y##_##z
+#define OP(vec, op, suffix) PASTER3 (vec, op, suffix)
+#define DUP(vec, suffix, val) PASTER3 (vec, set1, suffix) (val)
+
+type
+foo (type x, SCALAR *f)
+{
+ return OP (vec, op, suffix) (OP (vec, xor, suffix) (x, DUP (vec, suffix, ~0)),
+ DUP (vec, suffix, *f));
+}
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xdd, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
+/* { dg-final { scan-assembler-not "vpbroadcast" } } */
+
+#define type __m512i
+#define vec 512
+#define op or
+#define suffix epi32
+#define SCALAR int
+
+#include "avx512-binop-not-1.h"
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xbb, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
+/* { dg-final { scan-assembler-not "vpbroadcast" } } */
+
+#define type __m512i
+#define vec 512
+#define op or
+#define suffix epi32
+#define SCALAR int
+
+#include "avx512-binop-not-2.h"