[1/5] x86: use VPTERNLOG for further bitwise two-vector operations

Message ID 457ffad0-9ecd-3e19-f5ab-6153ce4b8bad@suse.com
State Accepted
Headers
Series x86: make better use of VPTERNLOG{D,Q} |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Jan Beulich June 21, 2023, 6:25 a.m. UTC
  All combinations of and, ior, xor, and not involving two operands can be
expressed that way in a single insn.

gcc/

	PR target/93768
	* config/i386/i386.cc (ix86_rtx_costs): Further special-case
	bitwise vector operations.
	* config/i386/sse.md (*iornot<mode>3): New insn.
	(*xnor<mode>3): Likewise.
	(*<nlogic><mode>3): Likewise.
	(andor): New code iterator.
	(nlogic): New code attribute.
	(ternlog_nlogic): Likewise.

gcc/testsuite/

	PR target/93768
	gcc.target/i386/avx512-binop-not-1.h: New.
	gcc.target/i386/avx512-binop-not-2.h: New.
	gcc.target/i386/avx512f-orn-si-zmm-1.c: New test.
	gcc.target/i386/avx512f-orn-si-zmm-2.c: New test.
---
The use of VI matches that in e.g. one_cmpl<mode>2 /
<mask_codefor>one_cmpl<mode>2<mask_name> and *andnot<mode>3, despite
(here and there)
- V64QI and V32HI being needlessly excluded when AVX512BW isn't enabled,
- V<n>TI not being covered,
- vector modes more narrow than 16 bytes not being covered.
  

Comments

Hongtao Liu June 25, 2023, 4:42 a.m. UTC | #1
On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> All combinations of and, ior, xor, and not involving two operands can be
> expressed that way in a single insn.
>
> gcc/
>
>         PR target/93768
>         * config/i386/i386.cc (ix86_rtx_costs): Further special-case
>         bitwise vector operations.
>         * config/i386/sse.md (*iornot<mode>3): New insn.
>         (*xnor<mode>3): Likewise.
>         (*<nlogic><mode>3): Likewise.
>         (andor): New code iterator.
>         (nlogic): New code attribute.
>         (ternlog_nlogic): Likewise.
>
> gcc/testsuite/
>
>         PR target/93768
>         gcc.target/i386/avx512-binop-not-1.h: New.
>         gcc.target/i386/avx512-binop-not-2.h: New.
>         gcc.target/i386/avx512f-orn-si-zmm-1.c: New test.
>         gcc.target/i386/avx512f-orn-si-zmm-2.c: New test.
> ---
> The use of VI matches that in e.g. one_cmpl<mode>2 /
> <mask_codefor>one_cmpl<mode>2<mask_name> and *andnot<mode>3, despite
> (here and there)
> - V64QI and V32HI being needlessly excluded when AVX512BW isn't enabled,
> - V<n>TI not being covered,
> - vector modes more narrow than 16 bytes not being covered.
>
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -21178,6 +21178,32 @@ ix86_rtx_costs (rtx x, machine_mode mode
>        return false;
>
>      case IOR:
> +      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> +       {
> +         /* (ior (not ...) ...) can be a single insn in AVX512.  */
> +         if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F
> +             && (GET_MODE_SIZE (mode) == 64
> +                 || (TARGET_AVX512VL
> +                     && (GET_MODE_SIZE (mode) == 32
> +                         || GET_MODE_SIZE (mode) == 16))))
> +           {
> +             rtx right = GET_CODE (XEXP (x, 1)) != NOT
> +                         ? XEXP (x, 1) : XEXP (XEXP (x, 1), 0);
> +
> +             *total = ix86_vec_cost (mode, cost->sse_op)
> +                      + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
> +                                  outer_code, opno, speed)
> +                      + rtx_cost (right, mode, outer_code, opno, speed);
> +             return true;
> +           }
> +         *total = ix86_vec_cost (mode, cost->sse_op);
> +       }
> +      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
> +       *total = cost->add * 2;
> +      else
> +       *total = cost->add;
> +      return false;
> +
>      case XOR:
>        if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
>         *total = ix86_vec_cost (mode, cost->sse_op);
> @@ -21198,11 +21224,20 @@ ix86_rtx_costs (rtx x, machine_mode mode
>           /* pandn is a single instruction.  */
>           if (GET_CODE (XEXP (x, 0)) == NOT)
>             {
> +             rtx right = XEXP (x, 1);
> +
> +             /* (and (not ...) (not ...)) can be a single insn in AVX512.  */
> +             if (GET_CODE (right) == NOT && TARGET_AVX512F
> +                 && (GET_MODE_SIZE (mode) == 64
> +                     || (TARGET_AVX512VL
> +                         && (GET_MODE_SIZE (mode) == 32
> +                             || GET_MODE_SIZE (mode) == 16))))
> +               right = XEXP (right, 0);
> +
>               *total = ix86_vec_cost (mode, cost->sse_op)
>                        + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
>                                    outer_code, opno, speed)
> -                      + rtx_cost (XEXP (x, 1), mode,
> -                                  outer_code, opno, speed);
> +                      + rtx_cost (right, mode, outer_code, opno, speed);
>               return true;
>             }
>           else if (GET_CODE (XEXP (x, 1)) == NOT)
> @@ -21260,8 +21295,25 @@ ix86_rtx_costs (rtx x, machine_mode mode
>
>      case NOT:
>        if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> -       // vnot is pxor -1.
> -       *total = ix86_vec_cost (mode, cost->sse_op) + 1;
> +       {
> +         /* (not (xor ...)) can be a single insn in AVX512.  */
> +         if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F
> +             && (GET_MODE_SIZE (mode) == 64
> +                 || (TARGET_AVX512VL
> +                     && (GET_MODE_SIZE (mode) == 32
> +                         || GET_MODE_SIZE (mode) == 16))))
> +           {
> +             *total = ix86_vec_cost (mode, cost->sse_op)
> +                      + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
> +                                  outer_code, opno, speed)
> +                      + rtx_cost (XEXP (XEXP (x, 0), 1), mode,
> +                                  outer_code, opno, speed);
> +             return true;
> +           }
> +
> +         // vnot is pxor -1.
> +         *total = ix86_vec_cost (mode, cost->sse_op) + 1;
> +       }
>        else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
>         *total = cost->add * 2;
>        else
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -17616,6 +17616,98 @@
>    operands[2] = force_reg (V1TImode, CONSTM1_RTX (V1TImode));
>  })
>
> +(define_insn "*iornot<mode>3"
> +  [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
> +       (ior:VI
> +         (not:VI
> +           (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
> +         (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
> +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> +    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> +   && (register_operand (operands[1], <MODE>mode)
> +       || register_operand (operands[2], <MODE>mode))"
> +{
> +  if (!register_operand (operands[1], <MODE>mode))
> +    {
> +      if (TARGET_AVX512VL)
> +       return "vpternlog<ternlogsuffix>\t{$0xdd, %1, %2, %0|%0, %2, %1, 0xdd}";
> +      return "vpternlog<ternlogsuffix>\t{$0xdd, %g1, %g2, %g0|%g0, %g2, %g1, 0xdd}";
> +    }
> +  if (TARGET_AVX512VL)
> +    return "vpternlog<ternlogsuffix>\t{$0xbb, %2, %1, %0|%0, %1, %2, 0xbb}";
> +  return "vpternlog<ternlogsuffix>\t{$0xbb, %g2, %g1, %g0|%g0, %g1, %g2, 0xbb}";
> +}
> +  [(set_attr "type" "sselog")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set (attr "mode")
> +        (if_then_else (match_test "TARGET_AVX512VL")
> +                     (const_string "<sseinsnmode>")
> +                     (const_string "XI")))
> +   (set (attr "enabled")
> +       (if_then_else (eq_attr "alternative" "2,3")
> +                     (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> +                     (const_string "*")))])
> +
> +(define_insn "*xnor<mode>3"
> +  [(set (match_operand:VI 0 "register_operand" "=v,v")
> +       (not:VI
> +         (xor:VI
> +           (match_operand:VI 1 "bcst_vector_operand" "%v,v")
> +           (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> +    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> +   && (register_operand (operands[1], <MODE>mode)
> +       || register_operand (operands[2], <MODE>mode))"
> +{
> +  if (TARGET_AVX512VL)
> +    return "vpternlog<ternlogsuffix>\t{$0x99, %2, %1, %0|%0, %1, %2, 0x99}";
> +  else
> +    return "vpternlog<ternlogsuffix>\t{$0x99, %g2, %g1, %g0|%g0, %g1, %g2, 0x99}";
> +}
> +  [(set_attr "type" "sselog")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set (attr "mode")
> +        (if_then_else (match_test "TARGET_AVX512VL")
> +                     (const_string "<sseinsnmode>")
> +                     (const_string "XI")))
> +   (set (attr "enabled")
> +       (if_then_else (eq_attr "alternative" "1")
> +                     (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> +                     (const_string "*")))])
> +
> +(define_code_iterator andor [and ior])
> +(define_code_attr nlogic [(and "nor") (ior "nand")])
> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> +
> +(define_insn "*<nlogic><mode>3"
> +  [(set (match_operand:VI 0 "register_operand" "=v,v")
> +       (andor:VI
> +         (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> +         (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
(and (not op1))  (not op2)) -> (not: (ior: op1 op2))
(ior (not op1) (not op2)) -> (not : (and op1 op2))

Even w/o avx512f, the transformation should also benefit since it
takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).

The other 2 patterns: *xnor<mode>3 and iornot<mode>3  LGTM.

> +  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
> +    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
> +   && (register_operand (operands[1], <MODE>mode)
> +       || register_operand (operands[2], <MODE>mode))"
> +{
> +  if (TARGET_AVX512VL)
> +    return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %2, %1, %0|%0, %1, %2, <ternlog_nlogic>}";
> +  else
> +    return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %g2, %g1, %g0|%g0, %g1, %g2, <ternlog_nlogic>}";
> +}
> +  [(set_attr "type" "sselog")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "evex")
> +   (set (attr "mode")
> +        (if_then_else (match_test "TARGET_AVX512VL")
> +                     (const_string "<sseinsnmode>")
> +                     (const_string "XI")))
> +   (set (attr "enabled")
> +       (if_then_else (eq_attr "alternative" "1")
> +                     (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
> +                     (const_string "*")))])
> +
>  (define_mode_iterator AVX512ZEXTMASK
>    [(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI])
>
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-1.h
> @@ -0,0 +1,13 @@
> +#include <immintrin.h>
> +
> +#define PASTER2(x,y)           x##y
> +#define PASTER3(x,y,z)         _mm##x##_##y##_##z
> +#define OP(vec, op, suffix)    PASTER3 (vec, op, suffix)
> +#define DUP(vec, suffix, val)  PASTER3 (vec, set1, suffix) (val)
> +
> +type
> +foo (type x, SCALAR *f)
> +{
> +  return OP (vec, op, suffix) (x, OP (vec, xor, suffix) (DUP (vec, suffix, *f),
> +                                                        DUP (vec, suffix, ~0)));
> +}
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-2.h
> @@ -0,0 +1,13 @@
> +#include <immintrin.h>
> +
> +#define PASTER2(x,y)           x##y
> +#define PASTER3(x,y,z)         _mm##x##_##y##_##z
> +#define OP(vec, op, suffix)    PASTER3 (vec, op, suffix)
> +#define DUP(vec, suffix, val)  PASTER3 (vec, set1, suffix) (val)
> +
> +type
> +foo (type x, SCALAR *f)
> +{
> +  return OP (vec, op, suffix) (OP (vec, xor, suffix) (x, DUP (vec, suffix, ~0)),
> +                              DUP (vec, suffix, *f));
> +}
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
> +/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xdd, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vpbroadcast" } } */
> +
> +#define type __m512i
> +#define vec 512
> +#define op or
> +#define suffix epi32
> +#define SCALAR int
> +
> +#include "avx512-binop-not-1.h"
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
> +/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xbb, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
> +/* { dg-final { scan-assembler-not "vpbroadcast" } } */
> +
> +#define type __m512i
> +#define vec 512
> +#define op or
> +#define suffix epi32
> +#define SCALAR int
> +
> +#include "avx512-binop-not-2.h"
>
  
Jan Beulich June 25, 2023, 5:52 a.m. UTC | #2
On 25.06.2023 06:42, Hongtao Liu wrote:
> On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> +(define_code_iterator andor [and ior])
>> +(define_code_attr nlogic [(and "nor") (ior "nand")])
>> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
>> +
>> +(define_insn "*<nlogic><mode>3"
>> +  [(set (match_operand:VI 0 "register_operand" "=v,v")
>> +       (andor:VI
>> +         (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
>> +         (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
> (and (not op1))  (not op2)) -> (not: (ior: op1 op2))

This wouldn't be a win (not + andn) -> (or + not), but what's
more important is ...

> (ior (not op1) (not op2)) -> (not : (and op1 op2))
> 
> Even w/o avx512f, the transformation should also benefit since it
> takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).

... that these transformations (from the, as per the doc,
canonical representation of nand and nor) are already occurring
in common code, _if_ no suitable insn can be found. That was at
least the conclusion I drew from looking around a lot, supported
by the code that's generated prior to this change.

Jan
  
Hongtao Liu June 25, 2023, 7:13 a.m. UTC | #3
On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 25.06.2023 06:42, Hongtao Liu wrote:
> > On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> +(define_code_iterator andor [and ior])
> >> +(define_code_attr nlogic [(and "nor") (ior "nand")])
> >> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> >> +
> >> +(define_insn "*<nlogic><mode>3"
> >> +  [(set (match_operand:VI 0 "register_operand" "=v,v")
> >> +       (andor:VI
> >> +         (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> >> +         (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
> > (and (not op1))  (not op2)) -> (not: (ior: op1 op2))
>
> This wouldn't be a win (not + andn) -> (or + not), but what's
> more important is ...
>
> > (ior (not op1) (not op2)) -> (not : (and op1 op2))
> >
> > Even w/o avx512f, the transformation should also benefit since it
> > takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
>
> ... that these transformations (from the, as per the doc,
> canonical representation of nand and nor) are already occurring
I see, there're already such simplifications in the gimple phase, so
the question: is there any need for and/ior:not not pattern?
Can you provide a testcase to demonstrate that and/ior: not not
pattern is needed?
> in common code, _if_ no suitable insn can be found. That was at
> least the conclusion I drew from looking around a lot, supported
> by the code that's generated prior to this change.
>
> Jan
  
Hongtao Liu June 25, 2023, 7:23 a.m. UTC | #4
On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote:
> >
> > On 25.06.2023 06:42, Hongtao Liu wrote:
> > > On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > >>
> > >> +(define_code_iterator andor [and ior])
> > >> +(define_code_attr nlogic [(and "nor") (ior "nand")])
> > >> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> > >> +
> > >> +(define_insn "*<nlogic><mode>3"
> > >> +  [(set (match_operand:VI 0 "register_operand" "=v,v")
> > >> +       (andor:VI
> > >> +         (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> > >> +         (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > > I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
> > > (and (not op1))  (not op2)) -> (not: (ior: op1 op2))
> >
> > This wouldn't be a win (not + andn) -> (or + not), but what's
> > more important is ...
> >
> > > (ior (not op1) (not op2)) -> (not : (and op1 op2))
> > >
> > > Even w/o avx512f, the transformation should also benefit since it
> > > takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
> >
> > ... that these transformations (from the, as per the doc,
> > canonical representation of nand and nor) are already occurring
> I see, there're already such simplifications in the gimple phase, so
> the question: is there any need for and/ior:not not pattern?
> Can you provide a testcase to demonstrate that and/ior: not not
> pattern is needed?

typedef int v4si __attribute__((vector_size(16)));
v4si
foo1 (v4si a, v4si b)
{
    return ~a & ~b;
}

I only gimple have optimized it to

  <bb 2> [local count: 1073741824]:
  # DEBUG BEGIN_STMT
  _1 = a_2(D) | b_3(D);
  _4 = ~_1;
  return _4;


But rtl still try to match

(set (reg:V4SI 86)
    (and:V4SI (not:V4SI (reg:V4SI 88))
        (not:V4SI (reg:V4SI 89))))

Hmm.
> > in common code, _if_ no suitable insn can be found. That was at
> > least the conclusion I drew from looking around a lot, supported
> > by the code that's generated prior to this change.
> >
> > Jan
>
>
>
> --
> BR,
> Hongtao
  
Hongtao Liu June 25, 2023, 7:30 a.m. UTC | #5
On Sun, Jun 25, 2023 at 3:23 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote:
> > >
> > > On 25.06.2023 06:42, Hongtao Liu wrote:
> > > > On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
> > > > <gcc-patches@gcc.gnu.org> wrote:
> > > >>
> > > >> +(define_code_iterator andor [and ior])
> > > >> +(define_code_attr nlogic [(and "nor") (ior "nand")])
> > > >> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> > > >> +
> > > >> +(define_insn "*<nlogic><mode>3"
> > > >> +  [(set (match_operand:VI 0 "register_operand" "=v,v")
> > > >> +       (andor:VI
> > > >> +         (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> > > >> +         (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> > > > I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
> > > > (and (not op1))  (not op2)) -> (not: (ior: op1 op2))
> > >
> > > This wouldn't be a win (not + andn) -> (or + not), but what's
> > > more important is ...
> > >
> > > > (ior (not op1) (not op2)) -> (not : (and op1 op2))
> > > >
> > > > Even w/o avx512f, the transformation should also benefit since it
> > > > takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
> > >
> > > ... that these transformations (from the, as per the doc,
> > > canonical representation of nand and nor) are already occurring
> > I see, there're already such simplifications in the gimple phase, so
> > the question: is there any need for and/ior:not not pattern?
> > Can you provide a testcase to demonstrate that and/ior: not not
> > pattern is needed?
>
> typedef int v4si __attribute__((vector_size(16)));
> v4si
> foo1 (v4si a, v4si b)
> {
>     return ~a & ~b;
> }
>
> I only gimple have optimized it to
>
>   <bb 2> [local count: 1073741824]:
>   # DEBUG BEGIN_STMT
>   _1 = a_2(D) | b_3(D);
>   _4 = ~_1;
>   return _4;
>
>
> But rtl still try to match
>
> (set (reg:V4SI 86)
>     (and:V4SI (not:V4SI (reg:V4SI 88))
>         (not:V4SI (reg:V4SI 89))))
>
> Hmm.
In rtl, we're using xor -1 for not, so it's

(insn 8 7 9 2 (set (reg:V4SI 87)
        (ior:V4SI (reg:V4SI 88)
            (reg:V4SI 89))) "/app/example.cpp":6:15 6830 {*iorv4si3}
     (expr_list:REG_DEAD (reg:V4SI 89)
        (expr_list:REG_DEAD (reg:V4SI 88)
            (nil))))
(insn 9 8 14 2 (set (reg:V4SI 86)
        (xor:V4SI (reg:V4SI 87)
            (const_vector:V4SI [
                    (const_int -1 [0xffffffffffffffff]) repeated x4
                ]))) "/app/example.cpp":6:18 6792 {*one_cmplv4si2}

Then simplified to
> (set (reg:V4SI 86)
>     (and:V4SI (not:V4SI (reg:V4SI 88))
>         (not:V4SI (reg:V4SI 89))))
>

by

3565    case XOR:
3566      if (trueop1 == CONST0_RTX (mode))
3567        return op0;
3568      if (INTEGRAL_MODE_P (mode) && trueop1 == CONSTM1_RTX (mode))
3569        return simplify_gen_unary (NOT, mode, op0, mode);

and

1018      /* Apply De Morgan's laws to reduce number of patterns for machines
1019         with negating logical insns (and-not, nand, etc.).  If result has
1020         only one NOT, put it first, since that is how the patterns are
1021         coded.  */
1022      if (GET_CODE (op) == IOR || GET_CODE (op) == AND)
1023        {
1024          rtx in1 = XEXP (op, 0), in2 = XEXP (op, 1);
1025          machine_mode op_mode;
1026
1027          op_mode = GET_MODE (in1);
1028          in1 = simplify_gen_unary (NOT, op_mode, in1, op_mode);
1029
1030          op_mode = GET_MODE (in2);
1031          if (op_mode == VOIDmode)
1032            op_mode = mode;
1033          in2 = simplify_gen_unary (NOT, op_mode, in2, op_mode);
1034
1035          if (GET_CODE (in2) == NOT && GET_CODE (in1) != NOT)
1036            std::swap (in1, in2);
1037
1038          return gen_rtx_fmt_ee (GET_CODE (op) == IOR ? AND : IOR,
1039                                 mode, in1, in2);
1040        }


Ok, got it, and/ior:not not pattern LGTM then.

> > > in common code, _if_ no suitable insn can be found. That was at
> > > least the conclusion I drew from looking around a lot, supported
> > > by the code that's generated prior to this change.
> > >
> > > Jan
> >
> >
> >
> > --
> > BR,
> > Hongtao
>
>
>
> --
> BR,
> Hongtao
  
Jan Beulich June 25, 2023, 1:35 p.m. UTC | #6
On 25.06.2023 09:30, Hongtao Liu wrote:
> On Sun, Jun 25, 2023 at 3:23 PM Hongtao Liu <crazylht@gmail.com> wrote:
>>
>> On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
>>>
>>> On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>>
>>>> On 25.06.2023 06:42, Hongtao Liu wrote:
>>>>> On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
>>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>>>
>>>>>> +(define_code_iterator andor [and ior])
>>>>>> +(define_code_attr nlogic [(and "nor") (ior "nand")])
>>>>>> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
>>>>>> +
>>>>>> +(define_insn "*<nlogic><mode>3"
>>>>>> +  [(set (match_operand:VI 0 "register_operand" "=v,v")
>>>>>> +       (andor:VI
>>>>>> +         (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
>>>>>> +         (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
>>>>> I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
>>>>> (and (not op1))  (not op2)) -> (not: (ior: op1 op2))
>>>>
>>>> This wouldn't be a win (not + andn) -> (or + not), but what's
>>>> more important is ...
>>>>
>>>>> (ior (not op1) (not op2)) -> (not : (and op1 op2))
>>>>>
>>>>> Even w/o avx512f, the transformation should also benefit since it
>>>>> takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
>>>>
>>>> ... that these transformations (from the, as per the doc,
>>>> canonical representation of nand and nor) are already occurring
>>> I see, there're already such simplifications in the gimple phase, so
>>> the question: is there any need for and/ior:not not pattern?
>>> Can you provide a testcase to demonstrate that and/ior: not not
>>> pattern is needed?
>>
>> typedef int v4si __attribute__((vector_size(16)));
>> v4si
>> foo1 (v4si a, v4si b)
>> {
>>     return ~a & ~b;
>> }
>>
>> I only gimple have optimized it to
>>
>>   <bb 2> [local count: 1073741824]:
>>   # DEBUG BEGIN_STMT
>>   _1 = a_2(D) | b_3(D);
>>   _4 = ~_1;
>>   return _4;
>>
>>
>> But rtl still try to match
>>
>> (set (reg:V4SI 86)
>>     (and:V4SI (not:V4SI (reg:V4SI 88))
>>         (not:V4SI (reg:V4SI 89))))
>>
>> Hmm.
> In rtl, we're using xor -1 for not, so it's
> 
> (insn 8 7 9 2 (set (reg:V4SI 87)
>         (ior:V4SI (reg:V4SI 88)
>             (reg:V4SI 89))) "/app/example.cpp":6:15 6830 {*iorv4si3}
>      (expr_list:REG_DEAD (reg:V4SI 89)
>         (expr_list:REG_DEAD (reg:V4SI 88)
>             (nil))))
> (insn 9 8 14 2 (set (reg:V4SI 86)
>         (xor:V4SI (reg:V4SI 87)
>             (const_vector:V4SI [
>                     (const_int -1 [0xffffffffffffffff]) repeated x4
>                 ]))) "/app/example.cpp":6:18 6792 {*one_cmplv4si2}
> 
> Then simplified to
>> (set (reg:V4SI 86)
>>     (and:V4SI (not:V4SI (reg:V4SI 88))
>>         (not:V4SI (reg:V4SI 89))))
>>
> 
> by
> 
> 3565    case XOR:
> 3566      if (trueop1 == CONST0_RTX (mode))
> 3567        return op0;
> 3568      if (INTEGRAL_MODE_P (mode) && trueop1 == CONSTM1_RTX (mode))
> 3569        return simplify_gen_unary (NOT, mode, op0, mode);
> 
> and
> 
> 1018      /* Apply De Morgan's laws to reduce number of patterns for machines
> 1019         with negating logical insns (and-not, nand, etc.).  If result has
> 1020         only one NOT, put it first, since that is how the patterns are
> 1021         coded.  */
> 1022      if (GET_CODE (op) == IOR || GET_CODE (op) == AND)
> 1023        {
> 1024          rtx in1 = XEXP (op, 0), in2 = XEXP (op, 1);
> 1025          machine_mode op_mode;
> 1026
> 1027          op_mode = GET_MODE (in1);
> 1028          in1 = simplify_gen_unary (NOT, op_mode, in1, op_mode);
> 1029
> 1030          op_mode = GET_MODE (in2);
> 1031          if (op_mode == VOIDmode)
> 1032            op_mode = mode;
> 1033          in2 = simplify_gen_unary (NOT, op_mode, in2, op_mode);
> 1034
> 1035          if (GET_CODE (in2) == NOT && GET_CODE (in1) != NOT)
> 1036            std::swap (in1, in2);
> 1037
> 1038          return gen_rtx_fmt_ee (GET_CODE (op) == IOR ? AND : IOR,
> 1039                                 mode, in1, in2);
> 1040        }
> 
> 
> Ok, got it, and/ior:not not pattern LGTM then.

Just to avoid misunderstandings - together with your initial
reply that's then an "okay" to the patch as a whole, right?

Thanks, Jan
  
Hongtao Liu June 26, 2023, 12:42 a.m. UTC | #7
On Sun, Jun 25, 2023 at 9:35 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 25.06.2023 09:30, Hongtao Liu wrote:
> > On Sun, Jun 25, 2023 at 3:23 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >>
> >> On Sun, Jun 25, 2023 at 3:13 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >>>
> >>> On Sun, Jun 25, 2023 at 1:52 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>>>
> >>>> On 25.06.2023 06:42, Hongtao Liu wrote:
> >>>>> On Wed, Jun 21, 2023 at 2:26 PM Jan Beulich via Gcc-patches
> >>>>> <gcc-patches@gcc.gnu.org> wrote:
> >>>>>>
> >>>>>> +(define_code_iterator andor [and ior])
> >>>>>> +(define_code_attr nlogic [(and "nor") (ior "nand")])
> >>>>>> +(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
> >>>>>> +
> >>>>>> +(define_insn "*<nlogic><mode>3"
> >>>>>> +  [(set (match_operand:VI 0 "register_operand" "=v,v")
> >>>>>> +       (andor:VI
> >>>>>> +         (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
> >>>>>> +         (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
> >>>>> I'm thinking of doing it in simplify_rtx or gimple match.pd to transform
> >>>>> (and (not op1))  (not op2)) -> (not: (ior: op1 op2))
> >>>>
> >>>> This wouldn't be a win (not + andn) -> (or + not), but what's
> >>>> more important is ...
> >>>>
> >>>>> (ior (not op1) (not op2)) -> (not : (and op1 op2))
> >>>>>
> >>>>> Even w/o avx512f, the transformation should also benefit since it
> >>>>> takes less logic operations 3 -> 2.(or 2 -> 2 for pandn).
> >>>>
> >>>> ... that these transformations (from the, as per the doc,
> >>>> canonical representation of nand and nor) are already occurring
> >>> I see, there're already such simplifications in the gimple phase, so
> >>> the question: is there any need for and/ior:not not pattern?
> >>> Can you provide a testcase to demonstrate that and/ior: not not
> >>> pattern is needed?
> >>
> >> typedef int v4si __attribute__((vector_size(16)));
> >> v4si
> >> foo1 (v4si a, v4si b)
> >> {
> >>     return ~a & ~b;
> >> }
> >>
> >> I only gimple have optimized it to
> >>
> >>   <bb 2> [local count: 1073741824]:
> >>   # DEBUG BEGIN_STMT
> >>   _1 = a_2(D) | b_3(D);
> >>   _4 = ~_1;
> >>   return _4;
> >>
> >>
> >> But rtl still try to match
> >>
> >> (set (reg:V4SI 86)
> >>     (and:V4SI (not:V4SI (reg:V4SI 88))
> >>         (not:V4SI (reg:V4SI 89))))
> >>
> >> Hmm.
> > In rtl, we're using xor -1 for not, so it's
> >
> > (insn 8 7 9 2 (set (reg:V4SI 87)
> >         (ior:V4SI (reg:V4SI 88)
> >             (reg:V4SI 89))) "/app/example.cpp":6:15 6830 {*iorv4si3}
> >      (expr_list:REG_DEAD (reg:V4SI 89)
> >         (expr_list:REG_DEAD (reg:V4SI 88)
> >             (nil))))
> > (insn 9 8 14 2 (set (reg:V4SI 86)
> >         (xor:V4SI (reg:V4SI 87)
> >             (const_vector:V4SI [
> >                     (const_int -1 [0xffffffffffffffff]) repeated x4
> >                 ]))) "/app/example.cpp":6:18 6792 {*one_cmplv4si2}
> >
> > Then simplified to
> >> (set (reg:V4SI 86)
> >>     (and:V4SI (not:V4SI (reg:V4SI 88))
> >>         (not:V4SI (reg:V4SI 89))))
> >>
> >
> > by
> >
> > 3565    case XOR:
> > 3566      if (trueop1 == CONST0_RTX (mode))
> > 3567        return op0;
> > 3568      if (INTEGRAL_MODE_P (mode) && trueop1 == CONSTM1_RTX (mode))
> > 3569        return simplify_gen_unary (NOT, mode, op0, mode);
> >
> > and
> >
> > 1018      /* Apply De Morgan's laws to reduce number of patterns for machines
> > 1019         with negating logical insns (and-not, nand, etc.).  If result has
> > 1020         only one NOT, put it first, since that is how the patterns are
> > 1021         coded.  */
> > 1022      if (GET_CODE (op) == IOR || GET_CODE (op) == AND)
> > 1023        {
> > 1024          rtx in1 = XEXP (op, 0), in2 = XEXP (op, 1);
> > 1025          machine_mode op_mode;
> > 1026
> > 1027          op_mode = GET_MODE (in1);
> > 1028          in1 = simplify_gen_unary (NOT, op_mode, in1, op_mode);
> > 1029
> > 1030          op_mode = GET_MODE (in2);
> > 1031          if (op_mode == VOIDmode)
> > 1032            op_mode = mode;
> > 1033          in2 = simplify_gen_unary (NOT, op_mode, in2, op_mode);
> > 1034
> > 1035          if (GET_CODE (in2) == NOT && GET_CODE (in1) != NOT)
> > 1036            std::swap (in1, in2);
> > 1037
> > 1038          return gen_rtx_fmt_ee (GET_CODE (op) == IOR ? AND : IOR,
> > 1039                                 mode, in1, in2);
> > 1040        }
> >
> >
> > Ok, got it, and/ior:not not pattern LGTM then.
>
> Just to avoid misunderstandings - together with your initial
> reply that's then an "okay" to the patch as a whole, right?
Yes.
>
> Thanks, Jan
  

Patch

--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21178,6 +21178,32 @@  ix86_rtx_costs (rtx x, machine_mode mode
       return false;
 
     case IOR:
+      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+	{
+	  /* (ior (not ...) ...) can be a single insn in AVX512.  */
+	  if (GET_CODE (XEXP (x, 0)) == NOT && TARGET_AVX512F
+	      && (GET_MODE_SIZE (mode) == 64
+		  || (TARGET_AVX512VL
+		      && (GET_MODE_SIZE (mode) == 32
+			  || GET_MODE_SIZE (mode) == 16))))
+	    {
+	      rtx right = GET_CODE (XEXP (x, 1)) != NOT
+			  ? XEXP (x, 1) : XEXP (XEXP (x, 1), 0);
+
+	      *total = ix86_vec_cost (mode, cost->sse_op)
+		       + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+				   outer_code, opno, speed)
+		       + rtx_cost (right, mode, outer_code, opno, speed);
+	      return true;
+	    }
+	  *total = ix86_vec_cost (mode, cost->sse_op);
+	}
+      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+	*total = cost->add * 2;
+      else
+	*total = cost->add;
+      return false;
+
     case XOR:
       if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
 	*total = ix86_vec_cost (mode, cost->sse_op);
@@ -21198,11 +21224,20 @@  ix86_rtx_costs (rtx x, machine_mode mode
 	  /* pandn is a single instruction.  */
 	  if (GET_CODE (XEXP (x, 0)) == NOT)
 	    {
+	      rtx right = XEXP (x, 1);
+
+	      /* (and (not ...) (not ...)) can be a single insn in AVX512.  */
+	      if (GET_CODE (right) == NOT && TARGET_AVX512F
+		  && (GET_MODE_SIZE (mode) == 64
+		      || (TARGET_AVX512VL
+			  && (GET_MODE_SIZE (mode) == 32
+			      || GET_MODE_SIZE (mode) == 16))))
+		right = XEXP (right, 0);
+
 	      *total = ix86_vec_cost (mode, cost->sse_op)
 		       + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
 				   outer_code, opno, speed)
-		       + rtx_cost (XEXP (x, 1), mode,
-				   outer_code, opno, speed);
+		       + rtx_cost (right, mode, outer_code, opno, speed);
 	      return true;
 	    }
 	  else if (GET_CODE (XEXP (x, 1)) == NOT)
@@ -21260,8 +21295,25 @@  ix86_rtx_costs (rtx x, machine_mode mode
 
     case NOT:
       if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-	// vnot is pxor -1.
-	*total = ix86_vec_cost (mode, cost->sse_op) + 1;
+	{
+	  /* (not (xor ...)) can be a single insn in AVX512.  */
+	  if (GET_CODE (XEXP (x, 0)) == XOR && TARGET_AVX512F
+	      && (GET_MODE_SIZE (mode) == 64
+		  || (TARGET_AVX512VL
+		      && (GET_MODE_SIZE (mode) == 32
+			  || GET_MODE_SIZE (mode) == 16))))
+	    {
+	      *total = ix86_vec_cost (mode, cost->sse_op)
+		       + rtx_cost (XEXP (XEXP (x, 0), 0), mode,
+				   outer_code, opno, speed)
+		       + rtx_cost (XEXP (XEXP (x, 0), 1), mode,
+				   outer_code, opno, speed);
+	      return true;
+	    }
+
+	  // vnot is pxor -1.
+	  *total = ix86_vec_cost (mode, cost->sse_op) + 1;
+	}
       else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
 	*total = cost->add * 2;
       else
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17616,6 +17616,98 @@ 
   operands[2] = force_reg (V1TImode, CONSTM1_RTX (V1TImode));
 })
 
+(define_insn "*iornot<mode>3"
+  [(set (match_operand:VI 0 "register_operand" "=v,v,v,v")
+	(ior:VI
+	  (not:VI
+	    (match_operand:VI 1 "bcst_vector_operand" "v,Br,v,m"))
+	  (match_operand:VI 2 "bcst_vector_operand" "vBr,v,m,v")))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
+    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
+   && (register_operand (operands[1], <MODE>mode)
+       || register_operand (operands[2], <MODE>mode))"
+{
+  if (!register_operand (operands[1], <MODE>mode))
+    {
+      if (TARGET_AVX512VL)
+	return "vpternlog<ternlogsuffix>\t{$0xdd, %1, %2, %0|%0, %2, %1, 0xdd}";
+      return "vpternlog<ternlogsuffix>\t{$0xdd, %g1, %g2, %g0|%g0, %g2, %g1, 0xdd}";
+    }
+  if (TARGET_AVX512VL)
+    return "vpternlog<ternlogsuffix>\t{$0xbb, %2, %1, %0|%0, %1, %2, 0xbb}";
+  return "vpternlog<ternlogsuffix>\t{$0xbb, %g2, %g1, %g0|%g0, %g1, %g2, 0xbb}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set (attr "mode")
+        (if_then_else (match_test "TARGET_AVX512VL")
+		      (const_string "<sseinsnmode>")
+		      (const_string "XI")))
+   (set (attr "enabled")
+	(if_then_else (eq_attr "alternative" "2,3")
+		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+		      (const_string "*")))])
+
+(define_insn "*xnor<mode>3"
+  [(set (match_operand:VI 0 "register_operand" "=v,v")
+	(not:VI
+	  (xor:VI
+	    (match_operand:VI 1 "bcst_vector_operand" "%v,v")
+	    (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
+    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
+   && (register_operand (operands[1], <MODE>mode)
+       || register_operand (operands[2], <MODE>mode))"
+{
+  if (TARGET_AVX512VL)
+    return "vpternlog<ternlogsuffix>\t{$0x99, %2, %1, %0|%0, %1, %2, 0x99}";
+  else
+    return "vpternlog<ternlogsuffix>\t{$0x99, %g2, %g1, %g0|%g0, %g1, %g2, 0x99}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set (attr "mode")
+        (if_then_else (match_test "TARGET_AVX512VL")
+		      (const_string "<sseinsnmode>")
+		      (const_string "XI")))
+   (set (attr "enabled")
+	(if_then_else (eq_attr "alternative" "1")
+		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+		      (const_string "*")))])
+
+(define_code_iterator andor [and ior])
+(define_code_attr nlogic [(and "nor") (ior "nand")])
+(define_code_attr ternlog_nlogic [(and "0x11") (ior "0x77")])
+
+(define_insn "*<nlogic><mode>3"
+  [(set (match_operand:VI 0 "register_operand" "=v,v")
+	(andor:VI
+	  (not:VI (match_operand:VI 1 "bcst_vector_operand" "%v,v"))
+	  (not:VI (match_operand:VI 2 "bcst_vector_operand" "vBr,m"))))]
+  "(<MODE_SIZE> == 64 || TARGET_AVX512VL
+    || (TARGET_AVX512F && !TARGET_PREFER_AVX256))
+   && (register_operand (operands[1], <MODE>mode)
+       || register_operand (operands[2], <MODE>mode))"
+{
+  if (TARGET_AVX512VL)
+    return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %2, %1, %0|%0, %1, %2, <ternlog_nlogic>}";
+  else
+    return "vpternlog<ternlogsuffix>\t{$<ternlog_nlogic>, %g2, %g1, %g0|%g0, %g1, %g2, <ternlog_nlogic>}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "evex")
+   (set (attr "mode")
+        (if_then_else (match_test "TARGET_AVX512VL")
+		      (const_string "<sseinsnmode>")
+		      (const_string "XI")))
+   (set (attr "enabled")
+	(if_then_else (eq_attr "alternative" "1")
+		      (symbol_ref "<MODE_SIZE> == 64 || TARGET_AVX512VL")
+		      (const_string "*")))])
+
 (define_mode_iterator AVX512ZEXTMASK
   [(DI "TARGET_AVX512BW") (SI "TARGET_AVX512BW") HI])
 
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-1.h
@@ -0,0 +1,13 @@ 
+#include <immintrin.h>
+
+#define PASTER2(x,y)		x##y
+#define PASTER3(x,y,z)		_mm##x##_##y##_##z
+#define OP(vec, op, suffix)	PASTER3 (vec, op, suffix)
+#define DUP(vec, suffix, val)	PASTER3 (vec, set1, suffix) (val)
+
+type
+foo (type x, SCALAR *f)
+{
+  return OP (vec, op, suffix) (x, OP (vec, xor, suffix) (DUP (vec, suffix, *f),
+							 DUP (vec, suffix, ~0)));
+}
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512-binop-not-2.h
@@ -0,0 +1,13 @@ 
+#include <immintrin.h>
+
+#define PASTER2(x,y)		x##y
+#define PASTER3(x,y,z)		_mm##x##_##y##_##z
+#define OP(vec, op, suffix)	PASTER3 (vec, op, suffix)
+#define DUP(vec, suffix, val)	PASTER3 (vec, set1, suffix) (val)
+
+type
+foo (type x, SCALAR *f)
+{
+  return OP (vec, op, suffix) (OP (vec, xor, suffix) (x, DUP (vec, suffix, ~0)),
+			       DUP (vec, suffix, *f));
+}
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-1.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xdd, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
+/* { dg-final { scan-assembler-not "vpbroadcast" } } */
+
+#define type __m512i
+#define vec 512
+#define op or
+#define suffix epi32
+#define SCALAR int
+
+#include "avx512-binop-not-1.h"
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-orn-si-zmm-2.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -mno-avx512vl -mprefer-vector-width=512 -O2" } */
+/* { dg-final { scan-assembler-times "vpternlogd\[ \\t\]+\\\$0xbb, \\(%(?:eax|rdi|edi)\\)\\\{1to\[1-8\]+\\\}, %zmm\[0-9\]+, %zmm0" 1 } } */
+/* { dg-final { scan-assembler-not "vpbroadcast" } } */
+
+#define type __m512i
+#define vec 512
+#define op or
+#define suffix epi32
+#define SCALAR int
+
+#include "avx512-binop-not-2.h"