[x86] Add define_insn_and_split to support general version of "kxnor".

Message ID 20221011080316.1778261-1-hongtao.liu@intel.com
State Accepted, archived
Headers
Series [x86] Add define_insn_and_split to support general version of "kxnor". |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

liuhongt Oct. 11, 2022, 8:03 a.m. UTC
  For genereal_reg_operand, it will be splitted into xor + not.
For mask_reg_operand, it will be splitted with UNSPEC_MASK_OP just
like what we did for other logic operations.

The patch will optimize xor+not to kxnor when possible.

Bootstrapped and regtested on x86_64-pc-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

	* config/i386/i386.md (*notxor<mode>_1): New post_reload
	define_insn_and_split.
	(*notxorqi_1): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr107093.c: New test.
---
 gcc/config/i386/i386.md                  | 71 ++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107093.c | 38 +++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107093.c
  

Comments

Uros Bizjak Oct. 11, 2022, 9:04 a.m. UTC | #1
On Tue, Oct 11, 2022 at 10:03 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> For genereal_reg_operand, it will be splitted into xor + not.
> For mask_reg_operand, it will be splitted with UNSPEC_MASK_OP just
> like what we did for other logic operations.
>
> The patch will optimize xor+not to kxnor when possible.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         * config/i386/i386.md (*notxor<mode>_1): New post_reload
>         define_insn_and_split.
>         (*notxorqi_1): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr107093.c: New test.

OK with a small fix below.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.md                  | 71 ++++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr107093.c | 38 +++++++++++++
>  2 files changed, 109 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107093.c
>
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 1be9b669909..228edba2b40 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -10826,6 +10826,39 @@ (define_insn "*<code><mode>_1"
>     (set_attr "type" "alu, alu, msklog")
>     (set_attr "mode" "<MODE>")])
>
> +(define_insn_and_split "*notxor<mode>_1"
> +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,?k")
> +       (not:SWI248
> +         (xor:SWI248
> +           (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
> +           (match_operand:SWI248 2 "<general_operand>" "r<i>,<m>,k"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "ix86_binary_operator_ok (XOR, <MODE>mode, operands)"
> +  "#"
> +  "&& reload_completed"
> +  [(parallel
> +    [(set (match_dup 0)
> +         (xor:SWI248 (match_dup 1) (match_dup 2)))
> +     (clobber (reg:CC FLAGS_REG))])
> +   (set (match_dup 0)
> +       (not:SWI248 (match_dup 1)))]

(not:SWI248 (match_dup 0))

in the above RTX.

> +{
> +  if (MASK_REGNO_P (REGNO (operands[0])))
> +    {
> +      emit_insn (gen_kxnor<mode> (operands[0], operands[1], operands[2]));
> +      DONE;
> +    }
> +}
> +  [(set (attr "isa")
> +       (cond [(eq_attr "alternative" "2")
> +                (if_then_else (eq_attr "mode" "SI,DI")
> +                  (const_string "avx512bw")
> +                  (const_string "avx512f"))
> +             ]
> +             (const_string "*")))
> +   (set_attr "type" "alu, alu, msklog")
> +   (set_attr "mode" "<MODE>")])
> +
>  (define_insn_and_split "*iordi_1_bts"
>    [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
>         (ior:DI
> @@ -10959,6 +10992,44 @@ (define_insn "*<code>qi_1"
>               (symbol_ref "!TARGET_PARTIAL_REG_STALL")]
>            (symbol_ref "true")))])
>
> +(define_insn_and_split "*notxorqi_1"
> +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
> +       (not:QI
> +         (xor:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> +                 (match_operand:QI 2 "general_operand" "qn,m,rn,k"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "ix86_binary_operator_ok (XOR, QImode, operands)"
> +  "#"
> +  "&& reload_completed"
> +  [(parallel
> +    [(set (match_dup 0)
> +         (xor:QI (match_dup 1) (match_dup 2)))
> +     (clobber (reg:CC FLAGS_REG))])
> +   (set (match_dup 0)
> +       (not:QI (match_dup 0)))]
> +{
> +  if (mask_reg_operand (operands[0], QImode))
> +    {
> +      emit_insn (gen_kxnorqi (operands[0], operands[1], operands[2]));
> +      DONE;
> +    }
> +}
> +  [(set_attr "isa" "*,*,*,avx512f")
> +   (set_attr "type" "alu,alu,alu,msklog")
> +   (set (attr "mode")
> +       (cond [(eq_attr "alternative" "2")
> +                (const_string "SI")
> +               (and (eq_attr "alternative" "3")
> +                    (match_test "!TARGET_AVX512DQ"))
> +                (const_string "HI")
> +              ]
> +              (const_string "QI")))
> +   ;; Potential partial reg stall on alternative 2.
> +   (set (attr "preferred_for_speed")
> +     (cond [(eq_attr "alternative" "2")
> +             (symbol_ref "!TARGET_PARTIAL_REG_STALL")]
> +          (symbol_ref "true")))])
> +
>  ;; Alternative 1 is needed to work around LRA limitation, see PR82524.
>  (define_insn_and_split "*<code><mode>_1_slp"
>    [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,&<r>"))
> diff --git a/gcc/testsuite/gcc.target/i386/pr107093.c b/gcc/testsuite/gcc.target/i386/pr107093.c
> new file mode 100644
> index 00000000000..23e30cbac0f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr107093.c
> @@ -0,0 +1,38 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -O2 -mavx512vl" } */
> +/* { dg-final { scan-assembler-times {(?n)kxnor[bwqd]} 4 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times {(?n)kxnor[bwdq]} 3 { target ia32 } } }  */
> +
> +#include<immintrin.h>
> +
> +__m512i
> +foo (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask32 k1 = _mm512_cmp_epi16_mask (a, b, 1);
> +  __mmask32 k2 = _mm512_cmp_epi16_mask (c, d, 2);
> +  return _mm512_mask_mov_epi16 (a, ~(k1 ^ k2), c);
> +}
> +
> +__m512i
> +foo1 (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask16 k1 = _mm512_cmp_epi32_mask (a, b, 1);
> +  __mmask16 k2 = _mm512_cmp_epi32_mask (c, d, 2);
> +  return _mm512_mask_mov_epi32 (a, ~(k1 ^ k2), c);
> +}
> +
> +__m512i
> +foo2 (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask64 k1 = _mm512_cmp_epi8_mask (a, b, 1);
> +  __mmask64 k2 = _mm512_cmp_epi8_mask (c, d, 2);
> +  return _mm512_mask_mov_epi8 (a, ~(k1 ^ k2), c);
> +}
> +
> +__m512i
> +foo3 (__m512i a, __m512i b, __m512i c, __m512i d)
> +{
> +  __mmask8 k1 = _mm512_cmp_epi64_mask (a, b, 1);
> +  __mmask8 k2 = _mm512_cmp_epi64_mask (c, d, 2);
> +  return _mm512_mask_mov_epi64 (a, ~(k1 ^ k2), c);
> +}
> --
> 2.27.0
>
  
Jakub Jelinek Oct. 11, 2022, 1:58 p.m. UTC | #2
On Tue, Oct 11, 2022 at 04:03:16PM +0800, liuhongt via Gcc-patches wrote:
> gcc/ChangeLog:
> 
> 	* config/i386/i386.md (*notxor<mode>_1): New post_reload
> 	define_insn_and_split.
> 	(*notxorqi_1): Ditto.

> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -10826,6 +10826,39 @@ (define_insn "*<code><mode>_1"
>     (set_attr "type" "alu, alu, msklog")
>     (set_attr "mode" "<MODE>")])
>  
> +(define_insn_and_split "*notxor<mode>_1"
> +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,?k")
> +	(not:SWI248
> +	  (xor:SWI248
> +	    (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
> +	    (match_operand:SWI248 2 "<general_operand>" "r<i>,<m>,k"))))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "ix86_binary_operator_ok (XOR, <MODE>mode, operands)"
> +  "#"
> +  "&& reload_completed"
> +  [(parallel
> +    [(set (match_dup 0)
> +	  (xor:SWI248 (match_dup 1) (match_dup 2)))
> +     (clobber (reg:CC FLAGS_REG))])
> +   (set (match_dup 0)
> +	(not:SWI248 (match_dup 1)))]
> +{
> +  if (MASK_REGNO_P (REGNO (operands[0])))

This causes --enable-checking=yes,rtl,extra regression on
gcc.dg/store_merging_13.c test on x86_64-linux:
.../gcc/testsuite/gcc.dg/store_merging_13.c: In function 'f13':
.../gcc/testsuite/gcc.dg/store_merging_13.c:189:1: internal compiler error: RTL check: expected code 'reg', have 'mem' in rhs_regno, at rtl.h:1932
0x7b0c8f rtl_check_failed_code1(rtx_def const*, rtx_code, char const*, int, char const*)
        ../../gcc/rtl.cc:916
0x8e74be rhs_regno
        ../../gcc/rtl.h:1932
0x9785fd rhs_regno
        ./genrtl.h:120
0x9785fd gen_split_260(rtx_insn*, rtx_def**)
        ../../gcc/config/i386/i386.md:10846
0x23596dc split_insns(rtx_def*, rtx_insn*)
        ../../gcc/config/i386/i386.md:16392
0xfccd5a try_split(rtx_def*, rtx_insn*, int)
        ../../gcc/emit-rtl.cc:3799
0x132e9d8 split_insn
        ../../gcc/recog.cc:3384
0x13359d5 split_all_insns()
        ../../gcc/recog.cc:3488
0x1335ae8 execute
        ../../gcc/recog.cc:4412
Please submit a full bug report, with preprocessed source (by using -freport-bug).
Please include the complete backtrace with any bug report.
See <https://gcc.gnu.org/bugs/> for instructions.

Fixed thusly, tested on x86_64-linux, committed to trunk as obvious.

2022-10-11  Jakub Jelinek  <jakub@redhat.com>

	PR target/107185
	* config/i386/i386.md (*notxor<mode>_1): Use MASK_REG_P (x) instead of
	MASK_REGNO_P (REGNO (x)).

--- gcc/config/i386/i386.md.jj	2022-10-11 12:10:42.188891134 +0200
+++ gcc/config/i386/i386.md	2022-10-11 15:47:45.531449089 +0200
@@ -10843,7 +10843,7 @@ (define_insn_and_split "*notxor<mode>_1"
    (set (match_dup 0)
 	(not:SWI248 (match_dup 0)))]
 {
-  if (MASK_REGNO_P (REGNO (operands[0])))
+  if (MASK_REG_P (operands[0]))
     {
       emit_insn (gen_kxnor<mode> (operands[0], operands[1], operands[2]));
       DONE;


	Jakub
  
Li, Pan2 via Gcc-patches Oct. 12, 2022, 12:56 a.m. UTC | #3
> -----Original Message-----
> From: Jakub Jelinek <jakub@redhat.com>
> Sent: Tuesday, October 11, 2022 9:59 PM
> To: Liu, Hongtao <hongtao.liu@intel.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] [x86] Add define_insn_and_split to support general
> version of "kxnor".
> 
> On Tue, Oct 11, 2022 at 04:03:16PM +0800, liuhongt via Gcc-patches wrote:
> > gcc/ChangeLog:
> >
> > 	* config/i386/i386.md (*notxor<mode>_1): New post_reload
> > 	define_insn_and_split.
> > 	(*notxorqi_1): Ditto.
> 
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -10826,6 +10826,39 @@ (define_insn "*<code><mode>_1"
> >     (set_attr "type" "alu, alu, msklog")
> >     (set_attr "mode" "<MODE>")])
> >
> > +(define_insn_and_split "*notxor<mode>_1"
> > +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,?k")
> > +	(not:SWI248
> > +	  (xor:SWI248
> > +	    (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
> > +	    (match_operand:SWI248 2 "<general_operand>" "r<i>,<m>,k"))))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "ix86_binary_operator_ok (XOR, <MODE>mode, operands)"
> > +  "#"
> > +  "&& reload_completed"
> > +  [(parallel
> > +    [(set (match_dup 0)
> > +	  (xor:SWI248 (match_dup 1) (match_dup 2)))
> > +     (clobber (reg:CC FLAGS_REG))])
> > +   (set (match_dup 0)
> > +	(not:SWI248 (match_dup 1)))]
> > +{
> > +  if (MASK_REGNO_P (REGNO (operands[0])))
> 
> This causes --enable-checking=yes,rtl,extra regression on
> gcc.dg/store_merging_13.c test on x86_64-linux:
> .../gcc/testsuite/gcc.dg/store_merging_13.c: In function 'f13':
> .../gcc/testsuite/gcc.dg/store_merging_13.c:189:1: internal compiler error: RTL
> check: expected code 'reg', have 'mem' in rhs_regno, at rtl.h:1932 0x7b0c8f
> rtl_check_failed_code1(rtx_def const*, rtx_code, char const*, int, char const*)
>         ../../gcc/rtl.cc:916
> 0x8e74be rhs_regno
>         ../../gcc/rtl.h:1932
> 0x9785fd rhs_regno
>         ./genrtl.h:120
> 0x9785fd gen_split_260(rtx_insn*, rtx_def**)
>         ../../gcc/config/i386/i386.md:10846
> 0x23596dc split_insns(rtx_def*, rtx_insn*)
>         ../../gcc/config/i386/i386.md:16392
> 0xfccd5a try_split(rtx_def*, rtx_insn*, int)
>         ../../gcc/emit-rtl.cc:3799
> 0x132e9d8 split_insn
>         ../../gcc/recog.cc:3384
> 0x13359d5 split_all_insns()
>         ../../gcc/recog.cc:3488
> 0x1335ae8 execute
>         ../../gcc/recog.cc:4412
> Please submit a full bug report, with preprocessed source (by using -freport-
> bug).
> Please include the complete backtrace with any bug report.
> See <https://gcc.gnu.org/bugs/> for instructions.
> 
> Fixed thusly, tested on x86_64-linux, committed to trunk as obvious.
Thanks.
> 
> 2022-10-11  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR target/107185
> 	* config/i386/i386.md (*notxor<mode>_1): Use MASK_REG_P (x)
> instead of
> 	MASK_REGNO_P (REGNO (x)).
> 
> --- gcc/config/i386/i386.md.jj	2022-10-11 12:10:42.188891134 +0200
> +++ gcc/config/i386/i386.md	2022-10-11 15:47:45.531449089 +0200
> @@ -10843,7 +10843,7 @@ (define_insn_and_split "*notxor<mode>_1"
>     (set (match_dup 0)
>  	(not:SWI248 (match_dup 0)))]
>  {
> -  if (MASK_REGNO_P (REGNO (operands[0])))
> +  if (MASK_REG_P (operands[0]))
>      {
>        emit_insn (gen_kxnor<mode> (operands[0], operands[1], operands[2]));
>        DONE;
> 
> 
> 	Jakub
  

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 1be9b669909..228edba2b40 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -10826,6 +10826,39 @@  (define_insn "*<code><mode>_1"
    (set_attr "type" "alu, alu, msklog")
    (set_attr "mode" "<MODE>")])
 
+(define_insn_and_split "*notxor<mode>_1"
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,?k")
+	(not:SWI248
+	  (xor:SWI248
+	    (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
+	    (match_operand:SWI248 2 "<general_operand>" "r<i>,<m>,k"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (XOR, <MODE>mode, operands)"
+  "#"
+  "&& reload_completed"
+  [(parallel
+    [(set (match_dup 0)
+	  (xor:SWI248 (match_dup 1) (match_dup 2)))
+     (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 0)
+	(not:SWI248 (match_dup 1)))]
+{
+  if (MASK_REGNO_P (REGNO (operands[0])))
+    {
+      emit_insn (gen_kxnor<mode> (operands[0], operands[1], operands[2]));
+      DONE;
+    }
+}
+  [(set (attr "isa")
+	(cond [(eq_attr "alternative" "2")
+		 (if_then_else (eq_attr "mode" "SI,DI")
+		   (const_string "avx512bw")
+		   (const_string "avx512f"))
+	      ]
+	      (const_string "*")))
+   (set_attr "type" "alu, alu, msklog")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn_and_split "*iordi_1_bts"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
 	(ior:DI
@@ -10959,6 +10992,44 @@  (define_insn "*<code>qi_1"
 	      (symbol_ref "!TARGET_PARTIAL_REG_STALL")]
 	   (symbol_ref "true")))])
 
+(define_insn_and_split "*notxorqi_1"
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
+	(not:QI
+	  (xor:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
+		  (match_operand:QI 2 "general_operand" "qn,m,rn,k"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (XOR, QImode, operands)"
+  "#"
+  "&& reload_completed"
+  [(parallel
+    [(set (match_dup 0)
+	  (xor:QI (match_dup 1) (match_dup 2)))
+     (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 0)
+	(not:QI (match_dup 0)))]
+{
+  if (mask_reg_operand (operands[0], QImode))
+    {
+      emit_insn (gen_kxnorqi (operands[0], operands[1], operands[2]));
+      DONE;
+    }
+}
+  [(set_attr "isa" "*,*,*,avx512f")
+   (set_attr "type" "alu,alu,alu,msklog")
+   (set (attr "mode")
+	(cond [(eq_attr "alternative" "2")
+		 (const_string "SI")
+		(and (eq_attr "alternative" "3")
+		     (match_test "!TARGET_AVX512DQ"))
+		 (const_string "HI")
+	       ]
+	       (const_string "QI")))
+   ;; Potential partial reg stall on alternative 2.
+   (set (attr "preferred_for_speed")
+     (cond [(eq_attr "alternative" "2")
+	      (symbol_ref "!TARGET_PARTIAL_REG_STALL")]
+	   (symbol_ref "true")))])
+
 ;; Alternative 1 is needed to work around LRA limitation, see PR82524.
 (define_insn_and_split "*<code><mode>_1_slp"
   [(set (strict_low_part (match_operand:SWI12 0 "register_operand" "+<r>,&<r>"))
diff --git a/gcc/testsuite/gcc.target/i386/pr107093.c b/gcc/testsuite/gcc.target/i386/pr107093.c
new file mode 100644
index 00000000000..23e30cbac0f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107093.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2 -mavx512vl" } */
+/* { dg-final { scan-assembler-times {(?n)kxnor[bwqd]} 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times {(?n)kxnor[bwdq]} 3 { target ia32 } } }  */
+
+#include<immintrin.h>
+
+__m512i
+foo (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask32 k1 = _mm512_cmp_epi16_mask (a, b, 1);
+  __mmask32 k2 = _mm512_cmp_epi16_mask (c, d, 2);
+  return _mm512_mask_mov_epi16 (a, ~(k1 ^ k2), c);
+}
+
+__m512i
+foo1 (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask16 k1 = _mm512_cmp_epi32_mask (a, b, 1);
+  __mmask16 k2 = _mm512_cmp_epi32_mask (c, d, 2);
+  return _mm512_mask_mov_epi32 (a, ~(k1 ^ k2), c);
+}
+
+__m512i
+foo2 (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask64 k1 = _mm512_cmp_epi8_mask (a, b, 1);
+  __mmask64 k2 = _mm512_cmp_epi8_mask (c, d, 2);
+  return _mm512_mask_mov_epi8 (a, ~(k1 ^ k2), c);
+}
+
+__m512i
+foo3 (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  __mmask8 k1 = _mm512_cmp_epi64_mask (a, b, 1);
+  __mmask8 k2 = _mm512_cmp_epi64_mask (c, d, 2);
+  return _mm512_mask_mov_epi64 (a, ~(k1 ^ k2), c);
+}