On Thu, Jul 14, 2022 at 11:32 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Thu, Jul 14, 2022 at 3:22 PM Uros Bizjak via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Thu, Jul 14, 2022 at 7:33 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > And split it to GPR-version instruction after reload.
> > >
> > > > ?r was introduced under the assumption that we want vector values
> > > > mostly in vector registers. Currently there are no instructions with
> > > > memory or immediate operand, so that made sense at the time. Let's
> > > > keep ?r until logic instructions with mem/imm operands are introduced.
> > > > So, for the patch that adds 64-bit vector logic in GPR, I would advise
> > > > to first introduce only register operands. mem/imm operands should be
> > > Update patch to add ?r to 64-bit bit_op patterns.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > No big imact on SPEC2017(Most same binary).
> >
> > The problem with your approach is with the combine pass, where combine
> > first tries to recognize the combined instruction without clobber,
> > before re-recognizing instruction with added clobber. So, if a forward
> > propagation happens, the combine will *always* choose the insn variant
> > without GPR.
> Thank you for the explanation, I really did not know this point.
> >
> > So, the solution with VI_16_32 is to always expand with a clobbered
> > version that is split to either SImode or V16QImode. With 64-bit
> > instructions, we have two additional complications. First, we have a
> > native MMX instruction, and we have to split to it after reload, and
> > second, we have a builtin that expects vector insn.
> >
> > To solve the first issue, we should change the mode of
> > "*mmx<code><mode>" to V1DImode and split your new _gpr version with
> > clobber to it for !GENERAL_REG_P operands.
> >
> > The second issue could be solved by emitting V1DImode instructions
> > directly from the expander. Please note there are several expanders
> > that expect non-clobbered logic insn in certain mode to be available,
> > so the situation can become quite annoying...
> Yes. It looks like it would add a lot of code complexity, I'll hold
> the patch for now.
I did some experimenting in the past with the idea of adding GPR
instructions to 64-bit vectors. While there were some opportunities
with 32- and 16-bit operations, mostly due to the fact that these
arguments are passed via integer registers, 64-bit cases never
triggered, because 64-bit vectors are passed via XMM registers. Also,
when mem/imm alternatives were added, many inter-regunit moves were
generated for everything but the most simple testcases involving logic
operations, also considering the limited range of 64-bit immediates.
IMO, the only case it is worth adding is a direct immediate store to
memory, which HJ recently added.
Uros.
Uros.
@@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64
(V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT")
(V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")])
+(define_mode_iterator VI_16_32_64
+ [V2QI V4QI V2HI
+ (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT")
+ (V2SI "TARGET_64BIT")])
+
;; V2S* modes
(define_mode_iterator V2FI [V2SF V2SI])
@@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize
[(V8QI "b") (V4QI "b") (V2QI "b")
(V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+ [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+ (V4HI "DI") (V2HI "SI")
+ (V2SI "DI")
+ (V4HF "DI") (V2HF "SI")
+ (V2SF "DI")])
+
(define_mode_attr mmxdoublemode
[(V8QI "V8HI") (V4HI "V4SI")])
@@ -350,22 +363,7 @@ (define_insn_and_split "*mov<mode>_imm"
HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
<MODE>mode);
operands[1] = GEN_INT (val);
- machine_mode mode;
- switch (GET_MODE_SIZE (<MODE>mode))
- {
- case 2:
- mode = HImode;
- break;
- case 4:
- mode = SImode;
- break;
- case 8:
- mode = DImode;
- break;
- default:
- gcc_unreachable ();
- }
- operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
+ operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
})
;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2878,6 +2876,31 @@ (define_insn "mmx_andnot<mode>3"
(set_attr "type" "mmxadd,sselog,sselog,sselog")
(set_attr "mode" "DI,TI,TI,TI")])
+(define_insn "mmx_andnot<mode>3_gpr"
+ [(set (match_operand:MMXMODEI 0 "register_operand" "=?r,y,x,x,v")
+ (and:MMXMODEI
+ (not:MMXMODEI (match_operand:MMXMODEI 1 "register_operand" "r,0,0,x,v"))
+ (match_operand:MMXMODEI 2 "register_mmxmem_operand" "r,ym,x,x,v")))
+ (clobber (reg:CC FLAGS_REG))]
+ "TARGET_64BIT && (TARGET_MMX || TARGET_SSE2)"
+ "#"
+ [(set_attr "isa" "bmi,*,sse2_noavx,avx,avx512vl")
+ (set_attr "mmx_isa" "*,native,*,*,*")
+ (set_attr "type" "alu,mmxadd,sselog,sselog,sselog")
+ (set_attr "mode" "DI,DI,TI,TI,TI")])
+
+(define_split
+ [(set (match_operand:MMXMODEI 0 "register_operand")
+ (and:MMXMODEI
+ (not:MMXMODEI (match_operand:MMXMODEI 1 "register_mmxmem_operand"))
+ (match_operand:MMXMODEI 2 "register_mmxmem_operand")))
+ (clobber (reg:CC FLAGS_REG))]
+ "reload_completed
+ && (TARGET_MMX || TARGET_MMX_WITH_SSE)
+ && !GENERAL_REGNO_P (REGNO (operands[0]))"
+ [(set (match_dup 0)
+ (and:<MODE> (not:<MODE> (match_dup 1)) (match_dup 2)))])
+
(define_insn "*andnot<mode>3"
[(set (match_operand:VI_16_32 0 "register_operand" "=?&r,?r,x,x,v")
(and:VI_16_32
@@ -2892,20 +2915,20 @@ (define_insn "*andnot<mode>3"
(set_attr "mode" "SI,SI,TI,TI,TI")])
(define_split
- [(set (match_operand:VI_16_32 0 "general_reg_operand")
- (and:VI_16_32
- (not:VI_16_32 (match_operand:VI_16_32 1 "general_reg_operand"))
- (match_operand:VI_16_32 2 "general_reg_operand")))
+ [(set (match_operand:VI_16_32_64 0 "general_reg_operand")
+ (and:VI_16_32_64
+ (not:VI_16_32_64 (match_operand:VI_16_32_64 1 "general_reg_operand"))
+ (match_operand:VI_16_32_64 2 "general_reg_operand")))
(clobber (reg:CC FLAGS_REG))]
"TARGET_BMI && reload_completed"
[(parallel
[(set (match_dup 0)
- (and:SI (not:SI (match_dup 1)) (match_dup 2)))
+ (and:<mmxinsnmode> (not:<mmxinsnmode> (match_dup 1)) (match_dup 2)))
(clobber (reg:CC FLAGS_REG))])]
{
- operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
- operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
- operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+ operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
+ operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
+ operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
})
(define_split
@@ -2948,14 +2971,28 @@ (define_expand "mmx_<code><mode>3"
(match_operand:MMXMODEI 1 "register_mmxmem_operand")
(match_operand:MMXMODEI 2 "register_mmxmem_operand")))]
"TARGET_MMX || TARGET_MMX_WITH_SSE"
- "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+{
+ ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
+ if (TARGET_64BIT)
+ {
+ ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
+ DONE;
+ }
+})
(define_expand "<code><mode>3"
[(set (match_operand:MMXMODEI 0 "register_operand")
(any_logic:MMXMODEI
(match_operand:MMXMODEI 1 "register_operand")
(match_operand:MMXMODEI 2 "register_operand")))]
- "TARGET_MMX_WITH_SSE")
+ "TARGET_MMX_WITH_SSE"
+{
+ if (TARGET_64BIT)
+ {
+ ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
+ DONE;
+ }
+})
(define_insn "*mmx_<code><mode>3"
[(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v")
@@ -2974,6 +3011,32 @@ (define_insn "*mmx_<code><mode>3"
(set_attr "type" "mmxadd,sselog,sselog,sselog")
(set_attr "mode" "DI,TI,TI,TI")])
+(define_insn "*mmx_<code><mode>3_gpr"
+ [(set (match_operand:MMXMODEI 0 "register_operand" "=?r,y,x,x,v")
+ (any_logic:MMXMODEI
+ (match_operand:MMXMODEI 1 "register_mmxmem_operand" "%0,0,0,x,v")
+ (match_operand:MMXMODEI 2 "register_mmxmem_operand" "r,ym,x,x,v")))
+ (clobber (reg:CC FLAGS_REG))]
+ "TARGET_64BIT && (TARGET_MMX || TARGET_SSE2)
+ && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+ "#"
+ [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+ (set_attr "mmx_isa" "*,native,*,*,*")
+ (set_attr "type" "alu,mmxadd,sselog,sselog,sselog")
+ (set_attr "mode" "DI,DI,TI,TI,TI")])
+
+(define_split
+ [(set (match_operand:MMXMODEI 0 "register_operand")
+ (any_logic:MMXMODEI
+ (match_operand:MMXMODEI 1 "register_mmxmem_operand")
+ (match_operand:MMXMODEI 2 "register_mmxmem_operand")))
+ (clobber (reg:CC FLAGS_REG))]
+ "reload_completed && (TARGET_MMX || TARGET_MMX_WITH_SSE)
+ && !GENERAL_REGNO_P (REGNO (operands[0]))"
+ [(set (match_dup 0)
+ (any_logic:<MODE> (match_dup 1)
+ (match_dup 2)))])
+
(define_insn "<code><mode>3"
[(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
(any_logic:VI_16_32
@@ -2987,20 +3050,20 @@ (define_insn "<code><mode>3"
(set_attr "mode" "SI,TI,TI,TI")])
(define_split
- [(set (match_operand:VI_16_32 0 "general_reg_operand")
- (any_logic:VI_16_32
- (match_operand:VI_16_32 1 "general_reg_operand")
- (match_operand:VI_16_32 2 "general_reg_operand")))
+ [(set (match_operand:VI_16_32_64 0 "general_reg_operand")
+ (any_logic:VI_16_32_64
+ (match_operand:VI_16_32_64 1 "general_reg_operand")
+ (match_operand:VI_16_32_64 2 "general_reg_operand")))
(clobber (reg:CC FLAGS_REG))]
"reload_completed"
[(parallel
[(set (match_dup 0)
- (any_logic:SI (match_dup 1) (match_dup 2)))
+ (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
(clobber (reg:CC FLAGS_REG))])]
{
- operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
- operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
- operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+ operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
+ operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
+ operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
})
(define_split
new file mode 100644
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" { xfail *-*-* } } } */
+
+void
+foo (char* a, char* __restrict b)
+{
+ a[0] &= b[0];
+ a[1] &= b[1];
+ a[2] &= b[2];
+ a[3] &= b[3];
+}
+
+void
+foo1 (char* a, char* __restrict b)
+{
+ a[0] &= b[0];
+ a[1] &= b[1];
+}
+
+void
+foo2 (char* a, char* __restrict b)
+{
+ a[0] &= b[0];
+ a[1] &= b[1];
+ a[2] &= b[2];
+ a[3] &= b[3];
+ a[4] &= b[4];
+ a[5] &= b[5];
+ a[6] &= b[6];
+ a[7] &= b[7];
+}
+
+void
+foo3 (char* a, char* __restrict b)
+{
+ a[0] &= 1;
+ a[1] &= 2;
+ a[2] &= 3;
+ a[3] &= 3;
+}
+
+void
+foo4 (char* a, char* __restrict b)
+{
+ a[0] &= 1;
+ a[1] &= 2;
+}
+
+void
+foo5 (char* a, char* __restrict b)
+{
+ a[0] &= 1;
+ a[1] &= 2;
+ a[2] &= 2;
+ a[3] &= 3;
+ a[4] &= 4;
+ a[5] &= 5;
+ a[6] &= 6;
+ a[7] &= 7;
+}
new file mode 100644
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" { xfail *-*-* } } } */
+
+void
+foo (short* a, short* __restrict b)
+{
+ a[0] &= b[0];
+ a[1] &= b[1];
+ a[2] &= b[2];
+ a[3] &= b[3];
+}
+
+void
+foo1 (short* a, short* __restrict b)
+{
+ a[0] &= b[0];
+ a[1] &= b[1];
+}
+
+void
+foo3 (short* a, short* __restrict b)
+{
+ a[0] &= 1;
+ a[1] &= 2;
+ a[2] &= 3;
+ a[3] &= 3;
+}
+
+void
+foo4 (short* a, short* __restrict b)
+{
+ a[0] &= 1;
+ a[1] &= 2;
+}
new file mode 100644
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2 -mtune=generic" } */
+/* { dg-final { scan-assembler-not "xmm" { xfail { ! ia32 } } } } */
+
+void
+foo1 (int* a, int* __restrict b)
+{
+ a[0] &= b[0];
+ a[1] &= b[1];
+}
+
+void
+foo4 (int* a, int* __restrict b)
+{
+ a[0] &= 1;
+ a[1] &= 2;
+}