Hi Uros,
Many thanks and merry Christmas.
Here's the version as committed, implemented using your
preferred idiom with mode iterators for movss/movsd.
Thanks again.
2022-12-25 Roger Sayle <roger@nextmovesoftware.com>
Uroš Bizjak <ubizjak@gmail.com>
gcc/ChangeLog
* config/i386/i386-builtin.def (__builtin_ia32_movss): Update
CODE_FOR_sse_movss to CODE_FOR_sse_movss_v4sf.
(__builtin_ia32_movsd): Likewise, update CODE_FOR_sse2_movsd to
CODE_FOR_sse2_movsd_v2df.
* config/i386/i386-expand.cc (split_convert_uns_si_sse): Update
gen_sse_movss call to gen_sse_movss_v4sf, and gen_sse2_movsd call
to gen_sse2_movsd_v2df.
(expand_vec_perm_movs): Also allow V4SImode with TARGET_SSE and
V2DImode with TARGET_SSE2.
* config/i386/sse.md
(avx512fp16_fcmaddcsh_v8hf_mask3<round_expand_name>): Update
gen_sse_movss call to gen_sse_movss_v4sf.
(avx512fp16_fmaddcsh_v8hf_mask3<round_expand_name>): Likewise.
(sse_movss_<mode>): Renamed from sse_movss using VI4F_128 mode
iterator to handle both V4SF and V4SI.
(sse2_movsd_<mode>): Likewise, renamed from sse2_movsd using
VI8F_128 mode iterator to handle both V2DF and V2DI.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse-movss-4.c: New test case.
* gcc.target/i386/sse2-movsd-3.c: New test case.
Roger
--
> -----Original Message-----
> From: Uros Bizjak <ubizjak@gmail.com>
> Sent: 23 December 2022 17:18
> To: Roger Sayle <roger@nextmovesoftware.com>
> Cc: GCC Patches <gcc-patches@gcc.gnu.org>
> Subject: Re: [x86 PATCH] Use movss/movsd to implement V4SI/V2DI VEC_PERM.
>
> On Fri, Dec 23, 2022 at 5:46 PM Roger Sayle <roger@nextmovesoftware.com>
> wrote:
> >
> >
> > This patch tweaks the x86 backend to use the movss and movsd
> > instructions to perform some vector permutations on integer vectors
> > (V4SI and V2DI) in the same way they are used for floating point vectors (V4SF
> and V2DF).
> >
> > As a motivating example, consider:
> >
> > typedef unsigned int v4si __attribute__((vector_size(16))); typedef
> > float v4sf __attribute__((vector_size(16))); v4si foo(v4si x,v4si y) {
> > return (v4si){y[0],x[1],x[2],x[3]}; } v4sf bar(v4sf x,v4sf y) { return
> > (v4sf){y[0],x[1],x[2],x[3]}; }
> >
> > which is currently compiled with -O2 to:
> >
> > foo: movdqa %xmm0, %xmm2
> > shufps $80, %xmm0, %xmm1
> > movdqa %xmm1, %xmm0
> > shufps $232, %xmm2, %xmm0
> > ret
> >
> > bar: movss %xmm1, %xmm0
> > ret
> >
> > with this patch both functions compile to the same form.
> > Likewise for the V2DI case:
> >
> > typedef unsigned long v2di __attribute__((vector_size(16))); typedef
> > double v2df __attribute__((vector_size(16)));
> >
> > v2di foo(v2di x,v2di y) { return (v2di){y[0],x[1]}; } v2df bar(v2df
> > x,v2df y) { return (v2df){y[0],x[1]}; }
> >
> > which is currently generates:
> >
> > foo: shufpd $2, %xmm0, %xmm1
> > movdqa %xmm1, %xmm0
> > ret
> >
> > bar: movsd %xmm1, %xmm0
> > ret
> >
> > There are two possible approaches to adding integer vector forms of
> > the sse_movss and sse2_movsd instructions. One is to use a mode
> > iterator
> > (VI4F_128 or VI8F_128) on the existing define_insn patterns, but this
> > requires renaming the patterns to sse_movss_<mode> which then requires
> > changes to i386-builtins.def and through-out the backend to reflect
> > the new naming of gen_sse_movss_v4sf. The alternate approach (taken
> > here) is to simply clone and specialize the existing patterns. Uros,
> > if you'd prefer the first approach, I'm happy to make/test/commit those
> changes.
>
> I would really prefer the variant with VI4F_128/VI8F_128, these two iterators
> were introduced specifically for this case (see e.g.
> sse_shufps_<mode> and sse2_shufpd_<mode>. The internal name of the
> pattern is fairly irrelevant and a trivial search and replace operation can replace
> the grand total of 6 occurrences ...)
>
> Also, changing sse2_movsd to use VI8F_128 mode iterator would enable more
> alternatives besides movsd, so we give combine pass some more opportunities
> with memory operands.
>
> So, the patch with those two iterators is pre-approved.
>
> Uros.
>
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > and make -k check, both with and without --target_board=unix{-m32},
> > with no new failures. Ok for mainline?
> >
> > 2022-12-23 Roger Sayle <roger@nextmovesoftware.com>
> >
> > gcc/ChangeLog
> > * config/i386/i386-expand.cc (expand_vec_perm_movs): Also allow
> > V4SImode with TARGET_SSE and V2DImode with TARGET_SSE2.
> > * config/i386/sse.md (sse_movss_v4si): New define_insn, a V4SI
> > specialization of sse_movss.
> > (sse2_movsd_v2di): Likewise, a V2DI specialization of sse2_movsd.
> >
> > gcc/testsuite/ChangeLog
> > * gcc.target/i386/sse-movss-4.c: New test case.
> > * gcc.target/i386/sse2-movsd-3.c: New test case.
> >
> >
> > Thanks in advance,
> > Roger
> > --
> >
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index d85b175..0d1fc34 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -679,7 +679,7 @@ BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_
BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
+BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movss_v4sf, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
@@ -781,7 +781,7 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86
BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
-BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
+BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_movsd_v2df, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index a45640f..e144b5e 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1774,9 +1774,9 @@ ix86_split_convert_uns_si_sse (rtx operands[])
input = gen_rtx_REG (vecmode, REGNO (input));
emit_move_insn (value, CONST0_RTX (vecmode));
if (vecmode == V4SFmode)
- emit_insn (gen_sse_movss (value, value, input));
+ emit_insn (gen_sse_movss_v4sf (value, value, input));
else
- emit_insn (gen_sse2_movsd (value, value, input));
+ emit_insn (gen_sse2_movsd_v2df (value, value, input));
}
emit_move_insn (large, two31);
@@ -18903,8 +18903,10 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
return false;
if (!(TARGET_SSE && vmode == V4SFmode)
+ && !(TARGET_SSE && vmode == V4SImode)
&& !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
- && !(TARGET_SSE2 && vmode == V2DFmode))
+ && !(TARGET_SSE2 && vmode == V2DFmode)
+ && !(TARGET_SSE2 && vmode == V2DImode))
return false;
/* Only the first element is changed. */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index de632b2..d50627a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6825,7 +6825,7 @@
if (!MEM_P (operands[3]))
operands[3] = force_reg (V8HFmode, operands[3]);
op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode);
- emit_insn (gen_sse_movss (dest, op1, op0));
+ emit_insn (gen_sse_movss_v4sf (dest, op1, op0));
emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode));
DONE;
})
@@ -6855,7 +6855,7 @@
if (!MEM_P (operands[3]))
operands[3] = force_reg (V8HFmode, operands[3]);
op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode);
- emit_insn (gen_sse_movss (dest, op1, op0));
+ emit_insn (gen_sse_movss_v4sf (dest, op1, op0));
emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode));
DONE;
})
@@ -10498,11 +10498,11 @@
(set_attr "prefix" "orig,maybe_evex,orig,maybe_evex,maybe_vex")
(set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")])
-(define_insn "sse_movss"
- [(set (match_operand:V4SF 0 "register_operand" "=x,v")
- (vec_merge:V4SF
- (match_operand:V4SF 2 "register_operand" " x,v")
- (match_operand:V4SF 1 "register_operand" " 0,v")
+(define_insn "sse_movss_<mode>"
+ [(set (match_operand:VI4F_128 0 "register_operand" "=x,v")
+ (vec_merge:VI4F_128
+ (match_operand:VI4F_128 2 "register_operand" " x,v")
+ (match_operand:VI4F_128 1 "register_operand" " 0,v")
(const_int 1)))]
"TARGET_SSE"
"@
@@ -13481,11 +13481,11 @@
[(set (match_dup 0) (match_dup 1))]
"operands[0] = adjust_address (operands[0], DFmode, 0);")
-(define_insn "sse2_movsd"
- [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,v,x,v,m,x,x,v,o")
- (vec_merge:V2DF
- (match_operand:V2DF 2 "nonimmediate_operand" " x,v,m,m,v,0,0,v,0")
- (match_operand:V2DF 1 "nonimmediate_operand" " 0,v,0,v,0,x,o,o,v")
+(define_insn "sse2_movsd_<mode>"
+ [(set (match_operand:VI8F_128 0 "nonimmediate_operand" "=x,v,x,v,m,x,x,v,o")
+ (vec_merge:VI8F_128
+ (match_operand:VI8F_128 2 "nonimmediate_operand" " x,v,m,m,v,0,0,v,0")
+ (match_operand:VI8F_128 1 "nonimmediate_operand" " 0,v,0,v,0,x,o,o,v")
(const_int 1)))]
"TARGET_SSE2"
"@
diff --git a/gcc/testsuite/gcc.target/i386/sse-movss-4.c b/gcc/testsuite/gcc.target/i386/sse-movss-4.c
new file mode 100644
index 0000000..ec3019c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse-movss-4.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse" } */
+
+typedef unsigned int v4si __attribute__((vector_size(16)));
+typedef float v4sf __attribute__((vector_size(16)));
+
+v4si foo(v4si x,v4si y) { return (v4si){y[0],x[1],x[2],x[3]}; }
+v4sf bar(v4sf x,v4sf y) { return (v4sf){y[0],x[1],x[2],x[3]}; }
+
+/* { dg-final { scan-assembler-times "\tv?movss\t" 2 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+/* { dg-final { scan-assembler-not "vpblendw" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c b/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c
new file mode 100644
index 0000000..fadbe2b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef unsigned long long v2di __attribute__((vector_size(16)));
+typedef double v2df __attribute__((vector_size(16)));
+
+v2di foo(v2di x,v2di y) { return (v2di){y[0],x[1]}; }
+v2df bar(v2df x,v2df y) { return (v2df){y[0],x[1]}; }
+
+/* { dg-final { scan-assembler-times "\tv?movsd\t" 2 } } */
+/* { dg-final { scan-assembler-not "v?shufpd" } } */
+/* { dg-final { scan-assembler-not "movdqa" } } */
+/* { dg-final { scan-assembler-not "pshufd" } } */
+/* { dg-final { scan-assembler-not "v?punpckldq" } } */
+/* { dg-final { scan-assembler-not "v?movq" } } */
@@ -18903,8 +18903,10 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
return false;
if (!(TARGET_SSE && vmode == V4SFmode)
+ && !(TARGET_SSE && vmode == V4SImode)
&& !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
- && !(TARGET_SSE2 && vmode == V2DFmode))
+ && !(TARGET_SSE2 && vmode == V2DFmode)
+ && !(TARGET_SSE2 && vmode == V2DImode))
return false;
/* Only the first element is changed. */
@@ -10513,6 +10513,21 @@
(set_attr "prefix" "orig,maybe_evex")
(set_attr "mode" "SF")])
+(define_insn "sse_movss_v4si"
+ [(set (match_operand:V4SI 0 "register_operand" "=x,v")
+ (vec_merge:V4SI
+ (match_operand:V4SI 2 "register_operand" " x,v")
+ (match_operand:V4SI 1 "register_operand" " 0,v")
+ (const_int 1)))]
+ "TARGET_SSE"
+ "@
+ movss\t{%2, %0|%0, %2}
+ vmovss\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "isa" "noavx,avx")
+ (set_attr "type" "ssemov")
+ (set_attr "prefix" "orig,maybe_evex")
+ (set_attr "mode" "SF")])
+
(define_insn "avx2_vec_dup<mode>"
[(set (match_operand:VF1_128_256 0 "register_operand" "=v")
(vec_duplicate:VF1_128_256
@@ -13523,6 +13538,21 @@
(const_string "orig")))
(set_attr "mode" "DF,DF,V1DF,V1DF,V1DF,V2DF,V1DF,V1DF,V1DF")])
+(define_insn "sse2_movsd_v2di"
+ [(set (match_operand:V2DI 0 "register_operand" "=x,v")
+ (vec_merge:V2DI
+ (match_operand:V2DI 2 "register_operand" " x,v")
+ (match_operand:V2DI 1 "register_operand" " 0,v")
+ (const_int 1)))]
+ "TARGET_SSE2"
+ "@
+ movsd\t{%2, %0|%0, %2}
+ vmovsd\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "isa" "noavx,avx")
+ (set_attr "type" "ssemov")
+ (set_attr "prefix" "orig,maybe_evex")
+ (set_attr "mode" "DF")])
+
(define_insn "vec_dupv2df<mask_name>"
[(set (match_operand:V2DF 0 "register_operand" "=x,x,v")
(vec_duplicate:V2DF
new file mode 100644
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse" } */
+
+typedef unsigned int v4si __attribute__((vector_size(16)));
+typedef float v4sf __attribute__((vector_size(16)));
+
+v4si foo(v4si x,v4si y) { return (v4si){y[0],x[1],x[2],x[3]}; }
+v4sf bar(v4sf x,v4sf y) { return (v4sf){y[0],x[1],x[2],x[3]}; }
+
+/* { dg-final { scan-assembler-times "\tv?movss\t" 2 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+/* { dg-final { scan-assembler-not "vpblendw" } } */
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef unsigned long v2di __attribute__((vector_size(16)));
+typedef double v2df __attribute__((vector_size(16)));
+
+v2di foo(v2di x,v2di y) { return (v2di){y[0],x[1]}; }
+v2df bar(v2df x,v2df y) { return (v2df){y[0],x[1]}; }
+
+/* { dg-final { scan-assembler-times "\tv?movsd\t" 2 } } */
+/* { dg-final { scan-assembler-not "v?shufpd" } } */
+/* { dg-final { scan-assembler-not "movdqa" } } */
+/* { dg-final { scan-assembler-not "pshufd" } } */
+/* { dg-final { scan-assembler-not "v?punpckldq" } } */
+/* { dg-final { scan-assembler-not "v?movq" } } */