Optimize vlddqu to vmovdqu for TARGET_AVX

Message ID 20230720073516.2171485-1-hongtao.liu@intel.com
State Accepted
Headers
Series Optimize vlddqu to vmovdqu for TARGET_AVX |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

liuhongt July 20, 2023, 7:35 a.m. UTC
  For Intel processors, after TARGET_AVX, vmovdqu is optimized as fast
as vlddqu, UNSPEC_LDDQU can be removed to enable more optimizations.
Can someone confirm this with AMD folks?
If AMD doesn't like such optimization, I'll put my optimization under
micro-architecture tuning.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
If AMD also like such optimization, Ok for trunk?

gcc/ChangeLog:

	* config/i386/sse.md (<sse3>_lddqu<avxsizesuffix>): Change to
	define_expand, expand as simple move when TARGET_AVX
	&& (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD).
	The original define_insn is renamed to
	..
	(<sse3>_lddqu<avxsizesuffix>): .. this.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/vlddqu_vinserti128.c: New test.
---
 gcc/config/i386/sse.md                            | 15 ++++++++++++++-
 .../gcc.target/i386/vlddqu_vinserti128.c          | 11 +++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
  

Comments

Uros Bizjak July 20, 2023, 8:10 a.m. UTC | #1
On Thu, Jul 20, 2023 at 9:35 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> For Intel processors, after TARGET_AVX, vmovdqu is optimized as fast
> as vlddqu, UNSPEC_LDDQU can be removed to enable more optimizations.
> Can someone confirm this with AMD folks?
> If AMD doesn't like such optimization, I'll put my optimization under
> micro-architecture tuning.

The instruction is reachable only as __builtin_ia32_lddqu* (aka
_mm_lddqu_si*), so it was chosen by the programmer for a reason. I
think that in this case, the compiler should not be too smart and
change the instruction behind the programmer's back. The caveats are
also explained at length in the ISA manual.

Uros.

> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> If AMD also like such optimization, Ok for trunk?
>
> gcc/ChangeLog:
>
>         * config/i386/sse.md (<sse3>_lddqu<avxsizesuffix>): Change to
>         define_expand, expand as simple move when TARGET_AVX
>         && (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD).
>         The original define_insn is renamed to
>         ..
>         (<sse3>_lddqu<avxsizesuffix>): .. this.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/vlddqu_vinserti128.c: New test.
> ---
>  gcc/config/i386/sse.md                            | 15 ++++++++++++++-
>  .../gcc.target/i386/vlddqu_vinserti128.c          | 11 +++++++++++
>  2 files changed, 25 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 2d81347c7b6..d571a78f4c4 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1835,7 +1835,20 @@ (define_peephole2
>    [(set (match_dup 4) (match_dup 1))]
>    "operands[4] = adjust_address (operands[0], V2DFmode, 0);")
>
> -(define_insn "<sse3>_lddqu<avxsizesuffix>"
> +(define_expand "<sse3>_lddqu<avxsizesuffix>"
> +  [(set (match_operand:VI1 0 "register_operand")
> +       (unspec:VI1 [(match_operand:VI1 1 "memory_operand")]
> +                   UNSPEC_LDDQU))]
> +  "TARGET_SSE3"
> +{
> +  if (TARGET_AVX && (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD))
> +    {
> +      emit_move_insn (operands[0], operands[1]);
> +      DONE;
> +    }
> +})
> +
> +(define_insn "*<sse3>_lddqu<avxsizesuffix>"
>    [(set (match_operand:VI1 0 "register_operand" "=x")
>         (unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
>                     UNSPEC_LDDQU))]
> diff --git a/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
> new file mode 100644
> index 00000000000..29699a5fa7f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx2 -O2" } */
> +/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */
> +/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */
> +
> +#include <immintrin.h>
> +__m256i foo(void *data) {
> +    __m128i X1 = _mm_lddqu_si128((__m128i*)data);
> +    __m256i V1 = _mm256_broadcastsi128_si256 (X1);
> +    return V1;
> +}
> --
> 2.39.1.388.g2fc9e9ca3c
>
  
Hongtao Liu July 20, 2023, 11:50 p.m. UTC | #2
On Thu, Jul 20, 2023 at 4:11 PM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Thu, Jul 20, 2023 at 9:35 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > For Intel processors, after TARGET_AVX, vmovdqu is optimized as fast
> > as vlddqu, UNSPEC_LDDQU can be removed to enable more optimizations.
> > Can someone confirm this with AMD folks?
> > If AMD doesn't like such optimization, I'll put my optimization under
> > micro-architecture tuning.
>
> The instruction is reachable only as __builtin_ia32_lddqu* (aka
> _mm_lddqu_si*), so it was chosen by the programmer for a reason. I
> think that in this case, the compiler should not be too smart and
> change the instruction behind the programmer's back. The caveats are
> also explained at length in the ISA manual.
fine.
>
> Uros.
>
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > If AMD also like such optimization, Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> >         * config/i386/sse.md (<sse3>_lddqu<avxsizesuffix>): Change to
> >         define_expand, expand as simple move when TARGET_AVX
> >         && (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD).
> >         The original define_insn is renamed to
> >         ..
> >         (<sse3>_lddqu<avxsizesuffix>): .. this.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/vlddqu_vinserti128.c: New test.
> > ---
> >  gcc/config/i386/sse.md                            | 15 ++++++++++++++-
> >  .../gcc.target/i386/vlddqu_vinserti128.c          | 11 +++++++++++
> >  2 files changed, 25 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
> >
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index 2d81347c7b6..d571a78f4c4 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -1835,7 +1835,20 @@ (define_peephole2
> >    [(set (match_dup 4) (match_dup 1))]
> >    "operands[4] = adjust_address (operands[0], V2DFmode, 0);")
> >
> > -(define_insn "<sse3>_lddqu<avxsizesuffix>"
> > +(define_expand "<sse3>_lddqu<avxsizesuffix>"
> > +  [(set (match_operand:VI1 0 "register_operand")
> > +       (unspec:VI1 [(match_operand:VI1 1 "memory_operand")]
> > +                   UNSPEC_LDDQU))]
> > +  "TARGET_SSE3"
> > +{
> > +  if (TARGET_AVX && (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD))
> > +    {
> > +      emit_move_insn (operands[0], operands[1]);
> > +      DONE;
> > +    }
> > +})
> > +
> > +(define_insn "*<sse3>_lddqu<avxsizesuffix>"
> >    [(set (match_operand:VI1 0 "register_operand" "=x")
> >         (unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
> >                     UNSPEC_LDDQU))]
> > diff --git a/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
> > new file mode 100644
> > index 00000000000..29699a5fa7f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx2 -O2" } */
> > +/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */
> > +/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */
> > +
> > +#include <immintrin.h>
> > +__m256i foo(void *data) {
> > +    __m128i X1 = _mm_lddqu_si128((__m128i*)data);
> > +    __m256i V1 = _mm256_broadcastsi128_si256 (X1);
> > +    return V1;
> > +}
> > --
> > 2.39.1.388.g2fc9e9ca3c
> >
  

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2d81347c7b6..d571a78f4c4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1835,7 +1835,20 @@  (define_peephole2
   [(set (match_dup 4) (match_dup 1))]
   "operands[4] = adjust_address (operands[0], V2DFmode, 0);")
 
-(define_insn "<sse3>_lddqu<avxsizesuffix>"
+(define_expand "<sse3>_lddqu<avxsizesuffix>"
+  [(set (match_operand:VI1 0 "register_operand")
+	(unspec:VI1 [(match_operand:VI1 1 "memory_operand")]
+		    UNSPEC_LDDQU))]
+  "TARGET_SSE3"
+{
+  if (TARGET_AVX && (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD))
+    {
+      emit_move_insn (operands[0], operands[1]);
+      DONE;
+    }
+})
+
+(define_insn "*<sse3>_lddqu<avxsizesuffix>"
   [(set (match_operand:VI1 0 "register_operand" "=x")
 	(unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
 		    UNSPEC_LDDQU))]
diff --git a/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
new file mode 100644
index 00000000000..29699a5fa7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */
+/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */
+
+#include <immintrin.h>
+__m256i foo(void *data) {
+    __m128i X1 = _mm_lddqu_si128((__m128i*)data);
+    __m256i V1 = _mm256_broadcastsi128_si256 (X1);
+    return V1;
+}