diff mbox series

Optimize vlddqu to vmovdqu for TARGET_AVX

Message ID	20230720073516.2171485-1-hongtao.liu@intel.com
State	Accepted
Headers	Received-SPF: pass (google.com: domain of gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org designates 8.43.85.97 as permitted sender) client-ip=8.43.85.97; DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 19B823856DC7 To: gcc-patches@gcc.gnu.org Cc: ubizjak@gmail.com, hubicka@ucw.cz Subject: [PATCH] Optimize vlddqu to vmovdqu for TARGET_AVX Date: Thu, 20 Jul 2023 15:35:16 +0800 Message-Id: <20230720073516.2171485-1-hongtao.liu@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list From: liuhongt via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: liuhongt <hongtao.liu@intel.com> Errors-To: gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org> X-getmail-retrieved-from-mailbox: INBOX
Series	Optimize vlddqu to vmovdqu for TARGET_AVX \| Optimize vlddqu to vmovdqu for TARGET_AVX

Checks

Context	Check	Description
snail/gcc-patch-check	success	Github commit url

Commit Message

liuhongt July 20, 2023, 7:35 a.m. UTC

  For Intel processors, after TARGET_AVX, vmovdqu is optimized as fast
as vlddqu, UNSPEC_LDDQU can be removed to enable more optimizations.
Can someone confirm this with AMD folks?
If AMD doesn't like such optimization, I'll put my optimization under
micro-architecture tuning.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
If AMD also like such optimization, Ok for trunk?

gcc/ChangeLog:

	* config/i386/sse.md (<sse3>_lddqu<avxsizesuffix>): Change to
	define_expand, expand as simple move when TARGET_AVX
	&& (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD).
	The original define_insn is renamed to
	..
	(<sse3>_lddqu<avxsizesuffix>): .. this.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/vlddqu_vinserti128.c: New test.
---
 gcc/config/i386/sse.md                            | 15 ++++++++++++++-
 .../gcc.target/i386/vlddqu_vinserti128.c          | 11 +++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c

Comments

Uros Bizjak July 20, 2023, 8:10 a.m. UTC | #1

On Thu, Jul 20, 2023 at 9:35 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> For Intel processors, after TARGET_AVX, vmovdqu is optimized as fast
> as vlddqu, UNSPEC_LDDQU can be removed to enable more optimizations.
> Can someone confirm this with AMD folks?
> If AMD doesn't like such optimization, I'll put my optimization under
> micro-architecture tuning.

The instruction is reachable only as __builtin_ia32_lddqu* (aka
_mm_lddqu_si*), so it was chosen by the programmer for a reason. I
think that in this case, the compiler should not be too smart and
change the instruction behind the programmer's back. The caveats are
also explained at length in the ISA manual.

Uros.

> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> If AMD also like such optimization, Ok for trunk?
>
> gcc/ChangeLog:
>
>         * config/i386/sse.md (<sse3>_lddqu<avxsizesuffix>): Change to
>         define_expand, expand as simple move when TARGET_AVX
>         && (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD).
>         The original define_insn is renamed to
>         ..
>         (<sse3>_lddqu<avxsizesuffix>): .. this.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/vlddqu_vinserti128.c: New test.
> ---
>  gcc/config/i386/sse.md                            | 15 ++++++++++++++-
>  .../gcc.target/i386/vlddqu_vinserti128.c          | 11 +++++++++++
>  2 files changed, 25 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 2d81347c7b6..d571a78f4c4 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -1835,7 +1835,20 @@ (define_peephole2
>    [(set (match_dup 4) (match_dup 1))]
>    "operands[4] = adjust_address (operands[0], V2DFmode, 0);")
>
> -(define_insn "<sse3>_lddqu<avxsizesuffix>"
> +(define_expand "<sse3>_lddqu<avxsizesuffix>"
> +  [(set (match_operand:VI1 0 "register_operand")
> +       (unspec:VI1 [(match_operand:VI1 1 "memory_operand")]
> +                   UNSPEC_LDDQU))]
> +  "TARGET_SSE3"
> +{
> +  if (TARGET_AVX && (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD))
> +    {
> +      emit_move_insn (operands[0], operands[1]);
> +      DONE;
> +    }
> +})
> +
> +(define_insn "*<sse3>_lddqu<avxsizesuffix>"
>    [(set (match_operand:VI1 0 "register_operand" "=x")
>         (unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
>                     UNSPEC_LDDQU))]
> diff --git a/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
> new file mode 100644
> index 00000000000..29699a5fa7f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx2 -O2" } */
> +/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */
> +/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */
> +
> +#include <immintrin.h>
> +__m256i foo(void *data) {
> +    __m128i X1 = _mm_lddqu_si128((__m128i*)data);
> +    __m256i V1 = _mm256_broadcastsi128_si256 (X1);
> +    return V1;
> +}
> --
> 2.39.1.388.g2fc9e9ca3c
>

Hongtao Liu July 20, 2023, 11:50 p.m. UTC | #2

On Thu, Jul 20, 2023 at 4:11 PM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Thu, Jul 20, 2023 at 9:35 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > For Intel processors, after TARGET_AVX, vmovdqu is optimized as fast
> > as vlddqu, UNSPEC_LDDQU can be removed to enable more optimizations.
> > Can someone confirm this with AMD folks?
> > If AMD doesn't like such optimization, I'll put my optimization under
> > micro-architecture tuning.
>
> The instruction is reachable only as __builtin_ia32_lddqu* (aka
> _mm_lddqu_si*), so it was chosen by the programmer for a reason. I
> think that in this case, the compiler should not be too smart and
> change the instruction behind the programmer's back. The caveats are
> also explained at length in the ISA manual.
fine.
>
> Uros.
>
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > If AMD also like such optimization, Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> >         * config/i386/sse.md (<sse3>_lddqu<avxsizesuffix>): Change to
> >         define_expand, expand as simple move when TARGET_AVX
> >         && (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD).
> >         The original define_insn is renamed to
> >         ..
> >         (<sse3>_lddqu<avxsizesuffix>): .. this.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/vlddqu_vinserti128.c: New test.
> > ---
> >  gcc/config/i386/sse.md                            | 15 ++++++++++++++-
> >  .../gcc.target/i386/vlddqu_vinserti128.c          | 11 +++++++++++
> >  2 files changed, 25 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
> >
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index 2d81347c7b6..d571a78f4c4 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -1835,7 +1835,20 @@ (define_peephole2
> >    [(set (match_dup 4) (match_dup 1))]
> >    "operands[4] = adjust_address (operands[0], V2DFmode, 0);")
> >
> > -(define_insn "<sse3>_lddqu<avxsizesuffix>"
> > +(define_expand "<sse3>_lddqu<avxsizesuffix>"
> > +  [(set (match_operand:VI1 0 "register_operand")
> > +       (unspec:VI1 [(match_operand:VI1 1 "memory_operand")]
> > +                   UNSPEC_LDDQU))]
> > +  "TARGET_SSE3"
> > +{
> > +  if (TARGET_AVX && (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD))
> > +    {
> > +      emit_move_insn (operands[0], operands[1]);
> > +      DONE;
> > +    }
> > +})
> > +
> > +(define_insn "*<sse3>_lddqu<avxsizesuffix>"
> >    [(set (match_operand:VI1 0 "register_operand" "=x")
> >         (unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
> >                     UNSPEC_LDDQU))]
> > diff --git a/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
> > new file mode 100644
> > index 00000000000..29699a5fa7f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx2 -O2" } */
> > +/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */
> > +/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */
> > +
> > +#include <immintrin.h>
> > +__m256i foo(void *data) {
> > +    __m128i X1 = _mm_lddqu_si128((__m128i*)data);
> > +    __m256i V1 = _mm256_broadcastsi128_si256 (X1);
> > +    return V1;
> > +}
> > --
> > 2.39.1.388.g2fc9e9ca3c
> >

diff mbox series

Patch

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2d81347c7b6..d571a78f4c4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1835,7 +1835,20 @@  (define_peephole2
   [(set (match_dup 4) (match_dup 1))]
   "operands[4] = adjust_address (operands[0], V2DFmode, 0);")
 
-(define_insn "<sse3>_lddqu<avxsizesuffix>"
+(define_expand "<sse3>_lddqu<avxsizesuffix>"
+  [(set (match_operand:VI1 0 "register_operand")
+	(unspec:VI1 [(match_operand:VI1 1 "memory_operand")]
+		    UNSPEC_LDDQU))]
+  "TARGET_SSE3"
+{
+  if (TARGET_AVX && (<MODE_SIZE> == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD))
+    {
+      emit_move_insn (operands[0], operands[1]);
+      DONE;
+    }
+})
+
+(define_insn "*<sse3>_lddqu<avxsizesuffix>"
   [(set (match_operand:VI1 0 "register_operand" "=x")
 	(unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
 		    UNSPEC_LDDQU))]
diff --git a/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
new file mode 100644
index 00000000000..29699a5fa7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */
+/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */
+
+#include <immintrin.h>
+__m256i foo(void *data) {
+    __m128i X1 = _mm_lddqu_si128((__m128i*)data);
+    __m256i V1 = _mm256_broadcastsi128_si256 (X1);
+    return V1;
+}