[x86] Improved TImode (128-bit) integer constants on x86_64.
Checks
Commit Message
This patch fixes two issues with the handling of 128-bit TImode integer
constants in the x86_64 backend. The main issue is that GCC always
tries to load 128-bit integer constants via broadcasts to vector SSE
registers, even if the result is required in general registers. This
is seen in the two closely related functions below:
__int128 m;
#define CONST (((__int128)0x0123456789abcdefULL<<64) |
0x0123456789abcdefULL)
void foo() { m &= CONST; }
void bar() { m = CONST; }
When compiled with -O2 -mavx, we currently generate:
foo: movabsq $81985529216486895, %rax
vmovq %rax, %xmm0
vpunpcklqdq %xmm0, %xmm0, %xmm0
vmovq %xmm0, %rax
vpextrq $1, %xmm0, %rdx
andq %rax, m(%rip)
andq %rdx, m+8(%rip)
ret
bar: movabsq $81985529216486895, %rax
vmovq %rax, %xmm1
vpunpcklqdq %xmm1, %xmm1, %xmm0
vpextrq $1, %xmm0, %rdx
vmovq %xmm0, m(%rip)
movq %rdx, m+8(%rip)
ret
With this patch we defer the decision to use vector broadcast for
TImode until we know we need actually want a SSE register result,
by moving the call to ix86_convert_const_wide_int_to_broadcast from
the RTL expansion pass, to the scalar-to-vector (STV) pass. With
this change (and a minor tweak described below) we now generate:
foo: movabsq $81985529216486895, %rax
andq %rax, m(%rip)
andq %rax, m+8(%rip)
ret
bar: movabsq $81985529216486895, %rax
vmovq %rax, %xmm0
vpunpcklqdq %xmm0, %xmm0, %xmm0
vmovdqa %xmm0, m(%rip)
ret
showing that we now correctly use vector mode broadcasts (only)
where appropriate.
The one minor tweak mentioned above is to enable the un-cprop hi/lo
optimization, that I originally contributed back in September 2004
https://gcc.gnu.org/pipermail/gcc-patches/2004-September/148756.html
even when not optimizing for size. Without this (and currently with
just -O2) the function foo above generates:
foo: movabsq $81985529216486895, %rax
movabsq $81985529216486895, %rdx
andq %rax, m(%rip)
andq %rdx, m+8(%rip)
ret
I'm not sure why (back in 2004) I thought that avoiding the implicit
"movq %rax, %rdx" instead of a second load was faster, perhaps avoiding
a dependency to allow better scheduling, but nowadays "movq %rax, %rdx"
is either eliminated by GCC's hardreg cprop pass, or special cased by
modern hardware, making the first foo preferrable, not only shorter but
also faster.
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32},
and with/without -march=cascadelake with no new failures.
Ok for mainline?
2023-12-18 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-expand.cc
(ix86_convert_const_wide_int_to_broadcast): Remove static.
(ix86_expand_move): Don't attempt to convert wide constants
to SSE using ix86_convert_const_wide_int_to_broadcast here.
(ix86_split_long_move): Always un-cprop multi-word constants.
* config/i386/i386-expand.h
(ix86_convert_const_wide_int_to_broadcast): Prototype here.
* config/i386/i386-features.cc: Include i386-expand.h.
(timode_scalar_chain::convert_insn): When converting TImode to
v1TImode, try ix86_convert_const_wide_int_to_broadcast.
gcc/testsuite/ChangeLog
* gcc.target/i386/movti-2.c: New test case.
* gcc.target/i386/movti-3.c: Likewise.
Thanks in advance,
Roger
--
@@ -289,7 +289,7 @@ ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
-static rtx
+rtx
ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
{
/* Don't use integer vector broadcast if we can't move from GPR to SSE
@@ -541,14 +541,6 @@ ix86_expand_move (machine_mode mode, rtx operands[])
return;
}
}
- else if (CONST_WIDE_INT_P (op1)
- && GET_MODE_SIZE (mode) >= 16)
- {
- rtx tmp = ix86_convert_const_wide_int_to_broadcast
- (GET_MODE (op0), op1);
- if (tmp != nullptr)
- op1 = tmp;
- }
}
}
@@ -6323,18 +6315,15 @@ ix86_split_long_move (rtx operands[])
}
}
- /* If optimizing for size, attempt to locally unCSE nonzero constants. */
- if (optimize_insn_for_size_p ())
- {
- for (j = 0; j < nparts - 1; j++)
- if (CONST_INT_P (operands[6 + j])
- && operands[6 + j] != const0_rtx
- && REG_P (operands[2 + j]))
- for (i = j; i < nparts - 1; i++)
- if (CONST_INT_P (operands[7 + i])
- && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
- operands[7 + i] = operands[2 + j];
- }
+ /* Attempt to locally unCSE nonzero constants. */
+ for (j = 0; j < nparts - 1; j++)
+ if (CONST_INT_P (operands[6 + j])
+ && operands[6 + j] != const0_rtx
+ && REG_P (operands[2 + j]))
+ for (i = j; i < nparts - 1; i++)
+ if (CONST_INT_P (operands[7 + i])
+ && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
+ operands[7 + i] = operands[2 + j];
for (i = 0; i < nparts; i++)
emit_move_insn (operands[2 + i], operands[6 + i]);
@@ -57,5 +57,6 @@ bool ix86_notrack_prefixed_insn_p (rtx_insn *);
machine_mode ix86_split_reduction (machine_mode mode);
void ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0,
rtx op1, rtx *quot_p, rtx *rem_p);
+rtx ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op);
#endif /* GCC_I386_EXPAND_H */
@@ -90,6 +90,7 @@ along with GCC; see the file COPYING3. If not see
#include "dwarf2out.h"
#include "i386-builtins.h"
#include "i386-features.h"
+#include "i386-expand.h"
const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
"savms64",
@@ -1853,14 +1854,25 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
{
/* Since there are no instructions to store 128-bit constant,
temporary register usage is required. */
+ bool use_move;
start_sequence ();
- src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
- src = validize_mem (force_const_mem (V1TImode, src));
+ tmp = ix86_convert_const_wide_int_to_broadcast (TImode, src);
+ if (tmp)
+ {
+ src = lowpart_subreg (V1TImode, tmp, TImode);
+ use_move = true;
+ }
+ else
+ {
+ src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
+ src = validize_mem (force_const_mem (V1TImode, src));
+ use_move = MEM_P (dst);
+ }
rtx_insn *seq = get_insns ();
end_sequence ();
if (seq)
emit_insn_before (seq, insn);
- if (MEM_P (dst))
+ if (use_move)
{
tmp = gen_reg_rtx (V1TImode);
emit_insn_before (gen_rtx_SET (tmp, src), insn);
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -mavx" } */
+__int128 m;
+
+void foo()
+{
+ m &= ((__int128)0x0123456789abcdefULL<<64) | 0x0123456789abcdefULL;
+}
+
+/* { dg-final { scan-assembler-times "movabsq" 1 } } */
+/* { dg-final { scan-assembler-not "vmovq" } } */
+/* { dg-final { scan-assembler-not "vpunpcklqdq" } } */
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -mavx" } */
+
+__int128 m;
+
+void bar()
+{
+ m = ((__int128)0x0123456789abcdefULL<<64) | 0x0123456789abcdefULL;
+}
+
+/* { dg-final { scan-assembler "vmovdqa" } } */
+/* { dg-final { scan-assembler-not "vpextrq" } } */