i386: Avoid paradoxical subreg dests in vector zero_extend
Checks
Commit Message
For the V2HI -> V2SI zero extension in:
typedef unsigned short v2hi __attribute__((vector_size(4)));
typedef unsigned int v2si __attribute__((vector_size(8)));
v2si f (v2hi x) { return (v2si) {x[0], x[1]}; }
ix86_expand_sse_extend would generate:
(set (reg:V2HI 102)
(const_vector:V2HI [(const_int 0 [0])
(const_int 0 [0])]))
(set (subreg:V8HI (reg:V2HI 101) 0)
(vec_select:V8HI
(vec_concat:V16HI (subreg:V8HI (reg/v:V2HI 99 [ x ]) 0)
(subreg:V8HI (reg:V2HI 102) 0))
(parallel [(const_int 0 [0])
(const_int 8 [0x8])
(const_int 1 [0x1])
(const_int 9 [0x9])
(const_int 2 [0x2])
(const_int 10 [0xa])
(const_int 3 [0x3])
(const_int 11 [0xb])])))
(set (reg:V2SI 100)
(subreg:V2SI (reg:V2HI 101) 0))
(expr_list:REG_EQUAL (zero_extend:V2SI (reg/v:V2HI 99 [ x ])))
But using (subreg:V2SI (reg:V2HI 101) 0) as the destination of
the vec_select means that only the low 4 bytes of the destination
are stored. Only the lower half of reg 100 is well-defined.
Things tend to happen to work if the register allocator ties reg 101
to reg 100. But it caused problems with the upcoming late-combine pass
because we propagated the set of reg 100 into its uses.
Tested on x86_64-linux-gnu. OK to install?
Richard
gcc/
* config/i386/i386-expand.cc (ix86_split_mmx_punpck): Allow the
destination to be wider than the sources. Take the mode from the
first source.
(ix86_expand_sse_extend): Pass the destination directly to
ix86_split_mmx_punpck, rather than using a fresh register that
is half the size.
---
gcc/config/i386/i386-expand.cc | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
Comments
On Tue, Oct 24, 2023 at 12:08 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> For the V2HI -> V2SI zero extension in:
>
> typedef unsigned short v2hi __attribute__((vector_size(4)));
> typedef unsigned int v2si __attribute__((vector_size(8)));
> v2si f (v2hi x) { return (v2si) {x[0], x[1]}; }
>
> ix86_expand_sse_extend would generate:
>
> (set (reg:V2HI 102)
> (const_vector:V2HI [(const_int 0 [0])
> (const_int 0 [0])]))
> (set (subreg:V8HI (reg:V2HI 101) 0)
> (vec_select:V8HI
> (vec_concat:V16HI (subreg:V8HI (reg/v:V2HI 99 [ x ]) 0)
> (subreg:V8HI (reg:V2HI 102) 0))
> (parallel [(const_int 0 [0])
> (const_int 8 [0x8])
> (const_int 1 [0x1])
> (const_int 9 [0x9])
> (const_int 2 [0x2])
> (const_int 10 [0xa])
> (const_int 3 [0x3])
> (const_int 11 [0xb])])))
> (set (reg:V2SI 100)
> (subreg:V2SI (reg:V2HI 101) 0))
> (expr_list:REG_EQUAL (zero_extend:V2SI (reg/v:V2HI 99 [ x ])))
>
> But using (subreg:V2SI (reg:V2HI 101) 0) as the destination of
> the vec_select means that only the low 4 bytes of the destination
> are stored. Only the lower half of reg 100 is well-defined.
>
> Things tend to happen to work if the register allocator ties reg 101
> to reg 100. But it caused problems with the upcoming late-combine pass
> because we propagated the set of reg 100 into its uses.
>
> Tested on x86_64-linux-gnu. OK to install?
>
> Richard
>
>
> gcc/
> * config/i386/i386-expand.cc (ix86_split_mmx_punpck): Allow the
> destination to be wider than the sources. Take the mode from the
> first source.
> (ix86_expand_sse_extend): Pass the destination directly to
> ix86_split_mmx_punpck, rather than using a fresh register that
> is half the size.
OK.
Thanks,
Uros.
> ---
> gcc/config/i386/i386-expand.cc | 9 +++++----
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 1eae9d7c78c..2361ff77af3 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -1110,7 +1110,9 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
> ix86_move_vector_high_sse_to_mmx (op0);
> }
>
> -/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
> +/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. This is also used
> + for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
> + OPERANDS[0]. */
>
> void
> ix86_split_mmx_punpck (rtx operands[], bool high_p)
> @@ -1118,7 +1120,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
> rtx op0 = operands[0];
> rtx op1 = operands[1];
> rtx op2 = operands[2];
> - machine_mode mode = GET_MODE (op0);
> + machine_mode mode = GET_MODE (op1);
> rtx mask;
> /* The corresponding SSE mode. */
> machine_mode sse_mode, double_sse_mode;
> @@ -5660,7 +5662,7 @@ ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
> gcc_unreachable ();
> }
>
> - ops[0] = gen_reg_rtx (imode);
> + ops[0] = dest;
>
> ops[1] = force_reg (imode, src);
>
> @@ -5671,7 +5673,6 @@ ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
> ops[1], pc_rtx, pc_rtx);
>
> ix86_split_mmx_punpck (ops, false);
> - emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
> }
>
> /* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
> --
> 2.25.1
>
@@ -1110,7 +1110,9 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
ix86_move_vector_high_sse_to_mmx (op0);
}
-/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
+/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. This is also used
+ for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
+ OPERANDS[0]. */
void
ix86_split_mmx_punpck (rtx operands[], bool high_p)
@@ -1118,7 +1120,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
rtx op0 = operands[0];
rtx op1 = operands[1];
rtx op2 = operands[2];
- machine_mode mode = GET_MODE (op0);
+ machine_mode mode = GET_MODE (op1);
rtx mask;
/* The corresponding SSE mode. */
machine_mode sse_mode, double_sse_mode;
@@ -5660,7 +5662,7 @@ ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
gcc_unreachable ();
}
- ops[0] = gen_reg_rtx (imode);
+ ops[0] = dest;
ops[1] = force_reg (imode, src);
@@ -5671,7 +5673,6 @@ ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
ops[1], pc_rtx, pc_rtx);
ix86_split_mmx_punpck (ops, false);
- emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
}
/* Unpack SRC into the next wider integer vector type. UNSIGNED_P is