i386: Avoid paradoxical subreg dests in vector zero_extend

Message ID mpt4jigqqne.fsf@arm.com
State Unresolved
Headers
Series i386: Avoid paradoxical subreg dests in vector zero_extend |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Richard Sandiford Oct. 24, 2023, 10:08 a.m. UTC
  For the V2HI -> V2SI zero extension in:

  typedef unsigned short v2hi __attribute__((vector_size(4)));
  typedef unsigned int v2si __attribute__((vector_size(8)));
  v2si f (v2hi x) { return (v2si) {x[0], x[1]}; }

ix86_expand_sse_extend would generate:

   (set (reg:V2HI 102)
        (const_vector:V2HI [(const_int 0 [0])
			    (const_int 0 [0])]))
   (set (subreg:V8HI (reg:V2HI 101) 0)
        (vec_select:V8HI
          (vec_concat:V16HI (subreg:V8HI (reg/v:V2HI 99 [ x ]) 0)
                            (subreg:V8HI (reg:V2HI 102) 0))
          (parallel [(const_int 0 [0])
                     (const_int 8 [0x8])
                     (const_int 1 [0x1])
                     (const_int 9 [0x9])
                     (const_int 2 [0x2])
                     (const_int 10 [0xa])
                     (const_int 3 [0x3])
                     (const_int 11 [0xb])])))
  (set (reg:V2SI 100)
       (subreg:V2SI (reg:V2HI 101) 0))
    (expr_list:REG_EQUAL (zero_extend:V2SI (reg/v:V2HI 99 [ x ])))

But using (subreg:V2SI (reg:V2HI 101) 0) as the destination of
the vec_select means that only the low 4 bytes of the destination
are stored.  Only the lower half of reg 100 is well-defined.

Things tend to happen to work if the register allocator ties reg 101
to reg 100.  But it caused problems with the upcoming late-combine pass
because we propagated the set of reg 100 into its uses.

Tested on x86_64-linux-gnu.  OK to install?

Richard


gcc/
	* config/i386/i386-expand.cc (ix86_split_mmx_punpck): Allow the
	destination to be wider than the sources.  Take the mode from the
	first source.
	(ix86_expand_sse_extend): Pass the destination directly to
	ix86_split_mmx_punpck, rather than using a fresh register that
	is half the size.
---
 gcc/config/i386/i386-expand.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)
  

Comments

Uros Bizjak Oct. 24, 2023, 10:19 a.m. UTC | #1
On Tue, Oct 24, 2023 at 12:08 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> For the V2HI -> V2SI zero extension in:
>
>   typedef unsigned short v2hi __attribute__((vector_size(4)));
>   typedef unsigned int v2si __attribute__((vector_size(8)));
>   v2si f (v2hi x) { return (v2si) {x[0], x[1]}; }
>
> ix86_expand_sse_extend would generate:
>
>    (set (reg:V2HI 102)
>         (const_vector:V2HI [(const_int 0 [0])
>                             (const_int 0 [0])]))
>    (set (subreg:V8HI (reg:V2HI 101) 0)
>         (vec_select:V8HI
>           (vec_concat:V16HI (subreg:V8HI (reg/v:V2HI 99 [ x ]) 0)
>                             (subreg:V8HI (reg:V2HI 102) 0))
>           (parallel [(const_int 0 [0])
>                      (const_int 8 [0x8])
>                      (const_int 1 [0x1])
>                      (const_int 9 [0x9])
>                      (const_int 2 [0x2])
>                      (const_int 10 [0xa])
>                      (const_int 3 [0x3])
>                      (const_int 11 [0xb])])))
>   (set (reg:V2SI 100)
>        (subreg:V2SI (reg:V2HI 101) 0))
>     (expr_list:REG_EQUAL (zero_extend:V2SI (reg/v:V2HI 99 [ x ])))
>
> But using (subreg:V2SI (reg:V2HI 101) 0) as the destination of
> the vec_select means that only the low 4 bytes of the destination
> are stored.  Only the lower half of reg 100 is well-defined.
>
> Things tend to happen to work if the register allocator ties reg 101
> to reg 100.  But it caused problems with the upcoming late-combine pass
> because we propagated the set of reg 100 into its uses.
>
> Tested on x86_64-linux-gnu.  OK to install?
>
> Richard
>
>
> gcc/
>         * config/i386/i386-expand.cc (ix86_split_mmx_punpck): Allow the
>         destination to be wider than the sources.  Take the mode from the
>         first source.
>         (ix86_expand_sse_extend): Pass the destination directly to
>         ix86_split_mmx_punpck, rather than using a fresh register that
>         is half the size.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-expand.cc | 9 +++++----
>  1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 1eae9d7c78c..2361ff77af3 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -1110,7 +1110,9 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
>    ix86_move_vector_high_sse_to_mmx (op0);
>  }
>
> -/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */
> +/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  This is also used
> +   for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
> +   OPERANDS[0].  */
>
>  void
>  ix86_split_mmx_punpck (rtx operands[], bool high_p)
> @@ -1118,7 +1120,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
>    rtx op0 = operands[0];
>    rtx op1 = operands[1];
>    rtx op2 = operands[2];
> -  machine_mode mode = GET_MODE (op0);
> +  machine_mode mode = GET_MODE (op1);
>    rtx mask;
>    /* The corresponding SSE mode.  */
>    machine_mode sse_mode, double_sse_mode;
> @@ -5660,7 +5662,7 @@ ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
>        gcc_unreachable ();
>      }
>
> -  ops[0] = gen_reg_rtx (imode);
> +  ops[0] = dest;
>
>    ops[1] = force_reg (imode, src);
>
> @@ -5671,7 +5673,6 @@ ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
>                                   ops[1], pc_rtx, pc_rtx);
>
>    ix86_split_mmx_punpck (ops, false);
> -  emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
>  }
>
>  /* Unpack SRC into the next wider integer vector type.  UNSIGNED_P is
> --
> 2.25.1
>
  

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1eae9d7c78c..2361ff77af3 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1110,7 +1110,9 @@  ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
   ix86_move_vector_high_sse_to_mmx (op0);
 }
 
-/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */
+/* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  This is also used
+   for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
+   OPERANDS[0].  */
 
 void
 ix86_split_mmx_punpck (rtx operands[], bool high_p)
@@ -1118,7 +1120,7 @@  ix86_split_mmx_punpck (rtx operands[], bool high_p)
   rtx op0 = operands[0];
   rtx op1 = operands[1];
   rtx op2 = operands[2];
-  machine_mode mode = GET_MODE (op0);
+  machine_mode mode = GET_MODE (op1);
   rtx mask;
   /* The corresponding SSE mode.  */
   machine_mode sse_mode, double_sse_mode;
@@ -5660,7 +5662,7 @@  ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
       gcc_unreachable ();
     }
 
-  ops[0] = gen_reg_rtx (imode);
+  ops[0] = dest;
 
   ops[1] = force_reg (imode, src);
 
@@ -5671,7 +5673,6 @@  ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
 				  ops[1], pc_rtx, pc_rtx);
 
   ix86_split_mmx_punpck (ops, false);
-  emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), ops[0], imode));
 }
 
 /* Unpack SRC into the next wider integer vector type.  UNSIGNED_P is