[x86] Split SUBREGs of SSE vector registers into vec_select insns.
Checks
Commit Message
This patch is the final piece in the series to improve the ABI issues
affecting PR 88873. The previous patches tackled inserting DFmode
values into V2DFmode registers, by introducing insvti_{low,high}part
patterns. This patch improves the extraction of DFmode values from
v2DFmode registers via TImode intermediates.
I'd initially thought this would require new extvti_{low,high}part
patterns to be defined, but all that's required is to recognize that
the SUBREG idioms produced by combine are equivalent to (forms of)
vec_select patterns. The target-independent middle-end can't be sure
that the appropriate vec_select instruction exists on the target,
hence doesn't canonicalize a SUBREG of a vector mode as a vec_select,
but the backend can provide a define_split stating where and when
this is useful, for example, considering whether the operand is in
memory, or whether !TARGET_SSE_MATH and the destination is i387.
For pr88873.c, gcc -O2 -march=cascadelake currently generates:
foo: vpunpcklqdq %xmm3, %xmm2, %xmm7
vpunpcklqdq %xmm1, %xmm0, %xmm6
vpunpcklqdq %xmm5, %xmm4, %xmm2
vmovdqa %xmm7, -24(%rsp)
vmovdqa %xmm6, %xmm1
movq -16(%rsp), %rax
vpinsrq $1, %rax, %xmm7, %xmm4
vmovapd %xmm4, %xmm6
vfmadd132pd %xmm1, %xmm2, %xmm6
vmovapd %xmm6, -24(%rsp)
vmovsd -16(%rsp), %xmm1
vmovsd -24(%rsp), %xmm0
ret
with this patch, we now generate:
foo: vpunpcklqdq %xmm1, %xmm0, %xmm6
vpunpcklqdq %xmm3, %xmm2, %xmm7
vpunpcklqdq %xmm5, %xmm4, %xmm2
vmovdqa %xmm6, %xmm1
vfmadd132pd %xmm7, %xmm2, %xmm1
vmovsd %xmm1, %xmm1, %xmm0
vunpckhpd %xmm1, %xmm1, %xmm1
ret
The improvement is even more dramatic when compared to the original
29 instructions shown in comment #8. GCC 13, for example, required
12 transfers to/from memory.
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures. Ok for mainline?
2023-08-03 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/sse.md (define_split): Convert highpart:DF extract
from V2DFmode register into a sse2_storehpd instruction.
(define_split): Likewise, convert lowpart:DF extract from V2DF
register into a sse2_storelpd instruction.
gcc/testsuite/ChangeLog
* gcc.target/i386/pr88873.c: Tweak to check for improved code.
Thanks in advance,
Roger
--
Comments
On Thu, Aug 3, 2023 at 9:10 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch is the final piece in the series to improve the ABI issues
> affecting PR 88873. The previous patches tackled inserting DFmode
> values into V2DFmode registers, by introducing insvti_{low,high}part
> patterns. This patch improves the extraction of DFmode values from
> v2DFmode registers via TImode intermediates.
>
> I'd initially thought this would require new extvti_{low,high}part
> patterns to be defined, but all that's required is to recognize that
> the SUBREG idioms produced by combine are equivalent to (forms of)
> vec_select patterns. The target-independent middle-end can't be sure
> that the appropriate vec_select instruction exists on the target,
> hence doesn't canonicalize a SUBREG of a vector mode as a vec_select,
> but the backend can provide a define_split stating where and when
> this is useful, for example, considering whether the operand is in
> memory, or whether !TARGET_SSE_MATH and the destination is i387.
>
> For pr88873.c, gcc -O2 -march=cascadelake currently generates:
>
> foo: vpunpcklqdq %xmm3, %xmm2, %xmm7
> vpunpcklqdq %xmm1, %xmm0, %xmm6
> vpunpcklqdq %xmm5, %xmm4, %xmm2
> vmovdqa %xmm7, -24(%rsp)
> vmovdqa %xmm6, %xmm1
> movq -16(%rsp), %rax
> vpinsrq $1, %rax, %xmm7, %xmm4
> vmovapd %xmm4, %xmm6
> vfmadd132pd %xmm1, %xmm2, %xmm6
> vmovapd %xmm6, -24(%rsp)
> vmovsd -16(%rsp), %xmm1
> vmovsd -24(%rsp), %xmm0
> ret
>
> with this patch, we now generate:
>
> foo: vpunpcklqdq %xmm1, %xmm0, %xmm6
> vpunpcklqdq %xmm3, %xmm2, %xmm7
> vpunpcklqdq %xmm5, %xmm4, %xmm2
> vmovdqa %xmm6, %xmm1
> vfmadd132pd %xmm7, %xmm2, %xmm1
> vmovsd %xmm1, %xmm1, %xmm0
> vunpckhpd %xmm1, %xmm1, %xmm1
> ret
>
> The improvement is even more dramatic when compared to the original
> 29 instructions shown in comment #8. GCC 13, for example, required
> 12 transfers to/from memory.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures. Ok for mainline?
>
>
> 2023-08-03 Roger Sayle <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
> * config/i386/sse.md (define_split): Convert highpart:DF extract
> from V2DFmode register into a sse2_storehpd instruction.
> (define_split): Likewise, convert lowpart:DF extract from V2DF
> register into a sse2_storelpd instruction.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/pr88873.c: Tweak to check for improved code.
OK.
Thanks,
Uros.
>
>
> Thanks in advance,
> Roger
> --
>
@@ -13554,6 +13554,14 @@
[(set_attr "type" "ssemov")
(set_attr "mode" "V2SF,V4SF,V2SF")])
+;; Convert highpart SUBREG in sse2_storehpd or *vec_extractv2df_1_sse.
+(define_split
+ [(set (match_operand:DF 0 "register_operand")
+ (subreg:DF (match_operand:V2DF 1 "register_operand") 8))]
+ "TARGET_SSE"
+ [(set (match_dup 0)
+ (vec_select:DF (match_dup 1) (parallel [(const_int 1)])))])
+
;; Avoid combining registers from different units in a single alternative,
;; see comment above inline_secondary_memory_needed function in i386.cc
(define_insn "sse2_storelpd"
@@ -13599,6 +13607,14 @@
[(set_attr "type" "ssemov")
(set_attr "mode" "V2SF,V4SF,V2SF")])
+;; Convert lowpart SUBREG into sse2_storelpd or *vec_extractv2df_0_sse.
+(define_split
+ [(set (match_operand:DF 0 "register_operand")
+ (subreg:DF (match_operand:V2DF 1 "register_operand") 0))]
+ "TARGET_SSE"
+ [(set (match_dup 0)
+ (vec_select:DF (match_dup 1) (parallel [(const_int 0)])))])
+
(define_expand "sse2_loadhpd_exp"
[(set (match_operand:V2DF 0 "nonimmediate_operand")
(vec_concat:V2DF
@@ -9,3 +9,5 @@ s_t foo (s_t a, s_t b, s_t c)
}
/* { dg-final { scan-assembler-times "vpunpcklqdq" 3 } } */
+/* { dg-final { scan-assembler "vunpckhpd" } } */
+/* { dg-final { scan-assembler-not "rsp" } } */