[10/11] aarch64: Generalise TFmode load/store pair patterns

Message ID ZS7ziVpbMVpY2aU6@arm.com
State Accepted
Headers
Series aarch64: Add new load/store pair fusion pass |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Alex Coplan Oct. 17, 2023, 8:50 p.m. UTC
  This patch generalises the TFmode load/store pair patterns to TImode and
TDmode.  This brings them in line with the DXmode patterns, and uses the
same technique with separate mode iterators (TX and TX2) to allow for
distinct modes in each arm of the load/store pair.

For example, in combination with the post-RA load/store pair fusion pass
in the following patch, this improves the codegen for the following
varargs testcase involving TImode stores:

void g(void *);
int foo(int x, ...)
{
    __builtin_va_list ap;
    __builtin_va_start (ap, x);
    g(&ap);
    __builtin_va_end (ap);
}

from:

foo:
.LFB0:
	stp	x29, x30, [sp, -240]!
.LCFI0:
	mov	w9, -56
	mov	w8, -128
	mov	x29, sp
	add	x10, sp, 176
	stp	x1, x2, [sp, 184]
	add	x1, sp, 240
	add	x0, sp, 16
	stp	x1, x1, [sp, 16]
	str	x10, [sp, 32]
	stp	w9, w8, [sp, 40]
	str	q0, [sp, 48]
	str	q1, [sp, 64]
	str	q2, [sp, 80]
	str	q3, [sp, 96]
	str	q4, [sp, 112]
	str	q5, [sp, 128]
	str	q6, [sp, 144]
	str	q7, [sp, 160]
	stp	x3, x4, [sp, 200]
	stp	x5, x6, [sp, 216]
	str	x7, [sp, 232]
	bl	g
	ldp	x29, x30, [sp], 240
.LCFI1:
	ret

to:

foo:
.LFB0:
	stp	x29, x30, [sp, -240]!
.LCFI0:
	mov	w9, -56
	mov	w8, -128
	mov	x29, sp
	add	x10, sp, 176
	stp	x1, x2, [sp, 1bd4971b7c71e70a637a1dq84]
	add	x1, sp, 240
	add	x0, sp, 16
	stp	x1, x1, [sp, 16]
	str	x10, [sp, 32]
	stp	w9, w8, [sp, 40]
	stp	q0, q1, [sp, 48]
	stp	q2, q3, [sp, 80]
	stp	q4, q5, [sp, 112]
	stp	q6, q7, [sp, 144]
	stp	x3, x4, [sp, 200]
	stp	x5, x6, [sp, 216]
	str	x7, [sp, 232]
	bl	g
	ldp	x29, x30, [sp], 240
.LCFI1:
	ret

Note that this patch isn't needed if we only use the mode
canonicalization approach in the new ldp fusion pass (since we
canonicalize T{I,F,D}mode to V16QImode), but we seem to get slightly
better performance with mode canonicalization disabled (see
--param=aarch64-ldp-canonicalize-modes in the following patch).

Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?

gcc/ChangeLog:

	* config/aarch64/aarch64.md (load_pair_dw_tftf): Rename to ...
	(load_pair_dw_<TX:mode><TX2:mode>): ... this.
	(store_pair_dw_tftf): Rename to ...
	(store_pair_dw_<TX:mode><TX2:mode>): ... this.
	* config/aarch64/iterators.md (TX2): New.
---
 gcc/config/aarch64/aarch64.md   | 22 +++++++++++-----------
 gcc/config/aarch64/iterators.md |  3 +++
 2 files changed, 14 insertions(+), 11 deletions(-)
  

Comments

Richard Sandiford Oct. 18, 2023, 6:16 p.m. UTC | #1
Alex Coplan <alex.coplan@arm.com> writes:
> This patch generalises the TFmode load/store pair patterns to TImode and
> TDmode.  This brings them in line with the DXmode patterns, and uses the
> same technique with separate mode iterators (TX and TX2) to allow for
> distinct modes in each arm of the load/store pair.
>
> For example, in combination with the post-RA load/store pair fusion pass
> in the following patch, this improves the codegen for the following
> varargs testcase involving TImode stores:
>
> void g(void *);
> int foo(int x, ...)
> {
>     __builtin_va_list ap;
>     __builtin_va_start (ap, x);
>     g(&ap);
>     __builtin_va_end (ap);
> }
>
> from:
>
> foo:
> .LFB0:
> 	stp	x29, x30, [sp, -240]!
> .LCFI0:
> 	mov	w9, -56
> 	mov	w8, -128
> 	mov	x29, sp
> 	add	x10, sp, 176
> 	stp	x1, x2, [sp, 184]
> 	add	x1, sp, 240
> 	add	x0, sp, 16
> 	stp	x1, x1, [sp, 16]
> 	str	x10, [sp, 32]
> 	stp	w9, w8, [sp, 40]
> 	str	q0, [sp, 48]
> 	str	q1, [sp, 64]
> 	str	q2, [sp, 80]
> 	str	q3, [sp, 96]
> 	str	q4, [sp, 112]
> 	str	q5, [sp, 128]
> 	str	q6, [sp, 144]
> 	str	q7, [sp, 160]
> 	stp	x3, x4, [sp, 200]
> 	stp	x5, x6, [sp, 216]
> 	str	x7, [sp, 232]
> 	bl	g
> 	ldp	x29, x30, [sp], 240
> .LCFI1:
> 	ret
>
> to:
>
> foo:
> .LFB0:
> 	stp	x29, x30, [sp, -240]!
> .LCFI0:
> 	mov	w9, -56
> 	mov	w8, -128
> 	mov	x29, sp
> 	add	x10, sp, 176
> 	stp	x1, x2, [sp, 1bd4971b7c71e70a637a1dq84]
> 	add	x1, sp, 240
> 	add	x0, sp, 16
> 	stp	x1, x1, [sp, 16]
> 	str	x10, [sp, 32]
> 	stp	w9, w8, [sp, 40]
> 	stp	q0, q1, [sp, 48]
> 	stp	q2, q3, [sp, 80]
> 	stp	q4, q5, [sp, 112]
> 	stp	q6, q7, [sp, 144]
> 	stp	x3, x4, [sp, 200]
> 	stp	x5, x6, [sp, 216]
> 	str	x7, [sp, 232]
> 	bl	g
> 	ldp	x29, x30, [sp], 240
> .LCFI1:
> 	ret
>
> Note that this patch isn't needed if we only use the mode
> canonicalization approach in the new ldp fusion pass (since we
> canonicalize T{I,F,D}mode to V16QImode), but we seem to get slightly
> better performance with mode canonicalization disabled (see
> --param=aarch64-ldp-canonicalize-modes in the following patch).
>
> Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (load_pair_dw_tftf): Rename to ...
> 	(load_pair_dw_<TX:mode><TX2:mode>): ... this.
> 	(store_pair_dw_tftf): Rename to ...
> 	(store_pair_dw_<TX:mode><TX2:mode>): ... this.
> 	* config/aarch64/iterators.md (TX2): New.

OK, thanks.  It would be nice to investigate & fix the reasons for
the regressions with canonicalised modes, but I agree that this patch
is a strict improvement, since it fixes a hole in the current scheme.

Richard

> ---
>  gcc/config/aarch64/aarch64.md   | 22 +++++++++++-----------
>  gcc/config/aarch64/iterators.md |  3 +++
>  2 files changed, 14 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 32c7adc8928..e6af09c2e8b 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1757,16 +1757,16 @@ (define_insn "load_pair_dw_<DX:mode><DX2:mode>"
>    }
>  )
>  
> -(define_insn "load_pair_dw_tftf"
> -  [(set (match_operand:TF 0 "register_operand" "=w")
> -	(match_operand:TF 1 "aarch64_mem_pair_operand" "Ump"))
> -   (set (match_operand:TF 2 "register_operand" "=w")
> -	(match_operand:TF 3 "memory_operand" "m"))]
> +(define_insn "load_pair_dw_<TX:mode><TX2:mode>"
> +  [(set (match_operand:TX 0 "register_operand" "=w")
> +	(match_operand:TX 1 "aarch64_mem_pair_operand" "Ump"))
> +   (set (match_operand:TX2 2 "register_operand" "=w")
> +	(match_operand:TX2 3 "memory_operand" "m"))]
>     "TARGET_SIMD
>      && rtx_equal_p (XEXP (operands[3], 0),
>  		    plus_constant (Pmode,
>  				   XEXP (operands[1], 0),
> -				   GET_MODE_SIZE (TFmode)))"
> +				   GET_MODE_SIZE (<TX:MODE>mode)))"
>    "ldp\\t%q0, %q2, %z1"
>    [(set_attr "type" "neon_ldp_q")
>     (set_attr "fp" "yes")]
> @@ -1805,11 +1805,11 @@ (define_insn "store_pair_dw_<DX:mode><DX2:mode>"
>    }
>  )
>  
> -(define_insn "store_pair_dw_tftf"
> -  [(set (match_operand:TF 0 "aarch64_mem_pair_operand" "=Ump")
> -	(match_operand:TF 1 "register_operand" "w"))
> -   (set (match_operand:TF 2 "memory_operand" "=m")
> -	(match_operand:TF 3 "register_operand" "w"))]
> +(define_insn "store_pair_dw_<TX:mode><TX2:mode>"
> +  [(set (match_operand:TX 0 "aarch64_mem_pair_operand" "=Ump")
> +	(match_operand:TX 1 "register_operand" "w"))
> +   (set (match_operand:TX2 2 "memory_operand" "=m")
> +	(match_operand:TX2 3 "register_operand" "w"))]
>     "TARGET_SIMD &&
>      rtx_equal_p (XEXP (operands[2], 0),
>  		 plus_constant (Pmode,
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 2451d8c2cd8..f9e2210095e 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -319,6 +319,9 @@ (define_mode_iterator VS [V2SI V4SI])
>  
>  (define_mode_iterator TX [TI TF TD])
>  
> +;; Duplicate of the above
> +(define_mode_iterator TX2 [TI TF TD])
> +
>  (define_mode_iterator VTX [TI TF TD V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
>  
>  ;; Advanced SIMD opaque structure modes.
  

Patch

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 32c7adc8928..e6af09c2e8b 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1757,16 +1757,16 @@  (define_insn "load_pair_dw_<DX:mode><DX2:mode>"
   }
 )
 
-(define_insn "load_pair_dw_tftf"
-  [(set (match_operand:TF 0 "register_operand" "=w")
-	(match_operand:TF 1 "aarch64_mem_pair_operand" "Ump"))
-   (set (match_operand:TF 2 "register_operand" "=w")
-	(match_operand:TF 3 "memory_operand" "m"))]
+(define_insn "load_pair_dw_<TX:mode><TX2:mode>"
+  [(set (match_operand:TX 0 "register_operand" "=w")
+	(match_operand:TX 1 "aarch64_mem_pair_operand" "Ump"))
+   (set (match_operand:TX2 2 "register_operand" "=w")
+	(match_operand:TX2 3 "memory_operand" "m"))]
    "TARGET_SIMD
     && rtx_equal_p (XEXP (operands[3], 0),
 		    plus_constant (Pmode,
 				   XEXP (operands[1], 0),
-				   GET_MODE_SIZE (TFmode)))"
+				   GET_MODE_SIZE (<TX:MODE>mode)))"
   "ldp\\t%q0, %q2, %z1"
   [(set_attr "type" "neon_ldp_q")
    (set_attr "fp" "yes")]
@@ -1805,11 +1805,11 @@  (define_insn "store_pair_dw_<DX:mode><DX2:mode>"
   }
 )
 
-(define_insn "store_pair_dw_tftf"
-  [(set (match_operand:TF 0 "aarch64_mem_pair_operand" "=Ump")
-	(match_operand:TF 1 "register_operand" "w"))
-   (set (match_operand:TF 2 "memory_operand" "=m")
-	(match_operand:TF 3 "register_operand" "w"))]
+(define_insn "store_pair_dw_<TX:mode><TX2:mode>"
+  [(set (match_operand:TX 0 "aarch64_mem_pair_operand" "=Ump")
+	(match_operand:TX 1 "register_operand" "w"))
+   (set (match_operand:TX2 2 "memory_operand" "=m")
+	(match_operand:TX2 3 "register_operand" "w"))]
    "TARGET_SIMD &&
     rtx_equal_p (XEXP (operands[2], 0),
 		 plus_constant (Pmode,
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 2451d8c2cd8..f9e2210095e 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -319,6 +319,9 @@  (define_mode_iterator VS [V2SI V4SI])
 
 (define_mode_iterator TX [TI TF TD])
 
+;; Duplicate of the above
+(define_mode_iterator TX2 [TI TF TD])
+
 (define_mode_iterator VTX [TI TF TD V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
 
 ;; Advanced SIMD opaque structure modes.