[v4] xtensa: Eliminate the use of callee-saved register that saves and restores only once
Checks
Commit Message
In the previous patch, if insn is JUMP_INSN or CALL_INSN, it bypasses the reg check (possibly FAIL).
=====
In the case of the CALL0 ABI, values that must be retained before and
after function calls are placed in the callee-saved registers (A12
through A15) and referenced later. However, it is often the case that
the save and the reference are each only once and a simple register-
register move (the frame pointer is needed to recover the stack pointer
and must be excluded).
e.g. in the following example, if there are no other occurrences of
register A14:
;; before
; prologue {
...
s32i.n a14, sp, 16
...
; } prologue
...
mov.n a14, a6
...
call0 foo
...
mov.n a8, a14
...
; epilogue {
...
l32i.n a14, sp, 16
...
; } epilogue
It can be possible like this:
;; after
; prologue {
...
(deleted)
...
; } prologue
...
s32i.n a6, sp, 16
...
call0 foo
...
l32i.n a8, sp, 16
...
; epilogue {
...
(deleted)
...
; } epilogue
This patch introduces a new peephole2 pattern that implements the above.
gcc/ChangeLog:
* config/xtensa/xtensa.md: New peephole2 pattern that eliminates
the use of callee-saved register that saves and restores only once
for other register, by using its stack slot directly.
---
gcc/config/xtensa/xtensa.md | 62 +++++++++++++++++++++++++++++++++++++
1 file changed, 62 insertions(+)
Comments
Hi Suwa-san,
On Wed, Jan 18, 2023 at 7:50 PM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
>
> In the previous patch, if insn is JUMP_INSN or CALL_INSN, it bypasses the reg check (possibly FAIL).
>
> =====
> In the case of the CALL0 ABI, values that must be retained before and
> after function calls are placed in the callee-saved registers (A12
> through A15) and referenced later. However, it is often the case that
> the save and the reference are each only once and a simple register-
> register move (the frame pointer is needed to recover the stack pointer
> and must be excluded).
>
> e.g. in the following example, if there are no other occurrences of
> register A14:
>
> ;; before
> ; prologue {
> ...
> s32i.n a14, sp, 16
> ...
> ; } prologue
> ...
> mov.n a14, a6
> ...
> call0 foo
> ...
> mov.n a8, a14
> ...
> ; epilogue {
> ...
> l32i.n a14, sp, 16
> ...
> ; } epilogue
>
> It can be possible like this:
>
> ;; after
> ; prologue {
> ...
> (deleted)
> ...
> ; } prologue
> ...
> s32i.n a6, sp, 16
> ...
> call0 foo
> ...
> l32i.n a8, sp, 16
> ...
> ; epilogue {
> ...
> (deleted)
> ...
> ; } epilogue
>
> This patch introduces a new peephole2 pattern that implements the above.
>
> gcc/ChangeLog:
>
> * config/xtensa/xtensa.md: New peephole2 pattern that eliminates
> the use of callee-saved register that saves and restores only once
> for other register, by using its stack slot directly.
> ---
> gcc/config/xtensa/xtensa.md | 62 +++++++++++++++++++++++++++++++++++++
> 1 file changed, 62 insertions(+)
There are still issues with this change in the libgomp:
FAIL: libgomp.c/examples-4/target-1.c execution test
FAIL: libgomp.c/examples-4/target-2.c execution test
They come from the following function:
code produced before the change:
.literal_position
.literal .LC8, init@PLT
.literal .LC9, 400000
.literal .LC10, 100000
.literal .LC11, -800000
.literal .LC12, 800000
.align 4
.global vec_mult_ref
.type vec_mult_ref, @function
vec_mult_ref:
l32r a9, .LC11
addi sp, sp, -16
l32r a10, .LC9
s32i.n a12, sp, 8
s32i.n a13, sp, 4
s32i.n a0, sp, 12
add.n sp, sp, a9
add.n a12, sp, a10
l32r a9, .LC8
mov.n a13, a2
mov.n a3, sp
mov.n a2, a12
callx0 a9
l32r a7, .LC10
mov.n a10, a12
mov.n a11, sp
mov.n a2, a13
loop a7, .L17_LEND
.L17:
l32i.n a9, a10, 0
l32i.n a6, a11, 0
addi.n a10, a10, 4
mull a9, a9, a6
addi.n a11, a11, 4
s32i.n a9, a2, 0
addi.n a2, a2, 4
.L17_LEND:
l32r a9, .LC12
add.n sp, sp, a9
l32i.n a0, sp, 12
l32i.n a12, sp, 8
l32i.n a13, sp, 4
addi sp, sp, 16
ret.n
--------------------
with the change:
.literal_position
.literal .LC8, init@PLT
.literal .LC9, 400000
.literal .LC10, 100000
.literal .LC11, -800000
.literal .LC12, 800000
.align 4
.global vec_mult_ref
.type vec_mult_ref, @function
vec_mult_ref:
l32r a9, .LC11
l32r a10, .LC9
addi sp, sp, -16
s32i.n a12, sp, 8
s32i.n a0, sp, 12
add.n sp, sp, a9
add.n a12, sp, a10
l32r a9, .LC8
s32i.n a2, sp, 4
mov.n a3, sp
mov.n a2, a12
callx0 a9
l32r a7, .LC10
l32i.n a2, sp, 4
mov.n a10, a12
mov.n a11, sp
loop a7, .L17_LEND
.L17:
l32i.n a9, a10, 0
l32i.n a6, a11, 0
addi.n a10, a10, 4
mull a9, a9, a6
addi.n a11, a11, 4
s32i.n a9, a2, 0
addi.n a2, a2, 4
.L17_LEND:
l32r a9, .LC12
add.n sp, sp, a9
l32i.n a0, sp, 12
l32i.n a12, sp, 8
addi sp, sp, 16
ret.n
the stack pointer is modified after saving callee-saved registers,
but the stack offset where a2 is stored and reloaded does not take
this into an account.
After having this many attempts and getting to the issues that are
really hard to detect I wonder if the target backend is the right place
for this optimization?
On 2023/01/21 0:14, Max Filippov wrote:
> Hi Suwa-san,
Hi!
>
> On Wed, Jan 18, 2023 at 7:50 PM Takayuki 'January June' Suwa
> <jjsuwa_sys3175@yahoo.co.jp> wrote:
>>
>> In the previous patch, if insn is JUMP_INSN or CALL_INSN, it bypasses the reg check (possibly FAIL).
>>
>> =====
>> In the case of the CALL0 ABI, values that must be retained before and
>> after function calls are placed in the callee-saved registers (A12
>> through A15) and referenced later. However, it is often the case that
>> the save and the reference are each only once and a simple register-
>> register move (the frame pointer is needed to recover the stack pointer
>> and must be excluded).
>>
>> e.g. in the following example, if there are no other occurrences of
>> register A14:
>>
>> ;; before
>> ; prologue {
>> ...
>> s32i.n a14, sp, 16
>> ...
>> ; } prologue
>> ...
>> mov.n a14, a6
>> ...
>> call0 foo
>> ...
>> mov.n a8, a14
>> ...
>> ; epilogue {
>> ...
>> l32i.n a14, sp, 16
>> ...
>> ; } epilogue
>>
>> It can be possible like this:
>>
>> ;; after
>> ; prologue {
>> ...
>> (deleted)
>> ...
>> ; } prologue
>> ...
>> s32i.n a6, sp, 16
>> ...
>> call0 foo
>> ...
>> l32i.n a8, sp, 16
>> ...
>> ; epilogue {
>> ...
>> (deleted)
>> ...
>> ; } epilogue
>>
>> This patch introduces a new peephole2 pattern that implements the above.
>>
>> gcc/ChangeLog:
>>
>> * config/xtensa/xtensa.md: New peephole2 pattern that eliminates
>> the use of callee-saved register that saves and restores only once
>> for other register, by using its stack slot directly.
>> ---
>> gcc/config/xtensa/xtensa.md | 62 +++++++++++++++++++++++++++++++++++++
>> 1 file changed, 62 insertions(+)
>
> There are still issues with this change in the libgomp:
>
> FAIL: libgomp.c/examples-4/target-1.c execution test
> FAIL: libgomp.c/examples-4/target-2.c execution test
>
> They come from the following function:
>
> code produced before the change:
> .literal_position
> .literal .LC8, init@PLT
> .literal .LC9, 400000
> .literal .LC10, 100000
> .literal .LC11, -800000
> .literal .LC12, 800000
> .align 4
> .global vec_mult_ref
> .type vec_mult_ref, @function
> vec_mult_ref:
> l32r a9, .LC11
> addi sp, sp, -16
> l32r a10, .LC9
> s32i.n a12, sp, 8
> s32i.n a13, sp, 4
> s32i.n a0, sp, 12
> add.n sp, sp, a9
> add.n a12, sp, a10
> l32r a9, .LC8
> mov.n a13, a2
> mov.n a3, sp
> mov.n a2, a12
> callx0 a9
> l32r a7, .LC10
> mov.n a10, a12
> mov.n a11, sp
> mov.n a2, a13
> loop a7, .L17_LEND
> .L17:
> l32i.n a9, a10, 0
> l32i.n a6, a11, 0
> addi.n a10, a10, 4
> mull a9, a9, a6
> addi.n a11, a11, 4
> s32i.n a9, a2, 0
> addi.n a2, a2, 4
> .L17_LEND:
> l32r a9, .LC12
> add.n sp, sp, a9
> l32i.n a0, sp, 12
> l32i.n a12, sp, 8
> l32i.n a13, sp, 4
> addi sp, sp, 16
> ret.n
>
> --------------------
>
> with the change:
> .literal_position
> .literal .LC8, init@PLT
> .literal .LC9, 400000
> .literal .LC10, 100000
> .literal .LC11, -800000
> .literal .LC12, 800000
> .align 4
> .global vec_mult_ref
> .type vec_mult_ref, @function
> vec_mult_ref:
> l32r a9, .LC11
> l32r a10, .LC9
> addi sp, sp, -16
> s32i.n a12, sp, 8
> s32i.n a0, sp, 12
> add.n sp, sp, a9
> add.n a12, sp, a10
> l32r a9, .LC8
> s32i.n a2, sp, 4
> mov.n a3, sp
> mov.n a2, a12
> callx0 a9
> l32r a7, .LC10
> l32i.n a2, sp, 4
> mov.n a10, a12
> mov.n a11, sp
> loop a7, .L17_LEND
> .L17:
> l32i.n a9, a10, 0
> l32i.n a6, a11, 0
> addi.n a10, a10, 4
> mull a9, a9, a6
> addi.n a11, a11, 4
> s32i.n a9, a2, 0
> addi.n a2, a2, 4
> .L17_LEND:
> l32r a9, .LC12
> add.n sp, sp, a9
> l32i.n a0, sp, 12
> l32i.n a12, sp, 8
> addi sp, sp, 16
> ret.n
>
> the stack pointer is modified after saving callee-saved registers,
> but the stack offset where a2 is stored and reloaded does not take
> this into an account.
>
> After having this many attempts and getting to the issues that are
> really hard to detect I wonder if the target backend is the right place
> for this optimization?
>
I guess they are not hard to detect but just issues I didn't anticipate (and I just need a little more work).
And where else should it be done? What about implementing a target-specific pass just for one-point optimization?
On Fri, Jan 20, 2023 at 8:39 PM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
> On 2023/01/21 0:14, Max Filippov wrote:
> > After having this many attempts and getting to the issues that are
> > really hard to detect I wonder if the target backend is the right place
> > for this optimization?
> >
> I guess they are not hard to detect
I mean, on the testing side. check-gcc testsuite passed without new
regressions with this change, linux kernel smoke test passed, I was
almost convinced that it's ok to commit.
> but just issues I didn't anticipate (and I just need a little more work).
Looking at other peephole2 patterns I see that their code transformations
are much more compact and they don't need to track additional properties
of unrelated instructions.
> And where else should it be done? What about implementing a
> target-specific pass just for one-point optimization?
I don't even understand what's target-specific in this optimization?
It looks very generic to me.
On 2023/01/23 0:45, Max Filippov wrote:
> On Fri, Jan 20, 2023 at 8:39 PM Takayuki 'January June' Suwa
> <jjsuwa_sys3175@yahoo.co.jp> wrote:
>> On 2023/01/21 0:14, Max Filippov wrote:
>>> After having this many attempts and getting to the issues that are
>>> really hard to detect I wonder if the target backend is the right place
>>> for this optimization?
>>>
>> I guess they are not hard to detect
>
> I mean, on the testing side. check-gcc testsuite passed without new
> regressions with this change, linux kernel smoke test passed, I was
> almost convinced that it's ok to commit.
>
>> but just issues I didn't anticipate (and I just need a little more work).
>
> Looking at other peephole2 patterns I see that their code transformations
> are much more compact and they don't need to track additional properties
> of unrelated instructions.
>
>> And where else should it be done? What about implementing a
>> target-specific pass just for one-point optimization?
>
> I don't even understand what's target-specific in this optimization?
> It looks very generic to me.
>
Ah, I seem to have misunderstood what you meant, sorry.
Now, what this patch is trying to do depends on whether register moves can be converted to stack pointer indirect loads/stores with offsets, and whether there is any benefit in doing so, but they are not target dependent. Is it?
If we want the target-independent part to do something like this, we will need a mechanism (macro, hook, etc.) to write appropriate information in the machine description and pass it on.
For example, offset ranges for register indirect loads and stores, or whether the ABI requires that callee-saved registers always be associated with stack slots, or even the need for stack frame construction...
I totally agree that the peephole2 pattern is not the best implementation location.
@@ -3029,3 +3029,65 @@ FALLTHRU:;
operands[1] = GEN_INT (imm0);
operands[2] = GEN_INT (imm1);
})
+
+(define_peephole2
+ [(set (match_operand:SI 0 "register_operand")
+ (match_operand:SI 1 "reload_operand"))]
+ "!TARGET_WINDOWED_ABI && df
+ && epilogue_contains (insn)
+ && ! call_used_or_fixed_reg_p (REGNO (operands[0]))
+ && (!frame_pointer_needed
+ || REGNO (operands[0]) != HARD_FRAME_POINTER_REGNUM)"
+ [(const_int 0)]
+{
+ rtx reg = operands[0], pattern;
+ rtx_insn *insnP = NULL, *insnS = NULL, *insnR = NULL;
+ df_ref ref;
+ rtx_insn *insn;
+ for (ref = DF_REG_DEF_CHAIN (REGNO (reg));
+ ref; ref = DF_REF_NEXT_REG (ref))
+ if (DF_REF_CLASS (ref) != DF_REF_REGULAR
+ || DEBUG_INSN_P (insn = DF_REF_INSN (ref)))
+ continue;
+ else if (insn == curr_insn)
+ continue;
+ else if (GET_CODE (pattern = PATTERN (insn)) == SET
+ && rtx_equal_p (SET_DEST (pattern), reg)
+ && REG_P (SET_SRC (pattern)))
+ {
+ if (insnS)
+ FAIL;
+ insnS = insn;
+ continue;
+ }
+ else
+ FAIL;
+ for (ref = DF_REG_USE_CHAIN (REGNO (reg));
+ ref; ref = DF_REF_NEXT_REG (ref))
+ if (DF_REF_CLASS (ref) != DF_REF_REGULAR
+ || DEBUG_INSN_P (insn = DF_REF_INSN (ref)))
+ continue;
+ else if (prologue_contains (insn))
+ {
+ insnP = insn;
+ continue;
+ }
+ else if (GET_CODE (pattern = PATTERN (insn)) == SET
+ && rtx_equal_p (SET_SRC (pattern), reg)
+ && REG_P (SET_DEST (pattern)))
+ {
+ if (insnR)
+ FAIL;
+ insnR = insn;
+ continue;
+ }
+ else
+ FAIL;
+ if (!insnP || !insnS || !insnR)
+ FAIL;
+ SET_DEST (PATTERN (insnS)) = copy_rtx (operands[1]);
+ df_insn_rescan (insnS);
+ SET_SRC (PATTERN (insnR)) = copy_rtx (operands[1]);
+ df_insn_rescan (insnR);
+ set_insn_deleted (insnP);
+})