LoongArch: Replace -mexplicit-relocs=auto simple-used address peephole2 with combine
Checks
Commit Message
The problem with peephole2 is it uses a naive sliding-window algorithm
and misses many cases. For example:
float a[10000];
float t() { return a[0] + a[8000]; }
is compiled to:
la.local $r13,a
la.local $r12,a+32768
fld.s $f1,$r13,0
fld.s $f0,$r12,-768
fadd.s $f0,$f1,$f0
by trunk. But as we've explained in r14-4851, the following would be
better with -mexplicit-relocs=auto:
pcalau12i $r13,%pc_hi20(a)
pcalau12i $r12,%pc_hi20(a+32000)
fld.s $f1,$r13,%pc_lo12(a)
fld.s $f0,$r12,%pc_lo12(a+32000)
fadd.s $f0,$f1,$f0
However the sliding-window algorithm just won't detect the pcalau12i/fld
pair to be optimized. Use a define_insn_and_split in combine pass will
work around the issue.
gcc/ChangeLog:
* config/loongarch/loongarch.md:
(simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>): New
define_insn_and_split.
(simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>): Likewise.
(simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>): Likewise.
(simple_load_off<su>ext<P:mode><SUBDI:mode><GPR:mode>):
Likewise.
(simple_store<ST_ANY:mode><P:mode>): Likewise.
(simple_store_off<ST_ANY:mode><P:mode>): Likewise.
(define_peephole2): Remove la.local/[f]ld peepholes.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c:
New test.
---
Bootstrapped & regtested on loongarch64-linux-gnu. Ok for trunk?
gcc/config/loongarch/loongarch.md | 165 +++++++++---------
...explicit-relocs-auto-single-load-store-2.c | 11 ++
2 files changed, 98 insertions(+), 78 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
Comments
Ping :).
On Tue, 2023-12-12 at 14:47 +0800, Xi Ruoyao wrote:
> The problem with peephole2 is it uses a naive sliding-window algorithm
> and misses many cases. For example:
>
> float a[10000];
> float t() { return a[0] + a[8000]; }
>
> is compiled to:
>
> la.local $r13,a
> la.local $r12,a+32768
> fld.s $f1,$r13,0
> fld.s $f0,$r12,-768
> fadd.s $f0,$f1,$f0
>
> by trunk. But as we've explained in r14-4851, the following would be
> better with -mexplicit-relocs=auto:
>
> pcalau12i $r13,%pc_hi20(a)
> pcalau12i $r12,%pc_hi20(a+32000)
> fld.s $f1,$r13,%pc_lo12(a)
> fld.s $f0,$r12,%pc_lo12(a+32000)
> fadd.s $f0,$f1,$f0
>
> However the sliding-window algorithm just won't detect the pcalau12i/fld
> pair to be optimized. Use a define_insn_and_split in combine pass will
> work around the issue.
>
> gcc/ChangeLog:
>
> * config/loongarch/loongarch.md:
> (simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>): New
> define_insn_and_split.
> (simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>): Likewise.
> (simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>): Likewise.
> (simple_load_off<su>ext<P:mode><SUBDI:mode><GPR:mode>):
> Likewise.
> (simple_store<ST_ANY:mode><P:mode>): Likewise.
> (simple_store_off<ST_ANY:mode><P:mode>): Likewise.
> (define_peephole2): Remove la.local/[f]ld peepholes.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c:
> New test.
> ---
>
> Bootstrapped & regtested on loongarch64-linux-gnu. Ok for trunk?
>
> gcc/config/loongarch/loongarch.md | 165 +++++++++---------
> ...explicit-relocs-auto-single-load-store-2.c | 11 ++
> 2 files changed, 98 insertions(+), 78 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
>
> diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
> index 7b26d15aa4e..4009de408fb 100644
> --- a/gcc/config/loongarch/loongarch.md
> +++ b/gcc/config/loongarch/loongarch.md
> @@ -4033,101 +4033,110 @@ (define_insn "loongarch_crcc_w_<size>_w"
> ;;
> ;; And if the pseudo op cannot be relaxed, we'll get a worse result (with
> ;; 3 instructions).
> -(define_peephole2
> - [(set (match_operand:P 0 "register_operand")
> - (match_operand:P 1 "symbolic_pcrel_operand"))
> - (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
> - (mem:LD_AT_LEAST_32_BIT (match_dup 0)))]
> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> - && (peep2_reg_dead_p (2, operands[0]) \
> - || REGNO (operands[0]) == REGNO (operands[2]))"
> - [(set (match_dup 2)
> - (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
> +(define_insn_and_split "simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>"
> + [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
> + (mem:LD_AT_LEAST_32_BIT
> + (match_operand:P 1 "symbolic_pcrel_operand" "")))]
> + "loongarch_pre_reload_split () \
> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> + "#"
> + ""
> + [(set (match_dup 0)
> + (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
> {
> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> + operands[2] = gen_reg_rtx (Pmode);
> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
> })
>
> -(define_peephole2
> - [(set (match_operand:P 0 "register_operand")
> - (match_operand:P 1 "symbolic_pcrel_operand"))
> - (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
> - (mem:LD_AT_LEAST_32_BIT (plus (match_dup 0)
> - (match_operand 3 "const_int_operand"))))]
> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> - && (peep2_reg_dead_p (2, operands[0]) \
> - || REGNO (operands[0]) == REGNO (operands[2]))"
> - [(set (match_dup 2)
> - (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
> +(define_insn_and_split "simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>"
> + [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
> + (mem:LD_AT_LEAST_32_BIT
> + (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
> + (match_operand 2 "const_int_operand" ""))))]
> + "loongarch_pre_reload_split () \
> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> + "#"
> + ""
> + [(set (match_dup 0)
> + (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
> {
> - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> + HOST_WIDE_INT offset = INTVAL (operands[2]);
> + operands[2] = gen_reg_rtx (Pmode);
> + operands[1] = plus_constant (Pmode, operands[1], offset);
> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
> })
>
> -(define_peephole2
> - [(set (match_operand:P 0 "register_operand")
> - (match_operand:P 1 "symbolic_pcrel_operand"))
> - (set (match_operand:GPR 2 "register_operand")
> - (any_extend:GPR (mem:SUBDI (match_dup 0))))]
> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> - && (peep2_reg_dead_p (2, operands[0]) \
> - || REGNO (operands[0]) == REGNO (operands[2]))"
> - [(set (match_dup 2)
> - (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
> - (match_dup 1)))))]
> +(define_insn_and_split "simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
> + [(set (match_operand:GPR 0 "register_operand" "=r")
> + (any_extend:GPR
> + (mem:SUBDI (match_operand:P 1 "symbolic_pcrel_operand" ""))))]
> + "loongarch_pre_reload_split () \
> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> + "#"
> + ""
> + [(set (match_dup 0)
> + (any_extend:GPR
> + (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
> {
> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> + operands[2] = gen_reg_rtx (Pmode);
> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
> })
>
> -(define_peephole2
> - [(set (match_operand:P 0 "register_operand")
> - (match_operand:P 1 "symbolic_pcrel_operand"))
> - (set (match_operand:GPR 2 "register_operand")
> +(define_insn_and_split
> + "simple_load_off_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
> + [(set (match_operand:GPR 0 "register_operand" "=r")
> + (any_extend:GPR
> + (mem:SUBDI
> + (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
> + (match_operand 2 "const_int_operand" "")))))]
> + "loongarch_pre_reload_split () \
> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> + "#"
> + ""
> + [(set (match_dup 0)
> (any_extend:GPR
> - (mem:SUBDI (plus (match_dup 0)
> - (match_operand 3 "const_int_operand")))))]
> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> - && (peep2_reg_dead_p (2, operands[0]) \
> - || REGNO (operands[0]) == REGNO (operands[2]))"
> - [(set (match_dup 2)
> - (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
> - (match_dup 1)))))]
> + (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
> {
> - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> + HOST_WIDE_INT offset = INTVAL (operands[2]);
> + operands[2] = gen_reg_rtx (Pmode);
> + operands[1] = plus_constant (Pmode, operands[1], offset);
> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
> })
>
> -(define_peephole2
> - [(set (match_operand:P 0 "register_operand")
> - (match_operand:P 1 "symbolic_pcrel_operand"))
> - (set (mem:ST_ANY (match_dup 0))
> - (match_operand:ST_ANY 2 "register_operand"))]
> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> - && (peep2_reg_dead_p (2, operands[0])) \
> - && REGNO (operands[0]) != REGNO (operands[2])"
> - [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
> +(define_insn_and_split "simple_store<ST_ANY:mode><P:mode>"
> + [(set (mem:ST_ANY (match_operand:P 0 "symbolic_pcrel_operand"))
> + (match_operand:ST_ANY 1 "register_operand" "r,f"))]
> + "loongarch_pre_reload_split () \
> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> + "#"
> + ""
> + [(set (mem:ST_ANY (lo_sum:P (match_dup 2) (match_dup 0))) (match_dup 1))]
> {
> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> + operands[2] = gen_reg_rtx (Pmode);
> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[0]));
> })
>
> -(define_peephole2
> - [(set (match_operand:P 0 "register_operand")
> - (match_operand:P 1 "symbolic_pcrel_operand"))
> - (set (mem:ST_ANY (plus (match_dup 0)
> - (match_operand 3 "const_int_operand")))
> - (match_operand:ST_ANY 2 "register_operand"))]
> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
> - && (peep2_reg_dead_p (2, operands[0])) \
> - && REGNO (operands[0]) != REGNO (operands[2])"
> - [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
> +(define_insn_and_split "simple_store_off<ST_ANY:mode><P:mode>"
> + [(set (mem:ST_ANY
> + (plus (match_operand:P 0 "symbolic_pcrel_operand" "")
> + (match_operand 1 "const_int_operand" "")))
> + (match_operand:ST_ANY 2 "register_operand" "r,f"))]
> + "loongarch_pre_reload_split () \
> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
> + "#"
> + ""
> + [(set (mem:ST_ANY (lo_sum:P (match_dup 1) (match_dup 0))) (match_dup 2))]
> {
> - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
> + HOST_WIDE_INT offset = INTVAL (operands[1]);
> + operands[1] = gen_reg_rtx (Pmode);
> + operands[0] = plus_constant (Pmode, operands[0], offset);
> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[1], operands[0]));
> })
>
> ;; Synchronization instructions.
> diff --git a/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
> new file mode 100644
> index 00000000000..42cb966d1e0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=loongarch64 -mabi=lp64d -mexplicit-relocs=auto" } */
> +
> +float a[8001];
> +float
> +t (void)
> +{
> + return a[0] + a[8000];
> +}
> +
> +/* { dg-final { scan-assembler-not "la.local" } } */
Sorry, I've been busy with something else these two days. I don't think
there's anything wrong with the code,
but I need to test the spec.:-)
在 2023/12/21 下午7:56, Xi Ruoyao 写道:
> Ping :).
>
> On Tue, 2023-12-12 at 14:47 +0800, Xi Ruoyao wrote:
>> The problem with peephole2 is it uses a naive sliding-window algorithm
>> and misses many cases. For example:
>>
>> float a[10000];
>> float t() { return a[0] + a[8000]; }
>>
>> is compiled to:
>>
>> la.local $r13,a
>> la.local $r12,a+32768
>> fld.s $f1,$r13,0
>> fld.s $f0,$r12,-768
>> fadd.s $f0,$f1,$f0
>>
>> by trunk. But as we've explained in r14-4851, the following would be
>> better with -mexplicit-relocs=auto:
>>
>> pcalau12i $r13,%pc_hi20(a)
>> pcalau12i $r12,%pc_hi20(a+32000)
>> fld.s $f1,$r13,%pc_lo12(a)
>> fld.s $f0,$r12,%pc_lo12(a+32000)
>> fadd.s $f0,$f1,$f0
>>
>> However the sliding-window algorithm just won't detect the pcalau12i/fld
>> pair to be optimized. Use a define_insn_and_split in combine pass will
>> work around the issue.
>>
>> gcc/ChangeLog:
>>
>> * config/loongarch/loongarch.md:
>> (simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>): New
>> define_insn_and_split.
>> (simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>): Likewise.
>> (simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>): Likewise.
>> (simple_load_off<su>ext<P:mode><SUBDI:mode><GPR:mode>):
>> Likewise.
>> (simple_store<ST_ANY:mode><P:mode>): Likewise.
>> (simple_store_off<ST_ANY:mode><P:mode>): Likewise.
>> (define_peephole2): Remove la.local/[f]ld peepholes.
>>
>> gcc/testsuite/ChangeLog:
>>
>> * gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c:
>> New test.
>> ---
>>
>> Bootstrapped & regtested on loongarch64-linux-gnu. Ok for trunk?
>>
>> gcc/config/loongarch/loongarch.md | 165 +++++++++---------
>> ...explicit-relocs-auto-single-load-store-2.c | 11 ++
>> 2 files changed, 98 insertions(+), 78 deletions(-)
>> create mode 100644 gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
>>
>> diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
>> index 7b26d15aa4e..4009de408fb 100644
>> --- a/gcc/config/loongarch/loongarch.md
>> +++ b/gcc/config/loongarch/loongarch.md
>> @@ -4033,101 +4033,110 @@ (define_insn "loongarch_crcc_w_<size>_w"
>> ;;
>> ;; And if the pseudo op cannot be relaxed, we'll get a worse result (with
>> ;; 3 instructions).
>> -(define_peephole2
>> - [(set (match_operand:P 0 "register_operand")
>> - (match_operand:P 1 "symbolic_pcrel_operand"))
>> - (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
>> - (mem:LD_AT_LEAST_32_BIT (match_dup 0)))]
>> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> - && (peep2_reg_dead_p (2, operands[0]) \
>> - || REGNO (operands[0]) == REGNO (operands[2]))"
>> - [(set (match_dup 2)
>> - (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
>> +(define_insn_and_split "simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>"
>> + [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
>> + (mem:LD_AT_LEAST_32_BIT
>> + (match_operand:P 1 "symbolic_pcrel_operand" "")))]
>> + "loongarch_pre_reload_split () \
>> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> + "#"
>> + ""
>> + [(set (match_dup 0)
>> + (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
>> {
>> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> + operands[2] = gen_reg_rtx (Pmode);
>> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>> })
>>
>> -(define_peephole2
>> - [(set (match_operand:P 0 "register_operand")
>> - (match_operand:P 1 "symbolic_pcrel_operand"))
>> - (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
>> - (mem:LD_AT_LEAST_32_BIT (plus (match_dup 0)
>> - (match_operand 3 "const_int_operand"))))]
>> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> - && (peep2_reg_dead_p (2, operands[0]) \
>> - || REGNO (operands[0]) == REGNO (operands[2]))"
>> - [(set (match_dup 2)
>> - (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
>> +(define_insn_and_split "simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>"
>> + [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
>> + (mem:LD_AT_LEAST_32_BIT
>> + (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
>> + (match_operand 2 "const_int_operand" ""))))]
>> + "loongarch_pre_reload_split () \
>> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> + "#"
>> + ""
>> + [(set (match_dup 0)
>> + (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
>> {
>> - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
>> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> + HOST_WIDE_INT offset = INTVAL (operands[2]);
>> + operands[2] = gen_reg_rtx (Pmode);
>> + operands[1] = plus_constant (Pmode, operands[1], offset);
>> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>> })
>>
>> -(define_peephole2
>> - [(set (match_operand:P 0 "register_operand")
>> - (match_operand:P 1 "symbolic_pcrel_operand"))
>> - (set (match_operand:GPR 2 "register_operand")
>> - (any_extend:GPR (mem:SUBDI (match_dup 0))))]
>> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> - && (peep2_reg_dead_p (2, operands[0]) \
>> - || REGNO (operands[0]) == REGNO (operands[2]))"
>> - [(set (match_dup 2)
>> - (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
>> - (match_dup 1)))))]
>> +(define_insn_and_split "simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
>> + [(set (match_operand:GPR 0 "register_operand" "=r")
>> + (any_extend:GPR
>> + (mem:SUBDI (match_operand:P 1 "symbolic_pcrel_operand" ""))))]
>> + "loongarch_pre_reload_split () \
>> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> + "#"
>> + ""
>> + [(set (match_dup 0)
>> + (any_extend:GPR
>> + (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
>> {
>> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> + operands[2] = gen_reg_rtx (Pmode);
>> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>> })
>>
>> -(define_peephole2
>> - [(set (match_operand:P 0 "register_operand")
>> - (match_operand:P 1 "symbolic_pcrel_operand"))
>> - (set (match_operand:GPR 2 "register_operand")
>> +(define_insn_and_split
>> + "simple_load_off_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
>> + [(set (match_operand:GPR 0 "register_operand" "=r")
>> + (any_extend:GPR
>> + (mem:SUBDI
>> + (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
>> + (match_operand 2 "const_int_operand" "")))))]
>> + "loongarch_pre_reload_split () \
>> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> + "#"
>> + ""
>> + [(set (match_dup 0)
>> (any_extend:GPR
>> - (mem:SUBDI (plus (match_dup 0)
>> - (match_operand 3 "const_int_operand")))))]
>> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> - && (peep2_reg_dead_p (2, operands[0]) \
>> - || REGNO (operands[0]) == REGNO (operands[2]))"
>> - [(set (match_dup 2)
>> - (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
>> - (match_dup 1)))))]
>> + (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
>> {
>> - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
>> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> + HOST_WIDE_INT offset = INTVAL (operands[2]);
>> + operands[2] = gen_reg_rtx (Pmode);
>> + operands[1] = plus_constant (Pmode, operands[1], offset);
>> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
>> })
>>
>> -(define_peephole2
>> - [(set (match_operand:P 0 "register_operand")
>> - (match_operand:P 1 "symbolic_pcrel_operand"))
>> - (set (mem:ST_ANY (match_dup 0))
>> - (match_operand:ST_ANY 2 "register_operand"))]
>> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> - && (peep2_reg_dead_p (2, operands[0])) \
>> - && REGNO (operands[0]) != REGNO (operands[2])"
>> - [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
>> +(define_insn_and_split "simple_store<ST_ANY:mode><P:mode>"
>> + [(set (mem:ST_ANY (match_operand:P 0 "symbolic_pcrel_operand"))
>> + (match_operand:ST_ANY 1 "register_operand" "r,f"))]
>> + "loongarch_pre_reload_split () \
>> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> + "#"
>> + ""
>> + [(set (mem:ST_ANY (lo_sum:P (match_dup 2) (match_dup 0))) (match_dup 1))]
>> {
>> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> + operands[2] = gen_reg_rtx (Pmode);
>> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[0]));
>> })
>>
>> -(define_peephole2
>> - [(set (match_operand:P 0 "register_operand")
>> - (match_operand:P 1 "symbolic_pcrel_operand"))
>> - (set (mem:ST_ANY (plus (match_dup 0)
>> - (match_operand 3 "const_int_operand")))
>> - (match_operand:ST_ANY 2 "register_operand"))]
>> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
>> - && (peep2_reg_dead_p (2, operands[0])) \
>> - && REGNO (operands[0]) != REGNO (operands[2])"
>> - [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
>> +(define_insn_and_split "simple_store_off<ST_ANY:mode><P:mode>"
>> + [(set (mem:ST_ANY
>> + (plus (match_operand:P 0 "symbolic_pcrel_operand" "")
>> + (match_operand 1 "const_int_operand" "")))
>> + (match_operand:ST_ANY 2 "register_operand" "r,f"))]
>> + "loongarch_pre_reload_split () \
>> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
>> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
>> + "#"
>> + ""
>> + [(set (mem:ST_ANY (lo_sum:P (match_dup 1) (match_dup 0))) (match_dup 2))]
>> {
>> - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
>> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
>> + HOST_WIDE_INT offset = INTVAL (operands[1]);
>> + operands[1] = gen_reg_rtx (Pmode);
>> + operands[0] = plus_constant (Pmode, operands[0], offset);
>> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[1], operands[0]));
>> })
>>
>> ;; Synchronization instructions.
>> diff --git a/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
>> new file mode 100644
>> index 00000000000..42cb966d1e0
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c
>> @@ -0,0 +1,11 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -march=loongarch64 -mabi=lp64d -mexplicit-relocs=auto" } */
>> +
>> +float a[8001];
>> +float
>> +t (void)
>> +{
>> + return a[0] + a[8000];
>> +}
>> +
>> +/* { dg-final { scan-assembler-not "la.local" } } */
在 2023/12/21 下午8:00, chenglulu 写道:
> Sorry, I've been busy with something else these two days. I don't
> think there's anything wrong with the code,
>
> but I need to test the spec.:-)
Hi, Ruoyao:
After applying this patch, spec2006 464.h264 ref will have a 6.4%
performance drop. So I'm going to retest it.
>
> 在 2023/12/21 下午7:56, Xi Ruoyao 写道:
>> Ping :).
>>
On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote:
>
> 在 2023/12/21 下午8:00, chenglulu 写道:
> > Sorry, I've been busy with something else these two days. I don't
> > think there's anything wrong with the code,
> >
> > but I need to test the spec.:-)
>
> Hi, Ruoyao:
>
> After applying this patch, spec2006 464.h264 ref will have a 6.4%
> performance drop. So I'm going to retest it.
I think 6.4% is large enough not to be a random error.
Is there an example showing the code regression?
And I'm wondering if keeping the peephole besides the new
define_insn_and_split produces a better result instead of solely relying
on define_insn_and_split?
在 2023/12/22 下午3:09, Xi Ruoyao 写道:
> On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote:
>> 在 2023/12/21 下午8:00, chenglulu 写道:
>>> Sorry, I've been busy with something else these two days. I don't
>>> think there's anything wrong with the code,
>>>
>>> but I need to test the spec.:-)
>> Hi, Ruoyao:
>>
>> After applying this patch, spec2006 464.h264 ref will have a 6.4%
>> performance drop. So I'm going to retest it.
> I think 6.4% is large enough not to be a random error.
>
> Is there an example showing the code regression?
>
> And I'm wondering if keeping the peephole besides the new
> define_insn_and_split produces a better result instead of solely relying
> on define_insn_and_split?
>
I haven't debugged this yet, I'm retesting, if there is still such a big
performance gap,
I think I need to see the reason.
在 2023/12/22 下午3:21, chenglulu 写道:
>
> 在 2023/12/22 下午3:09, Xi Ruoyao 写道:
>> On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote:
>>> 在 2023/12/21 下午8:00, chenglulu 写道:
>>>> Sorry, I've been busy with something else these two days. I don't
>>>> think there's anything wrong with the code,
>>>>
>>>> but I need to test the spec.:-)
>>> Hi, Ruoyao:
>>>
>>> After applying this patch, spec2006 464.h264 ref will have a 6.4%
>>> performance drop. So I'm going to retest it.
>> I think 6.4% is large enough not to be a random error.
>>
>> Is there an example showing the code regression?
>>
>> And I'm wondering if keeping the peephole besides the new
>> define_insn_and_split produces a better result instead of solely relying
>> on define_insn_and_split?
>>
> I haven't debugged this yet, I'm retesting, if there is still such a
> big performance gap,
>
> I think I need to see the reason.
>
The performance drop has nothing to do with this patch. I found that the
h264 performance compiled
by r14-6787 compared to r14-6421 dropped by 6.4%.
在 2023/12/23 上午10:26, chenglulu 写道:
>
> 在 2023/12/22 下午3:21, chenglulu 写道:
>>
>> 在 2023/12/22 下午3:09, Xi Ruoyao 写道:
>>> On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote:
>>>> 在 2023/12/21 下午8:00, chenglulu 写道:
>>>>> Sorry, I've been busy with something else these two days. I don't
>>>>> think there's anything wrong with the code,
>>>>>
>>>>> but I need to test the spec.:-)
>>>> Hi, Ruoyao:
>>>>
>>>> After applying this patch, spec2006 464.h264 ref will have a 6.4%
>>>> performance drop. So I'm going to retest it.
>>> I think 6.4% is large enough not to be a random error.
>>>
>>> Is there an example showing the code regression?
>>>
>>> And I'm wondering if keeping the peephole besides the new
>>> define_insn_and_split produces a better result instead of solely
>>> relying
>>> on define_insn_and_split?
>>>
>> I haven't debugged this yet, I'm retesting, if there is still such a
>> big performance gap,
>>
>> I think I need to see the reason.
>>
> The performance drop has nothing to do with this patch. I found that
> the h264 performance compiled
>
> by r14-6787 compared to r14-6421 dropped by 6.4%.
>
>
But there is a problem. My regression test has the following two fail
items.(based on r14-6787)
+FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
+FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
> > The performance drop has nothing to do with this patch. I found that the h264 performance compiled
> > by r14-6787 compared to r14-6421 dropped by 6.4%.
Then I guess we should create a bug report...
> But there is a problem. My regression test has the following two fail items.(based on r14-6787)
> +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
> +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
Strange. I didn't see them on r14-6650 (with or without the patch).
On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote:
> On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
> > > The performance drop has nothing to do with this patch. I found that the h264 performance compiled
> > > by r14-6787 compared to r14-6421 dropped by 6.4%.
>
> Then I guess we should create a bug report...
>
> > But there is a problem. My regression test has the following two fail items.(based on r14-6787)
>
> > +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
I guess this is https://gcc.gnu.org/PR28123.
> > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
I'll take a look on this. Maybe it will show up with Binutils trunk (I
just realized I tested this patch with Binutils 2.41, and it's not
sufficient to really test the change).
> Strange. I didn't see them on r14-6650 (with or without the patch).
On Sat, 2023-12-23 at 18:47 +0800, Xi Ruoyao wrote:
> On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote:
> > On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
> > > > The performance drop has nothing to do with this patch. I found that the h264 performance compiled
> > > > by r14-6787 compared to r14-6421 dropped by 6.4%.
> >
> > Then I guess we should create a bug report...
> >
> > > But there is a problem. My regression test has the following two fail items.(based on r14-6787)
> >
> > > +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
>
> I guess this is https://gcc.gnu.org/PR28123.
>
> > > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
>
> I'll take a look on this. Maybe it will show up with Binutils trunk (I
> just realized I tested this patch with Binutils 2.41, and it's not
> sufficient to really test the change).
I cannot reproduce the issue on a Gentoo dev machine with Binutils
2.41.50.20231218 and the patch on top of r14-6819. And in my manual
testing (for ruling out the difference caused by default PIE and SSP)
the test also passes:
xry111@nanmen2 ~/git-repos/gcc-build $ /home/xry111/git-repos/gcc-
build/gcc/xgcc -B/home/xry111/git-repos/gcc-build/gcc/ /home/xry111/git-
repos/gcc/gcc/testsuite/gcc.dg/pr86617.c -fdiagnostics-plain-output -Os
-fdump-rtl-final -ffat-lto-objects -S -o pr86617.s -fno-stack-protector
-fno-pie && grep -c mem/v pr86617.c.348r.final
6
Could you recheck with latest GCC master?
在 2023/12/24 下午8:59, Xi Ruoyao 写道:
> On Sat, 2023-12-23 at 18:47 +0800, Xi Ruoyao wrote:
>> On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote:
>>> On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
>>>>> The performance drop has nothing to do with this patch. I found that the h264 performance compiled
>>>>> by r14-6787 compared to r14-6421 dropped by 6.4%.
>>> Then I guess we should create a bug report...
>>>
>>>> But there is a problem. My regression test has the following two fail items.(based on r14-6787)
>>>> +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
>> I guess this is https://gcc.gnu.org/PR28123.
>>
>>>> +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
>> I'll take a look on this. Maybe it will show up with Binutils trunk (I
>> just realized I tested this patch with Binutils 2.41, and it's not
>> sufficient to really test the change).
> I cannot reproduce the issue on a Gentoo dev machine with Binutils
> 2.41.50.20231218 and the patch on top of r14-6819. And in my manual
> testing (for ruling out the difference caused by default PIE and SSP)
> the test also passes:
>
> xry111@nanmen2 ~/git-repos/gcc-build $ /home/xry111/git-repos/gcc-
> build/gcc/xgcc -B/home/xry111/git-repos/gcc-build/gcc/ /home/xry111/git-
> repos/gcc/gcc/testsuite/gcc.dg/pr86617.c -fdiagnostics-plain-output -Os
> -fdump-rtl-final -ffat-lto-objects -S -o pr86617.s -fno-stack-protector
> -fno-pie && grep -c mem/v pr86617.c.348r.final
> 6
>
> Could you recheck with latest GCC master?
Ok, I'll test again with the latest code.
>
On Mon, 2023-12-25 at 10:08 +0800, chenglulu wrote:
>
> 在 2023/12/24 下午8:59, Xi Ruoyao 写道:
> > On Sat, 2023-12-23 at 18:47 +0800, Xi Ruoyao wrote:
> > > On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote:
> > > > On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
> > > > > > The performance drop has nothing to do with this patch. I
> > > > > > found that the h264 performance compiled
> > > > > > by r14-6787 compared to r14-6421 dropped by 6.4%.
> > > > Then I guess we should create a bug report...
> > > >
> > > > > But there is a problem. My regression test has the following
> > > > > two fail items.(based on r14-6787)
> > > > > +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
> > > I guess this is https://gcc.gnu.org/PR28123.
> > >
> > > > > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
> > > I'll take a look on this. Maybe it will show up with Binutils
> > > trunk (I
> > > just realized I tested this patch with Binutils 2.41, and it's not
> > > sufficient to really test the change).
> > I cannot reproduce the issue on a Gentoo dev machine with Binutils
> > 2.41.50.20231218 and the patch on top of r14-6819. And in my manual
> > testing (for ruling out the difference caused by default PIE and
> > SSP)
> > the test also passes:
> >
> > xry111@nanmen2 ~/git-repos/gcc-build $ /home/xry111/git-repos/gcc-
> > build/gcc/xgcc -B/home/xry111/git-repos/gcc-build/gcc/
> > /home/xry111/git-
> > repos/gcc/gcc/testsuite/gcc.dg/pr86617.c -fdiagnostics-plain-output
> > -Os
> > -fdump-rtl-final -ffat-lto-objects -S -o pr86617.s -fno-stack-
> > protector
> > -fno-pie && grep -c mem/v pr86617.c.348r.final
> > 6
> >
> > Could you recheck with latest GCC master?
> Ok, I'll test again with the latest code.
Per https://gcc.gnu.org/pipermail/gcc-patches/2023-December/641407.html
I need to and "&& true" into the split condition. I'll test it and send
V2.
在 2023/12/23 下午6:44, Xi Ruoyao 写道:
> On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote:
>>> The performance drop has nothing to do with this patch. I found that the h264 performance compiled
>>> by r14-6787 compared to r14-6421 dropped by 6.4%.
> Then I guess we should create a bug report...
The code h264 score in r14-6818 is the same as that of r14-6421.
>
>> But there is a problem. My regression test has the following two fail items.(based on r14-6787)
>> +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors)
>> +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
> Strange. I didn't see them on r14-6650 (with or without the patch).
>
+FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
In r14-6818 the issue persists. I kind of chased the code and found that the problem is like this:
volatile unsigned char u8;
void test (void)
{
u8 = u8 + u8;
u8 = u8 - u8;
}
$./gcc/cc1 test.c -o test.s -fdump-rtl-all-all -fdiagnostics-plain-output -Os -fdump-rtl-final -ffat-lto-objects
test.c.301r.outof_cfglayout
(insn 7 6 9 2 (set (reg:DI 80 [ u8.0_1 ])
(zero_extend:DI*(mem/v/c*:QI (symbol_ref:DI ("*.LANCHOR0") [flags 0x182]) [0 u8D.2193+0 S1 A8]))) "volatile.c":5:11 459 {simple_load_uextdiqidi}
(nil))
test.c.302r.split1
(insn 27 6 28 2 (set (reg:DI 98)
(unspec:DI [
(symbol_ref:DI ("*.LANCHOR0") [flags 0x182])
] UNSPEC_PCALAU12I_GR)) "volatile.c":5:11 -1
(nil))
(insn 28 27 9 2 (set (reg:DI 80 [ u8.0_1 ])
(zero_extend:DI*(mem:*QI (lo_sum:DI (reg:DI 98)
(symbol_ref:DI ("*.LANCHOR0") [flags 0x182])) [0 S1 A8]))) "volatile.c":5:11 -1
(nil))
The volatile property of the mem here is gone, so the test fails.
On Wed, 2023-12-27 at 11:59 +0800, chenglulu wrote:
> +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
>
> In r14-6818 the issue persists. I kind of chased the code and found that the problem is like this:
> volatile unsigned char u8;
>
> void test (void)
> {
> u8 = u8 + u8;
> u8 = u8 - u8;
> }
>
> $./gcc/cc1 test.c -o test.s -fdump-rtl-all-all -fdiagnostics-plain-output -Os -fdump-rtl-final -ffat-lto-objects
>
> test.c.301r.outof_cfglayout
>
> (insn 7 6 9 2 (set (reg:DI 80 [ u8.0_1 ])
> (zero_extend:DI (mem/v/c:QI (symbol_ref:DI ("*.LANCHOR0") [flags 0x182]) [0 u8D.2193+0 S1 A8]))) "volatile.c":5:11 459 {simple_load_uextdiqidi}
> (nil))
>
> test.c.302r.split1
>
> (insn 27 6 28 2 (set (reg:DI 98)
> (unspec:DI [
> (symbol_ref:DI ("*.LANCHOR0") [flags 0x182])
> ] UNSPEC_PCALAU12I_GR)) "volatile.c":5:11 -1
> (nil))
> (insn 28 27 9 2 (set (reg:DI 80 [ u8.0_1 ])
> (zero_extend:DI (mem:QI (lo_sum:DI (reg:DI 98)
> (symbol_ref:DI ("*.LANCHOR0") [flags 0x182])) [0 S1 A8]))) "volatile.c":5:11 -1
> (nil))
>
> The volatile property of the mem here is gone, so the test fails.
Phew. I guess I couldn't reproduce it because I have Jeff's ext-dce
patch in my local repo, which removed the zero_extend...
I'll rework this patch.
@@ -4033,101 +4033,110 @@ (define_insn "loongarch_crcc_w_<size>_w"
;;
;; And if the pseudo op cannot be relaxed, we'll get a worse result (with
;; 3 instructions).
-(define_peephole2
- [(set (match_operand:P 0 "register_operand")
- (match_operand:P 1 "symbolic_pcrel_operand"))
- (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
- (mem:LD_AT_LEAST_32_BIT (match_dup 0)))]
- "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
- && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
- && (peep2_reg_dead_p (2, operands[0]) \
- || REGNO (operands[0]) == REGNO (operands[2]))"
- [(set (match_dup 2)
- (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
+(define_insn_and_split "simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>"
+ [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
+ (mem:LD_AT_LEAST_32_BIT
+ (match_operand:P 1 "symbolic_pcrel_operand" "")))]
+ "loongarch_pre_reload_split () \
+ && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+ && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+ "#"
+ ""
+ [(set (match_dup 0)
+ (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
{
- emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+ operands[2] = gen_reg_rtx (Pmode);
+ emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
})
-(define_peephole2
- [(set (match_operand:P 0 "register_operand")
- (match_operand:P 1 "symbolic_pcrel_operand"))
- (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand")
- (mem:LD_AT_LEAST_32_BIT (plus (match_dup 0)
- (match_operand 3 "const_int_operand"))))]
- "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
- && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
- && (peep2_reg_dead_p (2, operands[0]) \
- || REGNO (operands[0]) == REGNO (operands[2]))"
- [(set (match_dup 2)
- (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))]
+(define_insn_and_split "simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>"
+ [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f")
+ (mem:LD_AT_LEAST_32_BIT
+ (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
+ (match_operand 2 "const_int_operand" ""))))]
+ "loongarch_pre_reload_split () \
+ && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+ && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+ "#"
+ ""
+ [(set (match_dup 0)
+ (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))]
{
- operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
- emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+ HOST_WIDE_INT offset = INTVAL (operands[2]);
+ operands[2] = gen_reg_rtx (Pmode);
+ operands[1] = plus_constant (Pmode, operands[1], offset);
+ emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
})
-(define_peephole2
- [(set (match_operand:P 0 "register_operand")
- (match_operand:P 1 "symbolic_pcrel_operand"))
- (set (match_operand:GPR 2 "register_operand")
- (any_extend:GPR (mem:SUBDI (match_dup 0))))]
- "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
- && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
- && (peep2_reg_dead_p (2, operands[0]) \
- || REGNO (operands[0]) == REGNO (operands[2]))"
- [(set (match_dup 2)
- (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
- (match_dup 1)))))]
+(define_insn_and_split "simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
+ [(set (match_operand:GPR 0 "register_operand" "=r")
+ (any_extend:GPR
+ (mem:SUBDI (match_operand:P 1 "symbolic_pcrel_operand" ""))))]
+ "loongarch_pre_reload_split () \
+ && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+ && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+ "#"
+ ""
+ [(set (match_dup 0)
+ (any_extend:GPR
+ (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
{
- emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+ operands[2] = gen_reg_rtx (Pmode);
+ emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
})
-(define_peephole2
- [(set (match_operand:P 0 "register_operand")
- (match_operand:P 1 "symbolic_pcrel_operand"))
- (set (match_operand:GPR 2 "register_operand")
+(define_insn_and_split
+ "simple_load_off_<su>ext<P:mode><SUBDI:mode><GPR:mode>"
+ [(set (match_operand:GPR 0 "register_operand" "=r")
+ (any_extend:GPR
+ (mem:SUBDI
+ (plus (match_operand:P 1 "symbolic_pcrel_operand" "")
+ (match_operand 2 "const_int_operand" "")))))]
+ "loongarch_pre_reload_split () \
+ && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+ && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+ "#"
+ ""
+ [(set (match_dup 0)
(any_extend:GPR
- (mem:SUBDI (plus (match_dup 0)
- (match_operand 3 "const_int_operand")))))]
- "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
- && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
- && (peep2_reg_dead_p (2, operands[0]) \
- || REGNO (operands[0]) == REGNO (operands[2]))"
- [(set (match_dup 2)
- (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0)
- (match_dup 1)))))]
+ (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))]
{
- operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
- emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+ HOST_WIDE_INT offset = INTVAL (operands[2]);
+ operands[2] = gen_reg_rtx (Pmode);
+ operands[1] = plus_constant (Pmode, operands[1], offset);
+ emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1]));
})
-(define_peephole2
- [(set (match_operand:P 0 "register_operand")
- (match_operand:P 1 "symbolic_pcrel_operand"))
- (set (mem:ST_ANY (match_dup 0))
- (match_operand:ST_ANY 2 "register_operand"))]
- "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
- && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
- && (peep2_reg_dead_p (2, operands[0])) \
- && REGNO (operands[0]) != REGNO (operands[2])"
- [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
+(define_insn_and_split "simple_store<ST_ANY:mode><P:mode>"
+ [(set (mem:ST_ANY (match_operand:P 0 "symbolic_pcrel_operand"))
+ (match_operand:ST_ANY 1 "register_operand" "r,f"))]
+ "loongarch_pre_reload_split () \
+ && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+ && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+ "#"
+ ""
+ [(set (mem:ST_ANY (lo_sum:P (match_dup 2) (match_dup 0))) (match_dup 1))]
{
- emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+ operands[2] = gen_reg_rtx (Pmode);
+ emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[0]));
})
-(define_peephole2
- [(set (match_operand:P 0 "register_operand")
- (match_operand:P 1 "symbolic_pcrel_operand"))
- (set (mem:ST_ANY (plus (match_dup 0)
- (match_operand 3 "const_int_operand")))
- (match_operand:ST_ANY 2 "register_operand"))]
- "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
- && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \
- && (peep2_reg_dead_p (2, operands[0])) \
- && REGNO (operands[0]) != REGNO (operands[2])"
- [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))]
+(define_insn_and_split "simple_store_off<ST_ANY:mode><P:mode>"
+ [(set (mem:ST_ANY
+ (plus (match_operand:P 0 "symbolic_pcrel_operand" "")
+ (match_operand 1 "const_int_operand" "")))
+ (match_operand:ST_ANY 2 "register_operand" "r,f"))]
+ "loongarch_pre_reload_split () \
+ && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \
+ && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)"
+ "#"
+ ""
+ [(set (mem:ST_ANY (lo_sum:P (match_dup 1) (match_dup 0))) (match_dup 2))]
{
- operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3]));
- emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1]));
+ HOST_WIDE_INT offset = INTVAL (operands[1]);
+ operands[1] = gen_reg_rtx (Pmode);
+ operands[0] = plus_constant (Pmode, operands[0], offset);
+ emit_insn (gen_pcalau12i_gr<P:mode> (operands[1], operands[0]));
})
;; Synchronization instructions.
new file mode 100644
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=loongarch64 -mabi=lp64d -mexplicit-relocs=auto" } */
+
+float a[8001];
+float
+t (void)
+{
+ return a[0] + a[8000];
+}
+
+/* { dg-final { scan-assembler-not "la.local" } } */