[V2,0/7] ira/lra: Support subreg coalesce

Message ID 20231112095858.3669003-1-lehua.ding@rivai.ai
Headers
Series ira/lra: Support subreg coalesce |

Message

Lehua Ding Nov. 12, 2023, 9:58 a.m. UTC
  Hi,

These patchs try to support subreg coalesce feature in
register allocation passes (ira and lra).

Let's consider a RISC-V program (https://godbolt.org/z/ec51d91aT):

```
#include <riscv_vector.h>

void
foo (int32_t *in, int32_t *out, size_t m)
{
  vint32m2_t result = __riscv_vle32_v_i32m2 (in, 32);
  vint32m1_t v0 = __riscv_vget_v_i32m2_i32m1 (result, 0);
  vint32m1_t v1 = __riscv_vget_v_i32m2_i32m1 (result, 1);
  for (size_t i = 0; i < m; i++)
    {
      v0 = __riscv_vadd_vv_i32m1(v0, v0, 4);
      v1 = __riscv_vmul_vv_i32m1(v1, v1, 4);
    }
  *(vint32m1_t*)(out+4*0) = v0;
  *(vint32m1_t*)(out+4*1) = v1;
}
```

Before these patchs:

```
foo:
	li	a5,32
	vsetvli	zero,a5,e32,m2,ta,ma
	vle32.v	v4,0(a0)
	vmv1r.v	v2,v4
	vmv1r.v	v1,v5
	beq	a2,zero,.L2
	li	a5,0
	vsetivli	zero,4,e32,m1,ta,ma
.L3:
	addi	a5,a5,1
	vadd.vv	v2,v2,v2
	vmul.vv	v1,v1,v1
	bne	a2,a5,.L3
.L2:
	vs1r.v	v2,0(a1)
	addi	a1,a1,16
	vs1r.v	v1,0(a1)
	ret
```

After these patchs:

```
foo:
	li	a5,32
	vsetvli	zero,a5,e32,m2,ta,ma
	vle32.v	v2,0(a0)
	beq	a2,zero,.L2
	li	a5,0
	vsetivli	zero,4,e32,m1,ta,ma
.L3:
	addi	a5,a5,1
	vadd.vv	v2,v2,v2
	vmul.vv	v3,v3,v3
	bne	a2,a5,.L3
.L2:
	vs1r.v	v2,0(a1)
	addi	a1,a1,16
	vs1r.v	v3,0(a1)
	ret
```

As you can see, the two redundant vmv1r.v instructions were removed.
The reason for the two redundant vmv1r.v instructions is because
the current ira pass is being conservative in calculating the live
range of pseduo registers that occupy multil hardregs. As in the
following two RTL instructions. Where r134 occupies two physical
registers and r135 and r136 occupy one physical register.
At insn 12 point, ira considers the entire r134 pseudo register
to be live, so r135 is in conflict with r134, as shown in the ira
dump info. Then when the physical registers are allocated, r135 and
r134 are allocated first because they are inside the loop body and
have higher priority. This makes it difficult to assign r136 to
overlap with r134, i.e., to assign r136 to hr100, thus eliminating
the need for the vmv1r.v instruction. Thus two vmv1r.v instructions
appear.

If we refine the live information of r134 to the case of each subreg,
we can remove this conflict. We can then create copies of the set
with subreg reference, thus increasing the priority of the r134 allocation,
which allow registers with bigger alignment requirements to prioritize
the allocation of physical registers. In RVV, pseudo registers occupying
two physical registers need to be time-2 aligned.

```
(insn 11 10 12 2 (set (reg/v:RVVM1SI 135 [ v0 ])
        (subreg:RVVM1SI (reg/v:RVVM2SI 134 [ result ]) 0)) "/app/example.c":7:19 998 {*movrvvm1si_whole}
     (nil))
(insn 12 11 13 2 (set (reg/v:RVVM1SI 136 [ v1 ])
        (subreg:RVVM1SI (reg/v:RVVM2SI 134 [ result ]) [16, 16])) "/app/example.c":8:19 998 {*movrvvm1si_whole}
     (expr_list:REG_DEAD (reg/v:RVVM2SI 134 [ result ])
        (nil)))
```

ira dump:

;; a1(r136,l0) conflicts: a3(r135,l0)
;;     total conflict hard regs:
;;     conflict hard regs:
;; a3(r135,l0) conflicts: a1(r136,l0) a6(r134,l0)
;;     total conflict hard regs:
;;     conflict hard regs:
;; a6(r134,l0) conflicts: a3(r135,l0)
;;     total conflict hard regs:
;;     conflict hard regs:
;;
;; ...
      Popping a1(r135,l0)  --         assign reg 97
      Popping a3(r136,l0)  --         assign reg 98
      Popping a4(r137,l0)  --         assign reg 15
      Popping a5(r140,l0)  --         assign reg 12
      Popping a10(r145,l0)  --         assign reg 12
      Popping a2(r139,l0)  --         assign reg 11
      Popping a9(r144,l0)  --         assign reg 11
      Popping a0(r142,l0)  --         assign reg 11
      Popping a6(r134,l0)  --         assign reg 100
      Popping a7(r143,l0)  --         assign reg 10
      Popping a8(r141,l0)  --         assign reg 15

The AArch64 SVE has the same problem. Consider the following
code (https://godbolt.org/z/MYrK7Ghaj):

```
#include <arm_sve.h>

int bar (svbool_t pg, int64_t* base, int n, int64_t *in1, int64_t *in2, int64_t*out)
{
  svint64x4_t result = svld4_s64 (pg, base);
  svint64_t v0 = svget4_s64(result, 0);
  svint64_t v1 = svget4_s64(result, 1);
  svint64_t v2 = svget4_s64(result, 2);
  svint64_t v3 = svget4_s64(result, 3);

  for (int i = 0; i < n; i += 1)
    {
        svint64_t v18 = svld1_s64(pg, in1);
        svint64_t v19 = svld1_s64(pg, in2);
        v0 = svmad_s64_z(pg, v0, v18, v19);
        v1 = svmad_s64_z(pg, v1, v18, v19);
        v2 = svmad_s64_z(pg, v2, v18, v19);
        v3 = svmad_s64_z(pg, v3, v18, v19);
    }
  svst1_s64(pg, out+0,v0);
  svst1_s64(pg, out+1,v1);
  svst1_s64(pg, out+2,v2);
  svst1_s64(pg, out+3,v3);
}
```

Before these patchs:

```
bar:
	ld4d	{z4.d - z7.d}, p0/z, [x0]
	mov	z26.d, z4.d
	mov	z27.d, z5.d
	mov	z28.d, z6.d
	mov	z29.d, z7.d
	cmp	w1, 0
	...
```

After these patchs:

```
bar:
	ld4d	{z28.d - z31.d}, p0/z, [x0]
	cmp	w1, 0
	...
```

Lehua Ding (7):
  df: Add DF_LIVE_SUBREG problem
  ira: Switch to live_subreg data
  ira: Support subreg live range track
  ira: Support subreg copy
  ira: Add all nregs >= 2 pseudos to tracke subreg list
  lra: Switch to live_subreg data flow
  lra: Support subreg live range track and conflict detect

 gcc/Makefile.in          |   1 +
 gcc/df-problems.cc       | 889 ++++++++++++++++++++++++++++++++++++++-
 gcc/df.h                 |  67 +++
 gcc/hard-reg-set.h       |  33 ++
 gcc/ira-build.cc         | 456 ++++++++++++++++----
 gcc/ira-color.cc         | 851 ++++++++++++++++++++++++++-----------
 gcc/ira-conflicts.cc     | 221 +++++++---
 gcc/ira-emit.cc          |  24 +-
 gcc/ira-int.h            |  67 ++-
 gcc/ira-lives.cc         | 507 ++++++++++++++++------
 gcc/ira.cc               |  73 ++--
 gcc/lra-assigns.cc       | 111 ++++-
 gcc/lra-coalesce.cc      |  20 +-
 gcc/lra-constraints.cc   | 111 +++--
 gcc/lra-int.h            |  33 ++
 gcc/lra-lives.cc         | 660 ++++++++++++++++++++++++-----
 gcc/lra-remat.cc         |  13 +-
 gcc/lra-spills.cc        |  22 +-
 gcc/lra.cc               | 139 +++++-
 gcc/regs.h               |   7 +
 gcc/subreg-live-range.cc | 628 +++++++++++++++++++++++++++
 gcc/subreg-live-range.h  | 333 +++++++++++++++
 gcc/timevar.def          |   1 +
 23 files changed, 4490 insertions(+), 777 deletions(-)
 create mode 100644 gcc/subreg-live-range.cc
 create mode 100644 gcc/subreg-live-range.h
  

Comments

Lehua Ding Nov. 12, 2023, 12:08 p.m. UTC | #1
These patches found a new bug and I resend a v3 version, I'm sorry about 
this.

V3: https://gcc.gnu.org/pipermail/gcc-patches/2023-November/636178.html

On 2023/11/12 17:58, Lehua Ding wrote:
> Hi,
> 
> These patchs try to support subreg coalesce feature in
> register allocation passes (ira and lra).
> 
> Let's consider a RISC-V program (https://godbolt.org/z/ec51d91aT):
> 
> ```
> #include <riscv_vector.h>
> 
> void
> foo (int32_t *in, int32_t *out, size_t m)
> {
>    vint32m2_t result = __riscv_vle32_v_i32m2 (in, 32);
>    vint32m1_t v0 = __riscv_vget_v_i32m2_i32m1 (result, 0);
>    vint32m1_t v1 = __riscv_vget_v_i32m2_i32m1 (result, 1);
>    for (size_t i = 0; i < m; i++)
>      {
>        v0 = __riscv_vadd_vv_i32m1(v0, v0, 4);
>        v1 = __riscv_vmul_vv_i32m1(v1, v1, 4);
>      }
>    *(vint32m1_t*)(out+4*0) = v0;
>    *(vint32m1_t*)(out+4*1) = v1;
> }
> ```
> 
> Before these patchs:
> 
> ```
> foo:
> 	li	a5,32
> 	vsetvli	zero,a5,e32,m2,ta,ma
> 	vle32.v	v4,0(a0)
> 	vmv1r.v	v2,v4
> 	vmv1r.v	v1,v5
> 	beq	a2,zero,.L2
> 	li	a5,0
> 	vsetivli	zero,4,e32,m1,ta,ma
> .L3:
> 	addi	a5,a5,1
> 	vadd.vv	v2,v2,v2
> 	vmul.vv	v1,v1,v1
> 	bne	a2,a5,.L3
> .L2:
> 	vs1r.v	v2,0(a1)
> 	addi	a1,a1,16
> 	vs1r.v	v1,0(a1)
> 	ret
> ```
> 
> After these patchs:
> 
> ```
> foo:
> 	li	a5,32
> 	vsetvli	zero,a5,e32,m2,ta,ma
> 	vle32.v	v2,0(a0)
> 	beq	a2,zero,.L2
> 	li	a5,0
> 	vsetivli	zero,4,e32,m1,ta,ma
> .L3:
> 	addi	a5,a5,1
> 	vadd.vv	v2,v2,v2
> 	vmul.vv	v3,v3,v3
> 	bne	a2,a5,.L3
> .L2:
> 	vs1r.v	v2,0(a1)
> 	addi	a1,a1,16
> 	vs1r.v	v3,0(a1)
> 	ret
> ```
> 
> As you can see, the two redundant vmv1r.v instructions were removed.
> The reason for the two redundant vmv1r.v instructions is because
> the current ira pass is being conservative in calculating the live
> range of pseduo registers that occupy multil hardregs. As in the
> following two RTL instructions. Where r134 occupies two physical
> registers and r135 and r136 occupy one physical register.
> At insn 12 point, ira considers the entire r134 pseudo register
> to be live, so r135 is in conflict with r134, as shown in the ira
> dump info. Then when the physical registers are allocated, r135 and
> r134 are allocated first because they are inside the loop body and
> have higher priority. This makes it difficult to assign r136 to
> overlap with r134, i.e., to assign r136 to hr100, thus eliminating
> the need for the vmv1r.v instruction. Thus two vmv1r.v instructions
> appear.
> 
> If we refine the live information of r134 to the case of each subreg,
> we can remove this conflict. We can then create copies of the set
> with subreg reference, thus increasing the priority of the r134 allocation,
> which allow registers with bigger alignment requirements to prioritize
> the allocation of physical registers. In RVV, pseudo registers occupying
> two physical registers need to be time-2 aligned.
> 
> ```
> (insn 11 10 12 2 (set (reg/v:RVVM1SI 135 [ v0 ])
>          (subreg:RVVM1SI (reg/v:RVVM2SI 134 [ result ]) 0)) "/app/example.c":7:19 998 {*movrvvm1si_whole}
>       (nil))
> (insn 12 11 13 2 (set (reg/v:RVVM1SI 136 [ v1 ])
>          (subreg:RVVM1SI (reg/v:RVVM2SI 134 [ result ]) [16, 16])) "/app/example.c":8:19 998 {*movrvvm1si_whole}
>       (expr_list:REG_DEAD (reg/v:RVVM2SI 134 [ result ])
>          (nil)))
> ```
> 
> ira dump:
> 
> ;; a1(r136,l0) conflicts: a3(r135,l0)
> ;;     total conflict hard regs:
> ;;     conflict hard regs:
> ;; a3(r135,l0) conflicts: a1(r136,l0) a6(r134,l0)
> ;;     total conflict hard regs:
> ;;     conflict hard regs:
> ;; a6(r134,l0) conflicts: a3(r135,l0)
> ;;     total conflict hard regs:
> ;;     conflict hard regs:
> ;;
> ;; ...
>        Popping a1(r135,l0)  --         assign reg 97
>        Popping a3(r136,l0)  --         assign reg 98
>        Popping a4(r137,l0)  --         assign reg 15
>        Popping a5(r140,l0)  --         assign reg 12
>        Popping a10(r145,l0)  --         assign reg 12
>        Popping a2(r139,l0)  --         assign reg 11
>        Popping a9(r144,l0)  --         assign reg 11
>        Popping a0(r142,l0)  --         assign reg 11
>        Popping a6(r134,l0)  --         assign reg 100
>        Popping a7(r143,l0)  --         assign reg 10
>        Popping a8(r141,l0)  --         assign reg 15
> 
> The AArch64 SVE has the same problem. Consider the following
> code (https://godbolt.org/z/MYrK7Ghaj):
> 
> ```
> #include <arm_sve.h>
> 
> int bar (svbool_t pg, int64_t* base, int n, int64_t *in1, int64_t *in2, int64_t*out)
> {
>    svint64x4_t result = svld4_s64 (pg, base);
>    svint64_t v0 = svget4_s64(result, 0);
>    svint64_t v1 = svget4_s64(result, 1);
>    svint64_t v2 = svget4_s64(result, 2);
>    svint64_t v3 = svget4_s64(result, 3);
> 
>    for (int i = 0; i < n; i += 1)
>      {
>          svint64_t v18 = svld1_s64(pg, in1);
>          svint64_t v19 = svld1_s64(pg, in2);
>          v0 = svmad_s64_z(pg, v0, v18, v19);
>          v1 = svmad_s64_z(pg, v1, v18, v19);
>          v2 = svmad_s64_z(pg, v2, v18, v19);
>          v3 = svmad_s64_z(pg, v3, v18, v19);
>      }
>    svst1_s64(pg, out+0,v0);
>    svst1_s64(pg, out+1,v1);
>    svst1_s64(pg, out+2,v2);
>    svst1_s64(pg, out+3,v3);
> }
> ```
> 
> Before these patchs:
> 
> ```
> bar:
> 	ld4d	{z4.d - z7.d}, p0/z, [x0]
> 	mov	z26.d, z4.d
> 	mov	z27.d, z5.d
> 	mov	z28.d, z6.d
> 	mov	z29.d, z7.d
> 	cmp	w1, 0
> 	...
> ```
> 
> After these patchs:
> 
> ```
> bar:
> 	ld4d	{z28.d - z31.d}, p0/z, [x0]
> 	cmp	w1, 0
> 	...
> ```
> 
> Lehua Ding (7):
>    df: Add DF_LIVE_SUBREG problem
>    ira: Switch to live_subreg data
>    ira: Support subreg live range track
>    ira: Support subreg copy
>    ira: Add all nregs >= 2 pseudos to tracke subreg list
>    lra: Switch to live_subreg data flow
>    lra: Support subreg live range track and conflict detect
> 
>   gcc/Makefile.in          |   1 +
>   gcc/df-problems.cc       | 889 ++++++++++++++++++++++++++++++++++++++-
>   gcc/df.h                 |  67 +++
>   gcc/hard-reg-set.h       |  33 ++
>   gcc/ira-build.cc         | 456 ++++++++++++++++----
>   gcc/ira-color.cc         | 851 ++++++++++++++++++++++++++-----------
>   gcc/ira-conflicts.cc     | 221 +++++++---
>   gcc/ira-emit.cc          |  24 +-
>   gcc/ira-int.h            |  67 ++-
>   gcc/ira-lives.cc         | 507 ++++++++++++++++------
>   gcc/ira.cc               |  73 ++--
>   gcc/lra-assigns.cc       | 111 ++++-
>   gcc/lra-coalesce.cc      |  20 +-
>   gcc/lra-constraints.cc   | 111 +++--
>   gcc/lra-int.h            |  33 ++
>   gcc/lra-lives.cc         | 660 ++++++++++++++++++++++++-----
>   gcc/lra-remat.cc         |  13 +-
>   gcc/lra-spills.cc        |  22 +-
>   gcc/lra.cc               | 139 +++++-
>   gcc/regs.h               |   7 +
>   gcc/subreg-live-range.cc | 628 +++++++++++++++++++++++++++
>   gcc/subreg-live-range.h  | 333 +++++++++++++++
>   gcc/timevar.def          |   1 +
>   23 files changed, 4490 insertions(+), 777 deletions(-)
>   create mode 100644 gcc/subreg-live-range.cc
>   create mode 100644 gcc/subreg-live-range.h
>