diff mbox series

RISC-V: Fix bug of tuple move splitter[PR112561]

Message ID	20231117073549.1841897-1-juzhe.zhong@rivai.ai
State	Unresolved
Headers	Received-SPF: pass (google.com: domain of gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org designates 8.43.85.97 as permitted sender) client-ip=8.43.85.97; DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 571453858D28 From: Juzhe-Zhong <juzhe.zhong@rivai.ai> To: gcc-patches@gcc.gnu.org Cc: kito.cheng@gmail.com, kito.cheng@sifive.com, jeffreyalaw@gmail.com, rdapp.gcc@gmail.com, Juzhe-Zhong <juzhe.zhong@rivai.ai> Subject: [PATCH] RISC-V: Fix bug of tuple move splitter[PR112561] Date: Fri, 17 Nov 2023 15:35:49 +0800 Message-Id: <20231117073549.1841897-1-juzhe.zhong@rivai.ai> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Feedback-ID: bizesmtp:rivai.ai:qybglogicsvrgz:qybglogicsvrgz7a-one-0 Precedence: list Errors-To: gcc-patches-bounces+ouuuleilei=gmail.com@gcc.gnu.org X-getmail-retrieved-from-mailbox: INBOX
Series	RISC-V: Fix bug of tuple move splitter[PR112561] \| RISC-V: Fix bug of tuple move splitter[PR112561]

Checks

Context	Check	Description
snail/gcc-patch-check	warning	Git am fail log

Commit Message

juzhe.zhong@rivai.ai Nov. 17, 2023, 7:35 a.m. UTC

  Fix segment fault on tuple move:

bbl loader
z  0000000000000000 ra 00000000000102ac sp 0000003ffffffaf0 gp 000000000001c0b8
tp 0000000000000000 t0 00000000000104a0 t1 000000000000000f t2 0000000000000000
s0 0000000000000000 s1 0000000000000000 a0 0000003ffffffb30 a1 0000003ffffffb58
a2 0000000000000000 a3 0000000000000000 a4 0000000000000000 a5 000000000001c340
a6 0000000000000004 a7 0000000000000004 s2 0000000000000000 s3 0000000000000000
s4 0000000000000000 s5 0000000000000000 s6 0000000000000000 s7 0000000000000000
s8 0000000000000000 s9 0000000000000000 sA 0000000000000000 sB 0000000000000000
t3 0000000000000000 t4 0000000000000000 t5 0000000000000000 t6 0000000000000000
pc 00000000000101aa va/inst 0000000000000004 sr 8000000200006620
User store segfault @ 0x0000000000000004

	PR target/112561

gcc/ChangeLog:

	* config/riscv/riscv-v.cc (emit_vlmax_insn_lra): Add VLS optimization.
	(expand_tuple_move): Fix bug

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/pr112561.c: New test.

---
 gcc/config/riscv/riscv-v.cc                   | 21 ++++++++++++++++---
 .../gcc.target/riscv/rvv/autovec/pr112561.c   | 16 ++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112561.c

Comments

Kito Cheng Nov. 17, 2023, 8:49 a.m. UTC | #1

> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index 6a2009ffb05..08bbb657a06 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -374,10 +374,24 @@ void
>  emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
>  {
>    gcc_assert (!can_create_pseudo_p ());
> +  machine_mode mode = GET_MODE (ops[0]);
>
> -  insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
> -  e.set_vl (vl);
> -  e.emit_insn ((enum insn_code) icode, ops);
> +  if (imm_avl_p (mode))
> +    {
> +      /* Even though VL is a real hardreg already allocated since
> +        it is post-RA now, we still gain benefits that we emit
> +        vsetivli zero, imm instead of vsetvli VL, zero which is
> +        we can be more flexible in post-RA instruction scheduling.  */
> +      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
> +      e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
> +      e.emit_insn ((enum insn_code) icode, ops);
> +    }
> +  else
> +    {
> +      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
> +      e.set_vl (vl);
> +      e.emit_insn ((enum insn_code) icode, ops);
> +    }

It's a separate optimization which should not be included within this patch.

>  }
>
>  /* Emit an RVV insn with a predefined vector length.  Contrary to
> @@ -2148,6 +2162,7 @@ expand_tuple_move (rtx *ops)
>           offset = ops[2];
>         }
>
> +      emit_vlmax_vsetvl (subpart_mode, ops[4]);

I didn't get why we need vlmax vsetvl here?

We use code_for_pred_mov if subpart_mode is fractional LMUL mode
and will use the whole reg load store if not fractional LMUL.

So we don't need explicitly vsetvl for the above 2 cases in my understanding?
I know I must miss something, do you mind giving me a few more explanations?

>        if (MEM_P (ops[1]))
>         {
>           /* Load operations.  */

juzhe.zhong@rivai.ai Nov. 17, 2023, 12:15 p.m. UTC | #2

>> I didn't get why we need vlmax vsetvl here?
 
>> We use code_for_pred_mov if subpart_mode is fractional LMUL mode
>> and will use the whole reg load store if not fractional LMUL.
 
>> So we don't need explicitly vsetvl for the above 2 cases in my understanding?
>> I know I must miss something, do you mind giving me a few more explanations?

The bug happens here:
https://godbolt.org/z/TbqKv3cWr 

Wrong codes:

        vlseg8e16.v     v1,(a5)
        vsetvli a7,zero,e16,mf2,ta,ma
        vse16.v v1,0(a7)
"a7" register is the issue.

Before split2, in our tuple move pattern:

(define_insn_and_split "*mov<VT:mode>_<P:mode>"
  [(set (match_operand:VT 0 "reg_or_mem_operand" "=vr,vr, m")
        (match_operand:VT 1 "reg_or_mem_operand" " vr, m,vr"))
   (clobber (match_scratch:P 2 "=X,&r,&r"))
   (clobber (match_scratch:P 3 "=X,&r,&r"))
   (clobber (match_scratch:P 4 "=X,&r,&r"))]

We clobber scalar registers since we may need to emit vsetvli VL,zero

We don't emit a explicit pattern set the clobber registers.
So we end up have this following RTL:

(insn 133 42 153 2 (set (reg:DI 6 t1 [247])
        (reg:DI 17 a7 [251])) "/app/example.c":8:14 206 {*movdi_64bit}
     (nil))
(insn 153 133 154 2 (set (reg:DI 16 a6 [231])
        (reg:DI 6 t1 [247])) "/app/example.c":8:14 206 {*movdi_64bit}
     (nil))
(insn 154 153 155 2 (set (mem:RVVMF2HI (reg:DI 16 a6 [231]) [0  S8 A16])
        (if_then_else:RVVMF2HI (unspec:RVVMF32BI [
                    (const_vector:RVVMF32BI [
                            (const_int 1 [0x1]) repeated x4
                        ])
                    (reg:DI 17 a7 [232])
                    (const_int 2 [0x2]) repeated x2
                    (const_int 1 [0x1])
                    (reg:SI 66 vl)
                    (reg:SI 67 vtype)
                ] UNSPEC_VPREDICATE)
            (reg:RVVMF2HI 97 v1 [orig:162 vect_array.21 ] [162])
            (unspec:RVVMF2HI [
                    (reg:SI 0 zero)
                ] UNSPEC_VUNDEF))) "/app/example.c":8:14 1711 {*pred_movrvvmf2hi}
     (nil))

You can see the memory address is "a6" is not "a7"
However, we have this following patterns before:

(insn 133 42 153 2 (set (reg:DI 6 t1 [247])
        (reg:DI 17 a7 [251])) "/app/example.c":8:14 206 {*movdi_64bit}
     (nil))
(insn 153 133 154 2 (set (reg:DI 16 a6 [231])
        (reg:DI 6 t1 [247])) "/app/example.c":8:14 206 {*movdi_64bit}
     (nil))

The latter pass consider "a6" can be replaced by "a7".
Then, the memory address is changed into "a7" which is wrong.

So. we should emit vsetvl, let GCC known the AVL "a7" used is a different value.
Then bug will be fixed.

But you remind me a thing, is that for whole register mode , we don't need this.
So, the code should be adjusted:

if（fractional_p)
  emit_vlmax_vsetvl (subpart_mode, ops[4]);






juzhe.zhong@rivai.ai
 
From: Kito Cheng
Date: 2023-11-17 16:49
To: Juzhe-Zhong
CC: gcc-patches; kito.cheng; jeffreyalaw; rdapp.gcc
Subject: Re: [PATCH] RISC-V: Fix bug of tuple move splitter[PR112561]
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index 6a2009ffb05..08bbb657a06 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -374,10 +374,24 @@ void
>  emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
>  {
>    gcc_assert (!can_create_pseudo_p ());
> +  machine_mode mode = GET_MODE (ops[0]);
>
> -  insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
> -  e.set_vl (vl);
> -  e.emit_insn ((enum insn_code) icode, ops);
> +  if (imm_avl_p (mode))
> +    {
> +      /* Even though VL is a real hardreg already allocated since
> +        it is post-RA now, we still gain benefits that we emit
> +        vsetivli zero, imm instead of vsetvli VL, zero which is
> +        we can be more flexible in post-RA instruction scheduling.  */
> +      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
> +      e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
> +      e.emit_insn ((enum insn_code) icode, ops);
> +    }
> +  else
> +    {
> +      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
> +      e.set_vl (vl);
> +      e.emit_insn ((enum insn_code) icode, ops);
> +    }
 
It's a separate optimization which should not be included within this patch.
 
>  }
>
>  /* Emit an RVV insn with a predefined vector length.  Contrary to
> @@ -2148,6 +2162,7 @@ expand_tuple_move (rtx *ops)
>           offset = ops[2];
>         }
>
> +      emit_vlmax_vsetvl (subpart_mode, ops[4]);
 
I didn't get why we need vlmax vsetvl here?
 
We use code_for_pred_mov if subpart_mode is fractional LMUL mode
and will use the whole reg load store if not fractional LMUL.
 
So we don't need explicitly vsetvl for the above 2 cases in my understanding?
I know I must miss something, do you mind giving me a few more explanations?
 
>        if (MEM_P (ops[1]))
>         {
>           /* Load operations.  */

Kito Cheng Nov. 17, 2023, 2:18 p.m. UTC | #3

I didn’t take a closer look yet on the ira/lra dump yet, but my feeling is
that may cause by the earlyclober modifier isn’t work as expect?

Let me take closer look tomorrow.

juzhe.zhong@rivai.ai <juzhe.zhong@rivai.ai>於 2023年11月17日 週五，20:16寫道：

> >> I didn't get why we need vlmax vsetvl here?
>
> >> We use code_for_pred_mov if subpart_mode is fractional LMUL mode
> >> and will use the whole reg load store if not fractional LMUL.
>
> >> So we don't need explicitly vsetvl for the above 2 cases in my
> understanding?
> >> I know I must miss something, do you mind giving me a few more
> explanations?
>
> The bug happens here:
> https://godbolt.org/z/TbqKv3cWr
>
> Wrong codes:
>
>         vlseg8e16.v     v1,(a5)
>         vsetvli a7,zero,e16,mf2,ta,ma
>         vse16.v v1,0(a7)
> "a7" register is the issue.
>
> Before split2, in our tuple move pattern:
>
> (define_insn_and_split "*mov<VT:mode>_<P:mode>"
>   [(set (match_operand:VT 0 "reg_or_mem_operand" "=vr,vr, m")
>         (match_operand:VT 1 "reg_or_mem_operand" " vr, m,vr"))
>    (clobber (match_scratch:P 2 "=X,&r,&r"))
>    (clobber (match_scratch:P 3 "=X,&r,&r"))
>    (clobber (match_scratch:P 4 "=X,&r,&r"))]
>
> We clobber scalar registers since we may need to emit vsetvli VL,zero
>
> We don't emit a explicit pattern set the clobber registers.
> So we end up have this following RTL:
>
> (insn 133 42 153 2 (set (reg:DI 6 t1 [247])
>         (reg:DI 17 a7 [251])) "/app/example.c":8:14 206 {*movdi_64bit}
>      (nil))
> (insn 153 133 154 2 (set (reg:DI 16 a6 [231])
>         (reg:DI 6 t1 [247])) "/app/example.c":8:14 206 {*movdi_64bit}
>      (nil))
> (insn 154 153 155 2 (set (mem:RVVMF2HI (reg:DI 16 a6 [231]) [0  S8 A16])
>         (if_then_else:RVVMF2HI (unspec:RVVMF32BI [
>                     (const_vector:RVVMF32BI [
>                             (const_int 1 [0x1]) repeated x4
>                         ])
>                     (reg:DI 17 a7 [232])
>                     (const_int 2 [0x2]) repeated x2
>                     (const_int 1 [0x1])
>                     (reg:SI 66 vl)
>                     (reg:SI 67 vtype)
>                 ] UNSPEC_VPREDICATE)
>             (reg:RVVMF2HI 97 v1 [orig:162 vect_array.21 ] [162])
>             (unspec:RVVMF2HI [
>                     (reg:SI 0 zero)
>                 ] UNSPEC_VUNDEF))) "/app/example.c":8:14 1711
> {*pred_movrvvmf2hi}
>      (nil))
>
> You can see the memory address is "a6" is not "a7"
> However, we have this following patterns before:
>
> (insn 133 42 153 2 (set (reg:DI 6 t1 [247])
>         (reg:DI 17 a7 [251])) "/app/example.c":8:14 206 {*movdi_64bit}
>      (nil))
> (insn 153 133 154 2 (set (reg:DI 16 a6 [231])
>         (reg:DI 6 t1 [247])) "/app/example.c":8:14 206 {*movdi_64bit}
>      (nil))
>
> The latter pass consider "a6" can be replaced by "a7".
> Then, the memory address is changed into "a7" which is wrong.
>
> So. we should emit vsetvl, let GCC known the AVL "a7" used is a different
> value.
> Then bug will be fixed.
>
> But you remind me a thing, is that for whole register mode , we don't need
> this.
> So, the code should be adjusted:
>
> if（fractional_p)
>   emit_vlmax_vsetvl (subpart_mode, ops[4]);
>
>
>
>
> ------------------------------
> juzhe.zhong@rivai.ai
>
>
> *From:* Kito Cheng <kito.cheng@sifive.com>
> *Date:* 2023-11-17 16:49
> *To:* Juzhe-Zhong <juzhe.zhong@rivai.ai>
> *CC:* gcc-patches <gcc-patches@gcc.gnu.org>; kito.cheng
> <kito.cheng@gmail.com>; jeffreyalaw <jeffreyalaw@gmail.com>; rdapp.gcc
> <rdapp.gcc@gmail.com>
> *Subject:* Re: [PATCH] RISC-V: Fix bug of tuple move splitter[PR112561]
> > diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> > index 6a2009ffb05..08bbb657a06 100644
> > --- a/gcc/config/riscv/riscv-v.cc
> > +++ b/gcc/config/riscv/riscv-v.cc
> > @@ -374,10 +374,24 @@ void
> >  emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx
> vl)
> >  {
> >    gcc_assert (!can_create_pseudo_p ());
> > +  machine_mode mode = GET_MODE (ops[0]);
> >
> > -  insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
> > -  e.set_vl (vl);
> > -  e.emit_insn ((enum insn_code) icode, ops);
> > +  if (imm_avl_p (mode))
> > +    {
> > +      /* Even though VL is a real hardreg already allocated since
> > +        it is post-RA now, we still gain benefits that we emit
> > +        vsetivli zero, imm instead of vsetvli VL, zero which is
> > +        we can be more flexible in post-RA instruction scheduling.  */
> > +      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
> > +      e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
> > +      e.emit_insn ((enum insn_code) icode, ops);
> > +    }
> > +  else
> > +    {
> > +      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
> > +      e.set_vl (vl);
> > +      e.emit_insn ((enum insn_code) icode, ops);
> > +    }
>
> It's a separate optimization which should not be included within this
> patch.
>
> >  }
> >
> >  /* Emit an RVV insn with a predefined vector length.  Contrary to
> > @@ -2148,6 +2162,7 @@ expand_tuple_move (rtx *ops)
> >           offset = ops[2];
> >         }
> >
> > +      emit_vlmax_vsetvl (subpart_mode, ops[4]);
>
> I didn't get why we need vlmax vsetvl here?
>
> We use code_for_pred_mov if subpart_mode is fractional LMUL mode
> and will use the whole reg load store if not fractional LMUL.
>
> So we don't need explicitly vsetvl for the above 2 cases in my
> understanding?
> I know I must miss something, do you mind giving me a few more
> explanations?
>
> >        if (MEM_P (ops[1]))
> >         {
> >           /* Load operations.  */
>
>
>

Jeff Law Nov. 17, 2023, 3:13 p.m. UTC | #4

On 11/17/23 07:18, Kito Cheng wrote:
> I didn’t take a closer look yet on the ira/lra dump yet, but my feeling 
> is that may cause by the earlyclober modifier isn’t work as expect?
> 
> Let me take closer look tomorrow.
Remember that constraints aren't checked until register allocation.  So 
the combiner, splitters, etc don't know about "earlyclobber".  It's a 
relatively common goof.

Not sure if that's playing a role here, but I've seen it happen several 
times in the past.

Jeff

Kito Cheng Nov. 18, 2023, 9:59 a.m. UTC | #5

> On 11/17/23 07:18, Kito Cheng wrote:
> > I didn’t take a closer look yet on the ira/lra dump yet, but my feeling
> > is that may cause by the earlyclober modifier isn’t work as expect?
> >
> > Let me take closer look tomorrow.
> Remember that constraints aren't checked until register allocation.  So
> the combiner, splitters, etc don't know about "earlyclobber".  It's a
> relatively common goof.
>
> Not sure if that's playing a role here, but I've seen it happen several
> times in the past.

Oh, okay, found IRA/LRA are both did the right jobs, it just because
we don't use that clobber register correctly, only use - no def, so
the cprop_hardreg thinks it can do that, then screw up, so Ju-Zhe has
explained and fix in right way but I just didn't get the point
yesterday :P

diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 6a2009ffb05..08bbb657a06 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -374,10 +374,24 @@  void
 emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
 {
   gcc_assert (!can_create_pseudo_p ());
+  machine_mode mode = GET_MODE (ops[0]);
 
-  insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
-  e.set_vl (vl);
-  e.emit_insn ((enum insn_code) icode, ops);
+  if (imm_avl_p (mode))
+    {
+      /* Even though VL is a real hardreg already allocated since
+	 it is post-RA now, we still gain benefits that we emit
+	 vsetivli zero, imm instead of vsetvli VL, zero which is
+	 we can be more flexible in post-RA instruction scheduling.  */
+      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
+      e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
+      e.emit_insn ((enum insn_code) icode, ops);
+    }
+  else
+    {
+      insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
+      e.set_vl (vl);
+      e.emit_insn ((enum insn_code) icode, ops);
+    }
 }
 
 /* Emit an RVV insn with a predefined vector length.  Contrary to
@@ -2148,6 +2162,7 @@  expand_tuple_move (rtx *ops)
 	  offset = ops[2];
 	}
 
+      emit_vlmax_vsetvl (subpart_mode, ops[4]);
       if (MEM_P (ops[1]))
 	{
 	  /* Load operations.  */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112561.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112561.c
new file mode 100644
index 00000000000..25e61fa12c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr112561.c
@@ -0,0 +1,16 @@ 
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-options "-O3 -ftree-vectorize --param=riscv-autovec-preference=fixed-vlmax -mcmodel=medlow" } */
+
+int printf(char *, ...);
+int a, b, c, e;
+short d[7][7] = {};
+int main() {
+  short f;
+  c = 0;
+  for (; c <= 6; c++) {
+    e |= d[c][c] & 1;
+    b &= f & 3;
+  }
+  printf("%d\n", a);
+  return 0;
+}