RISC-V: Optimize codegen of VLA SLP
Checks
Commit Message
Recently, I figure out a better approach in case of codegen for VLA stepped vector.
Here is the detail descriptions:
Case 1:
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 37] + 1;
a[i * 8 + 1] = b[i * 8 + 37] + 2;
a[i * 8 + 2] = b[i * 8 + 37] + 3;
a[i * 8 + 3] = b[i * 8 + 37] + 4;
a[i * 8 + 4] = b[i * 8 + 37] + 5;
a[i * 8 + 5] = b[i * 8 + 37] + 6;
a[i * 8 + 6] = b[i * 8 + 37] + 7;
a[i * 8 + 7] = b[i * 8 + 37] + 8;
}
}
We need to generate the stepped vector:
NPATTERNS = 8.
{ 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 }
Before this patch:
vid.v v4 ;; {0,1,2,3,4,5,6,7,...}
vsrl.vi v4,v4,3 ;; {0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,...}
li a3,8 ;; {8}
vmul.vx v4,v4,a3 ;; {0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,...}
After this patch:
vid.v v4 ;; {0,1,2,3,4,5,6,7,...}
vand.vi v4,v4,-8(-NPATTERNS) ;; {0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,...}
Case 2:
void
f (uint8_t *restrict a, uint8_t *restrict b)
{
for (int i = 0; i < 100; ++i)
{
a[i * 8] = b[i * 8 + 3] + 1;
a[i * 8 + 1] = b[i * 8 + 2] + 2;
a[i * 8 + 2] = b[i * 8 + 1] + 3;
a[i * 8 + 3] = b[i * 8 + 0] + 4;
a[i * 8 + 4] = b[i * 8 + 7] + 5;
a[i * 8 + 5] = b[i * 8 + 6] + 6;
a[i * 8 + 6] = b[i * 8 + 5] + 7;
a[i * 8 + 7] = b[i * 8 + 4] + 8;
}
}
We need to generate the stepped vector:
NPATTERNS = 4.
{ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, ... }
Before this patch:
li a6,134221824
slli a6,a6,5
addi a6,a6,3 ;; 64-bit: 0x0003000200010000
vmv.v.x v6,a6 ;; {3, 2, 1, 0, ... }
vid.v v4 ;; {0, 1, 2, 3, 4, 5, 6, 7, ... }
vsrl.vi v4,v4,2 ;; {0, 0, 0, 0, 1, 1, 1, 1, ... }
li a3,4 ;; {4}
vmul.vx v4,v4,a3 ;; {0, 0, 0, 0, 4, 4, 4, 4, ... }
vadd.vv v4,v4,v6 ;; {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, ... }
After this patch:
li a3,-536875008
slli a3,a3,4
addi a3,a3,1
slli a3,a3,16
vmv.v.x v2,a3 ;; {3, 1, -1, -3, ... }
vid.v v4 ;; {0, 1, 2, 3, 4, 5, 6, 7, ... }
vadd.vv v4,v4,v2 ;; {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, ... }
gcc/ChangeLog:
* config/riscv/riscv-v.cc (expand_const_vector): Optimize codegen.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/partial/slp-1.c: Adapt testcase.
* gcc.target/riscv/rvv/autovec/partial/slp-16.c: New test.
* gcc.target/riscv/rvv/autovec/partial/slp_run-16.c: New test.
---
gcc/config/riscv/riscv-v.cc | 78 ++++++++-----------
.../riscv/rvv/autovec/partial/slp-1.c | 2 +
.../riscv/rvv/autovec/partial/slp-16.c | 24 ++++++
.../riscv/rvv/autovec/partial/slp_run-16.c | 66 ++++++++++++++++
4 files changed, 125 insertions(+), 45 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp-16.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/partial/slp_run-16.c
Comments
Hi Juzhe,
> Case 1:
> void
> f (uint8_t *restrict a, uint8_t *restrict b)
> {
> for (int i = 0; i < 100; ++i)
> {
> a[i * 8] = b[i * 8 + 37] + 1;
> a[i * 8 + 1] = b[i * 8 + 37] + 2;
> a[i * 8 + 2] = b[i * 8 + 37] + 3;
> a[i * 8 + 3] = b[i * 8 + 37] + 4;
> a[i * 8 + 4] = b[i * 8 + 37] + 5;
> a[i * 8 + 5] = b[i * 8 + 37] + 6;
> a[i * 8 + 6] = b[i * 8 + 37] + 7;
> a[i * 8 + 7] = b[i * 8 + 37] + 8;
> }
> }
>
> We need to generate the stepped vector:
> NPATTERNS = 8.
> { 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 }
>
> Before this patch:
> vid.v v4 ;; {0,1,2,3,4,5,6,7,...}
> vsrl.vi v4,v4,3 ;; {0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,...}
> li a3,8 ;; {8}
> vmul.vx v4,v4,a3 ;; {0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,...}
>
> After this patch:
> vid.v v4 ;; {0,1,2,3,4,5,6,7,...}
> vand.vi v4,v4,-8(-NPATTERNS) ;; {0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,...}
This is a nice improvement. Even though we're in the SLP realm I would
still add an assert that documents that we're indeed operating with
pow2_p (NPATTERNS) and some comment as to why we can use AND.
Sure we're doing exact_log2 et al later anyway, just to make things
clearer.
> Before this patch:
> li a6,134221824
> slli a6,a6,5
> addi a6,a6,3 ;; 64-bit: 0x0003000200010000
> vmv.v.x v6,a6 ;; {3, 2, 1, 0, ... }
> vid.v v4 ;; {0, 1, 2, 3, 4, 5, 6, 7, ... }
> vsrl.vi v4,v4,2 ;; {0, 0, 0, 0, 1, 1, 1, 1, ... }
> li a3,4 ;; {4}
> vmul.vx v4,v4,a3 ;; {0, 0, 0, 0, 4, 4, 4, 4, ... }
> vadd.vv v4,v4,v6 ;; {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, ... }
>
> After this patch:
> li a3,-536875008
> slli a3,a3,4
> addi a3,a3,1
> slli a3,a3,16
> vmv.v.x v2,a3 ;; {3, 1, -1, -3, ... }
> vid.v v4 ;; {0, 1, 2, 3, 4, 5, 6, 7, ... }
> vadd.vv v4,v4,v2 ;; {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, ... }
My immediate idea would have been to fall back to the first
approach, i.e. create the "0x00030002..." constant
> li a6,134221824
> slli a6,a6,5
> addi a6,a6,3 ;; 64-bit: 0x0003000200010000
> vmv.v.x v6,a6 ;; {3, 2, 1, 0, ... }
and then
vid.v v4
vand.vi v4, v4, -4
vadd.vv v4, v4, v6
It's one more vector instruction though so possibly worse from a latency
standpoint.
Rest looks good to me.
Regards
Robin
> This is a nice improvement. Even though we're in the SLP realm I would
> still add an assert that documents that we're indeed operating with
> pow2_p (NPATTERNS) and some comment as to why we can use AND.
> Sure we're doing exact_log2 et al later anyway, just to make things
> clearer.
Actually no assert necessary, just a comment like:
/* As NPATTERNS is always a power of two, we can ..." */
Regards
Robin
Hi, Robin.
>> Actually no assert necessary, just a comment like:
>> /* As NPATTERNS is always a power of two, we can ..." */
Ok.
>> My immediate idea would have been to fall back to the first
>> approach, i.e. create the "0x00030002..." constant
>>and then
>> vid.v v4
>>>> vand.vi v4, v4, -4
>> vadd.vv v4, v4, v6
>>It's one more vector instruction though so possibly worse from a latency
>>standpoint.
>>Rest looks good to me.
This solution I have tried and talked with my colleague deeply, turns out is very obvious consume 1 more vector insn.
You may argue that this patch needs 1 more scalar insn (slli) which is relative cheaper than vector insn.
I prefer this patch solution.
Address comments. I have add comments to send V2.
Thanks.
juzhe.zhong@rivai.ai
From: Robin Dapp
Date: 2023-06-20 15:55
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; palmer; palmer; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Optimize codegen of VLA SLP
Hi Juzhe,
> Case 1:
> void
> f (uint8_t *restrict a, uint8_t *restrict b)
> {
> for (int i = 0; i < 100; ++i)
> {
> a[i * 8] = b[i * 8 + 37] + 1;
> a[i * 8 + 1] = b[i * 8 + 37] + 2;
> a[i * 8 + 2] = b[i * 8 + 37] + 3;
> a[i * 8 + 3] = b[i * 8 + 37] + 4;
> a[i * 8 + 4] = b[i * 8 + 37] + 5;
> a[i * 8 + 5] = b[i * 8 + 37] + 6;
> a[i * 8 + 6] = b[i * 8 + 37] + 7;
> a[i * 8 + 7] = b[i * 8 + 37] + 8;
> }
> }
>
> We need to generate the stepped vector:
> NPATTERNS = 8.
> { 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 }
>
> Before this patch:
> vid.v v4 ;; {0,1,2,3,4,5,6,7,...}
> vsrl.vi v4,v4,3 ;; {0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,...}
> li a3,8 ;; {8}
> vmul.vx v4,v4,a3 ;; {0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,...}
>
> After this patch:
> vid.v v4 ;; {0,1,2,3,4,5,6,7,...}
> vand.vi v4,v4,-8(-NPATTERNS) ;; {0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,...}
This is a nice improvement. Even though we're in the SLP realm I would
still add an assert that documents that we're indeed operating with
pow2_p (NPATTERNS) and some comment as to why we can use AND.
Sure we're doing exact_log2 et al later anyway, just to make things
clearer.
> Before this patch:
> li a6,134221824
> slli a6,a6,5
> addi a6,a6,3 ;; 64-bit: 0x0003000200010000
> vmv.v.x v6,a6 ;; {3, 2, 1, 0, ... }
> vid.v v4 ;; {0, 1, 2, 3, 4, 5, 6, 7, ... }
> vsrl.vi v4,v4,2 ;; {0, 0, 0, 0, 1, 1, 1, 1, ... }
> li a3,4 ;; {4}
> vmul.vx v4,v4,a3 ;; {0, 0, 0, 0, 4, 4, 4, 4, ... }
> vadd.vv v4,v4,v6 ;; {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, ... }
>
> After this patch:
> li a3,-536875008
> slli a3,a3,4
> addi a3,a3,1
> slli a3,a3,16
> vmv.v.x v2,a3 ;; {3, 1, -1, -3, ... }
> vid.v v4 ;; {0, 1, 2, 3, 4, 5, 6, 7, ... }
> vadd.vv v4,v4,v2 ;; {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, ... }
My immediate idea would have been to fall back to the first
approach, i.e. create the "0x00030002..." constant
> li a6,134221824
> slli a6,a6,5
> addi a6,a6,3 ;; 64-bit: 0x0003000200010000
> vmv.v.x v6,a6 ;; {3, 2, 1, 0, ... }
and then
vid.v v4
vand.vi v4, v4, -4
vadd.vv v4, v4, v6
It's one more vector instruction though so possibly worse from a latency
standpoint.
Rest looks good to me.
Regards
Robin
Hi, Robin. Can you give me more details of the comments:
What about:
/* As NPATTERNS is always a power of two, we can optimize codegen of
VLA const vector according to this feature. */
Is this comment Ok ? You know I am always struggle with adding an English comment or description.
Thanks.
juzhe.zhong@rivai.ai
From: Robin Dapp
Date: 2023-06-20 16:03
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; palmer; palmer; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Optimize codegen of VLA SLP
> This is a nice improvement. Even though we're in the SLP realm I would
> still add an assert that documents that we're indeed operating with
> pow2_p (NPATTERNS) and some comment as to why we can use AND.
> Sure we're doing exact_log2 et al later anyway, just to make things
> clearer.
Actually no assert necessary, just a comment like:
/* As NPATTERNS is always a power of two, we can ..." */
Regards
Robin
> + /* Step 2: VID AND -NPATTERNS:
> + { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
> + */
Before that, just add something simple like:
We want to create a pattern where value[ix] = floor (ix / NPATTERNS).
As NPATTERNS is always a power of two we can rewrite this as
= ix & -NPATTERNS.
Regards
Robin
Ok. Just sent V2. I will adjust comment and send V3 again :)
juzhe.zhong@rivai.ai
From: Robin Dapp
Date: 2023-06-20 16:55
To: juzhe.zhong@rivai.ai; gcc-patches
CC: rdapp.gcc; kito.cheng; Kito.cheng; palmer; palmer; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Optimize codegen of VLA SLP
> + /* Step 2: VID AND -NPATTERNS:
> + { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
> + */
Before that, just add something simple like:
We want to create a pattern where value[ix] = floor (ix / NPATTERNS).
As NPATTERNS is always a power of two we can rewrite this as
= ix & -NPATTERNS.
Regards
Robin
> Ok. Just sent V2. I will adjust comment and send V3 again :)
Sorry, was too slow.
Regards
Robin
@@ -1128,7 +1128,7 @@ expand_const_vector (rtx target, rtx src)
builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
}
builder.finalize ();
-
+
if (CONST_VECTOR_DUPLICATE_P (src))
{
/* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
@@ -1204,61 +1204,49 @@ expand_const_vector (rtx target, rtx src)
if (builder.single_step_npatterns_p ())
{
/* Describe the case by choosing NPATTERNS = 4 as an example. */
- rtx base, step;
+ insn_code icode;
+
+ /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
+ rtx vid = gen_reg_rtx (builder.mode ());
+ rtx vid_ops[] = {vid};
+ icode = code_for_pred_series (builder.mode ());
+ emit_vlmax_insn (icode, RVV_MISC_OP, vid_ops);
+
if (builder.npatterns_all_equal_p ())
{
/* Generate the variable-length vector following this rule:
{ a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
E.g. { 0, 0, 8, 8, 16, 16, ... } */
- /* Step 1: Generate base = { 0, 0, 0, 0, 0, 0, 0, ... }. */
- base = expand_vector_broadcast (builder.mode (), builder.elt (0));
+ /* Step 2: VID AND -NPATTERNS:
+ { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
+ */
+ rtx imm
+ = gen_int_mode (-builder.npatterns (), builder.inner_mode ());
+ rtx and_ops[] = {target, vid, imm};
+ icode = code_for_pred_scalar (AND, builder.mode ());
+ emit_vlmax_insn (icode, RVV_BINOP, and_ops);
}
else
{
/* Generate the variable-length vector following this rule:
{ a, b, a, b, a + step, b + step, a + step*2, b + step*2, ...}
- E.g. { 0, 6, 0, 6, 8, 14, 8, 14, 16, 22, 16, 22, ... } */
- /* Step 1: Generate base = { 0, 6, 0, 6, ... }. */
- rvv_builder new_builder (builder.mode (), builder.npatterns (),
- 1);
- for (unsigned int i = 0; i < builder.npatterns (); ++i)
- new_builder.quick_push (builder.elt (i));
- rtx new_vec = new_builder.build ();
- base = gen_reg_rtx (builder.mode ());
- emit_move_insn (base, new_vec);
+ E.g. { 3, 2, 1, 0, 7, 6, 5, 4, ... } */
+ /* Step 2: Generate diff = TARGET - VID:
+ { 3-0, 2-1, 1-2, 0-3, 7-4, 6-5, 5-6, 4-7, ... }*/
+ rvv_builder v (builder.mode (), builder.npatterns (), 1);
+ for (unsigned int i = 0; i < v.npatterns (); ++i)
+ {
+ /* Calculate the diff between the target sequence and
+ vid sequence. */
+ HOST_WIDE_INT diff = INTVAL (builder.elt (i)) - i;
+ v.quick_push (gen_int_mode (diff, v.inner_mode ()));
+ }
+ /* Step 2: Generate result = VID + diff. */
+ rtx vec = v.build ();
+ rtx add_ops[] = {target, vid, vec};
+ emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()), RVV_BINOP,
+ add_ops);
}
-
- /* Step 2: Generate step = gen_int_mode (diff, mode). */
- poly_int64 value1 = rtx_to_poly_int64 (builder.elt (0));
- poly_int64 value2
- = rtx_to_poly_int64 (builder.elt (builder.npatterns ()));
- poly_int64 diff = value2 - value1;
- step = gen_int_mode (diff, builder.inner_mode ());
-
- /* Step 3: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */
- rtx vid = gen_reg_rtx (builder.mode ());
- rtx op[] = {vid};
- emit_vlmax_insn (code_for_pred_series (builder.mode ()), RVV_MISC_OP,
- op);
-
- /* Step 4: Generate factor = { 0, 0, 0, 0, 1, 1, 1, 1, ... }. */
- rtx factor = gen_reg_rtx (builder.mode ());
- rtx shift_ops[]
- = {factor, vid,
- gen_int_mode (exact_log2 (builder.npatterns ()), Pmode)};
- emit_vlmax_insn (code_for_pred_scalar (LSHIFTRT, builder.mode ()),
- RVV_BINOP, shift_ops);
-
- /* Step 5: Generate adjusted step = { 0, 0, 0, 0, diff, diff, ... } */
- rtx adjusted_step = gen_reg_rtx (builder.mode ());
- rtx mul_ops[] = {adjusted_step, factor, step};
- emit_vlmax_insn (code_for_pred_scalar (MULT, builder.mode ()),
- RVV_BINOP, mul_ops);
-
- /* Step 6: Generate the final result. */
- rtx add_ops[] = {target, base, adjusted_step};
- emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()), RVV_BINOP,
- add_ops);
}
else
/* TODO: We will enable more variable-length vector in the future. */
@@ -20,3 +20,5 @@ f (int8_t *restrict a, int8_t *restrict b, int n)
}
/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" } } */
+/* { dg-final { scan-assembler {\tvid\.v} } } */
+/* { dg-final { scan-assembler {\tvand} } } */
new file mode 100644
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d --param riscv-autovec-preference=scalable -fdump-tree-optimized-details" } */
+
+#include <stdint-gcc.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ a[i * 8] = b[i * 8 + 3] + 1;
+ a[i * 8 + 1] = b[i * 8 + 2] + 2;
+ a[i * 8 + 2] = b[i * 8 + 1] + 3;
+ a[i * 8 + 3] = b[i * 8 + 0] + 4;
+ a[i * 8 + 4] = b[i * 8 + 7] + 5;
+ a[i * 8 + 5] = b[i * 8 + 6] + 6;
+ a[i * 8 + 6] = b[i * 8 + 5] + 7;
+ a[i * 8 + 7] = b[i * 8 + 4] + 8;
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "\.VEC_PERM" 1 "optimized" } } */
+/* { dg-final { scan-assembler {\tvid\.v} } } */
+/* { dg-final { scan-assembler-not {\tvmul} } } */
new file mode 100644
@@ -0,0 +1,66 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param riscv-autovec-preference=scalable" } */
+
+#include "slp-16.c"
+
+#define LIMIT 128
+void __attribute__ ((optimize (0)))
+f_golden (int8_t *restrict a, int8_t *restrict b, int n)
+{
+ for (int i = 0; i < n; ++i)
+ {
+ a[i * 8] = b[i * 8 + 3] + 1;
+ a[i * 8 + 1] = b[i * 8 + 2] + 2;
+ a[i * 8 + 2] = b[i * 8 + 1] + 3;
+ a[i * 8 + 3] = b[i * 8 + 0] + 4;
+ a[i * 8 + 4] = b[i * 8 + 7] + 5;
+ a[i * 8 + 5] = b[i * 8 + 6] + 6;
+ a[i * 8 + 6] = b[i * 8 + 5] + 7;
+ a[i * 8 + 7] = b[i * 8 + 4] + 8;
+ }
+}
+
+int
+main (void)
+{
+#define RUN(NUM) \
+ int8_t a_##NUM[NUM * 8 + 8] = {0}; \
+ int8_t a_golden_##NUM[NUM * 8 + 8] = {0}; \
+ int8_t b_##NUM[NUM * 8 + 8] = {0}; \
+ for (int i = 0; i < NUM * 8 + 8; i++) \
+ { \
+ if (i % NUM == 0) \
+ b_##NUM[i] = (i + NUM) % LIMIT; \
+ else \
+ b_##NUM[i] = (i - NUM) % (-LIMIT); \
+ } \
+ f (a_##NUM, b_##NUM, NUM); \
+ f_golden (a_golden_##NUM, b_##NUM, NUM); \
+ for (int i = 0; i < NUM * 8 + 8; i++) \
+ { \
+ if (a_##NUM[i] != a_golden_##NUM[i]) \
+ __builtin_abort (); \
+ }
+
+ RUN (3);
+ RUN (5);
+ RUN (15);
+ RUN (16);
+ RUN (17);
+ RUN (31);
+ RUN (32);
+ RUN (33);
+ RUN (63);
+ RUN (64);
+ RUN (65);
+ RUN (127);
+ RUN (128);
+ RUN (129);
+ RUN (239);
+ RUN (359);
+ RUN (498);
+ RUN (799);
+ RUN (977);
+ RUN (5789);
+ return 0;
+}