[V2] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of both VLA && VLS auto-vectorization.
Checks
Commit Message
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
This patch optimizes both RVV VLA && VLS vectorization.
Consider this following case:
void __attribute__((noinline, noclone))
f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
{
for (int i = 0; i < count; ++i)
dst[i] = op1[i] + op2[i];
}
VLA:
Before this patch:
ble a3,zero,.L1
srli a4,a1,2
negw a4,a4
andi a5,a4,3
sext.w a3,a3
beq a5,zero,.L3
lw a7,0(a1)
lw a6,0(a2)
andi a4,a4,2
addw a6,a6,a7
sw a6,0(a0)
beq a4,zero,.L3
lw a7,4(a1)
lw a4,4(a2)
li a6,3
addw a4,a4,a7
sw a4,4(a0)
bne a5,a6,.L3
lw a6,8(a2)
lw a4,8(a1)
addw a4,a4,a6
sw a4,8(a0)
.L3:
subw a3,a3,a5
slli a4,a3,32
csrr a6,vlenb
srli a4,a4,32
srli a6,a6,2
slli a3,a5,2
mv a5,a4
bgtu a4,a6,.L17
.L5:
csrr a6,vlenb
add a1,a1,a3
add a2,a2,a3
add a0,a0,a3
srli a7,a6,2
li a3,0
.L8:
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v1,0(a1)
vle32.v v2,0(a2)
vsetvli t1,zero,e32,m1,ta,ma
add a3,a3,a7
vadd.vv v1,v1,v2
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v1,0(a0)
mv a5,a4
bleu a4,a3,.L6
mv a5,a3
.L6:
sub a5,a4,a5
bleu a5,a7,.L7
mv a5,a7
.L7:
add a1,a1,a6
add a2,a2,a6
add a0,a0,a6
bne a5,zero,.L8
.L1:
ret
.L17:
mv a5,a6
j .L5
After this patch:
f:
ble a3,zero,.L1
csrr a4,vlenb
srli a4,a4,2
mv a5,a3
bgtu a3,a4,.L9
.L3:
csrr a6,vlenb
li a4,0
srli a7,a6,2
.L6:
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v2,0(a1)
vle32.v v1,0(a2)
vsetvli t1,zero,e32,m1,ta,ma
add a4,a4,a7
vadd.vv v1,v1,v2
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v1,0(a0)
mv a5,a3
bleu a3,a4,.L4
mv a5,a4
.L4:
sub a5,a3,a5
bleu a5,a7,.L5
mv a5,a7
.L5:
add a0,a0,a6
add a2,a2,a6
add a1,a1,a6
bne a5,zero,.L6
.L1:
ret
.L9:
mv a5,a4
j .L3
VLS:
Before this patch:
f3:
ble a3,zero,.L1
srli a5,a1,2
negw a5,a5
andi a4,a5,3
sext.w a3,a3
beq a4,zero,.L3
lw a7,0(a1)
lw a6,0(a2)
andi a5,a5,2
addw a6,a6,a7
sw a6,0(a0)
beq a5,zero,.L3
lw a7,4(a1)
lw a5,4(a2)
li a6,3
addw a5,a5,a7
sw a5,4(a0)
bne a4,a6,.L3
lw a6,8(a2)
lw a5,8(a1)
addw a5,a5,a6
sw a5,8(a0)
.L3:
subw a3,a3,a4
slli a6,a4,2
slli a5,a3,32
srli a5,a5,32
add a1,a1,a6
add a2,a2,a6
add a0,a0,a6
li a3,4
.L6:
mv a4,a5
bleu a5,a3,.L5
li a4,4
.L5:
vsetvli zero,a4,e32,m1,ta,ma
vle32.v v1,0(a1)
vle32.v v2,0(a2)
vsetivli zero,4,e32,m1,ta,ma
sub a5,a5,a4
vadd.vv v1,v1,v2
vsetvli zero,a4,e32,m1,ta,ma
vse32.v v1,0(a0)
addi a1,a1,16
addi a2,a2,16
addi a0,a0,16
bne a5,zero,.L6
.L1:
ret
After this patch:
f3:
ble a3,zero,.L1
li a4,4
.L4:
mv a5,a3
bleu a3,a4,.L3
li a5,4
.L3:
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v2,0(a1)
vle32.v v1,0(a2)
vsetivli zero,4,e32,m1,ta,ma
sub a3,a3,a5
vadd.vv v1,v1,v2
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v1,0(a0)
addi a2,a2,16
addi a0,a0,16
addi a1,a1,16
bne a3,zero,.L4
.L1:
ret
gcc/ChangeLog:
* config/riscv/riscv.cc (riscv_vectorize_preferred_vector_alignment): New function.
(TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt testcase.
* gcc.target/riscv/rvv/autovec/align-1.c: New test.
* gcc.target/riscv/rvv/autovec/align-2.c: New test.
---
gcc/config/riscv/riscv.cc | 14 ++++++++++++++
.../gcc.target/riscv/rvv/autovec/align-1.c | 12 ++++++++++++
.../gcc.target/riscv/rvv/autovec/align-2.c | 12 ++++++++++++
.../riscv/rvv/autovec/binop/shift-rv32gcv.c | 10 ++++++----
4 files changed, 44 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
Comments
LGTM
<juzhe.zhong@rivai.ai> 於 2023年5月15日 週一 11:16 寫道:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> This patch optimizes both RVV VLA && VLS vectorization.
>
> Consider this following case:
> void __attribute__((noinline, noclone))
> f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> {
> for (int i = 0; i < count; ++i)
> dst[i] = op1[i] + op2[i];
> }
>
> VLA:
> Before this patch:
> ble a3,zero,.L1
> srli a4,a1,2
> negw a4,a4
> andi a5,a4,3
> sext.w a3,a3
> beq a5,zero,.L3
> lw a7,0(a1)
> lw a6,0(a2)
> andi a4,a4,2
> addw a6,a6,a7
> sw a6,0(a0)
> beq a4,zero,.L3
> lw a7,4(a1)
> lw a4,4(a2)
> li a6,3
> addw a4,a4,a7
> sw a4,4(a0)
> bne a5,a6,.L3
> lw a6,8(a2)
> lw a4,8(a1)
> addw a4,a4,a6
> sw a4,8(a0)
> .L3:
> subw a3,a3,a5
> slli a4,a3,32
> csrr a6,vlenb
> srli a4,a4,32
> srli a6,a6,2
> slli a3,a5,2
> mv a5,a4
> bgtu a4,a6,.L17
> .L5:
> csrr a6,vlenb
> add a1,a1,a3
> add a2,a2,a3
> add a0,a0,a3
> srli a7,a6,2
> li a3,0
> .L8:
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v1,0(a1)
> vle32.v v2,0(a2)
> vsetvli t1,zero,e32,m1,ta,ma
> add a3,a3,a7
> vadd.vv v1,v1,v2
> vsetvli zero,a5,e32,m1,ta,ma
> vse32.v v1,0(a0)
> mv a5,a4
> bleu a4,a3,.L6
> mv a5,a3
> .L6:
> sub a5,a4,a5
> bleu a5,a7,.L7
> mv a5,a7
> .L7:
> add a1,a1,a6
> add a2,a2,a6
> add a0,a0,a6
> bne a5,zero,.L8
> .L1:
> ret
> .L17:
> mv a5,a6
> j .L5
>
> After this patch:
> f:
> ble a3,zero,.L1
> csrr a4,vlenb
> srli a4,a4,2
> mv a5,a3
> bgtu a3,a4,.L9
> .L3:
> csrr a6,vlenb
> li a4,0
> srli a7,a6,2
> .L6:
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v2,0(a1)
> vle32.v v1,0(a2)
> vsetvli t1,zero,e32,m1,ta,ma
> add a4,a4,a7
> vadd.vv v1,v1,v2
> vsetvli zero,a5,e32,m1,ta,ma
> vse32.v v1,0(a0)
> mv a5,a3
> bleu a3,a4,.L4
> mv a5,a4
> .L4:
> sub a5,a3,a5
> bleu a5,a7,.L5
> mv a5,a7
> .L5:
> add a0,a0,a6
> add a2,a2,a6
> add a1,a1,a6
> bne a5,zero,.L6
> .L1:
> ret
> .L9:
> mv a5,a4
> j .L3
>
> VLS:
> Before this patch:
> f3:
> ble a3,zero,.L1
> srli a5,a1,2
> negw a5,a5
> andi a4,a5,3
> sext.w a3,a3
> beq a4,zero,.L3
> lw a7,0(a1)
> lw a6,0(a2)
> andi a5,a5,2
> addw a6,a6,a7
> sw a6,0(a0)
> beq a5,zero,.L3
> lw a7,4(a1)
> lw a5,4(a2)
> li a6,3
> addw a5,a5,a7
> sw a5,4(a0)
> bne a4,a6,.L3
> lw a6,8(a2)
> lw a5,8(a1)
> addw a5,a5,a6
> sw a5,8(a0)
> .L3:
> subw a3,a3,a4
> slli a6,a4,2
> slli a5,a3,32
> srli a5,a5,32
> add a1,a1,a6
> add a2,a2,a6
> add a0,a0,a6
> li a3,4
> .L6:
> mv a4,a5
> bleu a5,a3,.L5
> li a4,4
> .L5:
> vsetvli zero,a4,e32,m1,ta,ma
> vle32.v v1,0(a1)
> vle32.v v2,0(a2)
> vsetivli zero,4,e32,m1,ta,ma
> sub a5,a5,a4
> vadd.vv v1,v1,v2
> vsetvli zero,a4,e32,m1,ta,ma
> vse32.v v1,0(a0)
> addi a1,a1,16
> addi a2,a2,16
> addi a0,a0,16
> bne a5,zero,.L6
> .L1:
> ret
>
> After this patch:
> f3:
> ble a3,zero,.L1
> li a4,4
> .L4:
> mv a5,a3
> bleu a3,a4,.L3
> li a5,4
> .L3:
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v2,0(a1)
> vle32.v v1,0(a2)
> vsetivli zero,4,e32,m1,ta,ma
> sub a3,a3,a5
> vadd.vv v1,v1,v2
> vsetvli zero,a5,e32,m1,ta,ma
> vse32.v v1,0(a0)
> addi a2,a2,16
> addi a0,a0,16
> addi a1,a1,16
> bne a3,zero,.L4
> .L1:
> ret
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.cc
> (riscv_vectorize_preferred_vector_alignment): New function.
> (TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt
> testcase.
> * gcc.target/riscv/rvv/autovec/align-1.c: New test.
> * gcc.target/riscv/rvv/autovec/align-2.c: New test.
>
> ---
> gcc/config/riscv/riscv.cc | 14 ++++++++++++++
> .../gcc.target/riscv/rvv/autovec/align-1.c | 12 ++++++++++++
> .../gcc.target/riscv/rvv/autovec/align-2.c | 12 ++++++++++++
> .../riscv/rvv/autovec/binop/shift-rv32gcv.c | 10 ++++++----
> 4 files changed, 44 insertions(+), 4 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index de578b5b899..a770fdfaa0e 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -7499,6 +7499,16 @@ riscv_preferred_simd_mode (scalar_mode mode)
> return word_mode;
> }
>
> +/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
> +
> +static poly_uint64
> +riscv_vectorize_preferred_vector_alignment (const_tree type)
> +{
> + if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
> + return TYPE_ALIGN (TREE_TYPE (type));
> + return TYPE_ALIGN (type);
> +}
> +
> /* Initialize the GCC target structure. */
> #undef TARGET_ASM_ALIGNED_HI_OP
> #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> @@ -7771,6 +7781,10 @@ riscv_preferred_simd_mode (scalar_mode mode)
> #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
> #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
>
> +#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
> +#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
> + riscv_vectorize_preferred_vector_alignment
> +
> struct gcc_target targetm = TARGET_INITIALIZER;
>
> #include "gt-riscv.h"
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> new file mode 100644
> index 00000000000..14201e1f7e0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param
> riscv-autovec-preference=scalable" } */
> +
> +void __attribute__((noinline, noclone))
> +f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> +{
> + for (int i = 0; i < count; ++i)
> + dst[i] = op1[i] + op2[i];
> +}
> +
> +/* { dg-final { scan-assembler-not "lw" } } */
> +/* { dg-final { scan-assembler-not "sw" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> new file mode 100644
> index 00000000000..812584e9d25
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param
> riscv-autovec-preference=fixed-vlmax" } */
> +
> +void __attribute__((noinline, noclone))
> +f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> +{
> + for (int i = 0; i < count; ++i)
> + dst[i] = op1[i] + op2[i];
> +}
> +
> +/* { dg-final { scan-assembler-not "lw" } } */
> +/* { dg-final { scan-assembler-not "sw" } } */
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> index da0f79a1cf0..d98100b3276 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> @@ -4,8 +4,10 @@
> #include "shift-template.h"
>
> /* TODO: For int16_t and uint16_t we need widening/promotion patterns.
> - Therefore, expect only 4 vsll.vv instead of 6 for now. */
> + We don't check the assembler number since lacking patterns make
> + auto-vectorization inconsistent in LMUL = 1/2/4/8. */
> +
> +/* { dg-final { scan-assembler {\tvsll\.vv} } } */
> +/* { dg-final { scan-assembler {\tvsrl\.vv} } } */
> +/* { dg-final { scan-assembler {\tvsra\.vv} } } */
>
> -/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */
> -/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */
> -/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */
> --
> 2.36.1
>
>
>
Thanks. Can you take a look at this patch:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618398.html
This has been fixed 5 rounds. I already fixed it as you suggested
juzhe.zhong@rivai.ai
From: Kito Cheng
Date: 2023-05-15 13:01
To: 钟居哲
CC: GCC Patches; Kito Cheng; Palmer Dabbelt; Palmer Dabbelt; Jeff Law; rdapp.gcc
Subject: Re: [PATCH V2] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of both VLA && VLS auto-vectorization.
LGTM
<juzhe.zhong@rivai.ai> 於 2023年5月15日 週一 11:16 寫道:
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
This patch optimizes both RVV VLA && VLS vectorization.
Consider this following case:
void __attribute__((noinline, noclone))
f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
{
for (int i = 0; i < count; ++i)
dst[i] = op1[i] + op2[i];
}
VLA:
Before this patch:
ble a3,zero,.L1
srli a4,a1,2
negw a4,a4
andi a5,a4,3
sext.w a3,a3
beq a5,zero,.L3
lw a7,0(a1)
lw a6,0(a2)
andi a4,a4,2
addw a6,a6,a7
sw a6,0(a0)
beq a4,zero,.L3
lw a7,4(a1)
lw a4,4(a2)
li a6,3
addw a4,a4,a7
sw a4,4(a0)
bne a5,a6,.L3
lw a6,8(a2)
lw a4,8(a1)
addw a4,a4,a6
sw a4,8(a0)
.L3:
subw a3,a3,a5
slli a4,a3,32
csrr a6,vlenb
srli a4,a4,32
srli a6,a6,2
slli a3,a5,2
mv a5,a4
bgtu a4,a6,.L17
.L5:
csrr a6,vlenb
add a1,a1,a3
add a2,a2,a3
add a0,a0,a3
srli a7,a6,2
li a3,0
.L8:
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v1,0(a1)
vle32.v v2,0(a2)
vsetvli t1,zero,e32,m1,ta,ma
add a3,a3,a7
vadd.vv v1,v1,v2
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v1,0(a0)
mv a5,a4
bleu a4,a3,.L6
mv a5,a3
.L6:
sub a5,a4,a5
bleu a5,a7,.L7
mv a5,a7
.L7:
add a1,a1,a6
add a2,a2,a6
add a0,a0,a6
bne a5,zero,.L8
.L1:
ret
.L17:
mv a5,a6
j .L5
After this patch:
f:
ble a3,zero,.L1
csrr a4,vlenb
srli a4,a4,2
mv a5,a3
bgtu a3,a4,.L9
.L3:
csrr a6,vlenb
li a4,0
srli a7,a6,2
.L6:
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v2,0(a1)
vle32.v v1,0(a2)
vsetvli t1,zero,e32,m1,ta,ma
add a4,a4,a7
vadd.vv v1,v1,v2
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v1,0(a0)
mv a5,a3
bleu a3,a4,.L4
mv a5,a4
.L4:
sub a5,a3,a5
bleu a5,a7,.L5
mv a5,a7
.L5:
add a0,a0,a6
add a2,a2,a6
add a1,a1,a6
bne a5,zero,.L6
.L1:
ret
.L9:
mv a5,a4
j .L3
VLS:
Before this patch:
f3:
ble a3,zero,.L1
srli a5,a1,2
negw a5,a5
andi a4,a5,3
sext.w a3,a3
beq a4,zero,.L3
lw a7,0(a1)
lw a6,0(a2)
andi a5,a5,2
addw a6,a6,a7
sw a6,0(a0)
beq a5,zero,.L3
lw a7,4(a1)
lw a5,4(a2)
li a6,3
addw a5,a5,a7
sw a5,4(a0)
bne a4,a6,.L3
lw a6,8(a2)
lw a5,8(a1)
addw a5,a5,a6
sw a5,8(a0)
.L3:
subw a3,a3,a4
slli a6,a4,2
slli a5,a3,32
srli a5,a5,32
add a1,a1,a6
add a2,a2,a6
add a0,a0,a6
li a3,4
.L6:
mv a4,a5
bleu a5,a3,.L5
li a4,4
.L5:
vsetvli zero,a4,e32,m1,ta,ma
vle32.v v1,0(a1)
vle32.v v2,0(a2)
vsetivli zero,4,e32,m1,ta,ma
sub a5,a5,a4
vadd.vv v1,v1,v2
vsetvli zero,a4,e32,m1,ta,ma
vse32.v v1,0(a0)
addi a1,a1,16
addi a2,a2,16
addi a0,a0,16
bne a5,zero,.L6
.L1:
ret
After this patch:
f3:
ble a3,zero,.L1
li a4,4
.L4:
mv a5,a3
bleu a3,a4,.L3
li a5,4
.L3:
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v2,0(a1)
vle32.v v1,0(a2)
vsetivli zero,4,e32,m1,ta,ma
sub a3,a3,a5
vadd.vv v1,v1,v2
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v1,0(a0)
addi a2,a2,16
addi a0,a0,16
addi a1,a1,16
bne a3,zero,.L4
.L1:
ret
gcc/ChangeLog:
* config/riscv/riscv.cc (riscv_vectorize_preferred_vector_alignment): New function.
(TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt testcase.
* gcc.target/riscv/rvv/autovec/align-1.c: New test.
* gcc.target/riscv/rvv/autovec/align-2.c: New test.
---
gcc/config/riscv/riscv.cc | 14 ++++++++++++++
.../gcc.target/riscv/rvv/autovec/align-1.c | 12 ++++++++++++
.../gcc.target/riscv/rvv/autovec/align-2.c | 12 ++++++++++++
.../riscv/rvv/autovec/binop/shift-rv32gcv.c | 10 ++++++----
4 files changed, 44 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index de578b5b899..a770fdfaa0e 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7499,6 +7499,16 @@ riscv_preferred_simd_mode (scalar_mode mode)
return word_mode;
}
+/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
+
+static poly_uint64
+riscv_vectorize_preferred_vector_alignment (const_tree type)
+{
+ if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
+ return TYPE_ALIGN (TREE_TYPE (type));
+ return TYPE_ALIGN (type);
+}
+
/* Initialize the GCC target structure. */
#undef TARGET_ASM_ALIGNED_HI_OP
#define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -7771,6 +7781,10 @@ riscv_preferred_simd_mode (scalar_mode mode)
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
+#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
+#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
+ riscv_vectorize_preferred_vector_alignment
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-riscv.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
new file mode 100644
index 00000000000..14201e1f7e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param riscv-autovec-preference=scalable" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+ for (int i = 0; i < count; ++i)
+ dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
new file mode 100644
index 00000000000..812584e9d25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param riscv-autovec-preference=fixed-vlmax" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+ for (int i = 0; i < count; ++i)
+ dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
index da0f79a1cf0..d98100b3276 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
@@ -4,8 +4,10 @@
#include "shift-template.h"
/* TODO: For int16_t and uint16_t we need widening/promotion patterns.
- Therefore, expect only 4 vsll.vv instead of 6 for now. */
+ We don't check the assembler number since lacking patterns make
+ auto-vectorization inconsistent in LMUL = 1/2/4/8. */
+
+/* { dg-final { scan-assembler {\tvsll\.vv} } } */
+/* { dg-final { scan-assembler {\tvsrl\.vv} } } */
+/* { dg-final { scan-assembler {\tvsra\.vv} } } */
-/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */
-/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */
-/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */
--
2.36.1
Committed, thanks kito.
Pan
-----Original Message-----
From: Gcc-patches <gcc-patches-bounces+pan2.li=intel.com@gcc.gnu.org> On Behalf Of Kito Cheng via Gcc-patches
Sent: Monday, May 15, 2023 1:01 PM
To: 钟居哲 <juzhe.zhong@rivai.ai>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>; Kito Cheng <kito.cheng@sifive.com>; Palmer Dabbelt <palmer@dabbelt.com>; Palmer Dabbelt <palmer@rivosinc.com>; Jeff Law <jeffreyalaw@gmail.com>; rdapp.gcc@gmail.co
Subject: Re: [PATCH V2] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of both VLA && VLS auto-vectorization.
LGTM
<juzhe.zhong@rivai.ai> 於 2023年5月15日 週一 11:16 寫道:
> From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
>
> This patch optimizes both RVV VLA && VLS vectorization.
>
> Consider this following case:
> void __attribute__((noinline, noclone))
> f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> {
> for (int i = 0; i < count; ++i)
> dst[i] = op1[i] + op2[i];
> }
>
> VLA:
> Before this patch:
> ble a3,zero,.L1
> srli a4,a1,2
> negw a4,a4
> andi a5,a4,3
> sext.w a3,a3
> beq a5,zero,.L3
> lw a7,0(a1)
> lw a6,0(a2)
> andi a4,a4,2
> addw a6,a6,a7
> sw a6,0(a0)
> beq a4,zero,.L3
> lw a7,4(a1)
> lw a4,4(a2)
> li a6,3
> addw a4,a4,a7
> sw a4,4(a0)
> bne a5,a6,.L3
> lw a6,8(a2)
> lw a4,8(a1)
> addw a4,a4,a6
> sw a4,8(a0)
> .L3:
> subw a3,a3,a5
> slli a4,a3,32
> csrr a6,vlenb
> srli a4,a4,32
> srli a6,a6,2
> slli a3,a5,2
> mv a5,a4
> bgtu a4,a6,.L17
> .L5:
> csrr a6,vlenb
> add a1,a1,a3
> add a2,a2,a3
> add a0,a0,a3
> srli a7,a6,2
> li a3,0
> .L8:
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v1,0(a1)
> vle32.v v2,0(a2)
> vsetvli t1,zero,e32,m1,ta,ma
> add a3,a3,a7
> vadd.vv v1,v1,v2
> vsetvli zero,a5,e32,m1,ta,ma
> vse32.v v1,0(a0)
> mv a5,a4
> bleu a4,a3,.L6
> mv a5,a3
> .L6:
> sub a5,a4,a5
> bleu a5,a7,.L7
> mv a5,a7
> .L7:
> add a1,a1,a6
> add a2,a2,a6
> add a0,a0,a6
> bne a5,zero,.L8
> .L1:
> ret
> .L17:
> mv a5,a6
> j .L5
>
> After this patch:
> f:
> ble a3,zero,.L1
> csrr a4,vlenb
> srli a4,a4,2
> mv a5,a3
> bgtu a3,a4,.L9
> .L3:
> csrr a6,vlenb
> li a4,0
> srli a7,a6,2
> .L6:
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v2,0(a1)
> vle32.v v1,0(a2)
> vsetvli t1,zero,e32,m1,ta,ma
> add a4,a4,a7
> vadd.vv v1,v1,v2
> vsetvli zero,a5,e32,m1,ta,ma
> vse32.v v1,0(a0)
> mv a5,a3
> bleu a3,a4,.L4
> mv a5,a4
> .L4:
> sub a5,a3,a5
> bleu a5,a7,.L5
> mv a5,a7
> .L5:
> add a0,a0,a6
> add a2,a2,a6
> add a1,a1,a6
> bne a5,zero,.L6
> .L1:
> ret
> .L9:
> mv a5,a4
> j .L3
>
> VLS:
> Before this patch:
> f3:
> ble a3,zero,.L1
> srli a5,a1,2
> negw a5,a5
> andi a4,a5,3
> sext.w a3,a3
> beq a4,zero,.L3
> lw a7,0(a1)
> lw a6,0(a2)
> andi a5,a5,2
> addw a6,a6,a7
> sw a6,0(a0)
> beq a5,zero,.L3
> lw a7,4(a1)
> lw a5,4(a2)
> li a6,3
> addw a5,a5,a7
> sw a5,4(a0)
> bne a4,a6,.L3
> lw a6,8(a2)
> lw a5,8(a1)
> addw a5,a5,a6
> sw a5,8(a0)
> .L3:
> subw a3,a3,a4
> slli a6,a4,2
> slli a5,a3,32
> srli a5,a5,32
> add a1,a1,a6
> add a2,a2,a6
> add a0,a0,a6
> li a3,4
> .L6:
> mv a4,a5
> bleu a5,a3,.L5
> li a4,4
> .L5:
> vsetvli zero,a4,e32,m1,ta,ma
> vle32.v v1,0(a1)
> vle32.v v2,0(a2)
> vsetivli zero,4,e32,m1,ta,ma
> sub a5,a5,a4
> vadd.vv v1,v1,v2
> vsetvli zero,a4,e32,m1,ta,ma
> vse32.v v1,0(a0)
> addi a1,a1,16
> addi a2,a2,16
> addi a0,a0,16
> bne a5,zero,.L6
> .L1:
> ret
>
> After this patch:
> f3:
> ble a3,zero,.L1
> li a4,4
> .L4:
> mv a5,a3
> bleu a3,a4,.L3
> li a5,4
> .L3:
> vsetvli zero,a5,e32,m1,ta,ma
> vle32.v v2,0(a1)
> vle32.v v1,0(a2)
> vsetivli zero,4,e32,m1,ta,ma
> sub a3,a3,a5
> vadd.vv v1,v1,v2
> vsetvli zero,a5,e32,m1,ta,ma
> vse32.v v1,0(a0)
> addi a2,a2,16
> addi a0,a0,16
> addi a1,a1,16
> bne a3,zero,.L4
> .L1:
> ret
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.cc
> (riscv_vectorize_preferred_vector_alignment): New function.
> (TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt
> testcase.
> * gcc.target/riscv/rvv/autovec/align-1.c: New test.
> * gcc.target/riscv/rvv/autovec/align-2.c: New test.
>
> ---
> gcc/config/riscv/riscv.cc | 14 ++++++++++++++
> .../gcc.target/riscv/rvv/autovec/align-1.c | 12 ++++++++++++
> .../gcc.target/riscv/rvv/autovec/align-2.c | 12 ++++++++++++
> .../riscv/rvv/autovec/binop/shift-rv32gcv.c | 10 ++++++----
> 4 files changed, 44 insertions(+), 4 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index de578b5b899..a770fdfaa0e 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -7499,6 +7499,16 @@ riscv_preferred_simd_mode (scalar_mode mode)
> return word_mode;
> }
>
> +/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
> +
> +static poly_uint64
> +riscv_vectorize_preferred_vector_alignment (const_tree type)
> +{
> + if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
> + return TYPE_ALIGN (TREE_TYPE (type));
> + return TYPE_ALIGN (type);
> +}
> +
> /* Initialize the GCC target structure. */
> #undef TARGET_ASM_ALIGNED_HI_OP
> #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> @@ -7771,6 +7781,10 @@ riscv_preferred_simd_mode (scalar_mode mode)
> #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
> #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
>
> +#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
> +#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
> + riscv_vectorize_preferred_vector_alignment
> +
> struct gcc_target targetm = TARGET_INITIALIZER;
>
> #include "gt-riscv.h"
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> new file mode 100644
> index 00000000000..14201e1f7e0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param
> riscv-autovec-preference=scalable" } */
> +
> +void __attribute__((noinline, noclone))
> +f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> +{
> + for (int i = 0; i < count; ++i)
> + dst[i] = op1[i] + op2[i];
> +}
> +
> +/* { dg-final { scan-assembler-not "lw" } } */
> +/* { dg-final { scan-assembler-not "sw" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> new file mode 100644
> index 00000000000..812584e9d25
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param
> riscv-autovec-preference=fixed-vlmax" } */
> +
> +void __attribute__((noinline, noclone))
> +f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int
> count)
> +{
> + for (int i = 0; i < count; ++i)
> + dst[i] = op1[i] + op2[i];
> +}
> +
> +/* { dg-final { scan-assembler-not "lw" } } */
> +/* { dg-final { scan-assembler-not "sw" } } */
> diff --git
> a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> index da0f79a1cf0..d98100b3276 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
> @@ -4,8 +4,10 @@
> #include "shift-template.h"
>
> /* TODO: For int16_t and uint16_t we need widening/promotion patterns.
> - Therefore, expect only 4 vsll.vv instead of 6 for now. */
> + We don't check the assembler number since lacking patterns make
> + auto-vectorization inconsistent in LMUL = 1/2/4/8. */
> +
> +/* { dg-final { scan-assembler {\tvsll\.vv} } } */
> +/* { dg-final { scan-assembler {\tvsrl\.vv} } } */
> +/* { dg-final { scan-assembler {\tvsra\.vv} } } */
>
> -/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */
> -/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */
> -/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */
> --
> 2.36.1
>
>
>
@@ -7499,6 +7499,16 @@ riscv_preferred_simd_mode (scalar_mode mode)
return word_mode;
}
+/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
+
+static poly_uint64
+riscv_vectorize_preferred_vector_alignment (const_tree type)
+{
+ if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
+ return TYPE_ALIGN (TREE_TYPE (type));
+ return TYPE_ALIGN (type);
+}
+
/* Initialize the GCC target structure. */
#undef TARGET_ASM_ALIGNED_HI_OP
#define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -7771,6 +7781,10 @@ riscv_preferred_simd_mode (scalar_mode mode)
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
+#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
+#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
+ riscv_vectorize_preferred_vector_alignment
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-riscv.h"
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param riscv-autovec-preference=scalable" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+ for (int i = 0; i < count; ++i)
+ dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param riscv-autovec-preference=fixed-vlmax" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+ for (int i = 0; i < count; ++i)
+ dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
@@ -4,8 +4,10 @@
#include "shift-template.h"
/* TODO: For int16_t and uint16_t we need widening/promotion patterns.
- Therefore, expect only 4 vsll.vv instead of 6 for now. */
+ We don't check the assembler number since lacking patterns make
+ auto-vectorization inconsistent in LMUL = 1/2/4/8. */
+
+/* { dg-final { scan-assembler {\tvsll\.vv} } } */
+/* { dg-final { scan-assembler {\tvsrl\.vv} } } */
+/* { dg-final { scan-assembler {\tvsra\.vv} } } */
-/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */
-/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */
-/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */