From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
This patch is to optimize the codegen of RVV VLS auto-vectorizaiton codegen due to
alignment.
void __attribute__((noinline, noclone))
f3 (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
{
for (int i = 0; i < count; ++i)
dst[i] = op1[i] + op2[i];
}
Before this patch:
f3:
ble a3,zero,.L1
srli a5,a1,2
negw a5,a5
andi a4,a5,3
sext.w a3,a3
beq a4,zero,.L3
lw a7,0(a1)
lw a6,0(a2)
andi a5,a5,2
addw a6,a6,a7
sw a6,0(a0)
beq a5,zero,.L3
lw a7,4(a1)
lw a5,4(a2)
li a6,3
addw a5,a5,a7
sw a5,4(a0)
bne a4,a6,.L3
lw a6,8(a2)
lw a5,8(a1)
addw a5,a5,a6
sw a5,8(a0)
.L3:
subw a3,a3,a4
slli a6,a4,2
slli a5,a3,32
srli a5,a5,32
add a1,a1,a6
add a2,a2,a6
add a0,a0,a6
li a3,4
.L6:
mv a4,a5
bleu a5,a3,.L5
li a4,4
.L5:
vsetvli zero,a4,e32,m1,ta,ma
vle32.v v1,0(a1)
vle32.v v2,0(a2)
vsetivli zero,4,e32,m1,ta,ma
sub a5,a5,a4
vadd.vv v1,v1,v2
vsetvli zero,a4,e32,m1,ta,ma
vse32.v v1,0(a0)
addi a1,a1,16
addi a2,a2,16
addi a0,a0,16
bne a5,zero,.L6
.L1:
ret
After this patch:
f3:
ble a3,zero,.L1
li a4,4
.L4:
mv a5,a3
bleu a3,a4,.L3
li a5,4
.L3:
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v2,0(a1)
vle32.v v1,0(a2)
vsetivli zero,4,e32,m1,ta,ma
sub a3,a3,a5
vadd.vv v1,v1,v2
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v1,0(a0)
addi a2,a2,16
addi a0,a0,16
addi a1,a1,16
bne a3,zero,.L4
.L1:
ret
The TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE is directly coming from ARM SVE.
The TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST is same as GCN port that vectorize
all cases by default. We will need to support accurate vector cost model in the future.
gcc/ChangeLog:
* config/riscv/riscv.cc (riscv_simd_vector_alignment_reachable): New function.
(riscv_vectorization_cost): New function.
(TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE): New target hook.
(TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST): New target hook.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/align-2.c: New test.
---
gcc/config/riscv/riscv.cc | 39 +++++++++++++++++++
.../gcc.target/riscv/rvv/autovec/align-2.c | 12 ++++++
2 files changed, 51 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-2.c