RISC-V: Support movmisalign of RVV VLA modes
Checks
Commit Message
Previously, I removed the movmisalign pattern to fix the execution FAILs in this commit:
https://github.com/gcc-mirror/gcc/commit/f7bff24905a6959f85f866390db2fff1d6f95520
I was thinking that RVV doesn't allow misaligned at the beginning so I removed that pattern.
However, after deep investigation && reading RVV ISA again and experiment on SPIKE,
I realized I was wrong.
RVV ISA reference: https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#vector-memory-alignment-constraints
"If an element accessed by a vector memory instruction is not naturally aligned to the size of the element,
either the element is transferred successfully or an address misaligned exception is raised on that element."
It's obvious that RVV ISA does allow misaligned vector load/store.
And experiment and confirm on SPIKE:
[jzzhong@rios-cad122:/work/home/jzzhong/work/toolchain/riscv/gcc/gcc/testsuite/gcc.dg/vect]$~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/bin/spike --isa=rv64gcv --varch=vlen:128,elen:64 ~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/riscv64-unknown-elf/bin/pk64 a.out
bbl loader
z 0000000000000000 ra 0000000000010158 sp 0000003ffffffb40 gp 0000000000012c48
tp 0000000000000000 t0 00000000000110da t1 000000000000000f t2 0000000000000000
s0 0000000000013460 s1 0000000000000000 a0 0000000000012ef5 a1 0000000000012018
a2 0000000000012a71 a3 000000000000000d a4 0000000000000004 a5 0000000000012a71
a6 0000000000012a71 a7 0000000000012018 s2 0000000000000000 s3 0000000000000000
s4 0000000000000000 s5 0000000000000000 s6 0000000000000000 s7 0000000000000000
s8 0000000000000000 s9 0000000000000000 sA 0000000000000000 sB 0000000000000000
t3 0000000000000000 t4 0000000000000000 t5 0000000000000000 t6 0000000000000000
pc 0000000000010258 va/inst 00000000020660a7 sr 8000000200006620
Store/AMO access fault!
[jzzhong@rios-cad122:/work/home/jzzhong/work/toolchain/riscv/gcc/gcc/testsuite/gcc.dg/vect]$~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/bin/spike --misaligned --isa=rv64gcv --varch=vlen:128,elen:64 ~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/riscv64-unknown-elf/bin/pk64 a.out
bbl loader
We can see SPIKE can pass previous *FAILED* execution tests with specifying --misaligned to SPIKE.
So, to honor RVV ISA SPEC, we should add movmisalign pattern back base on the investigations I have done since
it can improve multiple vectorization tests and fix dumple FAILs.
This patch fixes these following dump FAILs:
FAIL: gcc.dg/vect/vect-bitfield-read-2.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
FAIL: gcc.dg/vect/vect-bitfield-read-2.c scan-tree-dump-not optimized "Invalid sum"
FAIL: gcc.dg/vect/vect-bitfield-read-4.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
FAIL: gcc.dg/vect/vect-bitfield-read-4.c scan-tree-dump-not optimized "Invalid sum"
FAIL: gcc.dg/vect/vect-bitfield-write-2.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
FAIL: gcc.dg/vect/vect-bitfield-write-2.c scan-tree-dump-not optimized "Invalid sum"
FAIL: gcc.dg/vect/vect-bitfield-write-3.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
FAIL: gcc.dg/vect/vect-bitfield-write-3.c scan-tree-dump-not optimized "Invalid sum"
Consider this following case:
struct s {
unsigned i : 31;
char a : 4;
};
#define N 32
#define ELT0 {0x7FFFFFFFUL, 0}
#define ELT1 {0x7FFFFFFFUL, 1}
#define ELT2 {0x7FFFFFFFUL, 2}
#define ELT3 {0x7FFFFFFFUL, 3}
#define RES 48
struct s A[N]
= { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
int __attribute__ ((noipa))
f(struct s *ptr, unsigned n) {
int res = 0;
for (int i = 0; i < n; ++i)
res += ptr[i].a;
return res;
}
-O3 -S -fno-vect-cost-model (default strict-align):
f:
mv a4,a0
beq a1,zero,.L9
addiw a5,a1,-1
li a3,14
vsetivli zero,16,e64,m8,ta,ma
bleu a5,a3,.L3
andi a5,a0,127
bne a5,zero,.L3
srliw a3,a1,4
slli a3,a3,7
li a0,15
slli a0,a0,32
add a3,a3,a4
mv a5,a4
li a2,32
vmv.v.x v16,a0
vsetvli zero,zero,e32,m4,ta,ma
vmv.v.i v4,0
.L4:
vsetvli zero,zero,e64,m8,ta,ma
vle64.v v8,0(a5)
addi a5,a5,128
vand.vv v8,v8,v16
vsetvli zero,zero,e32,m4,ta,ma
vnsrl.wx v8,v8,a2
vadd.vv v4,v4,v8
bne a5,a3,.L4
li a3,0
andi a5,a1,15
vmv.s.x v1,a3
andi a3,a1,-16
vredsum.vs v1,v4,v1
vmv.x.s a0,v1
mv a2,a0
beq a5,zero,.L15
slli a5,a3,3
add a5,a4,a5
lw a0,4(a5)
andi a0,a0,15
addiw a4,a3,1
addw a0,a0,a2
bgeu a4,a1,.L15
lw a2,12(a5)
andi a2,a2,15
addiw a4,a3,2
addw a0,a2,a0
bgeu a4,a1,.L15
lw a2,20(a5)
andi a2,a2,15
addiw a4,a3,3
addw a0,a2,a0
bgeu a4,a1,.L15
lw a2,28(a5)
andi a2,a2,15
addiw a4,a3,4
addw a0,a2,a0
bgeu a4,a1,.L15
lw a2,36(a5)
andi a2,a2,15
addiw a4,a3,5
addw a0,a2,a0
bgeu a4,a1,.L15
lw a2,44(a5)
andi a2,a2,15
addiw a4,a3,6
addw a0,a2,a0
bgeu a4,a1,.L15
lw a2,52(a5)
andi a2,a2,15
addiw a4,a3,7
addw a0,a2,a0
bgeu a4,a1,.L15
lw a4,60(a5)
andi a4,a4,15
addw a4,a4,a0
addiw a2,a3,8
mv a0,a4
bgeu a2,a1,.L15
lw a0,68(a5)
andi a0,a0,15
addiw a2,a3,9
addw a0,a0,a4
bgeu a2,a1,.L15
lw a2,76(a5)
andi a2,a2,15
addiw a4,a3,10
addw a0,a2,a0
bgeu a4,a1,.L15
lw a2,84(a5)
andi a2,a2,15
addiw a4,a3,11
addw a0,a2,a0
bgeu a4,a1,.L15
lw a2,92(a5)
andi a2,a2,15
addiw a4,a3,12
addw a0,a2,a0
bgeu a4,a1,.L15
lw a2,100(a5)
andi a2,a2,15
addiw a4,a3,13
addw a0,a2,a0
bgeu a4,a1,.L15
lw a4,108(a5)
andi a4,a4,15
addiw a3,a3,14
addw a0,a4,a0
bgeu a3,a1,.L15
lw a5,116(a5)
andi a5,a5,15
addw a0,a5,a0
ret
.L9:
li a0,0
.L15:
ret
.L3:
mv a5,a4
slli a4,a1,32
srli a1,a4,29
add a1,a5,a1
li a0,0
.L7:
lw a4,4(a5)
andi a4,a4,15
addi a5,a5,8
addw a0,a4,a0
bne a5,a1,.L7
ret
-O3 -S -mno-strict-align -fno-vect-cost-model:
f:
beq a1,zero,.L4
slli a1,a1,32
li a5,15
vsetvli a4,zero,e64,m1,ta,ma
slli a5,a5,32
srli a1,a1,32
li a6,32
vmv.v.x v3,a5
vsetvli zero,zero,e32,mf2,ta,ma
vmv.v.i v2,0
.L3:
vsetvli a5,a1,e64,m1,ta,ma
vle64.v v1,0(a0)
vsetvli a3,zero,e64,m1,ta,ma
slli a2,a5,3
vand.vv v1,v1,v3
sub a1,a1,a5
vsetvli zero,zero,e32,mf2,ta,ma
add a0,a0,a2
vnsrl.wx v1,v1,a6
vsetvli zero,a5,e32,mf2,tu,ma
vadd.vv v2,v2,v1
bne a1,zero,.L3
li a5,0
vsetvli a3,zero,e32,mf2,ta,ma
vmv.s.x v1,a5
vredsum.vs v2,v2,v1
vmv.x.s a0,v2
ret
.L4:
li a0,0
ret
We can see it improves this case codegen a lot.
NOTE: Currently, we use -mno-strict-align to enable movmisalign pattern which is NOT the accurate since
according to RVV ISA:
"Support for misaligned vector memory accesses is independent of an implementation’s support for misaligned scalar memory accesses."
Ideally, we need a separate compile option to enable vector misalign independent on scalar memory misalign (For example, -mno-vector-strict-align).
gcc/ChangeLog:
* config/riscv/vector.md (movmisalign<mode>): Support misalign pattern for RVV VLA modes.
gcc/testsuite/ChangeLog:
* lib/target-supports.exp: Add -mno-strict-align to run vect tests.
---
gcc/config/riscv/vector.md | 13 +++++++++++++
gcc/testsuite/lib/target-supports.exp | 2 ++
2 files changed, 15 insertions(+)
Comments
On Sun, Oct 8, 2023 at 9:22 AM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote:
>
> Previously, I removed the movmisalign pattern to fix the execution FAILs in this commit:
> https://github.com/gcc-mirror/gcc/commit/f7bff24905a6959f85f866390db2fff1d6f95520
>
> I was thinking that RVV doesn't allow misaligned at the beginning so I removed that pattern.
> However, after deep investigation && reading RVV ISA again and experiment on SPIKE,
> I realized I was wrong.
>
> RVV ISA reference: https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#vector-memory-alignment-constraints
>
> "If an element accessed by a vector memory instruction is not naturally aligned to the size of the element,
> either the element is transferred successfully or an address misaligned exception is raised on that element."
But you gobble the "or .." into an existing -mstrict-align flag - are
you sure all implementations are
self-consistent with handling non-vector memory instructions and
vector memory instructions here?
At least the above wording doesn't seem to impose such requirement.
> It's obvious that RVV ISA does allow misaligned vector load/store.
>
> And experiment and confirm on SPIKE:
>
> [jzzhong@rios-cad122:/work/home/jzzhong/work/toolchain/riscv/gcc/gcc/testsuite/gcc.dg/vect]$~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/bin/spike --isa=rv64gcv --varch=vlen:128,elen:64 ~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/riscv64-unknown-elf/bin/pk64 a.out
> bbl loader
> z 0000000000000000 ra 0000000000010158 sp 0000003ffffffb40 gp 0000000000012c48
> tp 0000000000000000 t0 00000000000110da t1 000000000000000f t2 0000000000000000
> s0 0000000000013460 s1 0000000000000000 a0 0000000000012ef5 a1 0000000000012018
> a2 0000000000012a71 a3 000000000000000d a4 0000000000000004 a5 0000000000012a71
> a6 0000000000012a71 a7 0000000000012018 s2 0000000000000000 s3 0000000000000000
> s4 0000000000000000 s5 0000000000000000 s6 0000000000000000 s7 0000000000000000
> s8 0000000000000000 s9 0000000000000000 sA 0000000000000000 sB 0000000000000000
> t3 0000000000000000 t4 0000000000000000 t5 0000000000000000 t6 0000000000000000
> pc 0000000000010258 va/inst 00000000020660a7 sr 8000000200006620
> Store/AMO access fault!
>
> [jzzhong@rios-cad122:/work/home/jzzhong/work/toolchain/riscv/gcc/gcc/testsuite/gcc.dg/vect]$~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/bin/spike --misaligned --isa=rv64gcv --varch=vlen:128,elen:64 ~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/riscv64-unknown-elf/bin/pk64 a.out
> bbl loader
>
> We can see SPIKE can pass previous *FAILED* execution tests with specifying --misaligned to SPIKE.
>
> So, to honor RVV ISA SPEC, we should add movmisalign pattern back base on the investigations I have done since
> it can improve multiple vectorization tests and fix dumple FAILs.
>
> This patch fixes these following dump FAILs:
>
> FAIL: gcc.dg/vect/vect-bitfield-read-2.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-read-2.c scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-read-4.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-read-4.c scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-2.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-2.c scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-3.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-3.c scan-tree-dump-not optimized "Invalid sum"
>
> Consider this following case:
>
> struct s {
> unsigned i : 31;
> char a : 4;
> };
>
> #define N 32
> #define ELT0 {0x7FFFFFFFUL, 0}
> #define ELT1 {0x7FFFFFFFUL, 1}
> #define ELT2 {0x7FFFFFFFUL, 2}
> #define ELT3 {0x7FFFFFFFUL, 3}
> #define RES 48
> struct s A[N]
> = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
> ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
> ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
> ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
>
> int __attribute__ ((noipa))
> f(struct s *ptr, unsigned n) {
> int res = 0;
> for (int i = 0; i < n; ++i)
> res += ptr[i].a;
> return res;
> }
>
> -O3 -S -fno-vect-cost-model (default strict-align):
>
> f:
> mv a4,a0
> beq a1,zero,.L9
> addiw a5,a1,-1
> li a3,14
> vsetivli zero,16,e64,m8,ta,ma
> bleu a5,a3,.L3
> andi a5,a0,127
> bne a5,zero,.L3
> srliw a3,a1,4
> slli a3,a3,7
> li a0,15
> slli a0,a0,32
> add a3,a3,a4
> mv a5,a4
> li a2,32
> vmv.v.x v16,a0
> vsetvli zero,zero,e32,m4,ta,ma
> vmv.v.i v4,0
> .L4:
> vsetvli zero,zero,e64,m8,ta,ma
> vle64.v v8,0(a5)
> addi a5,a5,128
> vand.vv v8,v8,v16
> vsetvli zero,zero,e32,m4,ta,ma
> vnsrl.wx v8,v8,a2
> vadd.vv v4,v4,v8
> bne a5,a3,.L4
> li a3,0
> andi a5,a1,15
> vmv.s.x v1,a3
> andi a3,a1,-16
> vredsum.vs v1,v4,v1
> vmv.x.s a0,v1
> mv a2,a0
> beq a5,zero,.L15
> slli a5,a3,3
> add a5,a4,a5
> lw a0,4(a5)
> andi a0,a0,15
> addiw a4,a3,1
> addw a0,a0,a2
> bgeu a4,a1,.L15
> lw a2,12(a5)
> andi a2,a2,15
> addiw a4,a3,2
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,20(a5)
> andi a2,a2,15
> addiw a4,a3,3
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,28(a5)
> andi a2,a2,15
> addiw a4,a3,4
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,36(a5)
> andi a2,a2,15
> addiw a4,a3,5
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,44(a5)
> andi a2,a2,15
> addiw a4,a3,6
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,52(a5)
> andi a2,a2,15
> addiw a4,a3,7
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a4,60(a5)
> andi a4,a4,15
> addw a4,a4,a0
> addiw a2,a3,8
> mv a0,a4
> bgeu a2,a1,.L15
> lw a0,68(a5)
> andi a0,a0,15
> addiw a2,a3,9
> addw a0,a0,a4
> bgeu a2,a1,.L15
> lw a2,76(a5)
> andi a2,a2,15
> addiw a4,a3,10
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,84(a5)
> andi a2,a2,15
> addiw a4,a3,11
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,92(a5)
> andi a2,a2,15
> addiw a4,a3,12
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,100(a5)
> andi a2,a2,15
> addiw a4,a3,13
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a4,108(a5)
> andi a4,a4,15
> addiw a3,a3,14
> addw a0,a4,a0
> bgeu a3,a1,.L15
> lw a5,116(a5)
> andi a5,a5,15
> addw a0,a5,a0
> ret
> .L9:
> li a0,0
> .L15:
> ret
> .L3:
> mv a5,a4
> slli a4,a1,32
> srli a1,a4,29
> add a1,a5,a1
> li a0,0
> .L7:
> lw a4,4(a5)
> andi a4,a4,15
> addi a5,a5,8
> addw a0,a4,a0
> bne a5,a1,.L7
> ret
>
> -O3 -S -mno-strict-align -fno-vect-cost-model:
>
> f:
> beq a1,zero,.L4
> slli a1,a1,32
> li a5,15
> vsetvli a4,zero,e64,m1,ta,ma
> slli a5,a5,32
> srli a1,a1,32
> li a6,32
> vmv.v.x v3,a5
> vsetvli zero,zero,e32,mf2,ta,ma
> vmv.v.i v2,0
> .L3:
> vsetvli a5,a1,e64,m1,ta,ma
> vle64.v v1,0(a0)
> vsetvli a3,zero,e64,m1,ta,ma
> slli a2,a5,3
> vand.vv v1,v1,v3
> sub a1,a1,a5
> vsetvli zero,zero,e32,mf2,ta,ma
> add a0,a0,a2
> vnsrl.wx v1,v1,a6
> vsetvli zero,a5,e32,mf2,tu,ma
> vadd.vv v2,v2,v1
> bne a1,zero,.L3
> li a5,0
> vsetvli a3,zero,e32,mf2,ta,ma
> vmv.s.x v1,a5
> vredsum.vs v2,v2,v1
> vmv.x.s a0,v2
> ret
> .L4:
> li a0,0
> ret
>
> We can see it improves this case codegen a lot.
>
> NOTE: Currently, we use -mno-strict-align to enable movmisalign pattern which is NOT the accurate since
> according to RVV ISA:
>
> "Support for misaligned vector memory accesses is independent of an implementation’s support for misaligned scalar memory accesses."
>
> Ideally, we need a separate compile option to enable vector misalign independent on scalar memory misalign (For example, -mno-vector-strict-align).
>
> gcc/ChangeLog:
>
> * config/riscv/vector.md (movmisalign<mode>): Support misalign pattern for RVV VLA modes.
>
> gcc/testsuite/ChangeLog:
>
> * lib/target-supports.exp: Add -mno-strict-align to run vect tests.
>
> ---
> gcc/config/riscv/vector.md | 13 +++++++++++++
> gcc/testsuite/lib/target-supports.exp | 2 ++
> 2 files changed, 15 insertions(+)
>
> diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
> index cf5c0a40257..1bed1e40bd4 100644
> --- a/gcc/config/riscv/vector.md
> +++ b/gcc/config/riscv/vector.md
> @@ -1326,6 +1326,19 @@
> }
> )
>
> +;; According to RVV ISA:
> +;; If an element accessed by a vector memory instruction is not naturally aligned to the size of the element,
> +;; either the element is transferred successfully or an address misaligned exception is raised on that element.
> +(define_expand "movmisalign<mode>"
> + [(set (match_operand:V 0 "nonimmediate_operand")
> + (match_operand:V 1 "general_operand"))]
> + "TARGET_VECTOR && !STRICT_ALIGNMENT"
> + {
> + emit_move_insn (operands[0], operands[1]);
> + DONE;
> + }
> +)
> +
> ;; -----------------------------------------------------------------
> ;; ---- Duplicate Operations
> ;; -----------------------------------------------------------------
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index af52c38433d..f0b89cf4722 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -11389,11 +11389,13 @@ proc check_vect_support_and_set_flags { } {
> if [check_effective_target_riscv_vector_hw] {
> lappend DEFAULT_VECTCFLAGS "--param" "riscv-autovec-preference=scalable"
> lappend DEFAULT_VECTCFLAGS "--param" "riscv-vector-abi"
> + lappend DEFAULT_VECTCFLAGS "-mno-strict-align"
> set dg-do-what-default run
> } else {
> lappend DEFAULT_VECTCFLAGS "-march=rv64gcv_zvfh" "-mabi=lp64d"
> lappend DEFAULT_VECTCFLAGS "--param" "riscv-autovec-preference=scalable"
> lappend DEFAULT_VECTCFLAGS "--param" "riscv-vector-abi"
> + lappend DEFAULT_VECTCFLAGS "-mno-strict-align"
> set dg-do-what-default compile
> }
> } else {
> --
> 2.36.3
>
>> But you gobble the "or .." into an existing -mstrict-align flag - are
>> you sure all implementations are
>> self-consistent with handling non-vector memory instructions and
>> vector memory instructions here?
>> At least the above wording doesn't seem to impose such requirement.
RVV ISA:
"Support for misaligned vector memory accesses is independent of an implementation’s support for misaligned scalar memory accesses."
Support misalign vector memory access is independent on scalar memory access.
I think this patch (using -mno-strict-align) is not appropriate, which means I need additional compile option.
juzhe.zhong@rivai.ai
From: Richard Biener
Date: 2023-10-09 16:01
To: Juzhe-Zhong
CC: gcc-patches; kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc
Subject: Re: [PATCH] RISC-V: Support movmisalign of RVV VLA modes
On Sun, Oct 8, 2023 at 9:22 AM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote:
>
> Previously, I removed the movmisalign pattern to fix the execution FAILs in this commit:
> https://github.com/gcc-mirror/gcc/commit/f7bff24905a6959f85f866390db2fff1d6f95520
>
> I was thinking that RVV doesn't allow misaligned at the beginning so I removed that pattern.
> However, after deep investigation && reading RVV ISA again and experiment on SPIKE,
> I realized I was wrong.
>
> RVV ISA reference: https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#vector-memory-alignment-constraints
>
> "If an element accessed by a vector memory instruction is not naturally aligned to the size of the element,
> either the element is transferred successfully or an address misaligned exception is raised on that element."
But you gobble the "or .." into an existing -mstrict-align flag - are
you sure all implementations are
self-consistent with handling non-vector memory instructions and
vector memory instructions here?
At least the above wording doesn't seem to impose such requirement.
> It's obvious that RVV ISA does allow misaligned vector load/store.
>
> And experiment and confirm on SPIKE:
>
> [jzzhong@rios-cad122:/work/home/jzzhong/work/toolchain/riscv/gcc/gcc/testsuite/gcc.dg/vect]$~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/bin/spike --isa=rv64gcv --varch=vlen:128,elen:64 ~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/riscv64-unknown-elf/bin/pk64 a.out
> bbl loader
> z 0000000000000000 ra 0000000000010158 sp 0000003ffffffb40 gp 0000000000012c48
> tp 0000000000000000 t0 00000000000110da t1 000000000000000f t2 0000000000000000
> s0 0000000000013460 s1 0000000000000000 a0 0000000000012ef5 a1 0000000000012018
> a2 0000000000012a71 a3 000000000000000d a4 0000000000000004 a5 0000000000012a71
> a6 0000000000012a71 a7 0000000000012018 s2 0000000000000000 s3 0000000000000000
> s4 0000000000000000 s5 0000000000000000 s6 0000000000000000 s7 0000000000000000
> s8 0000000000000000 s9 0000000000000000 sA 0000000000000000 sB 0000000000000000
> t3 0000000000000000 t4 0000000000000000 t5 0000000000000000 t6 0000000000000000
> pc 0000000000010258 va/inst 00000000020660a7 sr 8000000200006620
> Store/AMO access fault!
>
> [jzzhong@rios-cad122:/work/home/jzzhong/work/toolchain/riscv/gcc/gcc/testsuite/gcc.dg/vect]$~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/bin/spike --misaligned --isa=rv64gcv --varch=vlen:128,elen:64 ~/work/toolchain/riscv/build/dev-rv64gcv_zfh-lp64d-medany-newlib-spike-debug/install/riscv64-unknown-elf/bin/pk64 a.out
> bbl loader
>
> We can see SPIKE can pass previous *FAILED* execution tests with specifying --misaligned to SPIKE.
>
> So, to honor RVV ISA SPEC, we should add movmisalign pattern back base on the investigations I have done since
> it can improve multiple vectorization tests and fix dumple FAILs.
>
> This patch fixes these following dump FAILs:
>
> FAIL: gcc.dg/vect/vect-bitfield-read-2.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-read-2.c scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-read-4.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-read-4.c scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-2.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-2.c scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-3.c -flto -ffat-lto-objects scan-tree-dump-not optimized "Invalid sum"
> FAIL: gcc.dg/vect/vect-bitfield-write-3.c scan-tree-dump-not optimized "Invalid sum"
>
> Consider this following case:
>
> struct s {
> unsigned i : 31;
> char a : 4;
> };
>
> #define N 32
> #define ELT0 {0x7FFFFFFFUL, 0}
> #define ELT1 {0x7FFFFFFFUL, 1}
> #define ELT2 {0x7FFFFFFFUL, 2}
> #define ELT3 {0x7FFFFFFFUL, 3}
> #define RES 48
> struct s A[N]
> = { ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
> ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
> ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3,
> ELT0, ELT1, ELT2, ELT3, ELT0, ELT1, ELT2, ELT3};
>
> int __attribute__ ((noipa))
> f(struct s *ptr, unsigned n) {
> int res = 0;
> for (int i = 0; i < n; ++i)
> res += ptr[i].a;
> return res;
> }
>
> -O3 -S -fno-vect-cost-model (default strict-align):
>
> f:
> mv a4,a0
> beq a1,zero,.L9
> addiw a5,a1,-1
> li a3,14
> vsetivli zero,16,e64,m8,ta,ma
> bleu a5,a3,.L3
> andi a5,a0,127
> bne a5,zero,.L3
> srliw a3,a1,4
> slli a3,a3,7
> li a0,15
> slli a0,a0,32
> add a3,a3,a4
> mv a5,a4
> li a2,32
> vmv.v.x v16,a0
> vsetvli zero,zero,e32,m4,ta,ma
> vmv.v.i v4,0
> .L4:
> vsetvli zero,zero,e64,m8,ta,ma
> vle64.v v8,0(a5)
> addi a5,a5,128
> vand.vv v8,v8,v16
> vsetvli zero,zero,e32,m4,ta,ma
> vnsrl.wx v8,v8,a2
> vadd.vv v4,v4,v8
> bne a5,a3,.L4
> li a3,0
> andi a5,a1,15
> vmv.s.x v1,a3
> andi a3,a1,-16
> vredsum.vs v1,v4,v1
> vmv.x.s a0,v1
> mv a2,a0
> beq a5,zero,.L15
> slli a5,a3,3
> add a5,a4,a5
> lw a0,4(a5)
> andi a0,a0,15
> addiw a4,a3,1
> addw a0,a0,a2
> bgeu a4,a1,.L15
> lw a2,12(a5)
> andi a2,a2,15
> addiw a4,a3,2
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,20(a5)
> andi a2,a2,15
> addiw a4,a3,3
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,28(a5)
> andi a2,a2,15
> addiw a4,a3,4
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,36(a5)
> andi a2,a2,15
> addiw a4,a3,5
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,44(a5)
> andi a2,a2,15
> addiw a4,a3,6
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,52(a5)
> andi a2,a2,15
> addiw a4,a3,7
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a4,60(a5)
> andi a4,a4,15
> addw a4,a4,a0
> addiw a2,a3,8
> mv a0,a4
> bgeu a2,a1,.L15
> lw a0,68(a5)
> andi a0,a0,15
> addiw a2,a3,9
> addw a0,a0,a4
> bgeu a2,a1,.L15
> lw a2,76(a5)
> andi a2,a2,15
> addiw a4,a3,10
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,84(a5)
> andi a2,a2,15
> addiw a4,a3,11
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,92(a5)
> andi a2,a2,15
> addiw a4,a3,12
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a2,100(a5)
> andi a2,a2,15
> addiw a4,a3,13
> addw a0,a2,a0
> bgeu a4,a1,.L15
> lw a4,108(a5)
> andi a4,a4,15
> addiw a3,a3,14
> addw a0,a4,a0
> bgeu a3,a1,.L15
> lw a5,116(a5)
> andi a5,a5,15
> addw a0,a5,a0
> ret
> .L9:
> li a0,0
> .L15:
> ret
> .L3:
> mv a5,a4
> slli a4,a1,32
> srli a1,a4,29
> add a1,a5,a1
> li a0,0
> .L7:
> lw a4,4(a5)
> andi a4,a4,15
> addi a5,a5,8
> addw a0,a4,a0
> bne a5,a1,.L7
> ret
>
> -O3 -S -mno-strict-align -fno-vect-cost-model:
>
> f:
> beq a1,zero,.L4
> slli a1,a1,32
> li a5,15
> vsetvli a4,zero,e64,m1,ta,ma
> slli a5,a5,32
> srli a1,a1,32
> li a6,32
> vmv.v.x v3,a5
> vsetvli zero,zero,e32,mf2,ta,ma
> vmv.v.i v2,0
> .L3:
> vsetvli a5,a1,e64,m1,ta,ma
> vle64.v v1,0(a0)
> vsetvli a3,zero,e64,m1,ta,ma
> slli a2,a5,3
> vand.vv v1,v1,v3
> sub a1,a1,a5
> vsetvli zero,zero,e32,mf2,ta,ma
> add a0,a0,a2
> vnsrl.wx v1,v1,a6
> vsetvli zero,a5,e32,mf2,tu,ma
> vadd.vv v2,v2,v1
> bne a1,zero,.L3
> li a5,0
> vsetvli a3,zero,e32,mf2,ta,ma
> vmv.s.x v1,a5
> vredsum.vs v2,v2,v1
> vmv.x.s a0,v2
> ret
> .L4:
> li a0,0
> ret
>
> We can see it improves this case codegen a lot.
>
> NOTE: Currently, we use -mno-strict-align to enable movmisalign pattern which is NOT the accurate since
> according to RVV ISA:
>
> "Support for misaligned vector memory accesses is independent of an implementation’s support for misaligned scalar memory accesses."
>
> Ideally, we need a separate compile option to enable vector misalign independent on scalar memory misalign (For example, -mno-vector-strict-align).
>
> gcc/ChangeLog:
>
> * config/riscv/vector.md (movmisalign<mode>): Support misalign pattern for RVV VLA modes.
>
> gcc/testsuite/ChangeLog:
>
> * lib/target-supports.exp: Add -mno-strict-align to run vect tests.
>
> ---
> gcc/config/riscv/vector.md | 13 +++++++++++++
> gcc/testsuite/lib/target-supports.exp | 2 ++
> 2 files changed, 15 insertions(+)
>
> diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
> index cf5c0a40257..1bed1e40bd4 100644
> --- a/gcc/config/riscv/vector.md
> +++ b/gcc/config/riscv/vector.md
> @@ -1326,6 +1326,19 @@
> }
> )
>
> +;; According to RVV ISA:
> +;; If an element accessed by a vector memory instruction is not naturally aligned to the size of the element,
> +;; either the element is transferred successfully or an address misaligned exception is raised on that element.
> +(define_expand "movmisalign<mode>"
> + [(set (match_operand:V 0 "nonimmediate_operand")
> + (match_operand:V 1 "general_operand"))]
> + "TARGET_VECTOR && !STRICT_ALIGNMENT"
> + {
> + emit_move_insn (operands[0], operands[1]);
> + DONE;
> + }
> +)
> +
> ;; -----------------------------------------------------------------
> ;; ---- Duplicate Operations
> ;; -----------------------------------------------------------------
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index af52c38433d..f0b89cf4722 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -11389,11 +11389,13 @@ proc check_vect_support_and_set_flags { } {
> if [check_effective_target_riscv_vector_hw] {
> lappend DEFAULT_VECTCFLAGS "--param" "riscv-autovec-preference=scalable"
> lappend DEFAULT_VECTCFLAGS "--param" "riscv-vector-abi"
> + lappend DEFAULT_VECTCFLAGS "-mno-strict-align"
> set dg-do-what-default run
> } else {
> lappend DEFAULT_VECTCFLAGS "-march=rv64gcv_zvfh" "-mabi=lp64d"
> lappend DEFAULT_VECTCFLAGS "--param" "riscv-autovec-preference=scalable"
> lappend DEFAULT_VECTCFLAGS "--param" "riscv-vector-abi"
> + lappend DEFAULT_VECTCFLAGS "-mno-strict-align"
> set dg-do-what-default compile
> }
> } else {
> --
> 2.36.3
>
Hi Juzhe,
I think an extra param might be too intrusive. I would expect normal
hardware implementations to support unaligned accesses (but they might
be slow which should be covered by costs) and only rarely have hardware
that doesn't support it and raises exceptions.
Therefore I would suggest to have a separate TARGET attribute like
TARGET_RVV_MISALIGNMENT_SUPPORTED (or so) that enable or disables the
movmisalign pattern. This would be enabled by default for now and when
there is a uarch that wants different behavior it should do something
like
#define TARGET_RVV_MISALIGNMENT_SUPPORTED (uarch != SPECIFIC_UARCH)
Regards
Robin
@@ -1326,6 +1326,19 @@
}
)
+;; According to RVV ISA:
+;; If an element accessed by a vector memory instruction is not naturally aligned to the size of the element,
+;; either the element is transferred successfully or an address misaligned exception is raised on that element.
+(define_expand "movmisalign<mode>"
+ [(set (match_operand:V 0 "nonimmediate_operand")
+ (match_operand:V 1 "general_operand"))]
+ "TARGET_VECTOR && !STRICT_ALIGNMENT"
+ {
+ emit_move_insn (operands[0], operands[1]);
+ DONE;
+ }
+)
+
;; -----------------------------------------------------------------
;; ---- Duplicate Operations
;; -----------------------------------------------------------------
@@ -11389,11 +11389,13 @@ proc check_vect_support_and_set_flags { } {
if [check_effective_target_riscv_vector_hw] {
lappend DEFAULT_VECTCFLAGS "--param" "riscv-autovec-preference=scalable"
lappend DEFAULT_VECTCFLAGS "--param" "riscv-vector-abi"
+ lappend DEFAULT_VECTCFLAGS "-mno-strict-align"
set dg-do-what-default run
} else {
lappend DEFAULT_VECTCFLAGS "-march=rv64gcv_zvfh" "-mabi=lp64d"
lappend DEFAULT_VECTCFLAGS "--param" "riscv-autovec-preference=scalable"
lappend DEFAULT_VECTCFLAGS "--param" "riscv-vector-abi"
+ lappend DEFAULT_VECTCFLAGS "-mno-strict-align"
set dg-do-what-default compile
}
} else {