Since our user vsetvl intrinsics are defined as just calculate the VL output
which is the number of the elements to be processed. Such intrinsics do not
have any side effects. We should normalize them when they have same ratio.
E.g __riscv_vsetvl_e8mf8 result is same as __riscv_vsetvl_e64m1.
Normalize them can allow us have better codegen.
Consider this following example:
#include "riscv_vector.h"
void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int cond, int avl) {
size_t vl;
if (cond)
vl = __riscv_vsetvl_e32m1(avl);
else
vl = __riscv_vsetvl_e16mf2(avl);
for (size_t i = 0; i < n; i += 1) {
vint32m1_t a = __riscv_vle32_v_i32m1(in1, vl);
vint32m1_t b = __riscv_vle32_v_i32m1_tu(a, in2, vl);
vint32m1_t c = __riscv_vle32_v_i32m1_tu(b, in3, vl);
__riscv_vse32_v_i32m1(out, c, vl);
}
}
Before this patch:
foo:
beq a5,zero,.L2
vsetvli a6,a6,e32,m1,tu,ma
.L3:
li a5,0
beq a4,zero,.L9
.L4:
vle32.v v1,0(a0)
addi a5,a5,1
vle32.v v1,0(a1)
vle32.v v1,0(a2)
vse32.v v1,0(a3)
bne a4,a5,.L4
.L9:
ret
.L2:
vsetvli zero,a6,e32,m1,tu,ma
j .L3
After this patch:
foo:
li a5,0
vsetvli zero,a6,e32,m1,tu,ma
beq a4,zero,.L9
.L4:
vle32.v v1,0(a0)
addi a5,a5,1
vle32.v v1,0(a1)
vle32.v v1,0(a2)
vse32.v v1,0(a3)
bne a4,a5,.L4
.L9:
ret
PR target/112092
gcc/ChangeLog:
* config/riscv/riscv-vector-builtins-bases.cc: Normalize the vsetvls.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/vsetvl/pr109743-1.c: Adapt test.
* gcc.target/riscv/rvv/vsetvl/pr109743-3.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vsetvl-11.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vsetvl-15.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vsetvl-22.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vsetvlmax-13.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vsetvlmax-15.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vsetvlmax-5.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vsetvlmax-7.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vsetvlmax-8.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/pr112092-1.c: New test.
* gcc.target/riscv/rvv/vsetvl/pr112092-2.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc | 24 +++++++++++++-----
.../gcc.target/riscv/rvv/vsetvl/pr109743-1.c | 2 +-
.../gcc.target/riscv/rvv/vsetvl/pr109743-3.c | 3 +--
.../gcc.target/riscv/rvv/vsetvl/pr112092-1.c | 25 +++++++++++++++++++
.../gcc.target/riscv/rvv/vsetvl/pr112092-2.c | 25 +++++++++++++++++++
.../gcc.target/riscv/rvv/vsetvl/vsetvl-11.c | 2 +-
.../gcc.target/riscv/rvv/vsetvl/vsetvl-15.c | 2 +-
.../gcc.target/riscv/rvv/vsetvl/vsetvl-22.c | 2 +-
.../riscv/rvv/vsetvl/vsetvlmax-13.c | 4 +--
.../riscv/rvv/vsetvl/vsetvlmax-15.c | 6 ++---
.../gcc.target/riscv/rvv/vsetvl/vsetvlmax-5.c | 4 +--
.../gcc.target/riscv/rvv/vsetvl/vsetvlmax-7.c | 2 +-
.../gcc.target/riscv/rvv/vsetvl/vsetvlmax-8.c | 4 +--
13 files changed, 83 insertions(+), 22 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112092-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr112092-2.c
On Wed, 8 Nov 2023, Kito Cheng wrote:
> OK, then LGTM, thanks for the explanation :)
Please don't top-post on a GCC mailing list (and preferably in off-list
replies to such mailing list messages unless it's been agreed to somehow
with the participants), as it makes it difficult to make context replies.
Best practice is to reply inline, quoting the relevant original paragraph
(or enough context) referred to above, and with all the other parts of the
message replied to discarded. We may even have it written down somewhere
(though I haven't checked; in the old days it used to be assumed), and I
do hope any sane modern MUA can handle it.
Otherwise the discussion thread quickly grows into an illegible mess.
So this change does indeed fix PR 112092, however we now have an issue
with several other test cases and the new `-mmovcc' option. For example
vsetvl-13.c fails with "-mmovcc -mbranch-cost=8" test options and assembly
produced is like:
vsetvli a6,a6,e8,mf4,ta,ma
snez a5,a5
neg a5,a5
and a6,a5,a6
not a5,a5
andi a5,a5,55
or a5,a6,a5
beq a4,zero,.L10
li a6,0
vsetvli zero,a5,e32,m1,tu,ma
.L4:
vle32.v v1,0(a0)
vle32.v v1,0(a1)
vle32.v v1,0(a2)
vse32.v v1,0(a3)
addi a6,a6,1
bne a4,a6,.L4
.L10:
ret
As far as I can tell code produced is legitimate, and for the record
analogous assembly is produced with `-march=rv32gcv_zicond' too:
vsetvli a6,a6,e8,mf4,ta,ma
czero.eqz a6,a6,a5
li a7,55
czero.nez a5,a7,a5
or a5,a5,a6
beq a4,zero,.L10
li a6,0
vsetvli zero,a5,e32,m1,tu,ma
.L4:
vle32.v v1,0(a0)
vle32.v v1,0(a1)
vle32.v v1,0(a2)
vse32.v v1,0(a3)
addi a6,a6,1
bne a4,a6,.L4
.L10:
ret
-- it's just that you can't see it with regression testing, because the
test case overrides `-march='. Presumably we do want to execute VSETVLI
twice here on the basis that to avoid the second one by means of branches
would be more costly than not to.
Shall we just silence false failures like this with `-mno-movcc' then or
shall we handle the conditional-move case somehow?
For reference plain branched assembly is like:
li a7,55
beq a5,zero,.L13
vsetvli zero,a6,e32,m1,tu,ma
.L2:
beq a4,zero,.L11
li a5,0
.L4:
vle32.v v1,0(a0)
vle32.v v1,0(a1)
vle32.v v1,0(a2)
vse32.v v1,0(a3)
addi a5,a5,1
bne a4,a5,.L4
.L11:
ret
.L13:
vsetvli zero,a7,e32,m1,tu,ma
j .L2
Maciej
@@ -131,19 +131,31 @@ public:
tree type = builtin_types[e.type.index].vector;
machine_mode mode = TYPE_MODE (type);
- machine_mode inner_mode = GET_MODE_INNER (mode);
+ /* Normalize same RATO (SEW/LMUL) into same vsetvl instruction.
+
+ - e8,mf8/e16,mf4/e32,mf2/e64,m1 --> e8mf8
+ - e8,mf4/e16,mf2/e32,m1/e64,m2 --> e8mf4
+ - e8,mf2/e16,m1/e32,m2/e64,m4 --> e8mf2
+ - e8,m1/e16,m2/e32,m4/e64,m8 --> e8m1
+ - e8,m2/e16,m4/e32,m8 --> e8m2
+ - e8,m4/e16,m8 --> e8m4
+ - e8,m8 --> e8m8
+ */
/* SEW. */
- e.add_input_operand (Pmode,
- gen_int_mode (GET_MODE_BITSIZE (inner_mode), Pmode));
+ e.add_input_operand (Pmode, gen_int_mode (8, Pmode));
/* LMUL. */
- e.add_input_operand (Pmode, gen_int_mode (get_vlmul (mode), Pmode));
+ machine_mode e8_mode
+ = get_vector_mode (QImode, GET_MODE_NUNITS (mode)).require ();
+ e.add_input_operand (Pmode, gen_int_mode (get_vlmul (e8_mode), Pmode));
/* TAIL_ANY. */
- e.add_input_operand (Pmode, gen_int_mode (get_prefer_tail_policy (), Pmode));
+ e.add_input_operand (Pmode,
+ gen_int_mode (get_prefer_tail_policy (), Pmode));
/* MASK_ANY. */
- e.add_input_operand (Pmode, gen_int_mode (get_prefer_mask_policy (), Pmode));
+ e.add_input_operand (Pmode,
+ gen_int_mode (get_prefer_mask_policy (), Pmode));
return e.generate_insn (code_for_vsetvl_no_side_effects (Pmode));
}
};
@@ -22,5 +22,5 @@ void f (int32_t * a, int32_t * b, int n)
}
}
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
@@ -22,7 +22,6 @@ void f (int32_t * a, int32_t * b, int n)
}
}
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e8,\s*mf2,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+} 2 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-O1" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
new file mode 100644
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int cond, int avl) {
+
+ size_t vl;
+ if (cond == 1)
+ vl = __riscv_vsetvl_e32m1(avl);
+ else if (cond == 2)
+ vl = __riscv_vsetvl_e8mf4(avl);
+ else if (cond == 2)
+ vl = __riscv_vsetvl_e16mf2(avl);
+ else
+ vl = __riscv_vsetvl_e64m2(avl);
+ for (size_t i = 0; i < n; i += 1) {
+ vint32m1_t a = __riscv_vle32_v_i32m1(in1, vl);
+ vint32m1_t b = __riscv_vle32_v_i32m1_tu(a, in2, vl);
+ vint32m1_t c = __riscv_vle32_v_i32m1_tu(b, in3, vl);
+ __riscv_vse32_v_i32m1(out, c, vl);
+ }
+}
+
+/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-Os" no-opts "-g" no-opts "-funroll-loops" } } } } */
new file mode 100644
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int cond) {
+
+ size_t vl;
+ if (cond == 1)
+ vl = __riscv_vsetvlmax_e32m1();
+ else if (cond == 2)
+ vl = __riscv_vsetvlmax_e8mf4();
+ else if (cond == 2)
+ vl = __riscv_vsetvlmax_e16mf2();
+ else
+ vl = __riscv_vsetvlmax_e64m2();
+ for (size_t i = 0; i < n; i += 1) {
+ vint32m1_t a = __riscv_vle32_v_i32m1(in1, vl);
+ vint32m1_t b = __riscv_vle32_v_i32m1_tu(a, in2, vl);
+ vint32m1_t c = __riscv_vle32_v_i32m1_tu(b, in3, vl);
+ __riscv_vse32_v_i32m1(out, c, vl);
+ }
+}
+
+/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-Os" no-opts "-g" no-opts "-funroll-loops" } } } } */
@@ -18,4 +18,4 @@ void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int c
}
}
-/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-Os" no-opts "-g" no-opts "-funroll-loops" } } } } */
@@ -18,5 +18,5 @@ void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int c
}
}
-/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-Os" no-opts "-g" no-opts "-funroll-loops" } } } } */
/* { dg-final { scan-assembler-times {srli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*4} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
@@ -18,4 +18,4 @@ void f(int8_t *base, int8_t *out, size_t vl, size_t m, size_t k) {
/* { dg-final { scan-assembler-times {slli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*4} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { no-opts "-O0" no-opts "-Os" no-opts "-g" no-opts "-funroll-loops" } } } } */
@@ -22,6 +22,6 @@ void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int c
}
/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m1,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e8,\s*mf4,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
\ No newline at end of file
+/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
@@ -18,6 +18,6 @@ void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int c
}
}
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {slli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*5} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-Os" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" no-opts "-Os" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {slli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*5} 1 { target { no-opts "-O0" no-opts "-Os" no-opts "-g" no-opts "-funroll-loops" } } } } */
@@ -17,5 +17,5 @@ void foo(void *in1, void *in2, void *in3, void *out, size_t n) {
}
}
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e32,\s*m1,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e32,\s*m1,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-O1" no-opts "-g" no-opts "-funroll-loops" } } } } */
@@ -17,7 +17,7 @@ void foo(void *in1, void *in2, void *in3, void *out, size_t n) {
}
}
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e16,\s*m1,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e8,\s*mf2,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e8,\s*m1,\s*t[au],\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m1,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf2,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
@@ -18,5 +18,5 @@ void foo(int32_t *in1, int32_t *in2, int32_t *in3, int32_t *out, size_t n, int c
}
}
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e32,\s*m1,\s*tu,\s*m[au]} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
-/* { dg-final { scan-assembler-times {vsetvli} 2 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero,\s*e32,\s*m1,\s*tu,\s*m[au]} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 { target { no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */