[Committed] RISC-V: Add dynamic LMUL test for x264

Message ID 20231221085750.3541650-1-juzhe.zhong@rivai.ai
State Accepted
Headers
Series [Committed] RISC-V: Add dynamic LMUL test for x264 |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

juzhe.zhong@rivai.ai Dec. 21, 2023, 8:57 a.m. UTC
  When working on evaluating x264 performance, I notice the best LMUL for such case with -march=rv64gcv is LMUL = 2

LMUL = 1:

x264_pixel_8x8:
	add	a4,a1,a2
	addi	a6,a0,16
	vsetivli	zero,4,e8,mf4,ta,ma
	add	a5,a4,a2
	vle8.v	v12,0(a6)
	vle8.v	v2,0(a4)
	addi	a6,a0,4
	addi	a4,a4,4
	vle8.v	v11,0(a6)
	vle8.v	v9,0(a4)
	addi	a6,a1,4
	addi	a4,a0,32
	vle8.v	v13,0(a0)
	vle8.v	v1,0(a1)
	vle8.v	v4,0(a6)
	vle8.v	v8,0(a4)
	vle8.v	v7,0(a5)
	vwsubu.vv	v3,v13,v1
	add	a3,a5,a2
	addi	a6,a0,20
	addi	a4,a0,36
	vle8.v	v10,0(a6)
	vle8.v	v6,0(a4)
	addi	a5,a5,4
	vle8.v	v5,0(a5)
	vsetvli	zero,zero,e16,mf2,ta,mu
	vmslt.vi	v0,v3,0
	vneg.v	v3,v3,v0.t
	vsetvli	zero,zero,e8,mf4,ta,ma
	vwsubu.vv	v1,v12,v2
	vsetvli	zero,zero,e16,mf2,ta,mu
	vmslt.vi	v0,v1,0
	vneg.v	v1,v1,v0.t
	vmv1r.v	v2,v1
	vwadd.vv	v1,v3,v2
	vsetvli	zero,zero,e8,mf4,ta,ma
	vwsubu.vv	v2,v11,v4
	vsetvli	zero,zero,e16,mf2,ta,mu
	vmslt.vi	v0,v2,0
	vneg.v	v2,v2,v0.t
	vsetvli	zero,zero,e8,mf4,ta,ma
	vwsubu.vv	v3,v10,v9
	vsetvli	zero,zero,e16,mf2,ta,mu
	vmv1r.v	v4,v2
	vmslt.vi	v0,v3,0
	vneg.v	v3,v3,v0.t
	vwadd.vv	v2,v4,v3
	vsetvli	zero,zero,e8,mf4,ta,ma
	vwsubu.vv	v3,v8,v7
	vsetvli	zero,zero,e16,mf2,ta,mu
	add	a4,a3,a2
	vmslt.vi	v0,v3,0
	vneg.v	v3,v3,v0.t
	vwadd.wv	v1,v1,v3
	vsetvli	zero,zero,e8,mf4,ta,ma
	add	a5,a4,a2
	vwsubu.vv	v3,v6,v5
	addi	a6,a0,48
	vsetvli	zero,zero,e16,mf2,ta,mu
	vle8.v	v16,0(a3)
	vle8.v	v12,0(a4)
	addi	a3,a3,4
	addi	a4,a4,4
	vle8.v	v17,0(a6)
	vle8.v	v14,0(a3)
	vle8.v	v10,0(a4)
	vle8.v	v8,0(a5)
	add	a6,a5,a2
	addi	a3,a0,64
	addi	a4,a0,80
	addi	a5,a5,4
	vle8.v	v13,0(a3)
	vle8.v	v4,0(a5)
	vle8.v	v9,0(a4)
	vle8.v	v6,0(a6)
	vmslt.vi	v0,v3,0
	addi	a7,a0,52
	vneg.v	v3,v3,v0.t
	vle8.v	v15,0(a7)
	vwadd.wv	v2,v2,v3
	addi	a3,a0,68
	addi	a4,a0,84
	vle8.v	v11,0(a3)
	vle8.v	v5,0(a4)
	addi	a5,a0,96
	vle8.v	v7,0(a5)
	vsetvli	zero,zero,e8,mf4,ta,ma
	vwsubu.vv	v3,v17,v16
	vsetvli	zero,zero,e16,mf2,ta,mu
	vmslt.vi	v0,v3,0
	vneg.v	v3,v3,v0.t
	vwadd.wv	v1,v1,v3
	vsetvli	zero,zero,e8,mf4,ta,ma
	vwsubu.vv	v3,v15,v14
	vsetvli	zero,zero,e16,mf2,ta,mu
	vmslt.vi	v0,v3,0
	vneg.v	v3,v3,v0.t
	vwadd.wv	v2,v2,v3
	vsetvli	zero,zero,e8,mf4,ta,ma
	vwsubu.vv	v3,v13,v12
	vsetvli	zero,zero,e16,mf2,ta,mu
	slli	a4,a2,3
	vmslt.vi	v0,v3,0
	vneg.v	v3,v3,v0.t
	vwadd.wv	v1,v1,v3
	vsetvli	zero,zero,e8,mf4,ta,ma
	sub	a4,a4,a2
	vwsubu.vv	v3,v11,v10
	vsetvli	zero,zero,e16,mf2,ta,mu
	add	a1,a1,a4
	vmslt.vi	v0,v3,0
	vneg.v	v3,v3,v0.t
	vwadd.wv	v2,v2,v3
	vsetvli	zero,zero,e8,mf4,ta,ma
	lbu	a7,0(a1)
	vwsubu.vv	v3,v9,v8
	lbu	a5,112(a0)
	vsetvli	zero,zero,e16,mf2,ta,mu
	subw	a5,a5,a7
	vmslt.vi	v0,v3,0
	lbu	a3,113(a0)
	vneg.v	v3,v3,v0.t
	lbu	a4,1(a1)
	vwadd.wv	v1,v1,v3
	addi	a6,a6,4
	vsetvli	zero,zero,e8,mf4,ta,ma
	subw	a3,a3,a4
	vwsubu.vv	v3,v5,v4
	addi	a2,a0,100
	vsetvli	zero,zero,e16,mf2,ta,mu
	vle8.v	v4,0(a6)
	sraiw	a6,a5,31
	vle8.v	v5,0(a2)
	sraiw	a7,a3,31
	vmslt.vi	v0,v3,0
	xor	a2,a5,a6
	vneg.v	v3,v3,v0.t
	vwadd.wv	v2,v2,v3
	vsetvli	zero,zero,e8,mf4,ta,ma
	lbu	a4,114(a0)
	vwsubu.vv	v3,v7,v6
	lbu	t1,2(a1)
	vsetvli	zero,zero,e16,mf2,ta,mu
	subw	a2,a2,a6
	xor	a6,a3,a7
	vmslt.vi	v0,v3,0
	subw	a4,a4,t1
	vneg.v	v3,v3,v0.t
	lbu	t1,3(a1)
	vwadd.wv	v1,v1,v3
	lbu	a5,115(a0)
	subw	a6,a6,a7
	vsetvli	zero,zero,e8,mf4,ta,ma
	li	a7,0
	vwsubu.vv	v3,v5,v4
	sraiw	t3,a4,31
	vsetvli	zero,zero,e16,mf2,ta,mu
	subw	a5,a5,t1
	vmslt.vi	v0,v3,0
	vneg.v	v3,v3,v0.t
	vwadd.wv	v2,v2,v3
	sraiw	t1,a5,31
	vsetvli	zero,zero,e32,m1,ta,ma
	xor	a4,a4,t3
	vadd.vv	v1,v1,v2
	vmv.s.x	v2,a7
	vredsum.vs	v1,v1,v2
	vmv.x.s	a7,v1
	addw	a2,a7,a2
	subw	a4,a4,t3
	addw	a6,a6,a2
	xor	a2,a5,t1
	lbu	a3,116(a0)
	lbu	t4,4(a1)
	addw	a4,a4,a6
	subw	a2,a2,t1
	lbu	a5,5(a1)
	subw	a3,a3,t4
	addw	a2,a2,a4
	lbu	a4,117(a0)
	lbu	t1,6(a1)
	sraiw	a7,a3,31
	subw	a4,a4,a5
	lbu	a5,118(a0)
	sraiw	a6,a4,31
	subw	a5,a5,t1
	xor	a3,a3,a7
	lbu	t1,7(a1)
	lbu	a0,119(a0)
	sraiw	a1,a5,31
	subw	a0,a0,t1
	subw	a3,a3,a7
	xor	a4,a4,a6
	addw	a3,a3,a2
	subw	a4,a4,a6
	sraiw	a2,a0,31
	xor	a5,a5,a1
	addw	a4,a4,a3
	subw	a5,a5,a1
	xor	a0,a0,a2
	addw	a5,a5,a4
	subw	a0,a0,a2
	addw	a0,a0,a5
	ret

LMUL = dynamic

x264_pixel_8x8:
	add	a7,a1,a2
	vsetivli	zero,8,e8,mf2,ta,ma
	add	a6,a7,a2
	vle8.v	v1,0(a1)
	add	a3,a6,a2
	vle8.v	v2,0(a7)
	add	a4,a3,a2
	vle8.v	v13,0(a0)
	vle8.v	v7,0(a4)
	vwsubu.vv	v4,v13,v1
	vle8.v	v11,0(a6)
	vle8.v	v9,0(a3)
	add	a5,a4,a2
	addi	t1,a0,16
	vle8.v	v5,0(a5)
	vle8.v	v3,0(t1)
	addi	a7,a0,32
	addi	a6,a0,48
	vle8.v	v12,0(a7)
	vle8.v	v10,0(a6)
	addi	a3,a0,64
	addi	a4,a0,80
	vle8.v	v8,0(a3)
	vle8.v	v6,0(a4)
	vsetvli	zero,zero,e16,m1,ta,mu
	vmslt.vi	v0,v4,0
	vneg.v	v4,v4,v0.t
	vsetvli	zero,zero,e8,mf2,ta,ma
	vwsubu.vv	v1,v3,v2
	vsetvli	zero,zero,e16,m1,ta,mu
	vmslt.vi	v0,v1,0
	vneg.v	v1,v1,v0.t
	vwadd.vv	v2,v4,v1
	vsetvli	zero,zero,e8,mf2,ta,ma
	vwsubu.vv	v1,v12,v11
	vsetvli	zero,zero,e16,m1,ta,mu
	vmslt.vi	v0,v1,0
	vneg.v	v1,v1,v0.t
	vwadd.wv	v2,v2,v1
	vsetvli	zero,zero,e8,mf2,ta,ma
	vwsubu.vv	v1,v10,v9
	vsetvli	zero,zero,e16,m1,ta,mu
	vmslt.vi	v0,v1,0
	vneg.v	v1,v1,v0.t
	vwadd.wv	v2,v2,v1
	vsetvli	zero,zero,e8,mf2,ta,ma
	vwsubu.vv	v1,v8,v7
	vsetvli	zero,zero,e16,m1,ta,mu
	slli	a4,a2,3
	vmslt.vi	v0,v1,0
	vneg.v	v1,v1,v0.t
	vwadd.wv	v2,v2,v1
	vsetvli	zero,zero,e8,mf2,ta,ma
	sub	a4,a4,a2
	vwsubu.vv	v1,v6,v5
	vsetvli	zero,zero,e16,m1,ta,mu
	addi	a3,a0,96
	vmslt.vi	v0,v1,0
	vle8.v	v7,0(a3)
	vneg.v	v1,v1,v0.t
	add	a5,a5,a2
	vwadd.wv	v2,v2,v1
	vle8.v	v6,0(a5)
	addi	a0,a0,112
	add	a1,a1,a4
	vle8.v	v5,0(a0)
	vle8.v	v4,0(a1)
	vsetvli	zero,zero,e8,mf2,ta,ma
	vwsubu.vv	v1,v7,v6
	vsetvli	zero,zero,e16,m1,ta,mu
	vmslt.vi	v0,v1,0
	vneg.v	v1,v1,v0.t
	vwadd.wv	v2,v2,v1
	vsetvli	zero,zero,e32,m2,ta,ma
	li	a5,0
	vmv.s.x	v1,a5
	vredsum.vs	v1,v2,v1
	vmv.x.s	a0,v1
	vsetvli	zero,zero,e8,mf2,ta,ma
	vwsubu.vv	v1,v5,v4
	vsetvli	zero,zero,e16,m1,ta,mu
	vmslt.vi	v0,v1,0
	vneg.v	v1,v1,v0.t
	vsetivli	zero,1,e32,m1,ta,ma
	vmv.s.x	v2,a5
	vsetivli	zero,8,e16,m1,ta,ma
	vwredsumu.vs	v1,v1,v2
	vsetivli	zero,0,e32,m1,ta,ma
	vmv.x.s	a5,v1
	addw	a0,a0,a5
	ret

I notice we have much better codegen and performance improvement gain with --param=riscv-autovec-lmul=dynamic
which is able to pick the best LMUL (M2).

Add test avoid future somebody potential destroy performance on X264.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: New test.

---
 .../costmodel/riscv/rvv/dynamic-lmul2-7.c     | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
  

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
new file mode 100644
index 00000000000..87e963edc47
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic" } */
+
+int
+x264_pixel_8x8 (unsigned char *pix1, unsigned char *pix2, int i_stride_pix2)
+{
+  int i_sum = 0;
+  for (int y = 0; y < 8; y++)
+    {
+      i_sum += __builtin_abs (pix1[0] - pix2[0]);
+      i_sum += __builtin_abs (pix1[1] - pix2[1]);
+      i_sum += __builtin_abs (pix1[2] - pix2[2]);
+      i_sum += __builtin_abs (pix1[3] - pix2[3]);
+      i_sum += __builtin_abs (pix1[4] - pix2[4]);
+      i_sum += __builtin_abs (pix1[5] - pix2[5]);
+      i_sum += __builtin_abs (pix1[6] - pix2[6]);
+      i_sum += __builtin_abs (pix1[7] - pix2[7]);
+      pix1 += 16;
+      pix2 += i_stride_pix2;
+    }
+  return i_sum;
+}
+
+/* { dg-final { scan-assembler {e32,m2} } } */