Pushed: [PATCH v2] LoongArch: Use LSX and LASX for block move

Message ID b0e6dc5d6e4a5cca7f2abf52f0472da0ba9b80fb.camel@xry111.site
State Accepted
Headers
Series Pushed: [PATCH v2] LoongArch: Use LSX and LASX for block move |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Xi Ruoyao Sept. 9, 2023, 7:03 a.m. UTC
  Pushed r14-3818 with test cases added.  The pushed patch is attached.

On Sat, 2023-09-09 at 14:10 +0800, chenglulu wrote:
> 
> 在 2023/9/8 上午12:14, Xi Ruoyao 写道:
> > gcc/ChangeLog:
> > 
> >         * config/loongarch/loongarch.h (LARCH_MAX_MOVE_PER_INSN):
> >         Define to the maximum amount of bytes able to be loaded or
> >         stored with one machine instruction.
> >         * config/loongarch/loongarch.cc (loongarch_mode_for_move_size):
> >         New static function.
> >         (loongarch_block_move_straight): Call
> >         loongarch_mode_for_move_size for machine_mode to be moved.
> >         (loongarch_expand_block_move): Use LARCH_MAX_MOVE_PER_INSN
> >         instead of UNITS_PER_WORD.
> > ---
> > 
> > Bootstrapped and regtested on loongarch64-linux-gnu, with PR110939 patch
> > applied, the "lib_build_self_spec = %<..." line in t-linux commented out
> > (because it's silently making -mlasx in BOOT_CFLAGS ineffective, Yujie
> > is working on a proper fix), and BOOT_CFLAGS="-O3 -mlasx".  Ok for trunk?
> 
> I think test cases need to be added here.
> 
> Otherwise OK, thanks!

/* snip */
  

Patch

From 35adc54b55aa199f17e2c84e382792e424b6171e Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Tue, 5 Sep 2023 21:02:38 +0800
Subject: [PATCH v2] LoongArch: Use LSX and LASX for block move

gcc/ChangeLog:

	* config/loongarch/loongarch.h (LARCH_MAX_MOVE_PER_INSN):
	Define to the maximum amount of bytes able to be loaded or
	stored with one machine instruction.
	* config/loongarch/loongarch.cc (loongarch_mode_for_move_size):
	New static function.
	(loongarch_block_move_straight): Call
	loongarch_mode_for_move_size for machine_mode to be moved.
	(loongarch_expand_block_move): Use LARCH_MAX_MOVE_PER_INSN
	instead of UNITS_PER_WORD.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/memcpy-vec-1.c: New test.
	* gcc.target/loongarch/memcpy-vec-2.c: New test.
	* gcc.target/loongarch/memcpy-vec-3.c: New test.
---
 gcc/config/loongarch/loongarch.cc             | 22 +++++++++++++++----
 gcc/config/loongarch/loongarch.h              |  3 +++
 .../gcc.target/loongarch/memcpy-vec-1.c       | 11 ++++++++++
 .../gcc.target/loongarch/memcpy-vec-2.c       | 12 ++++++++++
 .../gcc.target/loongarch/memcpy-vec-3.c       |  6 +++++
 5 files changed, 50 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/memcpy-vec-1.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/memcpy-vec-2.c
 create mode 100644 gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c

diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 6698414281e..509ef2b97f1 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -5191,6 +5191,20 @@  loongarch_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
   return true;
 }
 
+static machine_mode
+loongarch_mode_for_move_size (HOST_WIDE_INT size)
+{
+  switch (size)
+    {
+    case 32:
+      return V32QImode;
+    case 16:
+      return V16QImode;
+    }
+
+  return int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
+}
+
 /* Emit straight-line code to move LENGTH bytes from SRC to DEST.
    Assume that the areas do not overlap.  */
 
@@ -5220,7 +5234,7 @@  loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length,
 
   for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2)
     {
-      mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require ();
+      mode = loongarch_mode_for_move_size (delta_cur);
 
       for (; offs + delta_cur <= length; offs += delta_cur, i++)
 	{
@@ -5231,7 +5245,7 @@  loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length,
 
   for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2)
     {
-      mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require ();
+      mode = loongarch_mode_for_move_size (delta_cur);
 
       for (; offs + delta_cur <= length; offs += delta_cur, i++)
 	loongarch_emit_move (adjust_address (dest, mode, offs), regs[i]);
@@ -5326,8 +5340,8 @@  loongarch_expand_block_move (rtx dest, rtx src, rtx r_length, rtx r_align)
 
   HOST_WIDE_INT align = INTVAL (r_align);
 
-  if (!TARGET_STRICT_ALIGN || align > UNITS_PER_WORD)
-    align = UNITS_PER_WORD;
+  if (!TARGET_STRICT_ALIGN || align > LARCH_MAX_MOVE_PER_INSN)
+    align = LARCH_MAX_MOVE_PER_INSN;
 
   if (length <= align * LARCH_MAX_MOVE_OPS_STRAIGHT)
     {
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index 3fc9dc43ab1..7e391205583 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -1181,6 +1181,9 @@  typedef struct {
    least twice.  */
 #define LARCH_MAX_MOVE_OPS_STRAIGHT (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2)
 
+#define LARCH_MAX_MOVE_PER_INSN \
+  (ISA_HAS_LASX ? 32 : (ISA_HAS_LSX ? 16 : UNITS_PER_WORD))
+
 /* The base cost of a memcpy call, for MOVE_RATIO and friends.  These
    values were determined experimentally by benchmarking with CSiBE.
 */
diff --git a/gcc/testsuite/gcc.target/loongarch/memcpy-vec-1.c b/gcc/testsuite/gcc.target/loongarch/memcpy-vec-1.c
new file mode 100644
index 00000000000..8d9fedc9e4f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/memcpy-vec-1.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mabi=lp64d -march=la464 -mno-strict-align" } */
+/* { dg-final { scan-assembler-times "xvst" 2 } } */
+/* { dg-final { scan-assembler-times "\tvst" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.h" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.b" 1 } } */
+
+extern char a[], b[];
+void test() { __builtin_memcpy(a, b, 95); }
diff --git a/gcc/testsuite/gcc.target/loongarch/memcpy-vec-2.c b/gcc/testsuite/gcc.target/loongarch/memcpy-vec-2.c
new file mode 100644
index 00000000000..6b28b884db0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/memcpy-vec-2.c
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mabi=lp64d -march=la464 -mno-strict-align" } */
+/* { dg-final { scan-assembler-times "xvst" 2 } } */
+/* { dg-final { scan-assembler-times "\tvst" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.h" 1 } } */
+/* { dg-final { scan-assembler-times "st\\.b" 1 } } */
+
+typedef char __attribute__ ((vector_size (32), aligned (32))) vec;
+extern vec a[], b[];
+void test() { __builtin_memcpy(a, b, 95); }
diff --git a/gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c b/gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c
new file mode 100644
index 00000000000..233ed215078
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/memcpy-vec-3.c
@@ -0,0 +1,6 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=la464 -mabi=lp64d -mstrict-align" } */
+/* { dg-final { scan-assembler-not "vst" } } */
+
+extern char a[], b[];
+void test() { __builtin_memcpy(a, b, 16); }
-- 
2.42.0