[v1,1/2] LoongArch: Optimize immediate load.

Message ID 20221029070524.2570782-2-chenglulu@loongson.cn
State Accepted
Headers
Series Optimize immediate load. Add prefetch insns. |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

chenglulu Oct. 29, 2022, 7:05 a.m. UTC
  Fixed an issue where the compiler would not take four 64-bit immediate
load instructions out of the loop.

gcc/ChangeLog:

	* config/loongarch/constraints.md (x): New constraint.
	* config/loongarch/loongarch.cc (struct loongarch_integer_op):
	Define a new member curr_value, that records the value of
	the number stored in the destination register immediately
	after the current instruction has run.
	(loongarch_build_integer): Adds a method to load the immediate
	32-bit to 63-bit field.
	(loongarch_move_integer): Same as above.
	* config/loongarch/loongarch.h (HWIT_UC_0xFFFFFFFF):
	(HI32_OPERAND): NEW macro.
	* config/loongarch/loongarch.md (load_hi32):New template.
	* config/loongarch/predicates.md (const_hi32_operand): Determines
	whether the value is an immediate number that has a value of only
	the higher 32 bits.
	(hi32_mask_operand): Immediately counts the mask of 32 to 61 bits.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/imm-load.c: New test.
---
 gcc/config/loongarch/constraints.md           |  7 +-
 gcc/config/loongarch/loongarch.cc             | 95 ++++++++++++-------
 gcc/config/loongarch/loongarch.h              |  6 ++
 gcc/config/loongarch/loongarch.md             | 26 +++++
 gcc/config/loongarch/predicates.md            |  8 ++
 gcc/testsuite/gcc.target/loongarch/imm-load.c | 25 +++++
 6 files changed, 133 insertions(+), 34 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c
  

Patch

diff --git a/gcc/config/loongarch/constraints.md b/gcc/config/loongarch/constraints.md
index 43cb7b5f0f5..1dcf09ce5eb 100644
--- a/gcc/config/loongarch/constraints.md
+++ b/gcc/config/loongarch/constraints.md
@@ -46,7 +46,7 @@ 
 ;; "u" "A signed 52bit constant and low 32-bit is zero (for logic instructions)"
 ;; "v" "A signed 64-bit constant and low 44-bit is zero (for logic instructions)."
 ;; "w" "Matches any valid memory."
-;; "x" <-----unused
+;; "x" "A signed 64-bit constant and low 32-bit is zero (for logic instructions)."
 ;; "y" <-----unused
 ;; "z" FCC_REGS
 ;; "A" <-----unused
@@ -139,6 +139,11 @@  (define_constraint "v"
   (and (match_code "const_int")
        (match_test "LU52I_OPERAND (ival)")))
 
+(define_constraint "x"
+  "A signed 64-bit constant and low 32-bit is zero (for logic instructions)."
+  (and (match_code "const_int")
+       (match_test "HI32_OPERAND (ival)")))
+
 (define_register_constraint "z" "FCC_REGS"
   "A floating-point condition code register.")
 
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index f54c233f90c..5e8cd293645 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -139,6 +139,9 @@  struct loongarch_address_info
    METHOD_LU52I:
      Load 52-63 bit of the immediate number.
 
+   METHOD_LD_HI32:
+     Load 32-63 bit of the immediate number.
+
    METHOD_INSV:
      immediate like 0xfff00000fffffxxx
    */
@@ -147,13 +150,18 @@  enum loongarch_load_imm_method
   METHOD_NORMAL,
   METHOD_LU32I,
   METHOD_LU52I,
+  METHOD_LD_HI32,
   METHOD_INSV
 };
 
 struct loongarch_integer_op
 {
   enum rtx_code code;
+  /* Current Immediate Count The immediate count of the load instruction.  */
   HOST_WIDE_INT value;
+  /* Represent the result of the immediate count of the load instruction at
+     each step.  */
+  HOST_WIDE_INT curr_value;
   enum loongarch_load_imm_method method;
 };
 
@@ -1474,24 +1482,27 @@  loongarch_build_integer (struct loongarch_integer_op *codes,
     {
       /* The value of the lower 32 bit be loaded with one instruction.
 	 lu12i.w.  */
-      codes[0].code = UNKNOWN;
-      codes[0].method = METHOD_NORMAL;
-      codes[0].value = low_part;
+      codes[cost].code = UNKNOWN;
+      codes[cost].method = METHOD_NORMAL;
+      codes[cost].value = low_part;
+      codes[cost].curr_value = low_part;
       cost++;
     }
   else
     {
       /* lu12i.w + ior.  */
-      codes[0].code = UNKNOWN;
-      codes[0].method = METHOD_NORMAL;
-      codes[0].value = low_part & ~(IMM_REACH - 1);
+      codes[cost].code = UNKNOWN;
+      codes[cost].method = METHOD_NORMAL;
+      codes[cost].value = low_part & ~(IMM_REACH - 1);
+      codes[cost].curr_value = codes[cost].value;
       cost++;
       HOST_WIDE_INT iorv = low_part & (IMM_REACH - 1);
       if (iorv != 0)
 	{
-	  codes[1].code = IOR;
-	  codes[1].method = METHOD_NORMAL;
-	  codes[1].value = iorv;
+	  codes[cost].code = IOR;
+	  codes[cost].method = METHOD_NORMAL;
+	  codes[cost].value = iorv;
+	  codes[cost].curr_value = low_part;
 	  cost++;
 	}
     }
@@ -1514,23 +1525,34 @@  loongarch_build_integer (struct loongarch_integer_op *codes,
 	{
 	  codes[cost].method = METHOD_LU52I;
 	  codes[cost].value = value & LU52I_B;
-	  return cost + 1;
+	  codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+							0xfffffffffffff);
+	  return cost++;
 	}
 
-      codes[cost].method = METHOD_LU32I;
-      codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
-      cost++;
-
-      /* Determine whether the 52-61 bits are sign-extended from the low order,
-	 and if not, load the 52-61 bits.  */
-      if (!lu52i[(value & (HOST_WIDE_INT_1U << 51)) >> 51])
+      if (lu52i[sign51])
 	{
-	  codes[cost].method = METHOD_LU52I;
-	  codes[cost].value = value & LU52I_B;
+	  /* Determine whether the 52-61 bits are sign-extended from the low order.
+	     If so, the 52-61 bits of the immediate number do not need to be loaded.
+	  */
+	  codes[cost].method = METHOD_LU32I;
+	  codes[cost].value = (value & LU32I_B) | (sign51 ? LU52I_B : 0);
+	  codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+							0xffffffff);
+	  cost++;
+	}
+      else
+	{
+	  /* If the higher 32 bits of the 64bit immediate need to be loaded
+	     separately by two instructions, a false immediate load instruction
+	     load_hi32 is used to load them.  */
+	  codes[cost].method = METHOD_LD_HI32;
+	  codes[cost].value = value & 0xffffffff00000000;
+	  codes[cost].curr_value = codes[cost].value | (codes[cost-1].curr_value &
+							0xffffffff);
 	  cost++;
 	}
     }
-
   gcc_assert (cost <= LARCH_MAX_INTEGER_OPS);
 
   return cost;
@@ -2910,30 +2932,37 @@  loongarch_move_integer (rtx temp, rtx dest, unsigned HOST_WIDE_INT value)
       else
 	x = force_reg (mode, x);
 
+      set_unique_reg_note (get_last_insn (), REG_EQUAL, GEN_INT (codes[i-1].curr_value));
+
       switch (codes[i].method)
 	{
 	case METHOD_NORMAL:
+	  /* mov or ior.  */
 	  x = gen_rtx_fmt_ee (codes[i].code, mode, x,
 			      GEN_INT (codes[i].value));
 	  break;
 	case METHOD_LU32I:
-	  emit_insn (
-	    gen_rtx_SET (x,
-			 gen_rtx_IOR (DImode,
-				      gen_rtx_ZERO_EXTEND (
-					DImode, gen_rtx_SUBREG (SImode, x, 0)),
-				      GEN_INT (codes[i].value))));
+	  gcc_assert (mode == DImode);
+	  /* lu32i_d */
+	  x = gen_rtx_IOR (mode, gen_rtx_ZERO_EXTEND (mode,
+						gen_rtx_SUBREG (SImode, x, 0)),
+			   GEN_INT (codes[i].value));
 	  break;
 	case METHOD_LU52I:
-	  emit_insn (gen_lu52i_d (x, x, GEN_INT (0xfffffffffffff),
-				  GEN_INT (codes[i].value)));
+	  gcc_assert (mode == DImode);
+	  /* lu52i_d */
+	  x = gen_rtx_IOR (mode, gen_rtx_AND (mode, x, GEN_INT (0xfffffffffffff)),
+			   GEN_INT (codes[i].value));
 	  break;
-	case METHOD_INSV:
-	  emit_insn (
-	    gen_rtx_SET (gen_rtx_ZERO_EXTRACT (DImode, x, GEN_INT (20),
-					       GEN_INT (32)),
-			 gen_rtx_REG (DImode, 0)));
+	case METHOD_LD_HI32:
+	  /* Load the high 32 bits of the immediate number.  */
+	  gcc_assert (mode == DImode);
+	  /* load_hi32 */
+	  x = gen_rtx_IOR (mode, gen_rtx_AND (mode, x, GEN_INT (0xffffffff)),
+			   GEN_INT (codes[i].value));
 	  break;
+	case METHOD_INSV:
+	  /* It is not currently implemented.  */
 	default:
 	  gcc_unreachable ();
 	}
diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h
index f4a9c329fef..cfc046f546e 100644
--- a/gcc/config/loongarch/loongarch.h
+++ b/gcc/config/loongarch/loongarch.h
@@ -605,6 +605,12 @@  enum reg_class
 #define LU52I_OPERAND(VALUE) \
   (((VALUE) | (HWIT_UC_0xFFF << 52)) == (HWIT_UC_0xFFF << 52))
 
+/* True if VALUE can be loaded into a register using load_hi32.  */
+
+#define HWIT_UC_0xFFFFFFFF HOST_WIDE_INT_UC(0xffffffff)
+#define HI32_OPERAND(VALUE) \
+  (((VALUE) | (HWIT_UC_0xFFFFFFFF << 32)) == (HWIT_UC_0xFFFFFFFF << 32))
+
 /* Return a value X with the low 12 bits clear, and such that
    VALUE - X is a signed 12-bit value.  */
 
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 214b14bddd3..7eaa9ab66e3 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -1882,6 +1882,32 @@  (define_expand "mov<mode>cc"
   DONE;
 })
 
+(define_insn_and_split "load_hi32"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(ior:DI
+	  (and:DI (match_operand:DI 1 "register_operand" "0")
+		  (match_operand 2 "hi32_mask_operand"))
+	(match_operand 3 "const_hi32_operand" "x")))]
+  "TARGET_64BIT"
+  "#"
+  ""
+  [(set (match_dup 0)
+        (ior:DI
+          (zero_extend:DI
+            (subreg:SI (match_dup 1) 0))
+          (match_dup 4)))
+   (set (match_dup 0)
+        (ior:DI
+          (and:DI (match_dup 0)
+                  (match_dup 6))
+          (match_dup 5)))]
+{
+  operands[4] = GEN_INT (INTVAL (operands[3]) << 12 >> 12);
+  operands[5] = GEN_INT (INTVAL (operands[3]) & 0xfff0000000000000);
+  operands[6] = GEN_INT (0xfffffffffffff);
+}
+  [(set_attr "insn_count" "2")])
+
 (define_insn "lu32i_d"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(ior:DI
diff --git a/gcc/config/loongarch/predicates.md b/gcc/config/loongarch/predicates.md
index 8bd0c1376c9..29d81ff0250 100644
--- a/gcc/config/loongarch/predicates.md
+++ b/gcc/config/loongarch/predicates.md
@@ -35,6 +35,10 @@  (define_predicate "const_lu52i_operand"
   (and (match_code "const_int")
        (match_test "LU52I_OPERAND (INTVAL (op))")))
 
+(define_predicate "const_hi32_operand"
+  (and (match_code "const_int")
+       (match_test "HI32_OPERAND (INTVAL (op))")))
+
 (define_predicate "const_arith_operand"
   (and (match_code "const_int")
        (match_test "IMM12_OPERAND (INTVAL (op))")))
@@ -103,6 +107,10 @@  (define_predicate "lu52i_mask_operand"
   (and (match_code "const_int")
        (match_test "UINTVAL (op) == 0xfffffffffffff")))
 
+(define_predicate "hi32_mask_operand"
+  (and (match_code "const_int")
+       (match_test "UINTVAL (op) == 0xffffffff")))
+
 (define_predicate "low_bitmask_operand"
   (and (match_code "const_int")
        (match_test "low_bitmask_len (mode, INTVAL (op)) > 12")))
diff --git a/gcc/testsuite/gcc.target/loongarch/imm-load.c b/gcc/testsuite/gcc.target/loongarch/imm-load.c
new file mode 100644
index 00000000000..91ceb33d058
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/imm-load.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mabi=lp64d -O2 -fdump-rtl-loop2_invariant" } */
+
+extern long long b[10];
+static inline long long
+repeat_bytes (void)
+{
+  long long r = 0x0101010101010101;
+
+  return r;
+}
+
+static inline long long
+highbit_mask (long long m)
+{
+  return m & repeat_bytes ();
+}
+
+void test(long long *a)
+{
+  for (int i = 0; i < 10; i++)
+    b[i] = highbit_mask (a[i]);
+
+}
+/* { dg-final { scan-rtl-dump-times "moved without introducing a new temporary register" 4 "loop2_invariant" } } */