[v2] AArch64: Add inline memmove expansion

Message ID PAWPR08MB898291CAEA27140073ADE93983D7A@PAWPR08MB8982.eurprd08.prod.outlook.com
State Not Applicable
Headers
Series [v2] AArch64: Add inline memmove expansion |

Checks

Context Check Description
snail/gcc-patch-check fail Git am fail log

Commit Message

Wilco Dijkstra Oct. 16, 2023, 12:27 p.m. UTC
  v2: further cleanups, improved comments

Add support for inline memmove expansions.  The generated code is identical
as for memcpy, except that all loads are emitted before stores rather than
being interleaved.  The maximum size is 256 bytes which requires at most 16
registers.

Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
        * config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold):
        Change default.
        * config/aarch64/aarch64.md (cpymemdi): Add a parameter.
        (movmemdi): Call aarch64_expand_cpymem.
        * config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function,
        simplify, support storing generated loads/stores. 
        (aarch64_expand_cpymem): Support expansion of memmove.
        * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool arg.

gcc/testsuite/ChangeLog/
        * gcc.target/aarch64/memmove.c: Add new test.

---
  

Comments

Wilco Dijkstra Nov. 6, 2023, 12:09 p.m. UTC | #1
ping
 
v2: further cleanups, improved comments

Add support for inline memmove expansions.  The generated code is identical
as for memcpy, except that all loads are emitted before stores rather than
being interleaved.  The maximum size is 256 bytes which requires at most 16
registers.

Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
        * config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold):
        Change default.
        * config/aarch64/aarch64.md (cpymemdi): Add a parameter.
        (movmemdi): Call aarch64_expand_cpymem.
        * config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function,
        simplify, support storing generated loads/stores. 
        (aarch64_expand_cpymem): Support expansion of memmove.
        * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool arg.

gcc/testsuite/ChangeLog/
        * gcc.target/aarch64/memmove.c: Add new test.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..0d39622bd2826a3fde54d67b5c5da9ee9286cbbd 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -769,7 +769,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem_mops (rtx *, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 2fa5d09de85d385c1165e399bcc97681ef170916..e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25238,52 +25238,37 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
-/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
-   MODE bytes.  */
+/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
-aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
-                                             machine_mode mode)
+aarch64_copy_one_block (rtx *load, rtx *store, rtx src, rtx dst,
+                       int offset, machine_mode mode)
 {
-  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
-     address copies using V4SImode so that we can use Q registers.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+  /* Emit explict load/store pair instructions for 32-byte copies.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
     {
       mode = V4SImode;
+      rtx src1 = adjust_address (src, mode, offset);
+      rtx src2 = adjust_address (src, mode, offset + 16);
+      rtx dst1 = adjust_address (dst, mode, offset);
+      rtx dst2 = adjust_address (dst, mode, offset + 16);
       rtx reg1 = gen_reg_rtx (mode);
       rtx reg2 = gen_reg_rtx (mode);
-      /* "Cast" the pointers to the correct mode.  */
-      *src = adjust_address (*src, mode, 0);
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memcpy.  */
-      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
-                                       aarch64_progress_pointer (*src)));
-      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
-                                        aarch64_progress_pointer (*dst), reg2));
-      /* Move the pointers forward.  */
-      *src = aarch64_move_pointer (*src, 32);
-      *dst = aarch64_move_pointer (*dst, 32);
+      *load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
+      *store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
       return;
     }
 
   rtx reg = gen_reg_rtx (mode);
-
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_move_insn (reg, *src);
-  emit_move_insn (*dst, reg);
-  /* Move the pointers forward.  */
-  *src = aarch64_progress_pointer (*src);
-  *dst = aarch64_progress_pointer (*dst);
+  *load = gen_move_insn (reg, adjust_address (src, mode, offset));
+  *store = gen_move_insn (adjust_address (dst, mode, offset), reg);
 }
 
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
    rather than memcpy.  Return true iff we succeeded.  */
 bool
-aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
 {
   if (!TARGET_MOPS)
     return false;
@@ -25302,51 +25287,48 @@ aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
   return true;
 }
 
-/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false, indicating that a libcall to
-   memcpy should be emitted.  */
-
+/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
+   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
+   if this is a memmove rather than memcpy.  Return true if we succeed,
+   otherwise return false, indicating that a libcall should be emitted.  */
 bool
-aarch64_expand_cpymem (rtx *operands)
+aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 {
-  int mode_bits;
+  int mode_bytes;
   rtx dst = operands[0];
   rtx src = operands[1];
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode;
-  bool size_p = optimize_function_for_size_p (cfun);
+  machine_mode cur_mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
-    return aarch64_expand_cpymem_mops (operands);
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
 
   unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  /* Try to inline up to 256 bytes.  */
-  unsigned max_copy_size = 256;
-  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
+  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
+  unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
+  unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
+                                      : aarch64_mops_memcpy_size_threshold;
+
+  /* Reduce the maximum size with -Os.  */
+  if (optimize_function_for_size_p (cfun))
+    max_copy_size /= 4;
 
   /* Large copies use MOPS when available or a library call.  */
   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
-    return aarch64_expand_cpymem_mops (operands);
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
 
-  int copy_bits = 256;
+  unsigned copy_max = 32;
 
-  /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
-     support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
+  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
+     support or slow LDP/STP fall back to 16-byte chunks.  */
   if (size <= 24
       || !TARGET_SIMD
       || (aarch64_tune_params.extra_tuning_flags
           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-    copy_bits = 128;
-
-  /* Emit an inline load+store sequence and count the number of operations
-     involved.  We use a simple count of just the loads and stores emitted
-     rather than rtx_insn count as all the pointer adjustments and reg copying
-     in this function will get optimized away later in the pipeline.  */
-  start_sequence ();
-  unsigned nops = 0;
+    copy_max = 16;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -25354,69 +25336,60 @@ aarch64_expand_cpymem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  /* Convert size to bits to make the rest of the code simpler.  */
-  int n = size * BITS_PER_UNIT;
+  const int max_ops = 40;
+  rtx load[max_ops], store[max_ops];
 
-  while (n > 0)
+  int nops, offset;
+
+  for (nops = 0, offset = 0; size > 0; nops++)
     {
       /* Find the largest mode in which to do the copy in without over reading
          or writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
+       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
           cur_mode = mode_iter.require ();
 
-      gcc_assert (cur_mode != BLKmode);
+      gcc_assert (cur_mode != BLKmode && nops < max_ops);
 
-      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
+      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
 
       /* Prefer Q-register accesses for the last bytes.  */
-      if (mode_bits == 128 && copy_bits == 256)
+      if (mode_bytes == 16 && copy_max == 32)
         cur_mode = V4SImode;
 
-      aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
-      /* A single block copy is 1 load + 1 store.  */
-      nops += 2;
-      n -= mode_bits;
+      aarch64_copy_one_block (&load[nops], &store[nops], src, dst, offset, cur_mode);
+      size -= mode_bytes;
+      offset += mode_bytes;
 
       /* Emit trailing copies using overlapping unaligned accesses
-       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
+        (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
         {
-         machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
-         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-         gcc_assert (n_bits <= mode_bits);
-         src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
-         dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
-         n = n_bits;
+         next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
+         int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+         gcc_assert (n_bytes <= mode_bytes);
+         offset -= n_bytes - size;
+         size = n_bytes;
         }
     }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
-  /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
-     the constant size into a register.  */
-  unsigned mops_cost = 3 + 1;
-
-  /* If MOPS is available at this point we don't consider the libcall as it's
-     not a win even on code size.  At this point only consider MOPS if
-     optimizing for size.  For speed optimizations we will have chosen between
-     the two based on copy size already.  */
-  if (TARGET_MOPS)
-    {
-      if (size_p && mops_cost < nops)
-       return aarch64_expand_cpymem_mops (operands);
-      emit_insn (seq);
-      return true;
-    }
 
-  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
-     arguments + 1 for the call.  When MOPS is not available and we're
-     optimizing for size a libcall may be preferable.  */
-  unsigned libcall_cost = 4;
-  if (size_p && libcall_cost < nops)
-    return false;
+  /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
+  int i, j, m, inc;
+  inc = is_memmove ? nops : 3;
+  if (nops == inc + 1)
+    inc = nops / 2;
+  for (i = 0; i < nops; i += inc)
+    {
+      m = inc;
+      if (i + m > nops)
+       m = nops - i;
 
-  emit_insn (seq);
+      for (j = 0; j < m; j++)
+       emit_insn (load[i + j]);
+      for (j = 0; j < m; j++)
+       emit_insn (store[i + j]);
+    }
   return true;
 }
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1cb3a01d6791a48dc0b08df5783d97805448c7f2..18dd629c2456041b1185eae6d39de074709b2a39 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1629,7 +1629,7 @@ (define_expand "cpymemdi"
    (match_operand:DI 3 "immediate_operand")]
    ""
 {
-  if (aarch64_expand_cpymem (operands))
+  if (aarch64_expand_cpymem (operands, false))
     DONE;
   FAIL;
 }
@@ -1673,17 +1673,9 @@ (define_expand "movmemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "TARGET_MOPS"
+   ""
 {
-   rtx sz_reg = operands[2];
-   /* For constant-sized memmoves check the threshold.
-      FIXME: We should add a non-MOPS memmove expansion for smaller,
-      constant-sized memmove to avoid going to a libcall.  */
-   if (CONST_INT_P (sz_reg)
-       && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
-     FAIL;
-
-  if (aarch64_expand_cpymem_mops (operands, true))
+  if (aarch64_expand_cpymem (operands, true))
     DONE;
   FAIL;
 }
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index f5a518202a157b5b5bc2b2aa14ac1177fded7d66..0ac9d8c578d706e7bf0f0ae399d84544f0c619dc 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -327,7 +327,7 @@ Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
 Constant memcpy size in bytes above which to start using MOPS sequence.
 
 -param=aarch64-mops-memmove-size-threshold=
-Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
+Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(256) Param
 Constant memmove size in bytes above which to start using MOPS sequence.
 
 -param=aarch64-mops-memset-size-threshold=
diff --git a/gcc/testsuite/gcc.target/aarch64/memmove.c b/gcc/testsuite/gcc.target/aarch64/memmove.c
new file mode 100644
index 0000000000000000000000000000000000000000..6926a97761eb2578d3f1db7e6eb19dba17b888be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memmove.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+void
+copy1 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 12);
+}
+
+void
+copy2 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 128);
+}
+
+void
+copy3 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 255);
+}
+
+/* { dg-final { scan-assembler-not {\tb\tmemmove} } } */
  
Richard Sandiford Nov. 29, 2023, 6:30 p.m. UTC | #2
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> v2: further cleanups, improved comments
>
> Add support for inline memmove expansions.  The generated code is identical
> as for memcpy, except that all loads are emitted before stores rather than
> being interleaved.  The maximum size is 256 bytes which requires at most 16
> registers.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
>         * config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold):
>         Change default.
>         * config/aarch64/aarch64.md (cpymemdi): Add a parameter.
>         (movmemdi): Call aarch64_expand_cpymem.
>         * config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function,
>         simplify, support storing generated loads/stores.
>         (aarch64_expand_cpymem): Support expansion of memmove.
>         * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool arg.
>
> gcc/testsuite/ChangeLog/
>         * gcc.target/aarch64/memmove.c: Add new test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..0d39622bd2826a3fde54d67b5c5da9ee9286cbbd 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -769,7 +769,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
>  tree aarch64_vector_load_decl (tree);
>  void aarch64_expand_call (rtx, rtx, rtx, bool);
>  bool aarch64_expand_cpymem_mops (rtx *, bool);
> -bool aarch64_expand_cpymem (rtx *);
> +bool aarch64_expand_cpymem (rtx *, bool);
>  bool aarch64_expand_setmem (rtx *);
>  bool aarch64_float_const_zero_rtx_p (rtx);
>  bool aarch64_float_const_rtx_p (rtx);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 2fa5d09de85d385c1165e399bcc97681ef170916..e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25238,52 +25238,37 @@ aarch64_progress_pointer (rtx pointer)
>    return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
>  }
>
> -/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
> -   MODE bytes.  */
> +/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
>
>  static void
> -aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
> -                                             machine_mode mode)
> +aarch64_copy_one_block (rtx *load, rtx *store, rtx src, rtx dst,
> +                       int offset, machine_mode mode)
>  {
> -  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
> -     address copies using V4SImode so that we can use Q registers.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> +  /* Emit explict load/store pair instructions for 32-byte copies.  */
> +  if (known_eq (GET_MODE_SIZE (mode), 32))
>      {
>        mode = V4SImode;
> +      rtx src1 = adjust_address (src, mode, offset);
> +      rtx src2 = adjust_address (src, mode, offset + 16);
> +      rtx dst1 = adjust_address (dst, mode, offset);
> +      rtx dst2 = adjust_address (dst, mode, offset + 16);
>        rtx reg1 = gen_reg_rtx (mode);
>        rtx reg2 = gen_reg_rtx (mode);
> -      /* "Cast" the pointers to the correct mode.  */
> -      *src = adjust_address (*src, mode, 0);
> -      *dst = adjust_address (*dst, mode, 0);
> -      /* Emit the memcpy.  */
> -      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
> -                                       aarch64_progress_pointer (*src)));
> -      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
> -                                        aarch64_progress_pointer (*dst), reg2));
> -      /* Move the pointers forward.  */
> -      *src = aarch64_move_pointer (*src, 32);
> -      *dst = aarch64_move_pointer (*dst, 32);
> +      *load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
> +      *store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
>        return;
>      }
>
>    rtx reg = gen_reg_rtx (mode);
> -
> -  /* "Cast" the pointers to the correct mode.  */
> -  *src = adjust_address (*src, mode, 0);
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memcpy.  */
> -  emit_move_insn (reg, *src);
> -  emit_move_insn (*dst, reg);
> -  /* Move the pointers forward.  */
> -  *src = aarch64_progress_pointer (*src);
> -  *dst = aarch64_progress_pointer (*dst);
> +  *load = gen_move_insn (reg, adjust_address (src, mode, offset));
> +  *store = gen_move_insn (adjust_address (dst, mode, offset), reg);
>  }
>
>  /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
>     from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
>     rather than memcpy.  Return true iff we succeeded.  */
>  bool
> -aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
> +aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
>  {
>    if (!TARGET_MOPS)
>      return false;
> @@ -25302,51 +25287,48 @@ aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
>    return true;
>  }
>
> -/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
> -   we succeed, otherwise return false, indicating that a libcall to
> -   memcpy should be emitted.  */
> -
> +/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
> +   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
> +   if this is a memmove rather than memcpy.  Return true if we succeed,
> +   otherwise return false, indicating that a libcall should be emitted.  */
>  bool
> -aarch64_expand_cpymem (rtx *operands)
> +aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>  {
> -  int mode_bits;
> +  int mode_bytes;
>    rtx dst = operands[0];
>    rtx src = operands[1];
>    unsigned align = UINTVAL (operands[3]);
>    rtx base;
> -  machine_mode cur_mode = BLKmode;
> -  bool size_p = optimize_function_for_size_p (cfun);
> +  machine_mode cur_mode = BLKmode, next_mode;
>
>    /* Variable-sized or strict-align copies may use the MOPS expansion.  */
>    if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
> -    return aarch64_expand_cpymem_mops (operands);
> +    return aarch64_expand_cpymem_mops (operands, is_memmove);
>
>    unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
>
> -  /* Try to inline up to 256 bytes.  */
> -  unsigned max_copy_size = 256;
> -  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
> +  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
> +  unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
> +  unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
> +                                      : aarch64_mops_memcpy_size_threshold;
> +
> +  /* Reduce the maximum size with -Os.  */
> +  if (optimize_function_for_size_p (cfun))
> +    max_copy_size /= 4;
>
>    /* Large copies use MOPS when available or a library call.  */
>    if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
> -    return aarch64_expand_cpymem_mops (operands);
> +    return aarch64_expand_cpymem_mops (operands, is_memmove);
>
> -  int copy_bits = 256;
> +  unsigned copy_max = 32;
>
> -  /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
> -     support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
> +  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
> +     support or slow LDP/STP fall back to 16-byte chunks.  */
>    if (size <= 24
>        || !TARGET_SIMD
>        || (aarch64_tune_params.extra_tuning_flags
>           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> -    copy_bits = 128;
> -
> -  /* Emit an inline load+store sequence and count the number of operations
> -     involved.  We use a simple count of just the loads and stores emitted
> -     rather than rtx_insn count as all the pointer adjustments and reg copying
> -     in this function will get optimized away later in the pipeline.  */
> -  start_sequence ();
> -  unsigned nops = 0;
> +    copy_max = 16;
>
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> @@ -25354,69 +25336,60 @@ aarch64_expand_cpymem (rtx *operands)
>    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>    src = adjust_automodify_address (src, VOIDmode, base, 0);
>
> -  /* Convert size to bits to make the rest of the code simpler.  */
> -  int n = size * BITS_PER_UNIT;
> +  const int max_ops = 40;
> +  rtx load[max_ops], store[max_ops];

Please either add a comment explaining why 40 is guaranteed to be
enough, or (my preference) use:

  auto_vec<std::pair<rtx, rtx>, ...> ops;

with safe_push.  Using auto_vec also removes the need to track nops
separately.
>
> -  while (n > 0)
> +  int nops, offset;
> +
> +  for (nops = 0, offset = 0; size > 0; nops++)
>      {
>        /* Find the largest mode in which to do the copy in without over reading
>          or writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
> +       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
>           cur_mode = mode_iter.require ();
>
> -      gcc_assert (cur_mode != BLKmode);
> +      gcc_assert (cur_mode != BLKmode && nops < max_ops);
>
> -      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> +      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
>
>        /* Prefer Q-register accesses for the last bytes.  */
> -      if (mode_bits == 128 && copy_bits == 256)
> +      if (mode_bytes == 16 && copy_max == 32)
>         cur_mode = V4SImode;
>
> -      aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
> -      /* A single block copy is 1 load + 1 store.  */
> -      nops += 2;
> -      n -= mode_bits;
> +      aarch64_copy_one_block (&load[nops], &store[nops], src, dst, offset, cur_mode);
> +      size -= mode_bytes;
> +      offset += mode_bytes;
>
>        /* Emit trailing copies using overlapping unaligned accesses
> -       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> -      if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
> +        (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> +      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
>         {
> -         machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
> -         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> -         gcc_assert (n_bits <= mode_bits);
> -         src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
> -         dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
> -         n = n_bits;
> +         next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
> +         int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
> +         gcc_assert (n_bytes <= mode_bytes);
> +         offset -= n_bytes - size;
> +         size = n_bytes;
>         }
>      }
> -  rtx_insn *seq = get_insns ();
> -  end_sequence ();
> -  /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
> -     the constant size into a register.  */
> -  unsigned mops_cost = 3 + 1;
> -
> -  /* If MOPS is available at this point we don't consider the libcall as it's
> -     not a win even on code size.  At this point only consider MOPS if
> -     optimizing for size.  For speed optimizations we will have chosen between
> -     the two based on copy size already.  */
> -  if (TARGET_MOPS)
> -    {
> -      if (size_p && mops_cost < nops)
> -       return aarch64_expand_cpymem_mops (operands);
> -      emit_insn (seq);
> -      return true;
> -    }
>
> -  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
> -     arguments + 1 for the call.  When MOPS is not available and we're
> -     optimizing for size a libcall may be preferable.  */
> -  unsigned libcall_cost = 4;
> -  if (size_p && libcall_cost < nops)
> -    return false;
> +  /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
> +  int i, j, m, inc;
> +  inc = is_memmove ? nops : 3;
> +  if (nops == inc + 1)
> +    inc = nops / 2;

I realise this is personal preference, sorry, but I think this would
be better as:

  inc = is_memmove ? nops : nops == 4 ? 2 : 3;

since it separates the correctness requirement for is_memmove from the
optimisation of interleaving two accesses for nops==4.

> +  for (i = 0; i < nops; i += inc)
> +    {
> +      m = inc;
> +      if (i + m > nops)
> +       m = nops - i;
>
> -  emit_insn (seq);
> +      for (j = 0; j < m; j++)
> +       emit_insn (load[i + j]);
> +      for (j = 0; j < m; j++)
> +       emit_insn (store[i + j]);
> +    }

Also personal preference, but how about:

    m = MIN (i + inc, nops);
    for (j = i; j < m; j++)
      emit_insn (load[j]);
    for (j = i; j < m; j++)
      emit_insn (store[j]);

(adjusted for the comments above).  Not a strong opinion though, so feel
free to keep your version.

>    return true;
>  }
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 1cb3a01d6791a48dc0b08df5783d97805448c7f2..18dd629c2456041b1185eae6d39de074709b2a39 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1629,7 +1629,7 @@ (define_expand "cpymemdi"
>     (match_operand:DI 3 "immediate_operand")]
>     ""
>  {
> -  if (aarch64_expand_cpymem (operands))
> +  if (aarch64_expand_cpymem (operands, false))
>      DONE;
>    FAIL;
>  }
> @@ -1673,17 +1673,9 @@ (define_expand "movmemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "TARGET_MOPS"
> +   ""
>  {
> -   rtx sz_reg = operands[2];
> -   /* For constant-sized memmoves check the threshold.
> -      FIXME: We should add a non-MOPS memmove expansion for smaller,
> -      constant-sized memmove to avoid going to a libcall.  */
> -   if (CONST_INT_P (sz_reg)
> -       && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
> -     FAIL;
> -
> -  if (aarch64_expand_cpymem_mops (operands, true))
> +  if (aarch64_expand_cpymem (operands, true))
>      DONE;
>    FAIL;
>  }
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index f5a518202a157b5b5bc2b2aa14ac1177fded7d66..0ac9d8c578d706e7bf0f0ae399d84544f0c619dc 100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -327,7 +327,7 @@ Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
>  Constant memcpy size in bytes above which to start using MOPS sequence.
>
>  -param=aarch64-mops-memmove-size-threshold=
> -Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
> +Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(256) Param
>  Constant memmove size in bytes above which to start using MOPS sequence.
>
>  -param=aarch64-mops-memset-size-threshold=
> diff --git a/gcc/testsuite/gcc.target/aarch64/memmove.c b/gcc/testsuite/gcc.target/aarch64/memmove.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..6926a97761eb2578d3f1db7e6eb19dba17b888be
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/memmove.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */

Probably best to add:

#pragma GCC target "+nomops"

and have another copy of the test with -mstrict-align.  Keeping the
current test (as a third testcase) is fine too of course.

Otherwise it looks good.  You'd know the appropriate thresholds
better than me.

Thanks,
Richard

> +
> +void
> +copy1 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 12);
> +}
> +
> +void
> +copy2 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 128);
> +}
> +
> +void
> +copy3 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 255);
> +}
> +
> +/* { dg-final { scan-assembler-not {\tb\tmemmove} } } */
  
Wilco Dijkstra Dec. 1, 2023, 4:56 p.m. UTC | #3
Hi Richard,

> +  rtx load[max_ops], store[max_ops];
>
> Please either add a comment explaining why 40 is guaranteed to be
> enough, or (my preference) use:
>
>  auto_vec<std::pair<rtx, rtx>, ...> ops;

I've changed to using auto_vec since that should help reduce conflicts
with Alex' LDP changes. I double-checked maximum number of instructions,
with a minor tweak to handle AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
it can now be limited to 12 if you also select -mstrict-align.

v3: update after review, use auto_vec, tweak max_copy_size, add another test.

Add support for inline memmove expansions.  The generated code is identical
as for memcpy, except that all loads are emitted before stores rather than
being interleaved.  The maximum size is 256 bytes which requires at most 16
registers.

Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
        * config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold):
        Change default.
        * config/aarch64/aarch64.md (cpymemdi): Add a parameter.
        (movmemdi): Call aarch64_expand_cpymem.
        * config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function,
        simplify, support storing generated loads/stores. 
        (aarch64_expand_cpymem): Support expansion of memmove.
        * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool arg.

gcc/testsuite/ChangeLog/
        * gcc.target/aarch64/memmove.c: Add new test.
        * gcc.target/aarch64/memmove.c: Likewise.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index d2718cc87b306e9673b166cc40e0af2ba72aa17b..d958b181d79440ab1b4f274cc188559edc04c628 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -769,7 +769,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem_mops (rtx *, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 748b313092c5af452e9526a0c6747c51e598e4ca..26d1485ff6b977caeeb780dfaee739069742e638 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -23058,51 +23058,41 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
+typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
+
 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
-aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
-					      machine_mode mode)
+aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
+			int offset, machine_mode mode)
 {
-  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
-     address copies using V4SImode so that we can use Q registers.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+  /* Emit explict load/store pair instructions for 32-byte copies.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
     {
       mode = V4SImode;
+      rtx src1 = adjust_address (src, mode, offset);
+      rtx src2 = adjust_address (src, mode, offset + 16);
+      rtx dst1 = adjust_address (dst, mode, offset);
+      rtx dst2 = adjust_address (dst, mode, offset + 16);
       rtx reg1 = gen_reg_rtx (mode);
       rtx reg2 = gen_reg_rtx (mode);
-      /* "Cast" the pointers to the correct mode.  */
-      *src = adjust_address (*src, mode, 0);
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memcpy.  */
-      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
-					aarch64_progress_pointer (*src)));
-      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
-					 aarch64_progress_pointer (*dst), reg2));
-      /* Move the pointers forward.  */
-      *src = aarch64_move_pointer (*src, 32);
-      *dst = aarch64_move_pointer (*dst, 32);
+      rtx load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
+      rtx store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
+      ops.safe_push ({ load, store });
       return;
     }
 
   rtx reg = gen_reg_rtx (mode);
-
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_move_insn (reg, *src);
-  emit_move_insn (*dst, reg);
-  /* Move the pointers forward.  */
-  *src = aarch64_progress_pointer (*src);
-  *dst = aarch64_progress_pointer (*dst);
+  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
+  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
+  ops.safe_push ({ load, store });
 }
 
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
    rather than memcpy.  Return true iff we succeeded.  */
 bool
-aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
 {
   if (!TARGET_MOPS)
     return false;
@@ -23121,51 +23111,47 @@ aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
   return true;
 }
 
-/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false, indicating that a libcall to
-   memcpy should be emitted.  */
-
+/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
+   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
+   if this is a memmove rather than memcpy.  Return true if we succeed,
+   otherwise return false, indicating that a libcall should be emitted.  */
 bool
-aarch64_expand_cpymem (rtx *operands)
+aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 {
-  int mode_bits;
+  int mode_bytes;
   rtx dst = operands[0];
   rtx src = operands[1];
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode;
-  bool size_p = optimize_function_for_size_p (cfun);
+  machine_mode cur_mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
-    return aarch64_expand_cpymem_mops (operands);
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
 
   unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
+  bool use_ldpq = TARGET_SIMD && !(aarch64_tune_params.extra_tuning_flags
+				   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
+
+  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
+  unsigned max_copy_size = use_ldpq ? 256 : 128;
+  unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
+				       : aarch64_mops_memcpy_size_threshold;
 
-  /* Try to inline up to 256 bytes.  */
-  unsigned max_copy_size = 256;
-  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
+  /* Reduce the maximum size with -Os.  */
+  if (optimize_function_for_size_p (cfun))
+    max_copy_size /= 4;
 
   /* Large copies use MOPS when available or a library call.  */
   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
-    return aarch64_expand_cpymem_mops (operands);
-
-  int copy_bits = 256;
-
-  /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
-     support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
-  if (size <= 24
-      || !TARGET_SIMD
-      || (aarch64_tune_params.extra_tuning_flags
-	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-    copy_bits = 128;
-
-  /* Emit an inline load+store sequence and count the number of operations
-     involved.  We use a simple count of just the loads and stores emitted
-     rather than rtx_insn count as all the pointer adjustments and reg copying
-     in this function will get optimized away later in the pipeline.  */
-  start_sequence ();
-  unsigned nops = 0;
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
+
+  unsigned copy_max = 32;
+
+  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
+     support or slow LDP/STP fall back to 16-byte chunks.  */
+  if (size <= 24 || !use_ldpq)
+    copy_max = 16;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -23173,69 +23159,57 @@ aarch64_expand_cpymem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  /* Convert size to bits to make the rest of the code simpler.  */
-  int n = size * BITS_PER_UNIT;
+  copy_ops ops;
 
-  while (n > 0)
+  int offset = 0;
+
+  while (size > 0)
     {
       /* Find the largest mode in which to do the copy in without over reading
 	 or writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
+	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
 	  cur_mode = mode_iter.require ();
 
       gcc_assert (cur_mode != BLKmode);
 
-      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
+      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
 
       /* Prefer Q-register accesses for the last bytes.  */
-      if (mode_bits == 128 && copy_bits == 256)
+      if (mode_bytes == 16 && copy_max == 32)
 	cur_mode = V4SImode;
 
-      aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
-      /* A single block copy is 1 load + 1 store.  */
-      nops += 2;
-      n -= mode_bits;
+      aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
+      size -= mode_bytes;
+      offset += mode_bytes;
 
       /* Emit trailing copies using overlapping unaligned accesses
-	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
+	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
 	{
-	  machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
-	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-	  gcc_assert (n_bits <= mode_bits);
-	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
-	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
-	  n = n_bits;
+	  next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
+	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+	  gcc_assert (n_bytes <= mode_bytes);
+	  offset -= n_bytes - size;
+	  size = n_bytes;
 	}
     }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
-  /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
-     the constant size into a register.  */
-  unsigned mops_cost = 3 + 1;
-
-  /* If MOPS is available at this point we don't consider the libcall as it's
-     not a win even on code size.  At this point only consider MOPS if
-     optimizing for size.  For speed optimizations we will have chosen between
-     the two based on copy size already.  */
-  if (TARGET_MOPS)
-    {
-      if (size_p && mops_cost < nops)
-	return aarch64_expand_cpymem_mops (operands);
-      emit_insn (seq);
-      return true;
-    }
 
-  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
-     arguments + 1 for the call.  When MOPS is not available and we're
-     optimizing for size a libcall may be preferable.  */
-  unsigned libcall_cost = 4;
-  if (size_p && libcall_cost < nops)
-    return false;
+  /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
+  int nops = ops.length();
+  int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
 
-  emit_insn (seq);
+  for (int i = 0; i < nops; i += inc)
+    {
+      int m = MIN (nops, i + inc);
+      /* Emit loads.  */
+      for (int j = i; j < m; j++)
+	emit_insn (ops[j].first);
+      /* Emit stores.  */
+      for (int j = i; j < m; j++)
+	emit_insn (ops[j].second);
+    }
   return true;
 }
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 4a3af6df7e7caf1fab9483239ce41845a92e23b7..267955871591245100a3c1703845a336394241ec 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1632,7 +1632,7 @@ (define_expand "cpymemdi"
    (match_operand:DI 3 "immediate_operand")]
    ""
 {
-  if (aarch64_expand_cpymem (operands))
+  if (aarch64_expand_cpymem (operands, false))
     DONE;
   FAIL;
 }
@@ -1676,17 +1676,9 @@ (define_expand "movmemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "TARGET_MOPS"
+   ""
 {
-   rtx sz_reg = operands[2];
-   /* For constant-sized memmoves check the threshold.
-      FIXME: We should add a non-MOPS memmove expansion for smaller,
-      constant-sized memmove to avoid going to a libcall.  */
-   if (CONST_INT_P (sz_reg)
-       && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
-     FAIL;
-
-  if (aarch64_expand_cpymem_mops (operands, true))
+  if (aarch64_expand_cpymem (operands, true))
     DONE;
   FAIL;
 }
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index f5a518202a157b5b5bc2b2aa14ac1177fded7d66..0ac9d8c578d706e7bf0f0ae399d84544f0c619dc 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -327,7 +327,7 @@ Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
 Constant memcpy size in bytes above which to start using MOPS sequence.
 
 -param=aarch64-mops-memmove-size-threshold=
-Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
+Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(256) Param
 Constant memmove size in bytes above which to start using MOPS sequence.
 
 -param=aarch64-mops-memset-size-threshold=
diff --git a/gcc/testsuite/gcc.target/aarch64/memmove.c b/gcc/testsuite/gcc.target/aarch64/memmove.c
new file mode 100644
index 0000000000000000000000000000000000000000..d2dd65b5190d6f3569f3272b8bfdbd7621f43c7d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memmove.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#pragma GCC target "+nomops"
+
+void
+copy1 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 12);
+}
+
+void
+copy2 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 128);
+}
+
+void
+copy3 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 255);
+}
+
+/* { dg-final { scan-assembler-not {\tb\tmemmove} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/memmove2.c b/gcc/testsuite/gcc.target/aarch64/memmove2.c
new file mode 100644
index 0000000000000000000000000000000000000000..4c590a32214219cb0413617f2a4fc1e552643cfe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memmove2.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mstrict-align" } */
+
+#pragma GCC target "+nomops"
+
+void
+copy1 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 12);
+}
+
+void
+copy2 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 128);
+}
+
+void
+copy3 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 255);
+}
+
+/* { dg-final { scan-assembler-times {\tb\tmemmove} 3 } } */
  
Richard Sandiford Dec. 14, 2023, 4:39 p.m. UTC | #4
Sorry, only just realised that I've never replied to this :(

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>> +  rtx load[max_ops], store[max_ops];
>>
>> Please either add a comment explaining why 40 is guaranteed to be
>> enough, or (my preference) use:
>>
>>  auto_vec<std::pair<rtx, rtx>, ...> ops;
>
> I've changed to using auto_vec since that should help reduce conflicts
> with Alex' LDP changes. I double-checked maximum number of instructions,
> with a minor tweak to handle AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS
> it can now be limited to 12 if you also select -mstrict-align.
>
> v3: update after review, use auto_vec, tweak max_copy_size, add another test.
>
> Add support for inline memmove expansions.  The generated code is identical
> as for memcpy, except that all loads are emitted before stores rather than
> being interleaved.  The maximum size is 256 bytes which requires at most 16
> registers.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
>         * config/aarch64/aarch64.opt (aarch64_mops_memmove_size_threshold):
>         Change default.
>         * config/aarch64/aarch64.md (cpymemdi): Add a parameter.
>         (movmemdi): Call aarch64_expand_cpymem.
>         * config/aarch64/aarch64.cc (aarch64_copy_one_block): Rename function,
>         simplify, support storing generated loads/stores.
>         (aarch64_expand_cpymem): Support expansion of memmove.
>         * config/aarch64/aarch64-protos.h (aarch64_expand_cpymem): Add bool arg.
>
> gcc/testsuite/ChangeLog/
>         * gcc.target/aarch64/memmove.c: Add new test.
>         * gcc.target/aarch64/memmove.c: Likewise.

OK, thanks.  I since added a comment:

     ??? Although it would be possible to use LDP/STP Qn in streaming mode
     (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
     whether that would improve performance.  */

which now belongs...

>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index d2718cc87b306e9673b166cc40e0af2ba72aa17b..d958b181d79440ab1b4f274cc188559edc04c628 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -769,7 +769,7 @@ bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
>  tree aarch64_vector_load_decl (tree);
>  void aarch64_expand_call (rtx, rtx, rtx, bool);
>  bool aarch64_expand_cpymem_mops (rtx *, bool);
> -bool aarch64_expand_cpymem (rtx *);
> +bool aarch64_expand_cpymem (rtx *, bool);
>  bool aarch64_expand_setmem (rtx *);
>  bool aarch64_float_const_zero_rtx_p (rtx);
>  bool aarch64_float_const_rtx_p (rtx);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 748b313092c5af452e9526a0c6747c51e598e4ca..26d1485ff6b977caeeb780dfaee739069742e638 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23058,51 +23058,41 @@ aarch64_progress_pointer (rtx pointer)
>    return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
>  }
>
> +typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
> +
>  /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
>
>  static void
> -aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
> -                                             machine_mode mode)
> +aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
> +                       int offset, machine_mode mode)
>  {
> -  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
> -     address copies using V4SImode so that we can use Q registers.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> +  /* Emit explict load/store pair instructions for 32-byte copies.  */
> +  if (known_eq (GET_MODE_SIZE (mode), 32))
>      {
>        mode = V4SImode;
> +      rtx src1 = adjust_address (src, mode, offset);
> +      rtx src2 = adjust_address (src, mode, offset + 16);
> +      rtx dst1 = adjust_address (dst, mode, offset);
> +      rtx dst2 = adjust_address (dst, mode, offset + 16);
>        rtx reg1 = gen_reg_rtx (mode);
>        rtx reg2 = gen_reg_rtx (mode);
> -      /* "Cast" the pointers to the correct mode.  */
> -      *src = adjust_address (*src, mode, 0);
> -      *dst = adjust_address (*dst, mode, 0);
> -      /* Emit the memcpy.  */
> -      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
> -                                       aarch64_progress_pointer (*src)));
> -      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
> -                                        aarch64_progress_pointer (*dst), reg2));
> -      /* Move the pointers forward.  */
> -      *src = aarch64_move_pointer (*src, 32);
> -      *dst = aarch64_move_pointer (*dst, 32);
> +      rtx load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
> +      rtx store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
> +      ops.safe_push ({ load, store });
>        return;
>      }
>
>    rtx reg = gen_reg_rtx (mode);
> -
> -  /* "Cast" the pointers to the correct mode.  */
> -  *src = adjust_address (*src, mode, 0);
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memcpy.  */
> -  emit_move_insn (reg, *src);
> -  emit_move_insn (*dst, reg);
> -  /* Move the pointers forward.  */
> -  *src = aarch64_progress_pointer (*src);
> -  *dst = aarch64_progress_pointer (*dst);
> +  rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
> +  rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
> +  ops.safe_push ({ load, store });
>  }
>
>  /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
>     from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
>     rather than memcpy.  Return true iff we succeeded.  */
>  bool
> -aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
> +aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
>  {
>    if (!TARGET_MOPS)
>      return false;
> @@ -23121,51 +23111,47 @@ aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
>    return true;
>  }
>
> -/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
> -   we succeed, otherwise return false, indicating that a libcall to
> -   memcpy should be emitted.  */
> -
> +/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
> +   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
> +   if this is a memmove rather than memcpy.  Return true if we succeed,
> +   otherwise return false, indicating that a libcall should be emitted.  */
>  bool
> -aarch64_expand_cpymem (rtx *operands)
> +aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>  {
> -  int mode_bits;
> +  int mode_bytes;
>    rtx dst = operands[0];
>    rtx src = operands[1];
>    unsigned align = UINTVAL (operands[3]);
>    rtx base;
> -  machine_mode cur_mode = BLKmode;
> -  bool size_p = optimize_function_for_size_p (cfun);
> +  machine_mode cur_mode = BLKmode, next_mode;
>
>    /* Variable-sized or strict-align copies may use the MOPS expansion.  */
>    if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
> -    return aarch64_expand_cpymem_mops (operands);
> +    return aarch64_expand_cpymem_mops (operands, is_memmove);
>
>    unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
> +  bool use_ldpq = TARGET_SIMD && !(aarch64_tune_params.extra_tuning_flags
> +                                  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);

...here.

Apologies again for the delay.

Richard

> +
> +  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
> +  unsigned max_copy_size = use_ldpq ? 256 : 128;
> +  unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
> +                                      : aarch64_mops_memcpy_size_threshold;
>
> -  /* Try to inline up to 256 bytes.  */
> -  unsigned max_copy_size = 256;
> -  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
> +  /* Reduce the maximum size with -Os.  */
> +  if (optimize_function_for_size_p (cfun))
> +    max_copy_size /= 4;
>
>    /* Large copies use MOPS when available or a library call.  */
>    if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
> -    return aarch64_expand_cpymem_mops (operands);
> -
> -  int copy_bits = 256;
> -
> -  /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
> -     support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
> -  if (size <= 24
> -      || !TARGET_SIMD
> -      || (aarch64_tune_params.extra_tuning_flags
> -         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> -    copy_bits = 128;
> -
> -  /* Emit an inline load+store sequence and count the number of operations
> -     involved.  We use a simple count of just the loads and stores emitted
> -     rather than rtx_insn count as all the pointer adjustments and reg copying
> -     in this function will get optimized away later in the pipeline.  */
> -  start_sequence ();
> -  unsigned nops = 0;
> +    return aarch64_expand_cpymem_mops (operands, is_memmove);
> +
> +  unsigned copy_max = 32;
> +
> +  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
> +     support or slow LDP/STP fall back to 16-byte chunks.  */
> +  if (size <= 24 || !use_ldpq)
> +    copy_max = 16;
>
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> @@ -23173,69 +23159,57 @@ aarch64_expand_cpymem (rtx *operands)
>    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>    src = adjust_automodify_address (src, VOIDmode, base, 0);
>
> -  /* Convert size to bits to make the rest of the code simpler.  */
> -  int n = size * BITS_PER_UNIT;
> +  copy_ops ops;
>
> -  while (n > 0)
> +  int offset = 0;
> +
> +  while (size > 0)
>      {
>        /* Find the largest mode in which to do the copy in without over reading
>          or writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
> +       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
>           cur_mode = mode_iter.require ();
>
>        gcc_assert (cur_mode != BLKmode);
>
> -      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> +      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
>
>        /* Prefer Q-register accesses for the last bytes.  */
> -      if (mode_bits == 128 && copy_bits == 256)
> +      if (mode_bytes == 16 && copy_max == 32)
>         cur_mode = V4SImode;
>
> -      aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
> -      /* A single block copy is 1 load + 1 store.  */
> -      nops += 2;
> -      n -= mode_bits;
> +      aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
> +      size -= mode_bytes;
> +      offset += mode_bytes;
>
>        /* Emit trailing copies using overlapping unaligned accesses
> -       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> -      if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
> +        (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> +      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
>         {
> -         machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
> -         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> -         gcc_assert (n_bits <= mode_bits);
> -         src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
> -         dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
> -         n = n_bits;
> +         next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
> +         int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
> +         gcc_assert (n_bytes <= mode_bytes);
> +         offset -= n_bytes - size;
> +         size = n_bytes;
>         }
>      }
> -  rtx_insn *seq = get_insns ();
> -  end_sequence ();
> -  /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
> -     the constant size into a register.  */
> -  unsigned mops_cost = 3 + 1;
> -
> -  /* If MOPS is available at this point we don't consider the libcall as it's
> -     not a win even on code size.  At this point only consider MOPS if
> -     optimizing for size.  For speed optimizations we will have chosen between
> -     the two based on copy size already.  */
> -  if (TARGET_MOPS)
> -    {
> -      if (size_p && mops_cost < nops)
> -       return aarch64_expand_cpymem_mops (operands);
> -      emit_insn (seq);
> -      return true;
> -    }
>
> -  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
> -     arguments + 1 for the call.  When MOPS is not available and we're
> -     optimizing for size a libcall may be preferable.  */
> -  unsigned libcall_cost = 4;
> -  if (size_p && libcall_cost < nops)
> -    return false;
> +  /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
> +  int nops = ops.length();
> +  int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
>
> -  emit_insn (seq);
> +  for (int i = 0; i < nops; i += inc)
> +    {
> +      int m = MIN (nops, i + inc);
> +      /* Emit loads.  */
> +      for (int j = i; j < m; j++)
> +       emit_insn (ops[j].first);
> +      /* Emit stores.  */
> +      for (int j = i; j < m; j++)
> +       emit_insn (ops[j].second);
> +    }
>    return true;
>  }
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 4a3af6df7e7caf1fab9483239ce41845a92e23b7..267955871591245100a3c1703845a336394241ec 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1632,7 +1632,7 @@ (define_expand "cpymemdi"
>     (match_operand:DI 3 "immediate_operand")]
>     ""
>  {
> -  if (aarch64_expand_cpymem (operands))
> +  if (aarch64_expand_cpymem (operands, false))
>      DONE;
>    FAIL;
>  }
> @@ -1676,17 +1676,9 @@ (define_expand "movmemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "TARGET_MOPS"
> +   ""
>  {
> -   rtx sz_reg = operands[2];
> -   /* For constant-sized memmoves check the threshold.
> -      FIXME: We should add a non-MOPS memmove expansion for smaller,
> -      constant-sized memmove to avoid going to a libcall.  */
> -   if (CONST_INT_P (sz_reg)
> -       && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
> -     FAIL;
> -
> -  if (aarch64_expand_cpymem_mops (operands, true))
> +  if (aarch64_expand_cpymem (operands, true))
>      DONE;
>    FAIL;
>  }
> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
> index f5a518202a157b5b5bc2b2aa14ac1177fded7d66..0ac9d8c578d706e7bf0f0ae399d84544f0c619dc 100644
> --- a/gcc/config/aarch64/aarch64.opt
> +++ b/gcc/config/aarch64/aarch64.opt
> @@ -327,7 +327,7 @@ Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
>  Constant memcpy size in bytes above which to start using MOPS sequence.
>
>  -param=aarch64-mops-memmove-size-threshold=
> -Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
> +Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(256) Param
>  Constant memmove size in bytes above which to start using MOPS sequence.
>
>  -param=aarch64-mops-memset-size-threshold=
> diff --git a/gcc/testsuite/gcc.target/aarch64/memmove.c b/gcc/testsuite/gcc.target/aarch64/memmove.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..d2dd65b5190d6f3569f3272b8bfdbd7621f43c7d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/memmove.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#pragma GCC target "+nomops"
> +
> +void
> +copy1 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 12);
> +}
> +
> +void
> +copy2 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 128);
> +}
> +
> +void
> +copy3 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 255);
> +}
> +
> +/* { dg-final { scan-assembler-not {\tb\tmemmove} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/memmove2.c b/gcc/testsuite/gcc.target/aarch64/memmove2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..4c590a32214219cb0413617f2a4fc1e552643cfe
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/memmove2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mstrict-align" } */
> +
> +#pragma GCC target "+nomops"
> +
> +void
> +copy1 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 12);
> +}
> +
> +void
> +copy2 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 128);
> +}
> +
> +void
> +copy3 (int *x, int *y)
> +{
> +  __builtin_memmove (x, y, 255);
> +}
> +
> +/* { dg-final { scan-assembler-times {\tb\tmemmove} 3 } } */
  

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..0d39622bd2826a3fde54d67b5c5da9ee9286cbbd 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -769,7 +769,7 @@  bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 tree aarch64_vector_load_decl (tree);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
 bool aarch64_expand_cpymem_mops (rtx *, bool);
-bool aarch64_expand_cpymem (rtx *);
+bool aarch64_expand_cpymem (rtx *, bool);
 bool aarch64_expand_setmem (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 2fa5d09de85d385c1165e399bcc97681ef170916..e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25238,52 +25238,37 @@  aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
-/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
-   MODE bytes.  */
+/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
-aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
-					      machine_mode mode)
+aarch64_copy_one_block (rtx *load, rtx *store, rtx src, rtx dst,
+			int offset, machine_mode mode)
 {
-  /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
-     address copies using V4SImode so that we can use Q registers.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
+  /* Emit explict load/store pair instructions for 32-byte copies.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
     {
       mode = V4SImode;
+      rtx src1 = adjust_address (src, mode, offset);
+      rtx src2 = adjust_address (src, mode, offset + 16);
+      rtx dst1 = adjust_address (dst, mode, offset);
+      rtx dst2 = adjust_address (dst, mode, offset + 16);
       rtx reg1 = gen_reg_rtx (mode);
       rtx reg2 = gen_reg_rtx (mode);
-      /* "Cast" the pointers to the correct mode.  */
-      *src = adjust_address (*src, mode, 0);
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memcpy.  */
-      emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
-					aarch64_progress_pointer (*src)));
-      emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
-					 aarch64_progress_pointer (*dst), reg2));
-      /* Move the pointers forward.  */
-      *src = aarch64_move_pointer (*src, 32);
-      *dst = aarch64_move_pointer (*dst, 32);
+      *load = aarch64_gen_load_pair (mode, reg1, src1, reg2, src2);
+      *store = aarch64_gen_store_pair (mode, dst1, reg1, dst2, reg2);
       return;
     }
 
   rtx reg = gen_reg_rtx (mode);
-
-  /* "Cast" the pointers to the correct mode.  */
-  *src = adjust_address (*src, mode, 0);
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memcpy.  */
-  emit_move_insn (reg, *src);
-  emit_move_insn (*dst, reg);
-  /* Move the pointers forward.  */
-  *src = aarch64_progress_pointer (*src);
-  *dst = aarch64_progress_pointer (*dst);
+  *load = gen_move_insn (reg, adjust_address (src, mode, offset));
+  *store = gen_move_insn (adjust_address (dst, mode, offset), reg);
 }
 
 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
    rather than memcpy.  Return true iff we succeeded.  */
 bool
-aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
+aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
 {
   if (!TARGET_MOPS)
     return false;
@@ -25302,51 +25287,48 @@  aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
   return true;
 }
 
-/* Expand cpymem, as if from a __builtin_memcpy.  Return true if
-   we succeed, otherwise return false, indicating that a libcall to
-   memcpy should be emitted.  */
-
+/* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
+   OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
+   if this is a memmove rather than memcpy.  Return true if we succeed,
+   otherwise return false, indicating that a libcall should be emitted.  */
 bool
-aarch64_expand_cpymem (rtx *operands)
+aarch64_expand_cpymem (rtx *operands, bool is_memmove)
 {
-  int mode_bits;
+  int mode_bytes;
   rtx dst = operands[0];
   rtx src = operands[1];
   unsigned align = UINTVAL (operands[3]);
   rtx base;
-  machine_mode cur_mode = BLKmode;
-  bool size_p = optimize_function_for_size_p (cfun);
+  machine_mode cur_mode = BLKmode, next_mode;
 
   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
-    return aarch64_expand_cpymem_mops (operands);
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
 
   unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  /* Try to inline up to 256 bytes.  */
-  unsigned max_copy_size = 256;
-  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
+  /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
+  unsigned max_copy_size = TARGET_SIMD ? 256 : 128;
+  unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
+				       : aarch64_mops_memcpy_size_threshold;
+
+  /* Reduce the maximum size with -Os.  */
+  if (optimize_function_for_size_p (cfun))
+    max_copy_size /= 4;
 
   /* Large copies use MOPS when available or a library call.  */
   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
-    return aarch64_expand_cpymem_mops (operands);
+    return aarch64_expand_cpymem_mops (operands, is_memmove);
 
-  int copy_bits = 256;
+  unsigned copy_max = 32;
 
-  /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
-     support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
+  /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
+     support or slow LDP/STP fall back to 16-byte chunks.  */
   if (size <= 24
       || !TARGET_SIMD
       || (aarch64_tune_params.extra_tuning_flags
 	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
-    copy_bits = 128;
-
-  /* Emit an inline load+store sequence and count the number of operations
-     involved.  We use a simple count of just the loads and stores emitted
-     rather than rtx_insn count as all the pointer adjustments and reg copying
-     in this function will get optimized away later in the pipeline.  */
-  start_sequence ();
-  unsigned nops = 0;
+    copy_max = 16;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -25354,69 +25336,60 @@  aarch64_expand_cpymem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
-  /* Convert size to bits to make the rest of the code simpler.  */
-  int n = size * BITS_PER_UNIT;
+  const int max_ops = 40;
+  rtx load[max_ops], store[max_ops];
 
-  while (n > 0)
+  int nops, offset;
+
+  for (nops = 0, offset = 0; size > 0; nops++)
     {
       /* Find the largest mode in which to do the copy in without over reading
 	 or writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
+	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
 	  cur_mode = mode_iter.require ();
 
-      gcc_assert (cur_mode != BLKmode);
+      gcc_assert (cur_mode != BLKmode && nops < max_ops);
 
-      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
+      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
 
       /* Prefer Q-register accesses for the last bytes.  */
-      if (mode_bits == 128 && copy_bits == 256)
+      if (mode_bytes == 16 && copy_max == 32)
 	cur_mode = V4SImode;
 
-      aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
-      /* A single block copy is 1 load + 1 store.  */
-      nops += 2;
-      n -= mode_bits;
+      aarch64_copy_one_block (&load[nops], &store[nops], src, dst, offset, cur_mode);
+      size -= mode_bytes;
+      offset += mode_bytes;
 
       /* Emit trailing copies using overlapping unaligned accesses
-	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
+	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
 	{
-	  machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
-	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-	  gcc_assert (n_bits <= mode_bits);
-	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
-	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
-	  n = n_bits;
+	  next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
+	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+	  gcc_assert (n_bytes <= mode_bytes);
+	  offset -= n_bytes - size;
+	  size = n_bytes;
 	}
     }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
-  /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
-     the constant size into a register.  */
-  unsigned mops_cost = 3 + 1;
-
-  /* If MOPS is available at this point we don't consider the libcall as it's
-     not a win even on code size.  At this point only consider MOPS if
-     optimizing for size.  For speed optimizations we will have chosen between
-     the two based on copy size already.  */
-  if (TARGET_MOPS)
-    {
-      if (size_p && mops_cost < nops)
-	return aarch64_expand_cpymem_mops (operands);
-      emit_insn (seq);
-      return true;
-    }
 
-  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
-     arguments + 1 for the call.  When MOPS is not available and we're
-     optimizing for size a libcall may be preferable.  */
-  unsigned libcall_cost = 4;
-  if (size_p && libcall_cost < nops)
-    return false;
+  /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
+  int i, j, m, inc;
+  inc = is_memmove ? nops : 3;
+  if (nops == inc + 1)
+    inc = nops / 2;
+  for (i = 0; i < nops; i += inc)
+    {
+      m = inc;
+      if (i + m > nops)
+	m = nops - i;
 
-  emit_insn (seq);
+      for (j = 0; j < m; j++)
+	emit_insn (load[i + j]);
+      for (j = 0; j < m; j++)
+	emit_insn (store[i + j]);
+    }
   return true;
 }
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 1cb3a01d6791a48dc0b08df5783d97805448c7f2..18dd629c2456041b1185eae6d39de074709b2a39 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1629,7 +1629,7 @@  (define_expand "cpymemdi"
    (match_operand:DI 3 "immediate_operand")]
    ""
 {
-  if (aarch64_expand_cpymem (operands))
+  if (aarch64_expand_cpymem (operands, false))
     DONE;
   FAIL;
 }
@@ -1673,17 +1673,9 @@  (define_expand "movmemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "TARGET_MOPS"
+   ""
 {
-   rtx sz_reg = operands[2];
-   /* For constant-sized memmoves check the threshold.
-      FIXME: We should add a non-MOPS memmove expansion for smaller,
-      constant-sized memmove to avoid going to a libcall.  */
-   if (CONST_INT_P (sz_reg)
-       && INTVAL (sz_reg) < aarch64_mops_memmove_size_threshold)
-     FAIL;
-
-  if (aarch64_expand_cpymem_mops (operands, true))
+  if (aarch64_expand_cpymem (operands, true))
     DONE;
   FAIL;
 }
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index f5a518202a157b5b5bc2b2aa14ac1177fded7d66..0ac9d8c578d706e7bf0f0ae399d84544f0c619dc 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -327,7 +327,7 @@  Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) Init(256) Param
 Constant memcpy size in bytes above which to start using MOPS sequence.
 
 -param=aarch64-mops-memmove-size-threshold=
-Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(0) Param
+Target Joined UInteger Var(aarch64_mops_memmove_size_threshold) Init(256) Param
 Constant memmove size in bytes above which to start using MOPS sequence.
 
 -param=aarch64-mops-memset-size-threshold=
diff --git a/gcc/testsuite/gcc.target/aarch64/memmove.c b/gcc/testsuite/gcc.target/aarch64/memmove.c
new file mode 100644
index 0000000000000000000000000000000000000000..6926a97761eb2578d3f1db7e6eb19dba17b888be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memmove.c
@@ -0,0 +1,22 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+void
+copy1 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 12);
+}
+
+void
+copy2 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 128);
+}
+
+void
+copy3 (int *x, int *y)
+{
+  __builtin_memmove (x, y, 255);
+}
+
+/* { dg-final { scan-assembler-not {\tb\tmemmove} } } */