AArch64: Cleanup memset expansion

Message ID PAWPR08MB898262EC1D06EA3207A4372483D4A@PAWPR08MB8982.eurprd08.prod.outlook.com
State Not Applicable
Headers
Series AArch64: Cleanup memset expansion |

Checks

Context Check Description
snail/gcc-patch-check fail Git am fail log

Commit Message

Wilco Dijkstra Oct. 19, 2023, 12:51 p.m. UTC
  Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog:
        * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
        (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
        (aarch64_expand_setmem): Clean up implementation, use byte offsets,
        simplify size calculation.

---
  

Comments

Wilco Dijkstra Nov. 6, 2023, 12:11 p.m. UTC | #1
ping
 
Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog:
        * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
        (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
        (aarch64_expand_setmem): Clean up implementation, use byte offsets,
        simplify size calculation.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8..578a253d6e0e133e19592553fc873b3e73f9f218 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25229,15 +25229,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
                                     next, amount);
 }
 
-/* Return a new RTX holding the result of moving POINTER forward by the
-   size of the mode it points to.  */
-
-static rtx
-aarch64_progress_pointer (rtx pointer)
-{
-  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
-}
-
 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
@@ -25393,46 +25384,22 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
 static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-                                           machine_mode mode)
-{
-  /* If we are copying 128bits or 256bits, we can do that straight from
-     the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-    {
-      mode = GET_MODE (src);
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memset.  */
-      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
-                                        aarch64_progress_pointer (*dst), src));
-
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 32);
-      return;
-    }
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
+{
+  /* Emit explict store pair instructions for 32-byte writes.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
     {
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, GET_MODE (src), 0);
-      /* Emit the memset.  */
-      emit_move_insn (*dst, src);
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 16);
+      mode = V16QImode;
+      rtx dst1 = adjust_address (dst, mode, offset);
+      rtx dst2 = adjust_address (dst, mode, offset + 16);
+      emit_insn (aarch64_gen_store_pair (mode, dst1, src, dst2, src));
       return;
     }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
+  if (known_lt (GET_MODE_SIZE (mode), 16))
+    src = lowpart_subreg (mode, src, GET_MODE (src));
+  emit_move_insn (adjust_address (dst, mode, offset), src);
 }
 
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
@@ -25461,7 +25428,7 @@ aarch64_expand_setmem_mops (rtx *operands)
 bool
 aarch64_expand_setmem (rtx *operands)
 {
-  int n, mode_bits;
+  int mode_bytes;
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
@@ -25474,104 +25441,70 @@ aarch64_expand_setmem (rtx *operands)
       || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
+  /* Reduce the maximum size with -Os.  */
+  if (optimize_function_for_size_p (cfun))
+    max_set_size = 96;
+
   len = UINTVAL (operands[1]);
 
   /* Large memset uses MOPS when available or a library call.  */
   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
     return aarch64_expand_setmem_mops (operands);
 
-  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
-  /* The MOPS sequence takes:
-     3 instructions for the memory storing
-     + 1 to move the constant size into a reg
-     + 1 if VAL is a non-zero constant to move into a reg
-    (zero constants can use XZR directly).  */
-  unsigned mops_cost = 3 + 1 + cst_val;
-  /* A libcall to memset in the worst case takes 3 instructions to prepare
-     the arguments + 1 for the call.  */
-  unsigned libcall_cost = 4;
-
-  /* Attempt a sequence with a vector broadcast followed by stores.
-     Count the number of operations involved to see if it's worth it
-     against the alternatives.  A simple counter simd_ops on the
-     algorithmically-relevant operations is used rather than an rtx_insn count
-     as all the pointer adjusmtents and mode reinterprets will be optimized
-     away later.  */
-  start_sequence ();
-  unsigned simd_ops = 0;
-
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
 
   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
   src = expand_vector_broadcast (V16QImode, val);
   src = force_reg (V16QImode, src);
-  simd_ops++;
-  /* Convert len to bits to make the rest of the code simpler.  */
-  n = len * BITS_PER_UNIT;
 
-  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
-     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
-  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
-                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-                         ? GET_MODE_BITSIZE (TImode) : 256;
+  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
+     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
+  unsigned set_max = 32;
+
+  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
+                   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+    set_max = 16;
 
-  while (n > 0)
+  int offset = 0;
+  while (len > 0)
     {
       /* Find the largest mode in which to do the copy without
          over writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
           cur_mode = mode_iter.require ();
 
       gcc_assert (cur_mode != BLKmode);
 
-      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
-      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-      simd_ops++;
-      n -= mode_bits;
+      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+
+      /* Prefer Q-register accesses for the last bytes.  */
+      if (mode_bytes == 16)
+       cur_mode = V16QImode;
+
+      aarch64_set_one_block (src, dst, offset, cur_mode);
+      len -= mode_bytes;
+      offset += mode_bytes;
 
       /* Emit trailing writes using overlapping unaligned accesses
-       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
+        (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
         {
-         next_mode = smallest_mode_for_size (n, MODE_INT);
-         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-         gcc_assert (n_bits <= mode_bits);
-         dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
-         n = n_bits;
+         next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
+         int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+         gcc_assert (n_bytes <= mode_bytes);
+         offset -= n_bytes - len;
+         len = n_bytes;
         }
     }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
-
-  if (size_p)
-    {
-      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
-        call to memset or the MOPS expansion.  */
-      if (TARGET_MOPS
-         && mops_cost <= libcall_cost
-         && mops_cost <= simd_ops)
-       return aarch64_expand_setmem_mops (operands);
-      /* If MOPS is not available or not shorter pick a libcall if the SIMD
-        sequence is too long.  */
-      else if (libcall_cost < simd_ops)
-       return false;
-      emit_insn (seq);
-      return true;
-    }
 
-  /* At this point the SIMD broadcast sequence is the best choice when
-     optimizing for speed.  */
-  emit_insn (seq);
   return true;
 }
  
Kyrylo Tkachov Nov. 10, 2023, 9:50 a.m. UTC | #2
Hi Wilco,

> -----Original Message-----
> From: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
> Sent: Monday, November 6, 2023 12:12 PM
> To: GCC Patches <gcc-patches@gcc.gnu.org>
> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>
> Subject: Re: [PATCH] AArch64: Cleanup memset expansion
> 
> ping
> 
> Cleanup memset implementation.  Similar to memcpy/memmove, use an
> offset and
> bytes throughout.  Simplify the complex calculations when optimizing for size
> by using a fixed limit.
> 
> Passes regress/bootstrap, OK for commit?
> 

This looks like a good cleanup but I have a question...

> gcc/ChangeLog:
>         * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove
> function.
>         (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
>         (aarch64_expand_setmem): Clean up implementation, use byte offsets,
>         simplify size calculation.
> 
> ---
> 
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index
> e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8..578a253d6e0e133e1959255
> 3fc873b3e73f9f218 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25229,15 +25229,6 @@ aarch64_move_pointer (rtx pointer, poly_int64
> amount)
>                                      next, amount);
>  }
> 
> -/* Return a new RTX holding the result of moving POINTER forward by the
> -   size of the mode it points to.  */
> -
> -static rtx
> -aarch64_progress_pointer (rtx pointer)
> -{
> -  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE
> (pointer)));
> -}
> -
>  /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
> 
>  static void
> @@ -25393,46 +25384,22 @@ aarch64_expand_cpymem (rtx *operands,
> bool is_memmove)
>    return true;
>  }
> 
> -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset
> where
> -   SRC is a register we have created with the duplicated value to be set.  */
> +/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
>  static void
> -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
> -                                           machine_mode mode)
> -{
> -  /* If we are copying 128bits or 256bits, we can do that straight from
> -     the SIMD register we prepared.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> -    {
> -      mode = GET_MODE (src);
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, mode, 0);
> -      /* Emit the memset.  */
> -      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
> -                                        aarch64_progress_pointer (*dst), src));
> -
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 32);
> -      return;
> -    }
> -  if (known_eq (GET_MODE_BITSIZE (mode), 128))
> +aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
> +{
> +  /* Emit explict store pair instructions for 32-byte writes.  */
> +  if (known_eq (GET_MODE_SIZE (mode), 32))
>      {
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, GET_MODE (src), 0);
> -      /* Emit the memset.  */
> -      emit_move_insn (*dst, src);
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 16);
> +      mode = V16QImode;
> +      rtx dst1 = adjust_address (dst, mode, offset);
> +      rtx dst2 = adjust_address (dst, mode, offset + 16);
> +      emit_insn (aarch64_gen_store_pair (mode, dst1, src, dst2, src));
>        return;
>      }
> -  /* For copying less, we have to extract the right amount from src.  */
> -  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
> -
> -  /* "Cast" the *dst to the correct mode.  */
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memset.  */
> -  emit_move_insn (*dst, reg);
> -  /* Move the pointer forward.  */
> -  *dst = aarch64_progress_pointer (*dst);
> +  if (known_lt (GET_MODE_SIZE (mode), 16))
> +    src = lowpart_subreg (mode, src, GET_MODE (src));
> +  emit_move_insn (adjust_address (dst, mode, offset), src);
>  }
> 
>  /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
> @@ -25461,7 +25428,7 @@ aarch64_expand_setmem_mops (rtx *operands)
>  bool
>  aarch64_expand_setmem (rtx *operands)
>  {
> -  int n, mode_bits;
> +  int mode_bytes;
>    unsigned HOST_WIDE_INT len;
>    rtx dst = operands[0];
>    rtx val = operands[2], src;
> @@ -25474,104 +25441,70 @@ aarch64_expand_setmem (rtx *operands)
>        || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_setmem_mops (operands);
> 
> -  bool size_p = optimize_function_for_size_p (cfun);
> -
>    /* Default the maximum to 256-bytes when considering only libcall vs
>       SIMD broadcast sequence.  */
>    unsigned max_set_size = 256;
>    unsigned mops_threshold = aarch64_mops_memset_size_threshold;
> 
> +  /* Reduce the maximum size with -Os.  */
> +  if (optimize_function_for_size_p (cfun))
> +    max_set_size = 96;
> +

.... This is a new "magic" number in this code. It looks sensible, but how did you arrive at it?
Thanks,
Kyrill


>    len = UINTVAL (operands[1]);
> 
>    /* Large memset uses MOPS when available or a library call.  */
>    if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
>      return aarch64_expand_setmem_mops (operands);
> 
> -  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
> -  /* The MOPS sequence takes:
> -     3 instructions for the memory storing
> -     + 1 to move the constant size into a reg
> -     + 1 if VAL is a non-zero constant to move into a reg
> -    (zero constants can use XZR directly).  */
> -  unsigned mops_cost = 3 + 1 + cst_val;
> -  /* A libcall to memset in the worst case takes 3 instructions to prepare
> -     the arguments + 1 for the call.  */
> -  unsigned libcall_cost = 4;
> -
> -  /* Attempt a sequence with a vector broadcast followed by stores.
> -     Count the number of operations involved to see if it's worth it
> -     against the alternatives.  A simple counter simd_ops on the
> -     algorithmically-relevant operations is used rather than an rtx_insn count
> -     as all the pointer adjusmtents and mode reinterprets will be optimized
> -     away later.  */
> -  start_sequence ();
> -  unsigned simd_ops = 0;
> -
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> 
>    /* Prepare the val using a DUP/MOVI v0.16B, val.  */
>    src = expand_vector_broadcast (V16QImode, val);
>    src = force_reg (V16QImode, src);
> -  simd_ops++;
> -  /* Convert len to bits to make the rest of the code simpler.  */
> -  n = len * BITS_PER_UNIT;
> 
> -  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on
> the
> -     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> -  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
> -                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> -                         ? GET_MODE_BITSIZE (TImode) : 256;
> +  /* Set maximum amount to write in one go.  We allow 32-byte chunks
> based
> +     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning
> parameter.  */
> +  unsigned set_max = 32;
> +
> +  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
> +                   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> +    set_max = 16;
> 
> -  while (n > 0)
> +  int offset = 0;
> +  while (len > 0)
>      {
>        /* Find the largest mode in which to do the copy without
>           over writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
> +       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
>            cur_mode = mode_iter.require ();
> 
>        gcc_assert (cur_mode != BLKmode);
> 
> -      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> -      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
> -      simd_ops++;
> -      n -= mode_bits;
> +      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
> +
> +      /* Prefer Q-register accesses for the last bytes.  */
> +      if (mode_bytes == 16)
> +       cur_mode = V16QImode;
> +
> +      aarch64_set_one_block (src, dst, offset, cur_mode);
> +      len -= mode_bytes;
> +      offset += mode_bytes;
> 
>        /* Emit trailing writes using overlapping unaligned accesses
> -       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> -      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
> +        (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> +      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
>          {
> -         next_mode = smallest_mode_for_size (n, MODE_INT);
> -         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> -         gcc_assert (n_bits <= mode_bits);
> -         dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
> -         n = n_bits;
> +         next_mode = smallest_mode_for_size (len * BITS_PER_UNIT,
> MODE_INT);
> +         int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
> +         gcc_assert (n_bytes <= mode_bytes);
> +         offset -= n_bytes - len;
> +         len = n_bytes;
>          }
>      }
> -  rtx_insn *seq = get_insns ();
> -  end_sequence ();
> -
> -  if (size_p)
> -    {
> -      /* When optimizing for size we have 3 options: the SIMD broadcast
> sequence,
> -        call to memset or the MOPS expansion.  */
> -      if (TARGET_MOPS
> -         && mops_cost <= libcall_cost
> -         && mops_cost <= simd_ops)
> -       return aarch64_expand_setmem_mops (operands);
> -      /* If MOPS is not available or not shorter pick a libcall if the SIMD
> -        sequence is too long.  */
> -      else if (libcall_cost < simd_ops)
> -       return false;
> -      emit_insn (seq);
> -      return true;
> -    }
> 
> -  /* At this point the SIMD broadcast sequence is the best choice when
> -     optimizing for speed.  */
> -  emit_insn (seq);
>    return true;
>  }
>
  
Wilco Dijkstra Nov. 10, 2023, 10:17 a.m. UTC | #3
Hi Kyrill,

> +  /* Reduce the maximum size with -Os.  */
> +  if (optimize_function_for_size_p (cfun))
> +    max_set_size = 96;
> +

> .... This is a new "magic" number in this code. It looks sensible, but how did you arrive at it?

We need 1 instruction to create the value to store (DUP or MOVI) and 1 STP
for every 32 bytes, so the 96 means 4 instructions for typical sizes (sizes not
a multiple of 16 can add one extra instruction).

I checked codesize on SPECINT2017, and 96 had practically identical size.
Using 128 would also be a reasonable Os value with a very slight size increase, 
and 384 looks good for O2 - however I didn't want to tune these values as this
is a cleanup patch.

Cheers,
Wilco
  
Richard Earnshaw Nov. 10, 2023, 11:30 a.m. UTC | #4
On 10/11/2023 10:17, Wilco Dijkstra wrote:
> Hi Kyrill,
> 
>> +  /* Reduce the maximum size with -Os.  */
>> +  if (optimize_function_for_size_p (cfun))
>> +    max_set_size = 96;
>> +
> 
>> .... This is a new "magic" number in this code. It looks sensible, but how did you arrive at it?
> 
> We need 1 instruction to create the value to store (DUP or MOVI) and 1 STP
> for every 32 bytes, so the 96 means 4 instructions for typical sizes 
> (sizes not
> a multiple of 16 can add one extra instruction).
> 
> I checked codesize on SPECINT2017, and 96 had practically identical size.
> Using 128 would also be a reasonable Os value with a very slight size 
> increase,
> and 384 looks good for O2 - however I didn't want to tune these values 
> as this
> is a cleanup patch.
> 
> Cheers,
> Wilco

Shouldn't this be a param then?  Also, manifest constants in the middle 
of code are a potential nightmare, please move it to a #define (even if 
that's then used as the default value for the param).
  
Kyrylo Tkachov Nov. 10, 2023, 2:46 p.m. UTC | #5
> -----Original Message-----
> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
> Sent: Friday, November 10, 2023 11:31 AM
> To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>
> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>
> Subject: Re: [PATCH] AArch64: Cleanup memset expansion
> 
> 
> 
> On 10/11/2023 10:17, Wilco Dijkstra wrote:
> > Hi Kyrill,
> >
> >> +  /* Reduce the maximum size with -Os.  */
> >> +  if (optimize_function_for_size_p (cfun))
> >> +    max_set_size = 96;
> >> +
> >
> >> .... This is a new "magic" number in this code. It looks sensible, but how
> did you arrive at it?
> >
> > We need 1 instruction to create the value to store (DUP or MOVI) and 1 STP
> > for every 32 bytes, so the 96 means 4 instructions for typical sizes
> > (sizes not
> > a multiple of 16 can add one extra instruction).

It would be useful to have that reasoning in the comment.

> >
> > I checked codesize on SPECINT2017, and 96 had practically identical size.
> > Using 128 would also be a reasonable Os value with a very slight size
> > increase,
> > and 384 looks good for O2 - however I didn't want to tune these values
> > as this
> > is a cleanup patch.
> >
> > Cheers,
> > Wilco
> 
> Shouldn't this be a param then?  Also, manifest constants in the middle
> of code are a potential nightmare, please move it to a #define (even if
> that's then used as the default value for the param).

I agree on making this a #define but I wouldn't insist on a param.
Code size IMO has a much more consistent right or wrong answer as it's statically determinable.
It this was a speed-related param then I'd expect the flexibility for the power user to override such heuristics would be more widely useful.
But for code size the compiler should always be able to get it right.

If Richard would still like the param then I'm fine with having the param, but I'd be okay with the comment above and making this a #define.
Thanks,
Kyrill
  
Richard Earnshaw Nov. 10, 2023, 3:13 p.m. UTC | #6
On 10/11/2023 14:46, Kyrylo Tkachov wrote:
> 
> 
>> -----Original Message-----
>> From: Richard Earnshaw <Richard.Earnshaw@foss.arm.com>
>> Sent: Friday, November 10, 2023 11:31 AM
>> To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>; Kyrylo Tkachov
>> <Kyrylo.Tkachov@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>
>> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>
>> Subject: Re: [PATCH] AArch64: Cleanup memset expansion
>>
>>
>>
>> On 10/11/2023 10:17, Wilco Dijkstra wrote:
>>> Hi Kyrill,
>>>
>>>> +  /* Reduce the maximum size with -Os.  */
>>>> +  if (optimize_function_for_size_p (cfun))
>>>> +    max_set_size = 96;
>>>> +
>>>
>>>> .... This is a new "magic" number in this code. It looks sensible, but how
>> did you arrive at it?
>>>
>>> We need 1 instruction to create the value to store (DUP or MOVI) and 1 STP
>>> for every 32 bytes, so the 96 means 4 instructions for typical sizes
>>> (sizes not
>>> a multiple of 16 can add one extra instruction).
> 
> It would be useful to have that reasoning in the comment.
> 
>>>
>>> I checked codesize on SPECINT2017, and 96 had practically identical size.
>>> Using 128 would also be a reasonable Os value with a very slight size
>>> increase,
>>> and 384 looks good for O2 - however I didn't want to tune these values
>>> as this
>>> is a cleanup patch.
>>>
>>> Cheers,
>>> Wilco
>>
>> Shouldn't this be a param then?  Also, manifest constants in the middle
>> of code are a potential nightmare, please move it to a #define (even if
>> that's then used as the default value for the param).
> 
> I agree on making this a #define but I wouldn't insist on a param.
> Code size IMO has a much more consistent right or wrong answer as it's statically determinable.
> It this was a speed-related param then I'd expect the flexibility for the power user to override such heuristics would be more widely useful.
> But for code size the compiler should always be able to get it right.
> 
> If Richard would still like the param then I'm fine with having the param, but I'd be okay with the comment above and making this a #define.

I don't immediately have a feel for how sensitive code would be to the 
precise value here.  Is this value something that might affect 
individual benchmarks in different ways?  Or something where a future 
architecture might want a different value?  For either of those reasons 
a param might be useful, but if this is primarily a code size trade off 
and the variation in performance is small, then it's probably not 
worthwhile having an additional hook.

R.
  
Wilco Dijkstra Nov. 14, 2023, 4:23 p.m. UTC | #7
Hi,

>>> I checked codesize on SPECINT2017, and 96 had practically identical size.
>>> Using 128 would also be a reasonable Os value with a very slight size
>>> increase,
>>> and 384 looks good for O2 - however I didn't want to tune these values
>>> as this
>>> is a cleanup patch.
>>>
>>> Cheers,
>>> Wilco
>>
>> Shouldn't this be a param then?  Also, manifest constants in the middle
>> of code are a potential nightmare, please move it to a #define (even if
>> that's then used as the default value for the param).
> 
> I agree on making this a #define but I wouldn't insist on a param.
> Code size IMO has a much more consistent right or wrong answer as it's statically determinable.
> It this was a speed-related param then I'd expect the flexibility for the power user to override such heuristics would be more widely useful.
> But for code size the compiler should always be able to get it right.
> 
> If Richard would still like the param then I'm fine with having the param, but I'd be okay with the comment above and making this a #define.

> I don't immediately have a feel for how sensitive code would be to the 
> precise value here.  Is this value something that might affect 
> individual benchmarks in different ways?  Or something where a future 
> architecture might want a different value?  For either of those reasons 
> a param might be useful, but if this is primarily a code size trade off 
> and the variation in performance is small, then it's probably not 
> worthwhile having an additional hook.

These are just settings that are good for -Os and -O2. I might tune them once
every 5 years or so, but that's all that is needed. I don't believe there is any
value in giving users too many unnecessary options. Adding the configurable
MOPS settings introduced several bugs that went unnoticed despite multiple
code reviews, so doing this creates extra testing and maintenance overheads.

Cheers,
Wilco

---
v2: Add define MAX_SET_SIZE

Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog:
        * config/aarch64/aarch64.h (MAX_SET_SIZE): New define.
        * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
        (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
        (aarch64_expand_setmem): Clean up implementation, use byte offsets,
        simplify size calculation.

---

diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 2f0777a37acccb787200d15ae89ec186b4221748..1d98b48db43e09ecf8c4289a8cd4fc55cc2c8a26 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -998,6 +998,10 @@ typedef struct
    mode that should actually be used.  We allow pairs of registers.  */
 #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode)
 
+/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
+   and 1 MOVI/DUP (same size as a call).  */
+#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
+
 /* Maximum bytes moved by a single instruction (load/store pair).  */
 #define MOVE_MAX (UNITS_PER_WORD * 2)
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5a22b576710e29795d65ddf3face9e8587b1df88..83a18b35729ddd07a1925f53a77bc21c9ac7ca36 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25415,8 +25415,7 @@ aarch64_progress_pointer (rtx pointer)
   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
 }
 
-/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
-   MODE bytes.  */
+/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
@@ -25597,46 +25596,22 @@ aarch64_expand_cpymem (rtx *operands)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
 static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-					    machine_mode mode)
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
 {
-  /* If we are copying 128bits or 256bits, we can do that straight from
-     the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-    {
-      mode = GET_MODE (src);
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memset.  */
-      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
-					 aarch64_progress_pointer (*dst), src));
-
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 32);
-      return;
-    }
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+  /* Emit explict store pair instructions for 32-byte writes.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
     {
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, GET_MODE (src), 0);
-      /* Emit the memset.  */
-      emit_move_insn (*dst, src);
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 16);
+      mode = V16QImode;
+      rtx dst1 = adjust_address (dst, mode, offset);
+      rtx dst2 = adjust_address (dst, mode, offset + 16);
+      emit_insn (aarch64_gen_store_pair (mode, dst1, src, dst2, src));
       return;
     }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
+  if (known_lt (GET_MODE_SIZE (mode), 16))
+    src = lowpart_subreg (mode, src, GET_MODE (src));
+  emit_move_insn (adjust_address (dst, mode, offset), src);
 }
 
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
@@ -25665,7 +25640,7 @@ aarch64_expand_setmem_mops (rtx *operands)
 bool
 aarch64_expand_setmem (rtx *operands)
 {
-  int n, mode_bits;
+  int mode_bytes;
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
@@ -25678,11 +25653,9 @@ aarch64_expand_setmem (rtx *operands)
       || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
-  unsigned max_set_size = 256;
+  unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
   len = UINTVAL (operands[1]);
@@ -25691,91 +25664,55 @@ aarch64_expand_setmem (rtx *operands)
   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
     return aarch64_expand_setmem_mops (operands);
 
-  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
-  /* The MOPS sequence takes:
-     3 instructions for the memory storing
-     + 1 to move the constant size into a reg
-     + 1 if VAL is a non-zero constant to move into a reg
-    (zero constants can use XZR directly).  */
-  unsigned mops_cost = 3 + 1 + cst_val;
-  /* A libcall to memset in the worst case takes 3 instructions to prepare
-     the arguments + 1 for the call.  */
-  unsigned libcall_cost = 4;
-
-  /* Attempt a sequence with a vector broadcast followed by stores.
-     Count the number of operations involved to see if it's worth it
-     against the alternatives.  A simple counter simd_ops on the
-     algorithmically-relevant operations is used rather than an rtx_insn count
-     as all the pointer adjusmtents and mode reinterprets will be optimized
-     away later.  */
-  start_sequence ();
-  unsigned simd_ops = 0;
-
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
 
   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
   src = expand_vector_broadcast (V16QImode, val);
   src = force_reg (V16QImode, src);
-  simd_ops++;
-  /* Convert len to bits to make the rest of the code simpler.  */
-  n = len * BITS_PER_UNIT;
 
-  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
-     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
-  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
-			  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-			  ? GET_MODE_BITSIZE (TImode) : 256;
+  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
+     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
+  unsigned set_max = 32;
 
-  while (n > 0)
+  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
+		    & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+    set_max = 16;
+
+  int offset = 0;
+  while (len > 0)
     {
       /* Find the largest mode in which to do the copy without
 	 over writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
 	  cur_mode = mode_iter.require ();
 
       gcc_assert (cur_mode != BLKmode);
 
-      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
-      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-      simd_ops++;
-      n -= mode_bits;
+      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+
+      /* Prefer Q-register accesses for the last bytes.  */
+      if (mode_bytes == 16)
+	cur_mode = V16QImode;
+
+      aarch64_set_one_block (src, dst, offset, cur_mode);
+      len -= mode_bytes;
+      offset += mode_bytes;
 
       /* Emit trailing writes using overlapping unaligned accesses
-	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
+	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
 	{
-	  next_mode = smallest_mode_for_size (n, MODE_INT);
-	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-	  gcc_assert (n_bits <= mode_bits);
-	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
-	  n = n_bits;
+	  next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
+	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+	  gcc_assert (n_bytes <= mode_bytes);
+	  offset -= n_bytes - len;
+	  len = n_bytes;
 	}
     }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
 
-  if (size_p)
-    {
-      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
-	 call to memset or the MOPS expansion.  */
-      if (TARGET_MOPS
-	  && mops_cost <= libcall_cost
-	  && mops_cost <= simd_ops)
-	return aarch64_expand_setmem_mops (operands);
-      /* If MOPS is not available or not shorter pick a libcall if the SIMD
-	 sequence is too long.  */
-      else if (libcall_cost < simd_ops)
-	return false;
-      emit_insn (seq);
-      return true;
-    }
-
-  /* At this point the SIMD broadcast sequence is the best choice when
-     optimizing for speed.  */
-  emit_insn (seq);
   return true;
 }
  
Richard Earnshaw Nov. 14, 2023, 4:36 p.m. UTC | #8
On 14/11/2023 16:23, Wilco Dijkstra wrote:
> Hi,
> 
>>>> I checked codesize on SPECINT2017, and 96 had practically identical size.
>>>> Using 128 would also be a reasonable Os value with a very slight size
>>>> increase,
>>>> and 384 looks good for O2 - however I didn't want to tune these values
>>>> as this
>>>> is a cleanup patch.
>>>>
>>>> Cheers,
>>>> Wilco
>>>
>>> Shouldn't this be a param then?  Also, manifest constants in the middle
>>> of code are a potential nightmare, please move it to a #define (even if
>>> that's then used as the default value for the param).
>>
>> I agree on making this a #define but I wouldn't insist on a param.
>> Code size IMO has a much more consistent right or wrong answer as it's statically determinable.
>> It this was a speed-related param then I'd expect the flexibility for the power user to override such heuristics would be more widely useful.
>> But for code size the compiler should always be able to get it right.
>>
>> If Richard would still like the param then I'm fine with having the param, but I'd be okay with the comment above and making this a #define.
> 
>> I don't immediately have a feel for how sensitive code would be to the
>> precise value here.  Is this value something that might affect
>> individual benchmarks in different ways?  Or something where a future
>> architecture might want a different value?  For either of those reasons
>> a param might be useful, but if this is primarily a code size trade off
>> and the variation in performance is small, then it's probably not
>> worthwhile having an additional hook.
> 
> These are just settings that are good for -Os and -O2. I might tune them once
> every 5 years or so, but that's all that is needed. I don't believe there is any
> value in giving users too many unnecessary options. Adding the configurable
> MOPS settings introduced several bugs that went unnoticed despite multiple
> code reviews, so doing this creates extra testing and maintenance overheads.
> 
> Cheers,
> Wilco
> 
> ---
> v2: Add define MAX_SET_SIZE
> 
> Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
> bytes throughout.  Simplify the complex calculations when optimizing for size
> by using a fixed limit.
> 
> Passes regress/bootstrap, OK for commit?
>      
> gcc/ChangeLog:
>          * config/aarch64/aarch64.h (MAX_SET_SIZE): New define.
>          * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
>          (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
>          (aarch64_expand_setmem): Clean up implementation, use byte offsets,
>          simplify size calculation.
> 
> ---
> 
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 2f0777a37acccb787200d15ae89ec186b4221748..1d98b48db43e09ecf8c4289a8cd4fc55cc2c8a26 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -998,6 +998,10 @@ typedef struct
>      mode that should actually be used.  We allow pairs of registers.  */
>   #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode)
>   
> +/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
> +   and 1 MOVI/DUP (same size as a call).  */
> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96)

So it looks like this assumes we have AdvSIMD.  What about 
-mgeneral-regs-only?

R.

> +
>   /* Maximum bytes moved by a single instruction (load/store pair).  */
>   #define MOVE_MAX (UNITS_PER_WORD * 2)
>   
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 5a22b576710e29795d65ddf3face9e8587b1df88..83a18b35729ddd07a1925f53a77bc21c9ac7ca36 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25415,8 +25415,7 @@ aarch64_progress_pointer (rtx pointer)
>     return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
>   }
>   
> -/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
> -   MODE bytes.  */
> +/* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
>   
>   static void
>   aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
> @@ -25597,46 +25596,22 @@ aarch64_expand_cpymem (rtx *operands)
>     return true;
>   }
>   
> -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
> -   SRC is a register we have created with the duplicated value to be set.  */
> +/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
>   static void
> -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
> -					    machine_mode mode)
> +aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
>   {
> -  /* If we are copying 128bits or 256bits, we can do that straight from
> -     the SIMD register we prepared.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> -    {
> -      mode = GET_MODE (src);
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, mode, 0);
> -      /* Emit the memset.  */
> -      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
> -					 aarch64_progress_pointer (*dst), src));
> -
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 32);
> -      return;
> -    }
> -  if (known_eq (GET_MODE_BITSIZE (mode), 128))
> +  /* Emit explict store pair instructions for 32-byte writes.  */
> +  if (known_eq (GET_MODE_SIZE (mode), 32))
>       {
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, GET_MODE (src), 0);
> -      /* Emit the memset.  */
> -      emit_move_insn (*dst, src);
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 16);
> +      mode = V16QImode;
> +      rtx dst1 = adjust_address (dst, mode, offset);
> +      rtx dst2 = adjust_address (dst, mode, offset + 16);
> +      emit_insn (aarch64_gen_store_pair (mode, dst1, src, dst2, src));
>         return;
>       }
> -  /* For copying less, we have to extract the right amount from src.  */
> -  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
> -
> -  /* "Cast" the *dst to the correct mode.  */
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memset.  */
> -  emit_move_insn (*dst, reg);
> -  /* Move the pointer forward.  */
> -  *dst = aarch64_progress_pointer (*dst);
> +  if (known_lt (GET_MODE_SIZE (mode), 16))
> +    src = lowpart_subreg (mode, src, GET_MODE (src));
> +  emit_move_insn (adjust_address (dst, mode, offset), src);
>   }
>   
>   /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
> @@ -25665,7 +25640,7 @@ aarch64_expand_setmem_mops (rtx *operands)
>   bool
>   aarch64_expand_setmem (rtx *operands)
>   {
> -  int n, mode_bits;
> +  int mode_bytes;
>     unsigned HOST_WIDE_INT len;
>     rtx dst = operands[0];
>     rtx val = operands[2], src;
> @@ -25678,11 +25653,9 @@ aarch64_expand_setmem (rtx *operands)
>         || (STRICT_ALIGNMENT && align < 16))
>       return aarch64_expand_setmem_mops (operands);
>   
> -  bool size_p = optimize_function_for_size_p (cfun);
> -
>     /* Default the maximum to 256-bytes when considering only libcall vs
>        SIMD broadcast sequence.  */
> -  unsigned max_set_size = 256;
> +  unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
>     unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>   
>     len = UINTVAL (operands[1]);
> @@ -25691,91 +25664,55 @@ aarch64_expand_setmem (rtx *operands)
>     if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
>       return aarch64_expand_setmem_mops (operands);
>   
> -  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
> -  /* The MOPS sequence takes:
> -     3 instructions for the memory storing
> -     + 1 to move the constant size into a reg
> -     + 1 if VAL is a non-zero constant to move into a reg
> -    (zero constants can use XZR directly).  */
> -  unsigned mops_cost = 3 + 1 + cst_val;
> -  /* A libcall to memset in the worst case takes 3 instructions to prepare
> -     the arguments + 1 for the call.  */
> -  unsigned libcall_cost = 4;
> -
> -  /* Attempt a sequence with a vector broadcast followed by stores.
> -     Count the number of operations involved to see if it's worth it
> -     against the alternatives.  A simple counter simd_ops on the
> -     algorithmically-relevant operations is used rather than an rtx_insn count
> -     as all the pointer adjusmtents and mode reinterprets will be optimized
> -     away later.  */
> -  start_sequence ();
> -  unsigned simd_ops = 0;
> -
>     base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>     dst = adjust_automodify_address (dst, VOIDmode, base, 0);
>   
>     /* Prepare the val using a DUP/MOVI v0.16B, val.  */
>     src = expand_vector_broadcast (V16QImode, val);
>     src = force_reg (V16QImode, src);
> -  simd_ops++;
> -  /* Convert len to bits to make the rest of the code simpler.  */
> -  n = len * BITS_PER_UNIT;
>   
> -  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> -     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> -  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
> -			  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> -			  ? GET_MODE_BITSIZE (TImode) : 256;
> +  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
> +     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> +  unsigned set_max = 32;
>   
> -  while (n > 0)
> +  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
> +		    & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> +    set_max = 16;
> +
> +  int offset = 0;
> +  while (len > 0)
>       {
>         /* Find the largest mode in which to do the copy without
>   	 over writing.  */
>         opt_scalar_int_mode mode_iter;
>         FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
> +	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
>   	  cur_mode = mode_iter.require ();
>   
>         gcc_assert (cur_mode != BLKmode);
>   
> -      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> -      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
> -      simd_ops++;
> -      n -= mode_bits;
> +      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
> +
> +      /* Prefer Q-register accesses for the last bytes.  */
> +      if (mode_bytes == 16)
> +	cur_mode = V16QImode;
> +
> +      aarch64_set_one_block (src, dst, offset, cur_mode);
> +      len -= mode_bytes;
> +      offset += mode_bytes;
>   
>         /* Emit trailing writes using overlapping unaligned accesses
> -	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> -      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
> +	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> +      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
>   	{
> -	  next_mode = smallest_mode_for_size (n, MODE_INT);
> -	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> -	  gcc_assert (n_bits <= mode_bits);
> -	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
> -	  n = n_bits;
> +	  next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
> +	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
> +	  gcc_assert (n_bytes <= mode_bytes);
> +	  offset -= n_bytes - len;
> +	  len = n_bytes;
>   	}
>       }
> -  rtx_insn *seq = get_insns ();
> -  end_sequence ();
>   
> -  if (size_p)
> -    {
> -      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
> -	 call to memset or the MOPS expansion.  */
> -      if (TARGET_MOPS
> -	  && mops_cost <= libcall_cost
> -	  && mops_cost <= simd_ops)
> -	return aarch64_expand_setmem_mops (operands);
> -      /* If MOPS is not available or not shorter pick a libcall if the SIMD
> -	 sequence is too long.  */
> -      else if (libcall_cost < simd_ops)
> -	return false;
> -      emit_insn (seq);
> -      return true;
> -    }
> -
> -  /* At this point the SIMD broadcast sequence is the best choice when
> -     optimizing for speed.  */
> -  emit_insn (seq);
>     return true;
>   }
>
  
Wilco Dijkstra Nov. 14, 2023, 4:56 p.m. UTC | #9
Hi Richard,

> +/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
> +   and 1 MOVI/DUP (same size as a call).  */
> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96)

> So it looks like this assumes we have AdvSIMD.  What about 
> -mgeneral-regs-only?

After my strictalign bugfix
(https://gcc.gnu.org/pipermail/gcc-patches/2023-November/635309.html)
aarch64_expand_setmem starts with:

  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
      || (STRICT_ALIGNMENT && align < 16))
    return aarch64_expand_setmem_mops (operands);

Generating perfect code for every STRICT_ALIGNMENT x TARGET_SIMD
x AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS x speed/size combination
would require a huge rewrite - and that's not the goal of this patch.

Cheers,
Wilco
  
Wilco Dijkstra Dec. 22, 2023, 2:25 p.m. UTC | #10
v3: rebased to latest trunk

Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress & bootstrap.

gcc/ChangeLog:
	* config/aarch64/aarch64.h (MAX_SET_SIZE): New define.
	* config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
	(aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
	(aarch64_expand_setmem): Clean up implementation, use byte offsets,
	simplify size calculation.

---

diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 3ae42be770400da96ea3d9d25d6e1b2d393d034d..dd3b7988d585277181c478cd022fd7b6285929d0 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1178,6 +1178,10 @@ typedef struct
    mode that should actually be used.  We allow pairs of registers.  */
 #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode)
 
+/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
+   and 1 MOVI/DUP (same size as a call).  */
+#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
+
 /* Maximum bytes moved by a single instruction (load/store pair).  */
 #define MOVE_MAX (UNITS_PER_WORD * 2)
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f9850320f61c5ddccf47e6583d304e5f405a484f..0909b319d16b9a1587314bcfda0a8112b42a663f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26294,15 +26294,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
 				    next, amount);
 }
 
-/* Return a new RTX holding the result of moving POINTER forward by the
-   size of the mode it points to.  */
-
-static rtx
-aarch64_progress_pointer (rtx pointer)
-{
-  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
-}
-
 typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
 
 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
@@ -26457,45 +26448,21 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
 static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-					    machine_mode mode)
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
 {
-  /* If we are copying 128bits or 256bits, we can do that straight from
-     the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-    {
-      mode = GET_MODE (src);
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memset.  */
-      emit_insn (aarch64_gen_store_pair (*dst, src, src));
-
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 32);
-      return;
-    }
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+  /* Emit explict store pair instructions for 32-byte writes.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
     {
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, GET_MODE (src), 0);
-      /* Emit the memset.  */
-      emit_move_insn (*dst, src);
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 16);
+      mode = V16QImode;
+      rtx dst1 = adjust_address (dst, mode, offset);
+      emit_insn (aarch64_gen_store_pair (dst1, src, src));
       return;
     }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
+  if (known_lt (GET_MODE_SIZE (mode), 16))
+    src = lowpart_subreg (mode, src, GET_MODE (src));
+  emit_move_insn (adjust_address (dst, mode, offset), src);
 }
 
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
@@ -26524,7 +26491,7 @@ aarch64_expand_setmem_mops (rtx *operands)
 bool
 aarch64_expand_setmem (rtx *operands)
 {
-  int n, mode_bits;
+  int mode_bytes;
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
@@ -26537,11 +26504,9 @@ aarch64_expand_setmem (rtx *operands)
       || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
-  unsigned max_set_size = 256;
+  unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
   len = UINTVAL (operands[1]);
@@ -26550,91 +26515,55 @@ aarch64_expand_setmem (rtx *operands)
   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
     return aarch64_expand_setmem_mops (operands);
 
-  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
-  /* The MOPS sequence takes:
-     3 instructions for the memory storing
-     + 1 to move the constant size into a reg
-     + 1 if VAL is a non-zero constant to move into a reg
-    (zero constants can use XZR directly).  */
-  unsigned mops_cost = 3 + 1 + cst_val;
-  /* A libcall to memset in the worst case takes 3 instructions to prepare
-     the arguments + 1 for the call.  */
-  unsigned libcall_cost = 4;
-
-  /* Attempt a sequence with a vector broadcast followed by stores.
-     Count the number of operations involved to see if it's worth it
-     against the alternatives.  A simple counter simd_ops on the
-     algorithmically-relevant operations is used rather than an rtx_insn count
-     as all the pointer adjusmtents and mode reinterprets will be optimized
-     away later.  */
-  start_sequence ();
-  unsigned simd_ops = 0;
-
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
 
   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
   src = expand_vector_broadcast (V16QImode, val);
   src = force_reg (V16QImode, src);
-  simd_ops++;
-  /* Convert len to bits to make the rest of the code simpler.  */
-  n = len * BITS_PER_UNIT;
 
-  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
-     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
-  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
-			  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-			  ? GET_MODE_BITSIZE (TImode) : 256;
+  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
+     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
+  unsigned set_max = 32;
+
+  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
+		    & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+    set_max = 16;
 
-  while (n > 0)
+  int offset = 0;
+  while (len > 0)
     {
       /* Find the largest mode in which to do the copy without
 	 over writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
 	  cur_mode = mode_iter.require ();
 
       gcc_assert (cur_mode != BLKmode);
 
-      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
-      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-      simd_ops++;
-      n -= mode_bits;
+      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+
+      /* Prefer Q-register accesses for the last bytes.  */
+      if (mode_bytes == 16)
+	cur_mode = V16QImode;
+
+      aarch64_set_one_block (src, dst, offset, cur_mode);
+      len -= mode_bytes;
+      offset += mode_bytes;
 
       /* Emit trailing writes using overlapping unaligned accesses
-	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
+	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
 	{
-	  next_mode = smallest_mode_for_size (n, MODE_INT);
-	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-	  gcc_assert (n_bits <= mode_bits);
-	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
-	  n = n_bits;
+	  next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
+	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+	  gcc_assert (n_bytes <= mode_bytes);
+	  offset -= n_bytes - len;
+	  len = n_bytes;
 	}
     }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
-
-  if (size_p)
-    {
-      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
-	 call to memset or the MOPS expansion.  */
-      if (TARGET_MOPS
-	  && mops_cost <= libcall_cost
-	  && mops_cost <= simd_ops)
-	return aarch64_expand_setmem_mops (operands);
-      /* If MOPS is not available or not shorter pick a libcall if the SIMD
-	 sequence is too long.  */
-      else if (libcall_cost < simd_ops)
-	return false;
-      emit_insn (seq);
-      return true;
-    }
 
-  /* At this point the SIMD broadcast sequence is the best choice when
-     optimizing for speed.  */
-  emit_insn (seq);
   return true;
 }
  
Richard Sandiford Jan. 5, 2024, 10:53 a.m. UTC | #11
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> v3: rebased to latest trunk
>
> Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
> bytes throughout.  Simplify the complex calculations when optimizing for size
> by using a fixed limit.
>
> Passes regress & bootstrap.
>
> gcc/ChangeLog:
>         * config/aarch64/aarch64.h (MAX_SET_SIZE): New define.
>         * config/aarch64/aarch64.cc (aarch64_progress_pointer): Remove function.
>         (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
>         (aarch64_expand_setmem): Clean up implementation, use byte offsets,
>         simplify size calculation.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 3ae42be770400da96ea3d9d25d6e1b2d393d034d..dd3b7988d585277181c478cd022fd7b6285929d0 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -1178,6 +1178,10 @@ typedef struct
>     mode that should actually be used.  We allow pairs of registers.  */
>  #define MAX_FIXED_MODE_SIZE GET_MODE_BITSIZE (TImode)
>
> +/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
> +   and 1 MOVI/DUP (same size as a call).  */
> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
> +

Since this isn't (AFAIK) a standard macro, there doesn't seem to be
any need to put it in the header file.  It could just go at the head
of aarch64.cc instead.

>  /* Maximum bytes moved by a single instruction (load/store pair).  */
>  #define MOVE_MAX (UNITS_PER_WORD * 2)
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f9850320f61c5ddccf47e6583d304e5f405a484f..0909b319d16b9a1587314bcfda0a8112b42a663f 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -26294,15 +26294,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
>                                     next, amount);
>  }
>
> -/* Return a new RTX holding the result of moving POINTER forward by the
> -   size of the mode it points to.  */
> -
> -static rtx
> -aarch64_progress_pointer (rtx pointer)
> -{
> -  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
> -}
> -
>  typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
>
>  /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
> @@ -26457,45 +26448,21 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>    return true;
>  }
>
> -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
> -   SRC is a register we have created with the duplicated value to be set.  */
> +/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
>  static void
> -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
> -                                           machine_mode mode)
> +aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
>  {
> -  /* If we are copying 128bits or 256bits, we can do that straight from
> -     the SIMD register we prepared.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> -    {
> -      mode = GET_MODE (src);
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, mode, 0);
> -      /* Emit the memset.  */
> -      emit_insn (aarch64_gen_store_pair (*dst, src, src));
> -
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 32);
> -      return;
> -    }
> -  if (known_eq (GET_MODE_BITSIZE (mode), 128))
> +  /* Emit explict store pair instructions for 32-byte writes.  */
> +  if (known_eq (GET_MODE_SIZE (mode), 32))
>      {
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, GET_MODE (src), 0);
> -      /* Emit the memset.  */
> -      emit_move_insn (*dst, src);
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 16);
> +      mode = V16QImode;
> +      rtx dst1 = adjust_address (dst, mode, offset);
> +      emit_insn (aarch64_gen_store_pair (dst1, src, src));
>        return;
>      }
> -  /* For copying less, we have to extract the right amount from src.  */
> -  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
> -
> -  /* "Cast" the *dst to the correct mode.  */
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memset.  */
> -  emit_move_insn (*dst, reg);
> -  /* Move the pointer forward.  */
> -  *dst = aarch64_progress_pointer (*dst);
> +  if (known_lt (GET_MODE_SIZE (mode), 16))
> +    src = lowpart_subreg (mode, src, GET_MODE (src));
> +  emit_move_insn (adjust_address (dst, mode, offset), src);
>  }
>
>  /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
> @@ -26524,7 +26491,7 @@ aarch64_expand_setmem_mops (rtx *operands)
>  bool
>  aarch64_expand_setmem (rtx *operands)
>  {
> -  int n, mode_bits;
> +  int mode_bytes;
>    unsigned HOST_WIDE_INT len;
>    rtx dst = operands[0];
>    rtx val = operands[2], src;
> @@ -26537,11 +26504,9 @@ aarch64_expand_setmem (rtx *operands)
>        || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_setmem_mops (operands);
>
> -  bool size_p = optimize_function_for_size_p (cfun);
> -
>    /* Default the maximum to 256-bytes when considering only libcall vs
>       SIMD broadcast sequence.  */
> -  unsigned max_set_size = 256;
> +  unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
>    unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>
>    len = UINTVAL (operands[1]);
> @@ -26550,91 +26515,55 @@ aarch64_expand_setmem (rtx *operands)
>    if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
>      return aarch64_expand_setmem_mops (operands);
>
> -  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
> -  /* The MOPS sequence takes:
> -     3 instructions for the memory storing
> -     + 1 to move the constant size into a reg
> -     + 1 if VAL is a non-zero constant to move into a reg
> -    (zero constants can use XZR directly).  */
> -  unsigned mops_cost = 3 + 1 + cst_val;
> -  /* A libcall to memset in the worst case takes 3 instructions to prepare
> -     the arguments + 1 for the call.  */
> -  unsigned libcall_cost = 4;
> -
> -  /* Attempt a sequence with a vector broadcast followed by stores.
> -     Count the number of operations involved to see if it's worth it
> -     against the alternatives.  A simple counter simd_ops on the
> -     algorithmically-relevant operations is used rather than an rtx_insn count
> -     as all the pointer adjusmtents and mode reinterprets will be optimized
> -     away later.  */
> -  start_sequence ();
> -  unsigned simd_ops = 0;
> -
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
>
>    /* Prepare the val using a DUP/MOVI v0.16B, val.  */
>    src = expand_vector_broadcast (V16QImode, val);
>    src = force_reg (V16QImode, src);
> -  simd_ops++;
> -  /* Convert len to bits to make the rest of the code simpler.  */
> -  n = len * BITS_PER_UNIT;
>
> -  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> -     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> -  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
> -                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> -                         ? GET_MODE_BITSIZE (TImode) : 256;
> +  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
> +     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> +  unsigned set_max = 32;
> +
> +  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
> +                   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> +    set_max = 16;

I think we should take the tuning parameter into account when applying
the MAX_SET_SIZE limit for -Os.  Shouldn't it be 48 rather than 96 in
that case?  (Alternatively, I suppose it would make sense to ignore
the param for -Os, although we don't seem to do that elsewhere.)

Otherwise it looks like a nice improvement, thanks.  I agree a simple
limit like this is enough for the -Os case, since it's impossible
to account accurately for the register-clobbering effect of a call.

Richard

> -  while (n > 0)
> +  int offset = 0;
> +  while (len > 0)
>      {
>        /* Find the largest mode in which to do the copy without
>          over writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
> +       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
>           cur_mode = mode_iter.require ();
>
>        gcc_assert (cur_mode != BLKmode);
>
> -      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> -      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
> -      simd_ops++;
> -      n -= mode_bits;
> +      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
> +
> +      /* Prefer Q-register accesses for the last bytes.  */
> +      if (mode_bytes == 16)
> +       cur_mode = V16QImode;
> +
> +      aarch64_set_one_block (src, dst, offset, cur_mode);
> +      len -= mode_bytes;
> +      offset += mode_bytes;
>
>        /* Emit trailing writes using overlapping unaligned accesses
> -       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> -      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
> +        (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> +      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
>         {
> -         next_mode = smallest_mode_for_size (n, MODE_INT);
> -         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> -         gcc_assert (n_bits <= mode_bits);
> -         dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
> -         n = n_bits;
> +         next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
> +         int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
> +         gcc_assert (n_bytes <= mode_bytes);
> +         offset -= n_bytes - len;
> +         len = n_bytes;
>         }
>      }
> -  rtx_insn *seq = get_insns ();
> -  end_sequence ();
> -
> -  if (size_p)
> -    {
> -      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
> -        call to memset or the MOPS expansion.  */
> -      if (TARGET_MOPS
> -         && mops_cost <= libcall_cost
> -         && mops_cost <= simd_ops)
> -       return aarch64_expand_setmem_mops (operands);
> -      /* If MOPS is not available or not shorter pick a libcall if the SIMD
> -        sequence is too long.  */
> -      else if (libcall_cost < simd_ops)
> -       return false;
> -      emit_insn (seq);
> -      return true;
> -    }
>
> -  /* At this point the SIMD broadcast sequence is the best choice when
> -     optimizing for speed.  */
> -  emit_insn (seq);
>    return true;
>  }
  
Wilco Dijkstra Jan. 9, 2024, 8:51 p.m. UTC | #12
Hi Richard,

>> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
>
> Since this isn't (AFAIK) a standard macro, there doesn't seem to be
> any need to put it in the header file.  It could just go at the head
> of aarch64.cc instead.

Sure, I've moved it in v4.

>> +  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
>> +                   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
>> +    set_max = 16;
>
> I think we should take the tuning parameter into account when applying
> the MAX_SET_SIZE limit for -Os.  Shouldn't it be 48 rather than 96 in
> that case?  (Alternatively, I suppose it would make sense to ignore
> the param for -Os, although we don't seem to do that elsewhere.)

That tune is only used by an obsolete core. I ran the memcpy and memset
benchmarks from Optimized Routines on xgene-1 with and without LDP/STP.
There is no measurable penalty for using LDP/STP. I'm not sure why it was
ever added given it does not do anything useful. I'll post a separate patch
to remove it to reduce the maintenance overhead.

Cheers,
Wilco


Here is v4 (move MAX_SET_SIZE definition to aarch64.cc):

Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
bytes throughout.  Simplify the complex calculations when optimizing for size
by using a fixed limit.

Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog:
        * config/aarch64/aarch64.cc (MAX_SET_SIZE): New define.
        (aarch64_progress_pointer): Remove function.
        (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
        (aarch64_expand_setmem): Clean up implementation, use byte offsets,
        simplify size calculation.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index a5a6b52730d6c5013346d128e89915883f1707ae..62f4eee429c1c5195d54604f1d341a8a5a499d89 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -101,6 +101,10 @@
 /* Defined for convenience.  */
 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 
+/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
+   and 1 MOVI/DUP (same size as a call).  */
+#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
+
 /* Flags that describe how a function shares certain architectural state
    with its callers.
 
@@ -26321,15 +26325,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
 				    next, amount);
 }
 
-/* Return a new RTX holding the result of moving POINTER forward by the
-   size of the mode it points to.  */
-
-static rtx
-aarch64_progress_pointer (rtx pointer)
-{
-  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
-}
-
 typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
 
 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
@@ -26484,45 +26479,21 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
 static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-					    machine_mode mode)
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
 {
-  /* If we are copying 128bits or 256bits, we can do that straight from
-     the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-    {
-      mode = GET_MODE (src);
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memset.  */
-      emit_insn (aarch64_gen_store_pair (*dst, src, src));
-
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 32);
-      return;
-    }
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+  /* Emit explict store pair instructions for 32-byte writes.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
     {
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, GET_MODE (src), 0);
-      /* Emit the memset.  */
-      emit_move_insn (*dst, src);
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 16);
+      mode = V16QImode;
+      rtx dst1 = adjust_address (dst, mode, offset);
+      emit_insn (aarch64_gen_store_pair (dst1, src, src));
       return;
     }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
+  if (known_lt (GET_MODE_SIZE (mode), 16))
+    src = lowpart_subreg (mode, src, GET_MODE (src));
+  emit_move_insn (adjust_address (dst, mode, offset), src);
 }
 
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
@@ -26551,7 +26522,7 @@ aarch64_expand_setmem_mops (rtx *operands)
 bool
 aarch64_expand_setmem (rtx *operands)
 {
-  int n, mode_bits;
+  int mode_bytes;
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
@@ -26564,11 +26535,9 @@ aarch64_expand_setmem (rtx *operands)
       || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
-  unsigned max_set_size = 256;
+  unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
   len = UINTVAL (operands[1]);
@@ -26577,91 +26546,55 @@ aarch64_expand_setmem (rtx *operands)
   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
     return aarch64_expand_setmem_mops (operands);
 
-  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
-  /* The MOPS sequence takes:
-     3 instructions for the memory storing
-     + 1 to move the constant size into a reg
-     + 1 if VAL is a non-zero constant to move into a reg
-    (zero constants can use XZR directly).  */
-  unsigned mops_cost = 3 + 1 + cst_val;
-  /* A libcall to memset in the worst case takes 3 instructions to prepare
-     the arguments + 1 for the call.  */
-  unsigned libcall_cost = 4;
-
-  /* Attempt a sequence with a vector broadcast followed by stores.
-     Count the number of operations involved to see if it's worth it
-     against the alternatives.  A simple counter simd_ops on the
-     algorithmically-relevant operations is used rather than an rtx_insn count
-     as all the pointer adjusmtents and mode reinterprets will be optimized
-     away later.  */
-  start_sequence ();
-  unsigned simd_ops = 0;
-
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
 
   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
   src = expand_vector_broadcast (V16QImode, val);
   src = force_reg (V16QImode, src);
-  simd_ops++;
-  /* Convert len to bits to make the rest of the code simpler.  */
-  n = len * BITS_PER_UNIT;
 
-  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
-     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
-  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
-			  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-			  ? GET_MODE_BITSIZE (TImode) : 256;
+  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
+     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
+  unsigned set_max = 32;
 
-  while (n > 0)
+  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
+		    & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+    set_max = 16;
+
+  int offset = 0;
+  while (len > 0)
     {
       /* Find the largest mode in which to do the copy without
 	 over writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
 	  cur_mode = mode_iter.require ();
 
       gcc_assert (cur_mode != BLKmode);
 
-      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
-      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-      simd_ops++;
-      n -= mode_bits;
+      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+
+      /* Prefer Q-register accesses for the last bytes.  */
+      if (mode_bytes == 16)
+	cur_mode = V16QImode;
+
+      aarch64_set_one_block (src, dst, offset, cur_mode);
+      len -= mode_bytes;
+      offset += mode_bytes;
 
       /* Emit trailing writes using overlapping unaligned accesses
-	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
+	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
 	{
-	  next_mode = smallest_mode_for_size (n, MODE_INT);
-	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-	  gcc_assert (n_bits <= mode_bits);
-	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
-	  n = n_bits;
+	  next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
+	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+	  gcc_assert (n_bytes <= mode_bytes);
+	  offset -= n_bytes - len;
+	  len = n_bytes;
 	}
     }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
-
-  if (size_p)
-    {
-      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
-	 call to memset or the MOPS expansion.  */
-      if (TARGET_MOPS
-	  && mops_cost <= libcall_cost
-	  && mops_cost <= simd_ops)
-	return aarch64_expand_setmem_mops (operands);
-      /* If MOPS is not available or not shorter pick a libcall if the SIMD
-	 sequence is too long.  */
-      else if (libcall_cost < simd_ops)
-	return false;
-      emit_insn (seq);
-      return true;
-    }
 
-  /* At this point the SIMD broadcast sequence is the best choice when
-     optimizing for speed.  */
-  emit_insn (seq);
   return true;
 }
  
Richard Sandiford Jan. 10, 2024, 6:13 p.m. UTC | #13
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>>> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
>>
>> Since this isn't (AFAIK) a standard macro, there doesn't seem to be
>> any need to put it in the header file.  It could just go at the head
>> of aarch64.cc instead.
>
> Sure, I've moved it in v4.
>
>>> +  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
>>> +                   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
>>> +    set_max = 16;
>>
>> I think we should take the tuning parameter into account when applying
>> the MAX_SET_SIZE limit for -Os.  Shouldn't it be 48 rather than 96 in
>> that case?  (Alternatively, I suppose it would make sense to ignore
>> the param for -Os, although we don't seem to do that elsewhere.)
>
> That tune is only used by an obsolete core. I ran the memcpy and memset
> benchmarks from Optimized Routines on xgene-1 with and without LDP/STP.
> There is no measurable penalty for using LDP/STP. I'm not sure why it was
> ever added given it does not do anything useful. I'll post a separate patch
> to remove it to reduce the maintenance overhead.

Is that enough to justify removing it though?  It sounds from:

  https://gcc.gnu.org/pipermail/gcc-patches/2018-June/500017.html

like the problem was in more balanced code, rather than memory-limited
things like memset/memcpy.

But yeah, I'm not sure if the intuition was supported by numbers
in the end.  If SPEC also shows no change then we can probably drop it
(unless someone objects).

Let's leave this patch until that's resolved though, since I think as it
stands the patch does leave -Os -mtune=xgene1 worse off (bigger code).
Handling the tune in the meantime would also be OK.

BTW, just noticed, but...

>
> Cheers,
> Wilco
>
>
> Here is v4 (move MAX_SET_SIZE definition to aarch64.cc):
>
> Cleanup memset implementation.  Similar to memcpy/memmove, use an offset and
> bytes throughout.  Simplify the complex calculations when optimizing for size
> by using a fixed limit.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog:
>         * config/aarch64/aarch64.cc (MAX_SET_SIZE): New define.
>         (aarch64_progress_pointer): Remove function.
>         (aarch64_set_one_block_and_progress_pointer): Simplify and clean up.
>         (aarch64_expand_setmem): Clean up implementation, use byte offsets,
>         simplify size calculation.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index a5a6b52730d6c5013346d128e89915883f1707ae..62f4eee429c1c5195d54604f1d341a8a5a499d89 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -101,6 +101,10 @@
>  /* Defined for convenience.  */
>  #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
>
> +/* Maximum bytes set for an inline memset expansion.  With -Os use 3 STP
> +   and 1 MOVI/DUP (same size as a call).  */
> +#define MAX_SET_SIZE(speed) (speed ? 256 : 96)
> +
>  /* Flags that describe how a function shares certain architectural state
>     with its callers.
>
> @@ -26321,15 +26325,6 @@ aarch64_move_pointer (rtx pointer, poly_int64 amount)
>                                     next, amount);
>  }
>
> -/* Return a new RTX holding the result of moving POINTER forward by the
> -   size of the mode it points to.  */
> -
> -static rtx
> -aarch64_progress_pointer (rtx pointer)
> -{
> -  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
> -}
> -
>  typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
>
>  /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
> @@ -26484,45 +26479,21 @@ aarch64_expand_cpymem (rtx *operands, bool is_memmove)
>    return true;
>  }
>
> -/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
> -   SRC is a register we have created with the duplicated value to be set.  */
> +/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
>  static void
> -aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
> -                                           machine_mode mode)
> +aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
>  {
> -  /* If we are copying 128bits or 256bits, we can do that straight from
> -     the SIMD register we prepared.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> -    {
> -      mode = GET_MODE (src);
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, mode, 0);
> -      /* Emit the memset.  */
> -      emit_insn (aarch64_gen_store_pair (*dst, src, src));
> -
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 32);
> -      return;
> -    }
> -  if (known_eq (GET_MODE_BITSIZE (mode), 128))
> +  /* Emit explict store pair instructions for 32-byte writes.  */
> +  if (known_eq (GET_MODE_SIZE (mode), 32))
>      {
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, GET_MODE (src), 0);
> -      /* Emit the memset.  */
> -      emit_move_insn (*dst, src);
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 16);
> +      mode = V16QImode;
> +      rtx dst1 = adjust_address (dst, mode, offset);
> +      emit_insn (aarch64_gen_store_pair (dst1, src, src));
>        return;
>      }
> -  /* For copying less, we have to extract the right amount from src.  */
> -  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
> -
> -  /* "Cast" the *dst to the correct mode.  */
> -  *dst = adjust_address (*dst, mode, 0);
> -  /* Emit the memset.  */
> -  emit_move_insn (*dst, reg);
> -  /* Move the pointer forward.  */
> -  *dst = aarch64_progress_pointer (*dst);
> +  if (known_lt (GET_MODE_SIZE (mode), 16))
> +    src = lowpart_subreg (mode, src, GET_MODE (src));
> +  emit_move_insn (adjust_address (dst, mode, offset), src);
>  }
>
>  /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
> @@ -26551,7 +26522,7 @@ aarch64_expand_setmem_mops (rtx *operands)
>  bool
>  aarch64_expand_setmem (rtx *operands)
>  {
> -  int n, mode_bits;
> +  int mode_bytes;
>    unsigned HOST_WIDE_INT len;
>    rtx dst = operands[0];
>    rtx val = operands[2], src;
> @@ -26564,11 +26535,9 @@ aarch64_expand_setmem (rtx *operands)
>        || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_setmem_mops (operands);
>
> -  bool size_p = optimize_function_for_size_p (cfun);
> -
>    /* Default the maximum to 256-bytes when considering only libcall vs
>       SIMD broadcast sequence.  */

...this comment should be deleted along with the code it's describing.
Don't respin just for that though :)

Thanks,
Richard

> -  unsigned max_set_size = 256;
> +  unsigned max_set_size = MAX_SET_SIZE (optimize_function_for_speed_p (cfun));
>    unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>
>    len = UINTVAL (operands[1]);
> @@ -26577,91 +26546,55 @@ aarch64_expand_setmem (rtx *operands)
>    if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
>      return aarch64_expand_setmem_mops (operands);
>
> -  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
> -  /* The MOPS sequence takes:
> -     3 instructions for the memory storing
> -     + 1 to move the constant size into a reg
> -     + 1 if VAL is a non-zero constant to move into a reg
> -    (zero constants can use XZR directly).  */
> -  unsigned mops_cost = 3 + 1 + cst_val;
> -  /* A libcall to memset in the worst case takes 3 instructions to prepare
> -     the arguments + 1 for the call.  */
> -  unsigned libcall_cost = 4;
> -
> -  /* Attempt a sequence with a vector broadcast followed by stores.
> -     Count the number of operations involved to see if it's worth it
> -     against the alternatives.  A simple counter simd_ops on the
> -     algorithmically-relevant operations is used rather than an rtx_insn count
> -     as all the pointer adjusmtents and mode reinterprets will be optimized
> -     away later.  */
> -  start_sequence ();
> -  unsigned simd_ops = 0;
> -
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
>
>    /* Prepare the val using a DUP/MOVI v0.16B, val.  */
>    src = expand_vector_broadcast (V16QImode, val);
>    src = force_reg (V16QImode, src);
> -  simd_ops++;
> -  /* Convert len to bits to make the rest of the code simpler.  */
> -  n = len * BITS_PER_UNIT;
>
> -  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> -     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> -  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
> -                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> -                         ? GET_MODE_BITSIZE (TImode) : 256;
> +  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
> +     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> +  unsigned set_max = 32;
>
> -  while (n > 0)
> +  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
> +                   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> +    set_max = 16;
> +
> +  int offset = 0;
> +  while (len > 0)
>      {
>        /* Find the largest mode in which to do the copy without
>          over writing.  */
>        opt_scalar_int_mode mode_iter;
>        FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> -       if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
> +       if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
>           cur_mode = mode_iter.require ();
>
>        gcc_assert (cur_mode != BLKmode);
>
> -      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> -      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
> -      simd_ops++;
> -      n -= mode_bits;
> +      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
> +
> +      /* Prefer Q-register accesses for the last bytes.  */
> +      if (mode_bytes == 16)
> +       cur_mode = V16QImode;
> +
> +      aarch64_set_one_block (src, dst, offset, cur_mode);
> +      len -= mode_bytes;
> +      offset += mode_bytes;
>
>        /* Emit trailing writes using overlapping unaligned accesses
> -       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> -      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
> +        (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
> +      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
>         {
> -         next_mode = smallest_mode_for_size (n, MODE_INT);
> -         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> -         gcc_assert (n_bits <= mode_bits);
> -         dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
> -         n = n_bits;
> +         next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
> +         int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
> +         gcc_assert (n_bytes <= mode_bytes);
> +         offset -= n_bytes - len;
> +         len = n_bytes;
>         }
>      }
> -  rtx_insn *seq = get_insns ();
> -  end_sequence ();
> -
> -  if (size_p)
> -    {
> -      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
> -        call to memset or the MOPS expansion.  */
> -      if (TARGET_MOPS
> -         && mops_cost <= libcall_cost
> -         && mops_cost <= simd_ops)
> -       return aarch64_expand_setmem_mops (operands);
> -      /* If MOPS is not available or not shorter pick a libcall if the SIMD
> -        sequence is too long.  */
> -      else if (libcall_cost < simd_ops)
> -       return false;
> -      emit_insn (seq);
> -      return true;
> -    }
>
> -  /* At this point the SIMD broadcast sequence is the best choice when
> -     optimizing for speed.  */
> -  emit_insn (seq);
>    return true;
>  }
  
Wilco Dijkstra Jan. 30, 2024, 3:51 p.m. UTC | #14
Hi Richard,

>> That tune is only used by an obsolete core. I ran the memcpy and memset
>> benchmarks from Optimized Routines on xgene-1 with and without LDP/STP.
>> There is no measurable penalty for using LDP/STP. I'm not sure why it was
>> ever added given it does not do anything useful. I'll post a separate patch
>> to remove it to reduce the maintenance overhead.

Patch: https://gcc.gnu.org/pipermail/gcc-patches/2024-January/644442.html

> Is that enough to justify removing it though?  It sounds from:
>
>  https://gcc.gnu.org/pipermail/gcc-patches/2018-June/500017.html
>
> like the problem was in more balanced code, rather than memory-limited
> things like memset/memcpy.
>
> But yeah, I'm not sure if the intuition was supported by numbers
> in the end.  If SPEC also shows no change then we can probably drop it
> (unless someone objects).

SPECINT didn't show any difference either, so LDP doesn't have a measurable
penalty. It doesn't look like the original commit was ever backed up by benchmarks...

> Let's leave this patch until that's resolved though, since I think as it
> stands the patch does leave -Os -mtune=xgene1 worse off (bigger code).
> Handling the tune in the meantime would also be OK.

Note it was incorrectly handling -Os, it should still form LDP in that case
and take advantage of longer and faster inlined memcpy/memset instead of
calling a library function.

>    /* Default the maximum to 256-bytes when considering only libcall vs
>       SIMD broadcast sequence.  */

> ...this comment should be deleted along with the code it's describing.
> Don't respin just for that though :)

I've fixed that locally.

Cheers,
Wilco
  
Richard Sandiford Feb. 1, 2024, 5:32 p.m. UTC | #15
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>>> That tune is only used by an obsolete core. I ran the memcpy and memset
>>> benchmarks from Optimized Routines on xgene-1 with and without LDP/STP.
>>> There is no measurable penalty for using LDP/STP. I'm not sure why it was
>>> ever added given it does not do anything useful. I'll post a separate patch
>>> to remove it to reduce the maintenance overhead.
>
> Patch: https://gcc.gnu.org/pipermail/gcc-patches/2024-January/644442.html
>
>> Is that enough to justify removing it though?  It sounds from:
>>
>>  https://gcc.gnu.org/pipermail/gcc-patches/2018-June/500017.html
>>
>> like the problem was in more balanced code, rather than memory-limited
>> things like memset/memcpy.
>>
>> But yeah, I'm not sure if the intuition was supported by numbers
>> in the end.  If SPEC also shows no change then we can probably drop it
>> (unless someone objects).
>
> SPECINT didn't show any difference either, so LDP doesn't have a measurable
> penalty. It doesn't look like the original commit was ever backed up by benchmarks...
>
>> Let's leave this patch until that's resolved though, since I think as it
>> stands the patch does leave -Os -mtune=xgene1 worse off (bigger code).
>> Handling the tune in the meantime would also be OK.
>
> Note it was incorrectly handling -Os, it should still form LDP in that case
> and take advantage of longer and faster inlined memcpy/memset instead of
> calling a library function.

Yeah.  FWIW, I'd made the same point earlier in the review.

Now we have the LDP/STP policy tuning instead, but since the block
memory routines don't adjust for that yet, it's probably easier to
handle any such adjustment as a follow-on (for anyone who wants to do it).

>>    /* Default the maximum to 256-bytes when considering only libcall vs
>>       SIMD broadcast sequence.  */
>
>> ...this comment should be deleted along with the code it's describing.
>> Don't respin just for that though :)
>
> I've fixed that locally.

Thanks.  The patch is OK for GCC 15 if there are no objections to the
AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS patch.

Richard
  

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e19e2d1de2e5b30eca672df05d9dcc1bc106ecc8..578a253d6e0e133e19592553fc873b3e73f9f218 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25229,15 +25229,6 @@  aarch64_move_pointer (rtx pointer, poly_int64 amount)
 				    next, amount);
 }
 
-/* Return a new RTX holding the result of moving POINTER forward by the
-   size of the mode it points to.  */
-
-static rtx
-aarch64_progress_pointer (rtx pointer)
-{
-  return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
-}
-
 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
 
 static void
@@ -25393,46 +25384,22 @@  aarch64_expand_cpymem (rtx *operands, bool is_memmove)
   return true;
 }
 
-/* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
-   SRC is a register we have created with the duplicated value to be set.  */
+/* Set one block of size MODE at DST at offset OFFSET to value in SRC.  */
 static void
-aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-					    machine_mode mode)
-{
-  /* If we are copying 128bits or 256bits, we can do that straight from
-     the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-    {
-      mode = GET_MODE (src);
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memset.  */
-      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
-					 aarch64_progress_pointer (*dst), src));
-
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 32);
-      return;
-    }
-  if (known_eq (GET_MODE_BITSIZE (mode), 128))
+aarch64_set_one_block (rtx src, rtx dst, int offset, machine_mode mode)
+{
+  /* Emit explict store pair instructions for 32-byte writes.  */
+  if (known_eq (GET_MODE_SIZE (mode), 32))
     {
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, GET_MODE (src), 0);
-      /* Emit the memset.  */
-      emit_move_insn (*dst, src);
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 16);
+      mode = V16QImode;
+      rtx dst1 = adjust_address (dst, mode, offset);
+      rtx dst2 = adjust_address (dst, mode, offset + 16);
+      emit_insn (aarch64_gen_store_pair (mode, dst1, src, dst2, src));
       return;
     }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
-
-  /* "Cast" the *dst to the correct mode.  */
-  *dst = adjust_address (*dst, mode, 0);
-  /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
-  /* Move the pointer forward.  */
-  *dst = aarch64_progress_pointer (*dst);
+  if (known_lt (GET_MODE_SIZE (mode), 16))
+    src = lowpart_subreg (mode, src, GET_MODE (src));
+  emit_move_insn (adjust_address (dst, mode, offset), src);
 }
 
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
@@ -25461,7 +25428,7 @@  aarch64_expand_setmem_mops (rtx *operands)
 bool
 aarch64_expand_setmem (rtx *operands)
 {
-  int n, mode_bits;
+  int mode_bytes;
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
@@ -25474,104 +25441,70 @@  aarch64_expand_setmem (rtx *operands)
       || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
-  bool size_p = optimize_function_for_size_p (cfun);
-
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
+  /* Reduce the maximum size with -Os.  */
+  if (optimize_function_for_size_p (cfun))
+    max_set_size = 96;
+
   len = UINTVAL (operands[1]);
 
   /* Large memset uses MOPS when available or a library call.  */
   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
     return aarch64_expand_setmem_mops (operands);
 
-  int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
-  /* The MOPS sequence takes:
-     3 instructions for the memory storing
-     + 1 to move the constant size into a reg
-     + 1 if VAL is a non-zero constant to move into a reg
-    (zero constants can use XZR directly).  */
-  unsigned mops_cost = 3 + 1 + cst_val;
-  /* A libcall to memset in the worst case takes 3 instructions to prepare
-     the arguments + 1 for the call.  */
-  unsigned libcall_cost = 4;
-
-  /* Attempt a sequence with a vector broadcast followed by stores.
-     Count the number of operations involved to see if it's worth it
-     against the alternatives.  A simple counter simd_ops on the
-     algorithmically-relevant operations is used rather than an rtx_insn count
-     as all the pointer adjusmtents and mode reinterprets will be optimized
-     away later.  */
-  start_sequence ();
-  unsigned simd_ops = 0;
-
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
 
   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
   src = expand_vector_broadcast (V16QImode, val);
   src = force_reg (V16QImode, src);
-  simd_ops++;
-  /* Convert len to bits to make the rest of the code simpler.  */
-  n = len * BITS_PER_UNIT;
 
-  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
-     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
-  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
-			  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-			  ? GET_MODE_BITSIZE (TImode) : 256;
+  /* Set maximum amount to write in one go.  We allow 32-byte chunks based
+     on the AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
+  unsigned set_max = 32;
+
+  if (len <= 24 || (aarch64_tune_params.extra_tuning_flags
+		    & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+    set_max = 16;
 
-  while (n > 0)
+  int offset = 0;
+  while (len > 0)
     {
       /* Find the largest mode in which to do the copy without
 	 over writing.  */
       opt_scalar_int_mode mode_iter;
       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
-	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
+	if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (len, set_max))
 	  cur_mode = mode_iter.require ();
 
       gcc_assert (cur_mode != BLKmode);
 
-      mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
-      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-      simd_ops++;
-      n -= mode_bits;
+      mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
+
+      /* Prefer Q-register accesses for the last bytes.  */
+      if (mode_bytes == 16)
+	cur_mode = V16QImode;
+
+      aarch64_set_one_block (src, dst, offset, cur_mode);
+      len -= mode_bytes;
+      offset += mode_bytes;
 
       /* Emit trailing writes using overlapping unaligned accesses
-	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
-      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
+	 (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
+      if (len > 0 && len < set_max / 2 && !STRICT_ALIGNMENT)
 	{
-	  next_mode = smallest_mode_for_size (n, MODE_INT);
-	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
-	  gcc_assert (n_bits <= mode_bits);
-	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
-	  n = n_bits;
+	  next_mode = smallest_mode_for_size (len * BITS_PER_UNIT, MODE_INT);
+	  int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
+	  gcc_assert (n_bytes <= mode_bytes);
+	  offset -= n_bytes - len;
+	  len = n_bytes;
 	}
     }
-  rtx_insn *seq = get_insns ();
-  end_sequence ();
-
-  if (size_p)
-    {
-      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
-	 call to memset or the MOPS expansion.  */
-      if (TARGET_MOPS
-	  && mops_cost <= libcall_cost
-	  && mops_cost <= simd_ops)
-	return aarch64_expand_setmem_mops (operands);
-      /* If MOPS is not available or not shorter pick a libcall if the SIMD
-	 sequence is too long.  */
-      else if (libcall_cost < simd_ops)
-	return false;
-      emit_insn (seq);
-      return true;
-    }
 
-  /* At this point the SIMD broadcast sequence is the best choice when
-     optimizing for speed.  */
-  emit_insn (seq);
   return true;
 }