[v2] AArch64: Fix strict-align cpymem/setmem [PR103100]

Message ID PAWPR08MB8982FFA5F943B02C53858EAD83F8A@PAWPR08MB8982.eurprd08.prod.outlook.com
State Accepted
Headers
Series [v2] AArch64: Fix strict-align cpymem/setmem [PR103100] |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Wilco Dijkstra Sept. 21, 2023, 2:24 p.m. UTC
  v2: Use UINTVAL, rename max_mops_size.

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
    
Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
        PR target/103100
        * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
        (setmemdi): Likewise.
        * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
        strict-align.  Cleanup condition for using MOPS.
        (aarch64_expand_setmem): Likewise.

---
  

Comments

Wilco Dijkstra Oct. 16, 2023, 12:48 p.m. UTC | #1
ping
 
v2: Use UINTVAL, rename max_mops_size.

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
    
Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
        PR target/103100
        * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
        (setmemdi): Likewise.
        * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
        strict-align.  Cleanup condition for using MOPS.
        (aarch64_expand_setmem): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
-     It should be a win even for size optimization in the general case.
-     For speed optimization the choice between MOPS and the SIMD sequence
-     depends on the size of the copy, rather than number of instructions,
-     alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
     return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
-     inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+      || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-    return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+    return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
      the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
-     when available.  */
-  if (TARGET_MOPS
-      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-    return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
      Count the number of operations involved to see if it's worth it
      against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
       simd_ops++;
       n -= mode_bits;
 
-      /* Do certain trailing copies as overlapping if it's going to be
-        cheaper.  i.e. less instructions to do so.  For instance doing a 15
-        byte copy it's more efficient to do two overlapping 8 byte copies than
-        8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+      /* Emit trailing writes using overlapping unaligned accesses
+       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
         {
           next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""
 {
   if (aarch64_expand_cpymem (operands))
     DONE;
@@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
         (match_operand:QI  2 "nonmemory_operand")) ;; Value
    (use (match_operand:DI  1 "general_operand")) ;; Length
    (match_operand          3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
  {
   if (aarch64_expand_setmem (operands))
     DONE;
  
Wilco Dijkstra Nov. 6, 2023, 12:09 p.m. UTC | #2
ping
 
v2: Use UINTVAL, rename max_mops_size.

The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
Clean up the condition when to use MOPS.
    
Passes regress/bootstrap, OK for commit?
    
gcc/ChangeLog/
        PR target/103100
        * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
        (setmemdi): Likewise.
        * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
        strict-align.  Cleanup condition for using MOPS.
        (aarch64_expand_setmem): Likewise.

---

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
-     It should be a win even for size optimization in the general case.
-     For speed optimization the choice between MOPS and the SIMD sequence
-     depends on the size of the copy, rather than number of instructions,
-     alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
     return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
-     inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+      || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-    return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+    return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
      the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
-     when available.  */
-  if (TARGET_MOPS
-      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-    return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
      Count the number of operations involved to see if it's worth it
      against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
       simd_ops++;
       n -= mode_bits;
 
-      /* Do certain trailing copies as overlapping if it's going to be
-        cheaper.  i.e. less instructions to do so.  For instance doing a 15
-        byte copy it's more efficient to do two overlapping 8 byte copies than
-        8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+      /* Emit trailing writes using overlapping unaligned accesses
+       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
         {
           next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""
 {
   if (aarch64_expand_cpymem (operands))
     DONE;
@@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
         (match_operand:QI  2 "nonmemory_operand")) ;; Value
    (use (match_operand:DI  1 "general_operand")) ;; Length
    (match_operand          3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
  {
   if (aarch64_expand_setmem (operands))
     DONE;
  
Richard Sandiford Nov. 29, 2023, 6:09 p.m. UTC | #3
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> v2: Use UINTVAL, rename max_mops_size.
>
> The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
> Clean up the condition when to use MOPS.
>
> Passes regress/bootstrap, OK for commit?
>
> gcc/ChangeLog/
>         PR target/103100
>         * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
>         (setmemdi): Likewise.
>         * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
>         strict-align.  Cleanup condition for using MOPS.
>         (aarch64_expand_setmem): Likewise.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
>    int mode_bits;
>    rtx dst = operands[0];
>    rtx src = operands[1];
> +  unsigned align = UINTVAL (operands[3]);
>    rtx base;
>    machine_mode cur_mode = BLKmode;
> +  bool size_p = optimize_function_for_size_p (cfun);
>
> -  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
> -  if (!CONST_INT_P (operands[2]))
> +  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_cpymem_mops (operands);
>
> -  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
> -
> -  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
> -  unsigned HOST_WIDE_INT max_copy_size
> -    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
> +  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
>
> -  bool size_p = optimize_function_for_size_p (cfun);
> +  /* Try to inline up to 256 bytes.  */
> +  unsigned max_copy_size = 256;
> +  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
>
> -  /* Large constant-sized cpymem should go through MOPS when possible.
> -     It should be a win even for size optimization in the general case.
> -     For speed optimization the choice between MOPS and the SIMD sequence
> -     depends on the size of the copy, rather than number of instructions,
> -     alignment etc.  */
> -  if (size > max_copy_size)
> +  /* Large copies use MOPS when available or a library call.  */
> +  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
>      return aarch64_expand_cpymem_mops (operands);

It feels a little unintuitive to be calling aarch64_expand_cpymem_mops
for !TARGET_MOPS, but that's pre-existing, and I can see there are
arguments both ways.

Although !TARGET_SIMD is a niche interest on current trunk, it becomes
important for streaming-compatible mode.  So we might want to look
again at the different handling of !TARGET_SIMD in this function (where
we lower the copy size but not the threshold) and aarch64_expand_setmem
(where we bail out early).  That's not something for this patch though,
just mentioning it.

The patch is OK with me, but please give Richard E a day to object.

Thanks,
Richard

>
>    int copy_bits = 256;
> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
>    unsigned HOST_WIDE_INT len;
>    rtx dst = operands[0];
>    rtx val = operands[2], src;
> +  unsigned align = UINTVAL (operands[3]);
>    rtx base;
>    machine_mode cur_mode = BLKmode, next_mode;
>
> -  /* If we don't have SIMD registers or the size is variable use the MOPS
> -     inlined sequence if possible.  */
> -  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
> +  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
> +      || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_setmem_mops (operands);
>
>    bool size_p = optimize_function_for_size_p (cfun);
> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
>    /* Default the maximum to 256-bytes when considering only libcall vs
>       SIMD broadcast sequence.  */
>    unsigned max_set_size = 256;
> +  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>
> -  len = INTVAL (operands[1]);
> -  if (len > max_set_size && !TARGET_MOPS)
> -    return false;
> +  len = UINTVAL (operands[1]);
> +
> +  /* Large memset uses MOPS when available or a library call.  */
> +  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
> +    return aarch64_expand_setmem_mops (operands);
>
>    int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
>    /* The MOPS sequence takes:
> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
>       the arguments + 1 for the call.  */
>    unsigned libcall_cost = 4;
>
> -  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
> -     when available.  */
> -  if (TARGET_MOPS
> -      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
> -    return aarch64_expand_setmem_mops (operands);
> -
>    /* Attempt a sequence with a vector broadcast followed by stores.
>       Count the number of operations involved to see if it's worth it
>       against the alternatives.  A simple counter simd_ops on the
> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
>        simd_ops++;
>        n -= mode_bits;
>
> -      /* Do certain trailing copies as overlapping if it's going to be
> -        cheaper.  i.e. less instructions to do so.  For instance doing a 15
> -        byte copy it's more efficient to do two overlapping 8 byte copies than
> -        8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
> +      /* Emit trailing writes using overlapping unaligned accesses
> +       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
>        if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
>         {
>           next_mode = smallest_mode_for_size (n, MODE_INT);
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "!STRICT_ALIGNMENT || TARGET_MOPS"
> +   ""
>  {
>    if (aarch64_expand_cpymem (operands))
>      DONE;
> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
>          (match_operand:QI  2 "nonmemory_operand")) ;; Value
>     (use (match_operand:DI  1 "general_operand")) ;; Length
>     (match_operand          3 "immediate_operand")] ;; Align
> - "TARGET_SIMD || TARGET_MOPS"
> + ""
>   {
>    if (aarch64_expand_setmem (operands))
>      DONE;
  
Richard Earnshaw Nov. 30, 2023, 12:03 p.m. UTC | #4
On 29/11/2023 18:09, Richard Sandiford wrote:
> Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
>> v2: Use UINTVAL, rename max_mops_size.
>>
>> The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
>> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
>> Clean up the condition when to use MOPS.
>>
>> Passes regress/bootstrap, OK for commit?
>>
>> gcc/ChangeLog/
>>          PR target/103100
>>          * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.
>>          (setmemdi): Likewise.
>>          * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
>>          strict-align.  Cleanup condition for using MOPS.
>>          (aarch64_expand_setmem): Likewise.
>>
>> ---
>>
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
>>     int mode_bits;
>>     rtx dst = operands[0];
>>     rtx src = operands[1];
>> +  unsigned align = UINTVAL (operands[3]);
>>     rtx base;
>>     machine_mode cur_mode = BLKmode;
>> +  bool size_p = optimize_function_for_size_p (cfun);
>>
>> -  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
>> -  if (!CONST_INT_P (operands[2]))
>> +  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
>> +  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>>       return aarch64_expand_cpymem_mops (operands);
>>
>> -  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>> -
>> -  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
>> -  unsigned HOST_WIDE_INT max_copy_size
>> -    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
>> +  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
>>
>> -  bool size_p = optimize_function_for_size_p (cfun);
>> +  /* Try to inline up to 256 bytes.  */
>> +  unsigned max_copy_size = 256;
>> +  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
>>
>> -  /* Large constant-sized cpymem should go through MOPS when possible.
>> -     It should be a win even for size optimization in the general case.
>> -     For speed optimization the choice between MOPS and the SIMD sequence
>> -     depends on the size of the copy, rather than number of instructions,
>> -     alignment etc.  */
>> -  if (size > max_copy_size)
>> +  /* Large copies use MOPS when available or a library call.  */
>> +  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
>>       return aarch64_expand_cpymem_mops (operands);
> 
> It feels a little unintuitive to be calling aarch64_expand_cpymem_mops
> for !TARGET_MOPS, but that's pre-existing, and I can see there are
> arguments both ways.
> 
> Although !TARGET_SIMD is a niche interest on current trunk, it becomes
> important for streaming-compatible mode.  So we might want to look
> again at the different handling of !TARGET_SIMD in this function (where
> we lower the copy size but not the threshold) and aarch64_expand_setmem
> (where we bail out early).  That's not something for this patch though,
> just mentioning it.
> 
> The patch is OK with me, but please give Richard E a day to object.

This is fine by me.

R.

> 
> Thanks,
> Richard
> 
>>
>>     int copy_bits = 256;
>> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)
>>     unsigned HOST_WIDE_INT len;
>>     rtx dst = operands[0];
>>     rtx val = operands[2], src;
>> +  unsigned align = UINTVAL (operands[3]);
>>     rtx base;
>>     machine_mode cur_mode = BLKmode, next_mode;
>>
>> -  /* If we don't have SIMD registers or the size is variable use the MOPS
>> -     inlined sequence if possible.  */
>> -  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
>> +  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
>> +  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
>> +      || (STRICT_ALIGNMENT && align < 16))
>>       return aarch64_expand_setmem_mops (operands);
>>
>>     bool size_p = optimize_function_for_size_p (cfun);
>> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)
>>     /* Default the maximum to 256-bytes when considering only libcall vs
>>        SIMD broadcast sequence.  */
>>     unsigned max_set_size = 256;
>> +  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
>>
>> -  len = INTVAL (operands[1]);
>> -  if (len > max_set_size && !TARGET_MOPS)
>> -    return false;
>> +  len = UINTVAL (operands[1]);
>> +
>> +  /* Large memset uses MOPS when available or a library call.  */
>> +  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
>> +    return aarch64_expand_setmem_mops (operands);
>>
>>     int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
>>     /* The MOPS sequence takes:
>> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
>>        the arguments + 1 for the call.  */
>>     unsigned libcall_cost = 4;
>>
>> -  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
>> -     when available.  */
>> -  if (TARGET_MOPS
>> -      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
>> -    return aarch64_expand_setmem_mops (operands);
>> -
>>     /* Attempt a sequence with a vector broadcast followed by stores.
>>        Count the number of operations involved to see if it's worth it
>>        against the alternatives.  A simple counter simd_ops on the
>> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
>>         simd_ops++;
>>         n -= mode_bits;
>>
>> -      /* Do certain trailing copies as overlapping if it's going to be
>> -        cheaper.  i.e. less instructions to do so.  For instance doing a 15
>> -        byte copy it's more efficient to do two overlapping 8 byte copies than
>> -        8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
>> +      /* Emit trailing writes using overlapping unaligned accesses
>> +       (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
>>         if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
>>          {
>>            next_mode = smallest_mode_for_size (n, MODE_INT);
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
>>      (match_operand:BLK 1 "memory_operand")
>>      (match_operand:DI 2 "general_operand")
>>      (match_operand:DI 3 "immediate_operand")]
>> -   "!STRICT_ALIGNMENT || TARGET_MOPS"
>> +   ""
>>   {
>>     if (aarch64_expand_cpymem (operands))
>>       DONE;
>> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
>>           (match_operand:QI  2 "nonmemory_operand")) ;; Value
>>      (use (match_operand:DI  1 "general_operand")) ;; Length
>>      (match_operand          3 "immediate_operand")] ;; Align
>> - "TARGET_SIMD || TARGET_MOPS"
>> + ""
>>    {
>>     if (aarch64_expand_setmem (operands))
>>       DONE;
  

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index dd6874d13a75f20d10a244578afc355b25c73da2..8a12894d6b80de1031d6e7d02dca680c57bce136 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25261,27 +25261,23 @@  aarch64_expand_cpymem (rtx *operands)
   int mode_bits;
   rtx dst = operands[0];
   rtx src = operands[1];
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode;
+  bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Variable-sized memcpy can go through the MOPS expansion if available.  */
-  if (!CONST_INT_P (operands[2]))
+  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_cpymem_mops (operands);
 
-  unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
-
-  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
-  unsigned HOST_WIDE_INT max_copy_size
-    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
+  unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
 
-  bool size_p = optimize_function_for_size_p (cfun);
+  /* Try to inline up to 256 bytes.  */
+  unsigned max_copy_size = 256;
+  unsigned mops_threshold = aarch64_mops_memcpy_size_threshold;
 
-  /* Large constant-sized cpymem should go through MOPS when possible.
-     It should be a win even for size optimization in the general case.
-     For speed optimization the choice between MOPS and the SIMD sequence
-     depends on the size of the copy, rather than number of instructions,
-     alignment etc.  */
-  if (size > max_copy_size)
+  /* Large copies use MOPS when available or a library call.  */
+  if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
     return aarch64_expand_cpymem_mops (operands);
 
   int copy_bits = 256;
@@ -25445,12 +25441,13 @@  aarch64_expand_setmem (rtx *operands)
   unsigned HOST_WIDE_INT len;
   rtx dst = operands[0];
   rtx val = operands[2], src;
+  unsigned align = UINTVAL (operands[3]);
   rtx base;
   machine_mode cur_mode = BLKmode, next_mode;
 
-  /* If we don't have SIMD registers or the size is variable use the MOPS
-     inlined sequence if possible.  */
-  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
+  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
+  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
+      || (STRICT_ALIGNMENT && align < 16))
     return aarch64_expand_setmem_mops (operands);
 
   bool size_p = optimize_function_for_size_p (cfun);
@@ -25458,10 +25455,13 @@  aarch64_expand_setmem (rtx *operands)
   /* Default the maximum to 256-bytes when considering only libcall vs
      SIMD broadcast sequence.  */
   unsigned max_set_size = 256;
+  unsigned mops_threshold = aarch64_mops_memset_size_threshold;
 
-  len = INTVAL (operands[1]);
-  if (len > max_set_size && !TARGET_MOPS)
-    return false;
+  len = UINTVAL (operands[1]);
+
+  /* Large memset uses MOPS when available or a library call.  */
+  if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
+    return aarch64_expand_setmem_mops (operands);
 
   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
   /* The MOPS sequence takes:
@@ -25474,12 +25474,6 @@  aarch64_expand_setmem (rtx *operands)
      the arguments + 1 for the call.  */
   unsigned libcall_cost = 4;
 
-  /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
-     when available.  */
-  if (TARGET_MOPS
-      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
-    return aarch64_expand_setmem_mops (operands);
-
   /* Attempt a sequence with a vector broadcast followed by stores.
      Count the number of operations involved to see if it's worth it
      against the alternatives.  A simple counter simd_ops on the
@@ -25521,10 +25515,8 @@  aarch64_expand_setmem (rtx *operands)
       simd_ops++;
       n -= mode_bits;
 
-      /* Do certain trailing copies as overlapping if it's going to be
-	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
-	 byte copy it's more efficient to do two overlapping 8 byte copies than
-	 8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
+      /* Emit trailing writes using overlapping unaligned accesses
+	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
 	{
 	  next_mode = smallest_mode_for_size (n, MODE_INT);
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1627,7 +1627,7 @@  (define_expand "cpymemdi"
    (match_operand:BLK 1 "memory_operand")
    (match_operand:DI 2 "general_operand")
    (match_operand:DI 3 "immediate_operand")]
-   "!STRICT_ALIGNMENT || TARGET_MOPS"
+   ""
 {
   if (aarch64_expand_cpymem (operands))
     DONE;
@@ -1724,7 +1724,7 @@  (define_expand "setmemdi"
         (match_operand:QI  2 "nonmemory_operand")) ;; Value
    (use (match_operand:DI  1 "general_operand")) ;; Length
    (match_operand          3 "immediate_operand")] ;; Align
- "TARGET_SIMD || TARGET_MOPS"
+ ""
  {
   if (aarch64_expand_setmem (operands))
     DONE;