[PATCHv4,AARCH64] Fix PR target/103100 -mstrict-align and memset on not aligned buffers

Message ID 20230210035312.1630020-1-apinski@marvell.com
State Accepted
Headers
Series [PATCHv4,AARCH64] Fix PR target/103100 -mstrict-align and memset on not aligned buffers |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Andrew Pinski Feb. 10, 2023, 3:53 a.m. UTC
  The problem here is that aarch64_expand_setmem does not change the alignment
for strict alignment case.
This is version 4 of the fix, major changes from the last version is fixing
the way store pairs are handled which allows handling of storing 2 SI mode
at a time.
This also adds a testcase to show a case with -mstrict-align we can do
the store word pair stores.

OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.

	PR target/103100

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_gen_store_pair):
	Add support for SImode.
	(aarch64_set_one_block_and_progress_pointer):
	Add use_pair argument and rewrite and simplifying the
	code.
	(aarch64_can_use_pair_load_stores): New function.
	(aarch64_expand_setmem): Rewrite mode selection to
	better handle strict alignment and non ld/stp pair case.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/memset-strict-align-1.c: Update test.
	Reduce the size down to 207 and make s1 global and aligned
	to 16 bytes.
	* gcc.target/aarch64/memset-strict-align-2.c: New test.
	* gcc.target/aarch64/memset-strict-align-3.c: New test.
---
 gcc/config/aarch64/aarch64.cc                 | 136 ++++++++++--------
 .../aarch64/memset-strict-align-1.c           |  19 ++-
 .../aarch64/memset-strict-align-2.c           |  14 ++
 .../aarch64/memset-strict-align-3.c           |  15 ++
 4 files changed, 113 insertions(+), 71 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
  

Comments

Andrew Pinski March 3, 2023, 6:28 p.m. UTC | #1
On Thu, Feb 9, 2023 at 7:54 PM Andrew Pinski via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> The problem here is that aarch64_expand_setmem does not change the alignment
> for strict alignment case.
> This is version 4 of the fix, major changes from the last version is fixing
> the way store pairs are handled which allows handling of storing 2 SI mode
> at a time.
> This also adds a testcase to show a case with -mstrict-align we can do
> the store word pair stores.
>
> OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.

Ping?

>
>         PR target/103100
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64.cc (aarch64_gen_store_pair):
>         Add support for SImode.
>         (aarch64_set_one_block_and_progress_pointer):
>         Add use_pair argument and rewrite and simplifying the
>         code.
>         (aarch64_can_use_pair_load_stores): New function.
>         (aarch64_expand_setmem): Rewrite mode selection to
>         better handle strict alignment and non ld/stp pair case.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/memset-strict-align-1.c: Update test.
>         Reduce the size down to 207 and make s1 global and aligned
>         to 16 bytes.
>         * gcc.target/aarch64/memset-strict-align-2.c: New test.
>         * gcc.target/aarch64/memset-strict-align-3.c: New test.
> ---
>  gcc/config/aarch64/aarch64.cc                 | 136 ++++++++++--------
>  .../aarch64/memset-strict-align-1.c           |  19 ++-
>  .../aarch64/memset-strict-align-2.c           |  14 ++
>  .../aarch64/memset-strict-align-3.c           |  15 ++
>  4 files changed, 113 insertions(+), 71 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 5c40b6ed22a..3eaf9bd608a 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -8850,6 +8850,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
>  {
>    switch (mode)
>      {
> +    case E_SImode:
> +      return gen_store_pair_sw_sisi (mem1, reg1, mem2, reg2);
> +
>      case E_DImode:
>        return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
>
> @@ -24896,42 +24899,49 @@ aarch64_expand_cpymem (rtx *operands)
>     SRC is a register we have created with the duplicated value to be set.  */
>  static void
>  aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
> -                                           machine_mode mode)
> +                                           machine_mode mode, bool use_pairs)
>  {
> +  rtx reg = src;
>    /* If we are copying 128bits or 256bits, we can do that straight from
>       the SIMD register we prepared.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> -    {
> -      mode = GET_MODE (src);
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, mode, 0);
> -      /* Emit the memset.  */
> -      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
> -                                        aarch64_progress_pointer (*dst), src));
> -
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 32);
> -      return;
> -    }
>    if (known_eq (GET_MODE_BITSIZE (mode), 128))
> -    {
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, GET_MODE (src), 0);
> -      /* Emit the memset.  */
> -      emit_move_insn (*dst, src);
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 16);
> -      return;
> -    }
> -  /* For copying less, we have to extract the right amount from src.  */
> -  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
> +    mode = GET_MODE(src);
> +  else
> +    /* For copying less, we have to extract the right amount from src.  */
> +    reg = lowpart_subreg (mode, src, GET_MODE (src));
>
>    /* "Cast" the *dst to the correct mode.  */
>    *dst = adjust_address (*dst, mode, 0);
>    /* Emit the memset.  */
> -  emit_move_insn (*dst, reg);
> +  if (use_pairs)
> +    emit_insn (aarch64_gen_store_pair (mode, *dst, reg,
> +                                      aarch64_progress_pointer (*dst),
> +                                      reg));
> +  else
> +    emit_move_insn (*dst, reg);
> +
>    /* Move the pointer forward.  */
>    *dst = aarch64_progress_pointer (*dst);
> +  if (use_pairs)
> +    *dst = aarch64_progress_pointer (*dst);
> +}
> +
> +/* Returns true if size can be used as a store/load pair.
> +   This is a helper function for aarch64_expand_setmem and others. */
> +static bool
> +aarch64_can_use_pair_load_stores (unsigned HOST_WIDE_INT size)
> +{
> +  /* For DI and SI modes, we can use store pairs.  */
> +  if (size == GET_MODE_BITSIZE (DImode)
> +      || size == GET_MODE_BITSIZE (SImode))
> +    return true;
> +  /* For TI mode, we will use store pairs only if
> +     the target wants to. */
> +  else if (size == GET_MODE_BITSIZE (TImode)
> +      && !(aarch64_tune_params.extra_tuning_flags
> +          & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> +    return true;
> +  return false;
>  }
>
>  /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
> @@ -24974,9 +24984,21 @@ aarch64_expand_setmem (rtx *operands)
>
>    bool size_p = optimize_function_for_size_p (cfun);
>
> -  /* Default the maximum to 256-bytes when considering only libcall vs
> -     SIMD broadcast sequence.  */
> -  unsigned max_set_size = 256;
> +  /* Maximum amount to copy in one go (without taking into account store pairs).  */
> +  int copy_limit = GET_MODE_BITSIZE (TImode);
> +
> +  /* For strict alignment case, restrict the copy limit to the compiler time
> +     known alignment of the memory.  */
> +  if (STRICT_ALIGNMENT)
> +    copy_limit = MIN (copy_limit, (int)MEM_ALIGN (dst));
> +
> +  bool use_pairs = aarch64_can_use_pair_load_stores (copy_limit);
> +
> +  /* The max set size is 8 instructions of the copy_limit sized stores
> +     (also taking into account using pair stores or not);
> +     for the non strict alignment, this is 256 bytes.  */
> +  unsigned max_set_size;
> +  max_set_size = (copy_limit * (use_pairs ? 2 : 1) * 8) / BITS_PER_UNIT;
>
>    len = INTVAL (operands[1]);
>    if (len > max_set_size && !TARGET_MOPS)
> @@ -24990,8 +25012,10 @@ aarch64_expand_setmem (rtx *operands)
>      (zero constants can use XZR directly).  */
>    unsigned mops_cost = 3 + 1 + cst_val;
>    /* A libcall to memset in the worst case takes 3 instructions to prepare
> -     the arguments + 1 for the call.  */
> -  unsigned libcall_cost = 4;
> +     the arguments + 1 for the call.
> +     In the case of not optimizing for size the cost of doing a libcall
> +     is 17 instructions . */
> +  unsigned libcall_cost = size_p ? 4 : 17;
>
>    /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
>       when available.  */
> @@ -25001,12 +25025,12 @@ aarch64_expand_setmem (rtx *operands)
>
>    /* Attempt a sequence with a vector broadcast followed by stores.
>       Count the number of operations involved to see if it's worth it
> -     against the alternatives.  A simple counter simd_ops on the
> +     against the alternatives.  A simple counter inlined_ops on the
>       algorithmically-relevant operations is used rather than an rtx_insn count
>       as all the pointer adjusmtents and mode reinterprets will be optimized
>       away later.  */
>    start_sequence ();
> -  unsigned simd_ops = 0;
> +  unsigned inlined_ops = 0;
>
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> @@ -25014,16 +25038,10 @@ aarch64_expand_setmem (rtx *operands)
>    /* Prepare the val using a DUP/MOVI v0.16B, val.  */
>    src = expand_vector_broadcast (V16QImode, val);
>    src = force_reg (V16QImode, src);
> -  simd_ops++;
> +  inlined_ops++;
>    /* Convert len to bits to make the rest of the code simpler.  */
>    n = len * BITS_PER_UNIT;
>
> -  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> -     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> -  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
> -                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> -                         ? GET_MODE_BITSIZE (TImode) : 256;
> -
>    while (n > 0)
>      {
>        /* Find the largest mode in which to do the copy without
> @@ -25036,15 +25054,18 @@ aarch64_expand_setmem (rtx *operands)
>        gcc_assert (cur_mode != BLKmode);
>
>        mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> -      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
> -      simd_ops++;
> -      n -= mode_bits;
> +      bool pairs = aarch64_can_use_pair_load_stores (mode_bits);
> +      if (n < (mode_bits * 2))
> +       pairs = false;
> +      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode, pairs);
> +      inlined_ops++;
> +      n -= mode_bits * (pairs ? 2 : 1);
>
>        /* Do certain trailing copies as overlapping if it's going to be
>          cheaper.  i.e. less instructions to do so.  For instance doing a 15
>          byte copy it's more efficient to do two overlapping 8 byte copies than
>          8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
> -      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
> +      if (n > 0 && n < copy_limit && !STRICT_ALIGNMENT)
>         {
>           next_mode = smallest_mode_for_size (n, MODE_INT);
>           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> @@ -25056,24 +25077,17 @@ aarch64_expand_setmem (rtx *operands)
>    rtx_insn *seq = get_insns ();
>    end_sequence ();
>
> -  if (size_p)
> -    {
> -      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
> -        call to memset or the MOPS expansion.  */
> -      if (TARGET_MOPS
> -         && mops_cost <= libcall_cost
> -         && mops_cost <= simd_ops)
> -       return aarch64_expand_setmem_mops (operands);
> -      /* If MOPS is not available or not shorter pick a libcall if the SIMD
> -        sequence is too long.  */
> -      else if (libcall_cost < simd_ops)
> -       return false;
> -      emit_insn (seq);
> -      return true;
> -    }
> +  /* When optimizing for size we have 3 options: the inlined sequence,
> +     call to memset or the MOPS expansion.  */
> +  if (size_p
> +      && TARGET_MOPS
> +      && mops_cost <= libcall_cost
> +      && mops_cost <= inlined_ops)
> +    return aarch64_expand_setmem_mops (operands);
> +  /* Pick a libcall if the inlined sequence is too long.  */
> +  else if (libcall_cost < inlined_ops)
> +    return false;
>
> -  /* At this point the SIMD broadcast sequence is the best choice when
> -     optimizing for speed.  */
>    emit_insn (seq);
>    return true;
>  }
> diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> index 664d43aee13..63c864b25b0 100644
> --- a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> @@ -1,28 +1,27 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -mstrict-align" } */
>
> -struct s { char x[255]; };
> +struct s { char x[207]; };
> +struct s s1 __attribute__((aligned(16)));
>  void foo (struct s *);
> -void bar (void) { struct s s1 = {}; foo (&s1); }
> +void bar (void) { s1 = (struct s){}; foo (&s1); }
>
> -/* memset (s1 = {}, sizeof = 255) should be expanded out
> +/* memset (s1 = {}, sizeof = 207) should be expanded out
>     such that there are no overlap stores when -mstrict-align
>     is in use.
> -   so 7 pairs of 16 bytes stores (224 bytes).
> -   1 16 byte stores
> +   so 6 pairs of 16 bytes stores (96 bytes).
>     1 8 byte store
>     1 4 byte store
>     1 2 byte store
>     1 1 byte store
>     */
>
> -/* { dg-final { scan-assembler-times "stp\tq" 7 } } */
> -/* { dg-final { scan-assembler-times "str\tq" 1 } } */
> +/* { dg-final { scan-assembler-times "stp\tq" 6 } } */
> +/* { dg-final { scan-assembler-times "str\tq" 0 } } */
>  /* { dg-final { scan-assembler-times "str\txzr" 1 } } */
>  /* { dg-final { scan-assembler-times "str\twzr" 1 } } */
>  /* { dg-final { scan-assembler-times "strh\twzr" 1 } } */
>  /* { dg-final { scan-assembler-times "strb\twzr" 1 } } */
>
> -/* Also one store pair for the frame-pointer and the LR. */
> -/* { dg-final { scan-assembler-times "stp\tx" 1 } } */
> -
> +/* No store pair with 8 byte words is needed as foo is called with a sibcall. */
> +/* { dg-final { scan-assembler-times "stp\tx" 0 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
> new file mode 100644
> index 00000000000..650be86604b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mstrict-align" } */
> +
> +struct s { char x[7]; };
> +void foo (struct s *);
> +void bar (struct s *s1) { *s1 = (struct s){}; foo (s1); }
> +
> +/* memset (s1 = {}, sizeof = 7) should be expanded out
> +   such that there are no overlap stores when -mstrict-align
> +   is in use. As the alignment of s1 is unknown, byte stores are needed.
> +   so 15 byte stores
> +   */
> +
> +/* { dg-final { scan-assembler-times "strb\twzr" 7 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
> new file mode 100644
> index 00000000000..09cb9a654dc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os -mstrict-align" } */
> +
> +int a[2*3];
> +int f(int t)
> +{
> +  __builtin_memset(a, t, 2*3*sizeof(int));
> +}
> +
> +/* memset (a, (val), sizeof = 2*3*4) should be expanded out
> +   such that there are no overlap stores when -mstrict-align
> +   is in use. As the alignment of a is 4 byte aligned (due to -Os),
> +   store word pairs are needed. so 3 stp are in use.  */
> +
> +/* { dg-final { scan-assembler-times "stp\ts" 3 } } */
> --
> 2.31.1
>
  
Andrew Pinski March 24, 2023, 12:10 p.m. UTC | #2
On Fri, Mar 3, 2023 at 10:28 AM Andrew Pinski <pinskia@gmail.com> wrote:
>
> On Thu, Feb 9, 2023 at 7:54 PM Andrew Pinski via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > The problem here is that aarch64_expand_setmem does not change the alignment
> > for strict alignment case.
> > This is version 4 of the fix, major changes from the last version is fixing
> > the way store pairs are handled which allows handling of storing 2 SI mode
> > at a time.
> > This also adds a testcase to show a case with -mstrict-align we can do
> > the store word pair stores.
> >
> > OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.
>
> Ping?

Ping?


>
> >
> >         PR target/103100
> >
> > gcc/ChangeLog:
> >
> >         * config/aarch64/aarch64.cc (aarch64_gen_store_pair):
> >         Add support for SImode.
> >         (aarch64_set_one_block_and_progress_pointer):
> >         Add use_pair argument and rewrite and simplifying the
> >         code.
> >         (aarch64_can_use_pair_load_stores): New function.
> >         (aarch64_expand_setmem): Rewrite mode selection to
> >         better handle strict alignment and non ld/stp pair case.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/aarch64/memset-strict-align-1.c: Update test.
> >         Reduce the size down to 207 and make s1 global and aligned
> >         to 16 bytes.
> >         * gcc.target/aarch64/memset-strict-align-2.c: New test.
> >         * gcc.target/aarch64/memset-strict-align-3.c: New test.
> > ---
> >  gcc/config/aarch64/aarch64.cc                 | 136 ++++++++++--------
> >  .../aarch64/memset-strict-align-1.c           |  19 ++-
> >  .../aarch64/memset-strict-align-2.c           |  14 ++
> >  .../aarch64/memset-strict-align-3.c           |  15 ++
> >  4 files changed, 113 insertions(+), 71 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index 5c40b6ed22a..3eaf9bd608a 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -8850,6 +8850,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
> >  {
> >    switch (mode)
> >      {
> > +    case E_SImode:
> > +      return gen_store_pair_sw_sisi (mem1, reg1, mem2, reg2);
> > +
> >      case E_DImode:
> >        return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
> >
> > @@ -24896,42 +24899,49 @@ aarch64_expand_cpymem (rtx *operands)
> >     SRC is a register we have created with the duplicated value to be set.  */
> >  static void
> >  aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
> > -                                           machine_mode mode)
> > +                                           machine_mode mode, bool use_pairs)
> >  {
> > +  rtx reg = src;
> >    /* If we are copying 128bits or 256bits, we can do that straight from
> >       the SIMD register we prepared.  */
> > -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> > -    {
> > -      mode = GET_MODE (src);
> > -      /* "Cast" the *dst to the correct mode.  */
> > -      *dst = adjust_address (*dst, mode, 0);
> > -      /* Emit the memset.  */
> > -      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
> > -                                        aarch64_progress_pointer (*dst), src));
> > -
> > -      /* Move the pointers forward.  */
> > -      *dst = aarch64_move_pointer (*dst, 32);
> > -      return;
> > -    }
> >    if (known_eq (GET_MODE_BITSIZE (mode), 128))
> > -    {
> > -      /* "Cast" the *dst to the correct mode.  */
> > -      *dst = adjust_address (*dst, GET_MODE (src), 0);
> > -      /* Emit the memset.  */
> > -      emit_move_insn (*dst, src);
> > -      /* Move the pointers forward.  */
> > -      *dst = aarch64_move_pointer (*dst, 16);
> > -      return;
> > -    }
> > -  /* For copying less, we have to extract the right amount from src.  */
> > -  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
> > +    mode = GET_MODE(src);
> > +  else
> > +    /* For copying less, we have to extract the right amount from src.  */
> > +    reg = lowpart_subreg (mode, src, GET_MODE (src));
> >
> >    /* "Cast" the *dst to the correct mode.  */
> >    *dst = adjust_address (*dst, mode, 0);
> >    /* Emit the memset.  */
> > -  emit_move_insn (*dst, reg);
> > +  if (use_pairs)
> > +    emit_insn (aarch64_gen_store_pair (mode, *dst, reg,
> > +                                      aarch64_progress_pointer (*dst),
> > +                                      reg));
> > +  else
> > +    emit_move_insn (*dst, reg);
> > +
> >    /* Move the pointer forward.  */
> >    *dst = aarch64_progress_pointer (*dst);
> > +  if (use_pairs)
> > +    *dst = aarch64_progress_pointer (*dst);
> > +}
> > +
> > +/* Returns true if size can be used as a store/load pair.
> > +   This is a helper function for aarch64_expand_setmem and others. */
> > +static bool
> > +aarch64_can_use_pair_load_stores (unsigned HOST_WIDE_INT size)
> > +{
> > +  /* For DI and SI modes, we can use store pairs.  */
> > +  if (size == GET_MODE_BITSIZE (DImode)
> > +      || size == GET_MODE_BITSIZE (SImode))
> > +    return true;
> > +  /* For TI mode, we will use store pairs only if
> > +     the target wants to. */
> > +  else if (size == GET_MODE_BITSIZE (TImode)
> > +      && !(aarch64_tune_params.extra_tuning_flags
> > +          & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> > +    return true;
> > +  return false;
> >  }
> >
> >  /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
> > @@ -24974,9 +24984,21 @@ aarch64_expand_setmem (rtx *operands)
> >
> >    bool size_p = optimize_function_for_size_p (cfun);
> >
> > -  /* Default the maximum to 256-bytes when considering only libcall vs
> > -     SIMD broadcast sequence.  */
> > -  unsigned max_set_size = 256;
> > +  /* Maximum amount to copy in one go (without taking into account store pairs).  */
> > +  int copy_limit = GET_MODE_BITSIZE (TImode);
> > +
> > +  /* For strict alignment case, restrict the copy limit to the compiler time
> > +     known alignment of the memory.  */
> > +  if (STRICT_ALIGNMENT)
> > +    copy_limit = MIN (copy_limit, (int)MEM_ALIGN (dst));
> > +
> > +  bool use_pairs = aarch64_can_use_pair_load_stores (copy_limit);
> > +
> > +  /* The max set size is 8 instructions of the copy_limit sized stores
> > +     (also taking into account using pair stores or not);
> > +     for the non strict alignment, this is 256 bytes.  */
> > +  unsigned max_set_size;
> > +  max_set_size = (copy_limit * (use_pairs ? 2 : 1) * 8) / BITS_PER_UNIT;
> >
> >    len = INTVAL (operands[1]);
> >    if (len > max_set_size && !TARGET_MOPS)
> > @@ -24990,8 +25012,10 @@ aarch64_expand_setmem (rtx *operands)
> >      (zero constants can use XZR directly).  */
> >    unsigned mops_cost = 3 + 1 + cst_val;
> >    /* A libcall to memset in the worst case takes 3 instructions to prepare
> > -     the arguments + 1 for the call.  */
> > -  unsigned libcall_cost = 4;
> > +     the arguments + 1 for the call.
> > +     In the case of not optimizing for size the cost of doing a libcall
> > +     is 17 instructions . */
> > +  unsigned libcall_cost = size_p ? 4 : 17;
> >
> >    /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
> >       when available.  */
> > @@ -25001,12 +25025,12 @@ aarch64_expand_setmem (rtx *operands)
> >
> >    /* Attempt a sequence with a vector broadcast followed by stores.
> >       Count the number of operations involved to see if it's worth it
> > -     against the alternatives.  A simple counter simd_ops on the
> > +     against the alternatives.  A simple counter inlined_ops on the
> >       algorithmically-relevant operations is used rather than an rtx_insn count
> >       as all the pointer adjusmtents and mode reinterprets will be optimized
> >       away later.  */
> >    start_sequence ();
> > -  unsigned simd_ops = 0;
> > +  unsigned inlined_ops = 0;
> >
> >    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
> >    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> > @@ -25014,16 +25038,10 @@ aarch64_expand_setmem (rtx *operands)
> >    /* Prepare the val using a DUP/MOVI v0.16B, val.  */
> >    src = expand_vector_broadcast (V16QImode, val);
> >    src = force_reg (V16QImode, src);
> > -  simd_ops++;
> > +  inlined_ops++;
> >    /* Convert len to bits to make the rest of the code simpler.  */
> >    n = len * BITS_PER_UNIT;
> >
> > -  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> > -     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> > -  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
> > -                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> > -                         ? GET_MODE_BITSIZE (TImode) : 256;
> > -
> >    while (n > 0)
> >      {
> >        /* Find the largest mode in which to do the copy without
> > @@ -25036,15 +25054,18 @@ aarch64_expand_setmem (rtx *operands)
> >        gcc_assert (cur_mode != BLKmode);
> >
> >        mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> > -      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
> > -      simd_ops++;
> > -      n -= mode_bits;
> > +      bool pairs = aarch64_can_use_pair_load_stores (mode_bits);
> > +      if (n < (mode_bits * 2))
> > +       pairs = false;
> > +      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode, pairs);
> > +      inlined_ops++;
> > +      n -= mode_bits * (pairs ? 2 : 1);
> >
> >        /* Do certain trailing copies as overlapping if it's going to be
> >          cheaper.  i.e. less instructions to do so.  For instance doing a 15
> >          byte copy it's more efficient to do two overlapping 8 byte copies than
> >          8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
> > -      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
> > +      if (n > 0 && n < copy_limit && !STRICT_ALIGNMENT)
> >         {
> >           next_mode = smallest_mode_for_size (n, MODE_INT);
> >           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> > @@ -25056,24 +25077,17 @@ aarch64_expand_setmem (rtx *operands)
> >    rtx_insn *seq = get_insns ();
> >    end_sequence ();
> >
> > -  if (size_p)
> > -    {
> > -      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
> > -        call to memset or the MOPS expansion.  */
> > -      if (TARGET_MOPS
> > -         && mops_cost <= libcall_cost
> > -         && mops_cost <= simd_ops)
> > -       return aarch64_expand_setmem_mops (operands);
> > -      /* If MOPS is not available or not shorter pick a libcall if the SIMD
> > -        sequence is too long.  */
> > -      else if (libcall_cost < simd_ops)
> > -       return false;
> > -      emit_insn (seq);
> > -      return true;
> > -    }
> > +  /* When optimizing for size we have 3 options: the inlined sequence,
> > +     call to memset or the MOPS expansion.  */
> > +  if (size_p
> > +      && TARGET_MOPS
> > +      && mops_cost <= libcall_cost
> > +      && mops_cost <= inlined_ops)
> > +    return aarch64_expand_setmem_mops (operands);
> > +  /* Pick a libcall if the inlined sequence is too long.  */
> > +  else if (libcall_cost < inlined_ops)
> > +    return false;
> >
> > -  /* At this point the SIMD broadcast sequence is the best choice when
> > -     optimizing for speed.  */
> >    emit_insn (seq);
> >    return true;
> >  }
> > diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> > index 664d43aee13..63c864b25b0 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> > @@ -1,28 +1,27 @@
> >  /* { dg-do compile } */
> >  /* { dg-options "-O2 -mstrict-align" } */
> >
> > -struct s { char x[255]; };
> > +struct s { char x[207]; };
> > +struct s s1 __attribute__((aligned(16)));
> >  void foo (struct s *);
> > -void bar (void) { struct s s1 = {}; foo (&s1); }
> > +void bar (void) { s1 = (struct s){}; foo (&s1); }
> >
> > -/* memset (s1 = {}, sizeof = 255) should be expanded out
> > +/* memset (s1 = {}, sizeof = 207) should be expanded out
> >     such that there are no overlap stores when -mstrict-align
> >     is in use.
> > -   so 7 pairs of 16 bytes stores (224 bytes).
> > -   1 16 byte stores
> > +   so 6 pairs of 16 bytes stores (96 bytes).
> >     1 8 byte store
> >     1 4 byte store
> >     1 2 byte store
> >     1 1 byte store
> >     */
> >
> > -/* { dg-final { scan-assembler-times "stp\tq" 7 } } */
> > -/* { dg-final { scan-assembler-times "str\tq" 1 } } */
> > +/* { dg-final { scan-assembler-times "stp\tq" 6 } } */
> > +/* { dg-final { scan-assembler-times "str\tq" 0 } } */
> >  /* { dg-final { scan-assembler-times "str\txzr" 1 } } */
> >  /* { dg-final { scan-assembler-times "str\twzr" 1 } } */
> >  /* { dg-final { scan-assembler-times "strh\twzr" 1 } } */
> >  /* { dg-final { scan-assembler-times "strb\twzr" 1 } } */
> >
> > -/* Also one store pair for the frame-pointer and the LR. */
> > -/* { dg-final { scan-assembler-times "stp\tx" 1 } } */
> > -
> > +/* No store pair with 8 byte words is needed as foo is called with a sibcall. */
> > +/* { dg-final { scan-assembler-times "stp\tx" 0 } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
> > new file mode 100644
> > index 00000000000..650be86604b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mstrict-align" } */
> > +
> > +struct s { char x[7]; };
> > +void foo (struct s *);
> > +void bar (struct s *s1) { *s1 = (struct s){}; foo (s1); }
> > +
> > +/* memset (s1 = {}, sizeof = 7) should be expanded out
> > +   such that there are no overlap stores when -mstrict-align
> > +   is in use. As the alignment of s1 is unknown, byte stores are needed.
> > +   so 15 byte stores
> > +   */
> > +
> > +/* { dg-final { scan-assembler-times "strb\twzr" 7 } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
> > new file mode 100644
> > index 00000000000..09cb9a654dc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
> > @@ -0,0 +1,15 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-Os -mstrict-align" } */
> > +
> > +int a[2*3];
> > +int f(int t)
> > +{
> > +  __builtin_memset(a, t, 2*3*sizeof(int));
> > +}
> > +
> > +/* memset (a, (val), sizeof = 2*3*4) should be expanded out
> > +   such that there are no overlap stores when -mstrict-align
> > +   is in use. As the alignment of a is 4 byte aligned (due to -Os),
> > +   store word pairs are needed. so 3 stp are in use.  */
> > +
> > +/* { dg-final { scan-assembler-times "stp\ts" 3 } } */
> > --
> > 2.31.1
> >
  
Richard Sandiford April 4, 2023, 5:47 p.m. UTC | #3
Andrew Pinski via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> The problem here is that aarch64_expand_setmem does not change the alignment
> for strict alignment case.
> This is version 4 of the fix, major changes from the last version is fixing
> the way store pairs are handled which allows handling of storing 2 SI mode
> at a time.
> This also adds a testcase to show a case with -mstrict-align we can do
> the store word pair stores.

Heh.  The patch seems to be getting more complicated. :-)

> OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.
>
> 	PR target/103100
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.cc (aarch64_gen_store_pair):
> 	Add support for SImode.
> 	(aarch64_set_one_block_and_progress_pointer):
> 	Add use_pair argument and rewrite and simplifying the
> 	code.
> 	(aarch64_can_use_pair_load_stores): New function.
> 	(aarch64_expand_setmem): Rewrite mode selection to
> 	better handle strict alignment and non ld/stp pair case.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/memset-strict-align-1.c: Update test.
> 	Reduce the size down to 207 and make s1 global and aligned
> 	to 16 bytes.
> 	* gcc.target/aarch64/memset-strict-align-2.c: New test.
> 	* gcc.target/aarch64/memset-strict-align-3.c: New test.
> ---
>  gcc/config/aarch64/aarch64.cc                 | 136 ++++++++++--------
>  .../aarch64/memset-strict-align-1.c           |  19 ++-
>  .../aarch64/memset-strict-align-2.c           |  14 ++
>  .../aarch64/memset-strict-align-3.c           |  15 ++
>  4 files changed, 113 insertions(+), 71 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 5c40b6ed22a..3eaf9bd608a 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -8850,6 +8850,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
>  {
>    switch (mode)
>      {
> +    case E_SImode:
> +      return gen_store_pair_sw_sisi (mem1, reg1, mem2, reg2);
> +
>      case E_DImode:
>        return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
>  
> @@ -24896,42 +24899,49 @@ aarch64_expand_cpymem (rtx *operands)
>     SRC is a register we have created with the duplicated value to be set.  */
>  static void
>  aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
> -					    machine_mode mode)
> +					    machine_mode mode, bool use_pairs)

It would be good to update the comment, since this no longer matches
the aarch64_copy_one_block_and_progress_pointers interface very closely.

>  {
> +  rtx reg = src;
>    /* If we are copying 128bits or 256bits, we can do that straight from
>       the SIMD register we prepared.  */
> -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> -    {
> -      mode = GET_MODE (src);
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, mode, 0);
> -      /* Emit the memset.  */
> -      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
> -					 aarch64_progress_pointer (*dst), src));
> -
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 32);
> -      return;
> -    }
>    if (known_eq (GET_MODE_BITSIZE (mode), 128))
> -    {
> -      /* "Cast" the *dst to the correct mode.  */
> -      *dst = adjust_address (*dst, GET_MODE (src), 0);
> -      /* Emit the memset.  */
> -      emit_move_insn (*dst, src);
> -      /* Move the pointers forward.  */
> -      *dst = aarch64_move_pointer (*dst, 16);
> -      return;
> -    }
> -  /* For copying less, we have to extract the right amount from src.  */
> -  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
> +    mode = GET_MODE(src);

Nit: space before "(".

> +  else
> +    /* For copying less, we have to extract the right amount from src.  */
> +    reg = lowpart_subreg (mode, src, GET_MODE (src));
>  
>    /* "Cast" the *dst to the correct mode.  */
>    *dst = adjust_address (*dst, mode, 0);
>    /* Emit the memset.  */
> -  emit_move_insn (*dst, reg);
> +  if (use_pairs)
> +    emit_insn (aarch64_gen_store_pair (mode, *dst, reg,
> +				       aarch64_progress_pointer (*dst),
> +				       reg));
> +  else
> +    emit_move_insn (*dst, reg);
> +
>    /* Move the pointer forward.  */
>    *dst = aarch64_progress_pointer (*dst);
> +  if (use_pairs)
> +    *dst = aarch64_progress_pointer (*dst);
> +}
> +
> +/* Returns true if size can be used as a store/load pair.
> +   This is a helper function for aarch64_expand_setmem and others. */
> +static bool
> +aarch64_can_use_pair_load_stores (unsigned HOST_WIDE_INT size)
> +{
> +  /* For DI and SI modes, we can use store pairs.  */
> +  if (size == GET_MODE_BITSIZE (DImode)
> +      || size == GET_MODE_BITSIZE (SImode))
> +    return true;
> +  /* For TI mode, we will use store pairs only if
> +     the target wants to. */
> +  else if (size == GET_MODE_BITSIZE (TImode)
> +      && !(aarch64_tune_params.extra_tuning_flags
> +	   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> +    return true;
> +  return false;
>  }
>  
>  /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
> @@ -24974,9 +24984,21 @@ aarch64_expand_setmem (rtx *operands)
>  
>    bool size_p = optimize_function_for_size_p (cfun);
>  
> -  /* Default the maximum to 256-bytes when considering only libcall vs
> -     SIMD broadcast sequence.  */
> -  unsigned max_set_size = 256;
> +  /* Maximum amount to copy in one go (without taking into account store pairs).  */
> +  int copy_limit = GET_MODE_BITSIZE (TImode);
> +
> +  /* For strict alignment case, restrict the copy limit to the compiler time
> +     known alignment of the memory.  */
> +  if (STRICT_ALIGNMENT)
> +    copy_limit = MIN (copy_limit, (int)MEM_ALIGN (dst));
> +
> +  bool use_pairs = aarch64_can_use_pair_load_stores (copy_limit);
> +
> +  /* The max set size is 8 instructions of the copy_limit sized stores
> +     (also taking into account using pair stores or not);
> +     for the non strict alignment, this is 256 bytes.  */
> +  unsigned max_set_size;
> +  max_set_size = (copy_limit * (use_pairs ? 2 : 1) * 8) / BITS_PER_UNIT;
>  
>    len = INTVAL (operands[1]);
>    if (len > max_set_size && !TARGET_MOPS)
> @@ -24990,8 +25012,10 @@ aarch64_expand_setmem (rtx *operands)
>      (zero constants can use XZR directly).  */
>    unsigned mops_cost = 3 + 1 + cst_val;
>    /* A libcall to memset in the worst case takes 3 instructions to prepare
> -     the arguments + 1 for the call.  */
> -  unsigned libcall_cost = 4;
> +     the arguments + 1 for the call.
> +     In the case of not optimizing for size the cost of doing a libcall
> +     is 17 instructions . */
> +  unsigned libcall_cost = size_p ? 4 : 17;

Why 17?

If possible, I think it'd better to fix the -mstrict-align correctness
problem without changing the heuristics for -mno-strict-align.  (The
previous versions seemed able to do that.)

What happens if you leave the !size_p heuristics alone?  Do you get
too many inline copies for -mstrict-align, or too few?

Thanks,
Richard

>    /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
>       when available.  */
> @@ -25001,12 +25025,12 @@ aarch64_expand_setmem (rtx *operands)
>  
>    /* Attempt a sequence with a vector broadcast followed by stores.
>       Count the number of operations involved to see if it's worth it
> -     against the alternatives.  A simple counter simd_ops on the
> +     against the alternatives.  A simple counter inlined_ops on the
>       algorithmically-relevant operations is used rather than an rtx_insn count
>       as all the pointer adjusmtents and mode reinterprets will be optimized
>       away later.  */
>    start_sequence ();
> -  unsigned simd_ops = 0;
> +  unsigned inlined_ops = 0;
>  
>    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
>    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> @@ -25014,16 +25038,10 @@ aarch64_expand_setmem (rtx *operands)
>    /* Prepare the val using a DUP/MOVI v0.16B, val.  */
>    src = expand_vector_broadcast (V16QImode, val);
>    src = force_reg (V16QImode, src);
> -  simd_ops++;
> +  inlined_ops++;
>    /* Convert len to bits to make the rest of the code simpler.  */
>    n = len * BITS_PER_UNIT;
>  
> -  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> -     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> -  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
> -			  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> -			  ? GET_MODE_BITSIZE (TImode) : 256;
> -
>    while (n > 0)
>      {
>        /* Find the largest mode in which to do the copy without
> @@ -25036,15 +25054,18 @@ aarch64_expand_setmem (rtx *operands)
>        gcc_assert (cur_mode != BLKmode);
>  
>        mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> -      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
> -      simd_ops++;
> -      n -= mode_bits;
> +      bool pairs = aarch64_can_use_pair_load_stores (mode_bits);
> +      if (n < (mode_bits * 2))
> +	pairs = false;
> +      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode, pairs);
> +      inlined_ops++;
> +      n -= mode_bits * (pairs ? 2 : 1);
>  
>        /* Do certain trailing copies as overlapping if it's going to be
>  	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
>  	 byte copy it's more efficient to do two overlapping 8 byte copies than
>  	 8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
> -      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
> +      if (n > 0 && n < copy_limit && !STRICT_ALIGNMENT)
>  	{
>  	  next_mode = smallest_mode_for_size (n, MODE_INT);
>  	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> @@ -25056,24 +25077,17 @@ aarch64_expand_setmem (rtx *operands)
>    rtx_insn *seq = get_insns ();
>    end_sequence ();
>  
> -  if (size_p)
> -    {
> -      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
> -	 call to memset or the MOPS expansion.  */
> -      if (TARGET_MOPS
> -	  && mops_cost <= libcall_cost
> -	  && mops_cost <= simd_ops)
> -	return aarch64_expand_setmem_mops (operands);
> -      /* If MOPS is not available or not shorter pick a libcall if the SIMD
> -	 sequence is too long.  */
> -      else if (libcall_cost < simd_ops)
> -	return false;
> -      emit_insn (seq);
> -      return true;
> -    }
> +  /* When optimizing for size we have 3 options: the inlined sequence,
> +     call to memset or the MOPS expansion.  */
> +  if (size_p
> +      && TARGET_MOPS
> +      && mops_cost <= libcall_cost
> +      && mops_cost <= inlined_ops)
> +    return aarch64_expand_setmem_mops (operands);
> +  /* Pick a libcall if the inlined sequence is too long.  */
> +  else if (libcall_cost < inlined_ops)
> +    return false;
>  
> -  /* At this point the SIMD broadcast sequence is the best choice when
> -     optimizing for speed.  */
>    emit_insn (seq);
>    return true;
>  }
> diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> index 664d43aee13..63c864b25b0 100644
> --- a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> @@ -1,28 +1,27 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -mstrict-align" } */
>  
> -struct s { char x[255]; };
> +struct s { char x[207]; };
> +struct s s1 __attribute__((aligned(16)));
>  void foo (struct s *);
> -void bar (void) { struct s s1 = {}; foo (&s1); }
> +void bar (void) { s1 = (struct s){}; foo (&s1); }
>  
> -/* memset (s1 = {}, sizeof = 255) should be expanded out
> +/* memset (s1 = {}, sizeof = 207) should be expanded out
>     such that there are no overlap stores when -mstrict-align
>     is in use.
> -   so 7 pairs of 16 bytes stores (224 bytes).
> -   1 16 byte stores
> +   so 6 pairs of 16 bytes stores (96 bytes).
>     1 8 byte store
>     1 4 byte store
>     1 2 byte store
>     1 1 byte store
>     */
>  
> -/* { dg-final { scan-assembler-times "stp\tq" 7 } } */
> -/* { dg-final { scan-assembler-times "str\tq" 1 } } */
> +/* { dg-final { scan-assembler-times "stp\tq" 6 } } */
> +/* { dg-final { scan-assembler-times "str\tq" 0 } } */
>  /* { dg-final { scan-assembler-times "str\txzr" 1 } } */
>  /* { dg-final { scan-assembler-times "str\twzr" 1 } } */
>  /* { dg-final { scan-assembler-times "strh\twzr" 1 } } */
>  /* { dg-final { scan-assembler-times "strb\twzr" 1 } } */
>  
> -/* Also one store pair for the frame-pointer and the LR. */
> -/* { dg-final { scan-assembler-times "stp\tx" 1 } } */
> -
> +/* No store pair with 8 byte words is needed as foo is called with a sibcall. */
> +/* { dg-final { scan-assembler-times "stp\tx" 0 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
> new file mode 100644
> index 00000000000..650be86604b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mstrict-align" } */
> +
> +struct s { char x[7]; };
> +void foo (struct s *);
> +void bar (struct s *s1) { *s1 = (struct s){}; foo (s1); }
> +
> +/* memset (s1 = {}, sizeof = 7) should be expanded out
> +   such that there are no overlap stores when -mstrict-align
> +   is in use. As the alignment of s1 is unknown, byte stores are needed.
> +   so 15 byte stores
> +   */
> +
> +/* { dg-final { scan-assembler-times "strb\twzr" 7 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
> new file mode 100644
> index 00000000000..09cb9a654dc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os -mstrict-align" } */
> +
> +int a[2*3];
> +int f(int t)
> +{
> +  __builtin_memset(a, t, 2*3*sizeof(int));
> +}
> +
> +/* memset (a, (val), sizeof = 2*3*4) should be expanded out
> +   such that there are no overlap stores when -mstrict-align
> +   is in use. As the alignment of a is 4 byte aligned (due to -Os),
> +   store word pairs are needed. so 3 stp are in use.  */
> +
> +/* { dg-final { scan-assembler-times "stp\ts" 3 } } */
  
Andrew Pinski April 6, 2023, 11:42 p.m. UTC | #4
On Tue, Apr 4, 2023 at 10:48 AM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Andrew Pinski via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > The problem here is that aarch64_expand_setmem does not change the alignment
> > for strict alignment case.
> > This is version 4 of the fix, major changes from the last version is fixing
> > the way store pairs are handled which allows handling of storing 2 SI mode
> > at a time.
> > This also adds a testcase to show a case with -mstrict-align we can do
> > the store word pair stores.
>
> Heh.  The patch seems to be getting more complicated. :-)

Note I am no longer working on this patch. Review cycles for this
patch is just too long for me.

Thanks,
Andrew Pinski

>
> > OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions.
> >
> >       PR target/103100
> >
> > gcc/ChangeLog:
> >
> >       * config/aarch64/aarch64.cc (aarch64_gen_store_pair):
> >       Add support for SImode.
> >       (aarch64_set_one_block_and_progress_pointer):
> >       Add use_pair argument and rewrite and simplifying the
> >       code.
> >       (aarch64_can_use_pair_load_stores): New function.
> >       (aarch64_expand_setmem): Rewrite mode selection to
> >       better handle strict alignment and non ld/stp pair case.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/aarch64/memset-strict-align-1.c: Update test.
> >       Reduce the size down to 207 and make s1 global and aligned
> >       to 16 bytes.
> >       * gcc.target/aarch64/memset-strict-align-2.c: New test.
> >       * gcc.target/aarch64/memset-strict-align-3.c: New test.
> > ---
> >  gcc/config/aarch64/aarch64.cc                 | 136 ++++++++++--------
> >  .../aarch64/memset-strict-align-1.c           |  19 ++-
> >  .../aarch64/memset-strict-align-2.c           |  14 ++
> >  .../aarch64/memset-strict-align-3.c           |  15 ++
> >  4 files changed, 113 insertions(+), 71 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index 5c40b6ed22a..3eaf9bd608a 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -8850,6 +8850,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
> >  {
> >    switch (mode)
> >      {
> > +    case E_SImode:
> > +      return gen_store_pair_sw_sisi (mem1, reg1, mem2, reg2);
> > +
> >      case E_DImode:
> >        return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
> >
> > @@ -24896,42 +24899,49 @@ aarch64_expand_cpymem (rtx *operands)
> >     SRC is a register we have created with the duplicated value to be set.  */
> >  static void
> >  aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
> > -                                         machine_mode mode)
> > +                                         machine_mode mode, bool use_pairs)
>
> It would be good to update the comment, since this no longer matches
> the aarch64_copy_one_block_and_progress_pointers interface very closely.
>
> >  {
> > +  rtx reg = src;
> >    /* If we are copying 128bits or 256bits, we can do that straight from
> >       the SIMD register we prepared.  */
> > -  if (known_eq (GET_MODE_BITSIZE (mode), 256))
> > -    {
> > -      mode = GET_MODE (src);
> > -      /* "Cast" the *dst to the correct mode.  */
> > -      *dst = adjust_address (*dst, mode, 0);
> > -      /* Emit the memset.  */
> > -      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
> > -                                      aarch64_progress_pointer (*dst), src));
> > -
> > -      /* Move the pointers forward.  */
> > -      *dst = aarch64_move_pointer (*dst, 32);
> > -      return;
> > -    }
> >    if (known_eq (GET_MODE_BITSIZE (mode), 128))
> > -    {
> > -      /* "Cast" the *dst to the correct mode.  */
> > -      *dst = adjust_address (*dst, GET_MODE (src), 0);
> > -      /* Emit the memset.  */
> > -      emit_move_insn (*dst, src);
> > -      /* Move the pointers forward.  */
> > -      *dst = aarch64_move_pointer (*dst, 16);
> > -      return;
> > -    }
> > -  /* For copying less, we have to extract the right amount from src.  */
> > -  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
> > +    mode = GET_MODE(src);
>
> Nit: space before "(".
>
> > +  else
> > +    /* For copying less, we have to extract the right amount from src.  */
> > +    reg = lowpart_subreg (mode, src, GET_MODE (src));
> >
> >    /* "Cast" the *dst to the correct mode.  */
> >    *dst = adjust_address (*dst, mode, 0);
> >    /* Emit the memset.  */
> > -  emit_move_insn (*dst, reg);
> > +  if (use_pairs)
> > +    emit_insn (aarch64_gen_store_pair (mode, *dst, reg,
> > +                                    aarch64_progress_pointer (*dst),
> > +                                    reg));
> > +  else
> > +    emit_move_insn (*dst, reg);
> > +
> >    /* Move the pointer forward.  */
> >    *dst = aarch64_progress_pointer (*dst);
> > +  if (use_pairs)
> > +    *dst = aarch64_progress_pointer (*dst);
> > +}
> > +
> > +/* Returns true if size can be used as a store/load pair.
> > +   This is a helper function for aarch64_expand_setmem and others. */
> > +static bool
> > +aarch64_can_use_pair_load_stores (unsigned HOST_WIDE_INT size)
> > +{
> > +  /* For DI and SI modes, we can use store pairs.  */
> > +  if (size == GET_MODE_BITSIZE (DImode)
> > +      || size == GET_MODE_BITSIZE (SImode))
> > +    return true;
> > +  /* For TI mode, we will use store pairs only if
> > +     the target wants to. */
> > +  else if (size == GET_MODE_BITSIZE (TImode)
> > +      && !(aarch64_tune_params.extra_tuning_flags
> > +        & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
> > +    return true;
> > +  return false;
> >  }
> >
> >  /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
> > @@ -24974,9 +24984,21 @@ aarch64_expand_setmem (rtx *operands)
> >
> >    bool size_p = optimize_function_for_size_p (cfun);
> >
> > -  /* Default the maximum to 256-bytes when considering only libcall vs
> > -     SIMD broadcast sequence.  */
> > -  unsigned max_set_size = 256;
> > +  /* Maximum amount to copy in one go (without taking into account store pairs).  */
> > +  int copy_limit = GET_MODE_BITSIZE (TImode);
> > +
> > +  /* For strict alignment case, restrict the copy limit to the compiler time
> > +     known alignment of the memory.  */
> > +  if (STRICT_ALIGNMENT)
> > +    copy_limit = MIN (copy_limit, (int)MEM_ALIGN (dst));
> > +
> > +  bool use_pairs = aarch64_can_use_pair_load_stores (copy_limit);
> > +
> > +  /* The max set size is 8 instructions of the copy_limit sized stores
> > +     (also taking into account using pair stores or not);
> > +     for the non strict alignment, this is 256 bytes.  */
> > +  unsigned max_set_size;
> > +  max_set_size = (copy_limit * (use_pairs ? 2 : 1) * 8) / BITS_PER_UNIT;
> >
> >    len = INTVAL (operands[1]);
> >    if (len > max_set_size && !TARGET_MOPS)
> > @@ -24990,8 +25012,10 @@ aarch64_expand_setmem (rtx *operands)
> >      (zero constants can use XZR directly).  */
> >    unsigned mops_cost = 3 + 1 + cst_val;
> >    /* A libcall to memset in the worst case takes 3 instructions to prepare
> > -     the arguments + 1 for the call.  */
> > -  unsigned libcall_cost = 4;
> > +     the arguments + 1 for the call.
> > +     In the case of not optimizing for size the cost of doing a libcall
> > +     is 17 instructions . */
> > +  unsigned libcall_cost = size_p ? 4 : 17;
>
> Why 17?
>
> If possible, I think it'd better to fix the -mstrict-align correctness
> problem without changing the heuristics for -mno-strict-align.  (The
> previous versions seemed able to do that.)
>
> What happens if you leave the !size_p heuristics alone?  Do you get
> too many inline copies for -mstrict-align, or too few?
>
> Thanks,
> Richard
>
> >    /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
> >       when available.  */
> > @@ -25001,12 +25025,12 @@ aarch64_expand_setmem (rtx *operands)
> >
> >    /* Attempt a sequence with a vector broadcast followed by stores.
> >       Count the number of operations involved to see if it's worth it
> > -     against the alternatives.  A simple counter simd_ops on the
> > +     against the alternatives.  A simple counter inlined_ops on the
> >       algorithmically-relevant operations is used rather than an rtx_insn count
> >       as all the pointer adjusmtents and mode reinterprets will be optimized
> >       away later.  */
> >    start_sequence ();
> > -  unsigned simd_ops = 0;
> > +  unsigned inlined_ops = 0;
> >
> >    base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
> >    dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> > @@ -25014,16 +25038,10 @@ aarch64_expand_setmem (rtx *operands)
> >    /* Prepare the val using a DUP/MOVI v0.16B, val.  */
> >    src = expand_vector_broadcast (V16QImode, val);
> >    src = force_reg (V16QImode, src);
> > -  simd_ops++;
> > +  inlined_ops++;
> >    /* Convert len to bits to make the rest of the code simpler.  */
> >    n = len * BITS_PER_UNIT;
> >
> > -  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
> > -     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
> > -  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
> > -                       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
> > -                       ? GET_MODE_BITSIZE (TImode) : 256;
> > -
> >    while (n > 0)
> >      {
> >        /* Find the largest mode in which to do the copy without
> > @@ -25036,15 +25054,18 @@ aarch64_expand_setmem (rtx *operands)
> >        gcc_assert (cur_mode != BLKmode);
> >
> >        mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
> > -      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
> > -      simd_ops++;
> > -      n -= mode_bits;
> > +      bool pairs = aarch64_can_use_pair_load_stores (mode_bits);
> > +      if (n < (mode_bits * 2))
> > +     pairs = false;
> > +      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode, pairs);
> > +      inlined_ops++;
> > +      n -= mode_bits * (pairs ? 2 : 1);
> >
> >        /* Do certain trailing copies as overlapping if it's going to be
> >        cheaper.  i.e. less instructions to do so.  For instance doing a 15
> >        byte copy it's more efficient to do two overlapping 8 byte copies than
> >        8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
> > -      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
> > +      if (n > 0 && n < copy_limit && !STRICT_ALIGNMENT)
> >       {
> >         next_mode = smallest_mode_for_size (n, MODE_INT);
> >         int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
> > @@ -25056,24 +25077,17 @@ aarch64_expand_setmem (rtx *operands)
> >    rtx_insn *seq = get_insns ();
> >    end_sequence ();
> >
> > -  if (size_p)
> > -    {
> > -      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
> > -      call to memset or the MOPS expansion.  */
> > -      if (TARGET_MOPS
> > -       && mops_cost <= libcall_cost
> > -       && mops_cost <= simd_ops)
> > -     return aarch64_expand_setmem_mops (operands);
> > -      /* If MOPS is not available or not shorter pick a libcall if the SIMD
> > -      sequence is too long.  */
> > -      else if (libcall_cost < simd_ops)
> > -     return false;
> > -      emit_insn (seq);
> > -      return true;
> > -    }
> > +  /* When optimizing for size we have 3 options: the inlined sequence,
> > +     call to memset or the MOPS expansion.  */
> > +  if (size_p
> > +      && TARGET_MOPS
> > +      && mops_cost <= libcall_cost
> > +      && mops_cost <= inlined_ops)
> > +    return aarch64_expand_setmem_mops (operands);
> > +  /* Pick a libcall if the inlined sequence is too long.  */
> > +  else if (libcall_cost < inlined_ops)
> > +    return false;
> >
> > -  /* At this point the SIMD broadcast sequence is the best choice when
> > -     optimizing for speed.  */
> >    emit_insn (seq);
> >    return true;
> >  }
> > diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> > index 664d43aee13..63c864b25b0 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
> > @@ -1,28 +1,27 @@
> >  /* { dg-do compile } */
> >  /* { dg-options "-O2 -mstrict-align" } */
> >
> > -struct s { char x[255]; };
> > +struct s { char x[207]; };
> > +struct s s1 __attribute__((aligned(16)));
> >  void foo (struct s *);
> > -void bar (void) { struct s s1 = {}; foo (&s1); }
> > +void bar (void) { s1 = (struct s){}; foo (&s1); }
> >
> > -/* memset (s1 = {}, sizeof = 255) should be expanded out
> > +/* memset (s1 = {}, sizeof = 207) should be expanded out
> >     such that there are no overlap stores when -mstrict-align
> >     is in use.
> > -   so 7 pairs of 16 bytes stores (224 bytes).
> > -   1 16 byte stores
> > +   so 6 pairs of 16 bytes stores (96 bytes).
> >     1 8 byte store
> >     1 4 byte store
> >     1 2 byte store
> >     1 1 byte store
> >     */
> >
> > -/* { dg-final { scan-assembler-times "stp\tq" 7 } } */
> > -/* { dg-final { scan-assembler-times "str\tq" 1 } } */
> > +/* { dg-final { scan-assembler-times "stp\tq" 6 } } */
> > +/* { dg-final { scan-assembler-times "str\tq" 0 } } */
> >  /* { dg-final { scan-assembler-times "str\txzr" 1 } } */
> >  /* { dg-final { scan-assembler-times "str\twzr" 1 } } */
> >  /* { dg-final { scan-assembler-times "strh\twzr" 1 } } */
> >  /* { dg-final { scan-assembler-times "strb\twzr" 1 } } */
> >
> > -/* Also one store pair for the frame-pointer and the LR. */
> > -/* { dg-final { scan-assembler-times "stp\tx" 1 } } */
> > -
> > +/* No store pair with 8 byte words is needed as foo is called with a sibcall. */
> > +/* { dg-final { scan-assembler-times "stp\tx" 0 } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
> > new file mode 100644
> > index 00000000000..650be86604b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mstrict-align" } */
> > +
> > +struct s { char x[7]; };
> > +void foo (struct s *);
> > +void bar (struct s *s1) { *s1 = (struct s){}; foo (s1); }
> > +
> > +/* memset (s1 = {}, sizeof = 7) should be expanded out
> > +   such that there are no overlap stores when -mstrict-align
> > +   is in use. As the alignment of s1 is unknown, byte stores are needed.
> > +   so 15 byte stores
> > +   */
> > +
> > +/* { dg-final { scan-assembler-times "strb\twzr" 7 } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
> > new file mode 100644
> > index 00000000000..09cb9a654dc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
> > @@ -0,0 +1,15 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-Os -mstrict-align" } */
> > +
> > +int a[2*3];
> > +int f(int t)
> > +{
> > +  __builtin_memset(a, t, 2*3*sizeof(int));
> > +}
> > +
> > +/* memset (a, (val), sizeof = 2*3*4) should be expanded out
> > +   such that there are no overlap stores when -mstrict-align
> > +   is in use. As the alignment of a is 4 byte aligned (due to -Os),
> > +   store word pairs are needed. so 3 stp are in use.  */
> > +
> > +/* { dg-final { scan-assembler-times "stp\ts" 3 } } */
  

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5c40b6ed22a..3eaf9bd608a 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -8850,6 +8850,9 @@  aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
 {
   switch (mode)
     {
+    case E_SImode:
+      return gen_store_pair_sw_sisi (mem1, reg1, mem2, reg2);
+
     case E_DImode:
       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
 
@@ -24896,42 +24899,49 @@  aarch64_expand_cpymem (rtx *operands)
    SRC is a register we have created with the duplicated value to be set.  */
 static void
 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
-					    machine_mode mode)
+					    machine_mode mode, bool use_pairs)
 {
+  rtx reg = src;
   /* If we are copying 128bits or 256bits, we can do that straight from
      the SIMD register we prepared.  */
-  if (known_eq (GET_MODE_BITSIZE (mode), 256))
-    {
-      mode = GET_MODE (src);
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, mode, 0);
-      /* Emit the memset.  */
-      emit_insn (aarch64_gen_store_pair (mode, *dst, src,
-					 aarch64_progress_pointer (*dst), src));
-
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 32);
-      return;
-    }
   if (known_eq (GET_MODE_BITSIZE (mode), 128))
-    {
-      /* "Cast" the *dst to the correct mode.  */
-      *dst = adjust_address (*dst, GET_MODE (src), 0);
-      /* Emit the memset.  */
-      emit_move_insn (*dst, src);
-      /* Move the pointers forward.  */
-      *dst = aarch64_move_pointer (*dst, 16);
-      return;
-    }
-  /* For copying less, we have to extract the right amount from src.  */
-  rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
+    mode = GET_MODE(src);
+  else
+    /* For copying less, we have to extract the right amount from src.  */
+    reg = lowpart_subreg (mode, src, GET_MODE (src));
 
   /* "Cast" the *dst to the correct mode.  */
   *dst = adjust_address (*dst, mode, 0);
   /* Emit the memset.  */
-  emit_move_insn (*dst, reg);
+  if (use_pairs)
+    emit_insn (aarch64_gen_store_pair (mode, *dst, reg,
+				       aarch64_progress_pointer (*dst),
+				       reg));
+  else
+    emit_move_insn (*dst, reg);
+
   /* Move the pointer forward.  */
   *dst = aarch64_progress_pointer (*dst);
+  if (use_pairs)
+    *dst = aarch64_progress_pointer (*dst);
+}
+
+/* Returns true if size can be used as a store/load pair.
+   This is a helper function for aarch64_expand_setmem and others. */
+static bool
+aarch64_can_use_pair_load_stores (unsigned HOST_WIDE_INT size)
+{
+  /* For DI and SI modes, we can use store pairs.  */
+  if (size == GET_MODE_BITSIZE (DImode)
+      || size == GET_MODE_BITSIZE (SImode))
+    return true;
+  /* For TI mode, we will use store pairs only if
+     the target wants to. */
+  else if (size == GET_MODE_BITSIZE (TImode)
+      && !(aarch64_tune_params.extra_tuning_flags
+	   & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
+    return true;
+  return false;
 }
 
 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
@@ -24974,9 +24984,21 @@  aarch64_expand_setmem (rtx *operands)
 
   bool size_p = optimize_function_for_size_p (cfun);
 
-  /* Default the maximum to 256-bytes when considering only libcall vs
-     SIMD broadcast sequence.  */
-  unsigned max_set_size = 256;
+  /* Maximum amount to copy in one go (without taking into account store pairs).  */
+  int copy_limit = GET_MODE_BITSIZE (TImode);
+
+  /* For strict alignment case, restrict the copy limit to the compiler time
+     known alignment of the memory.  */
+  if (STRICT_ALIGNMENT)
+    copy_limit = MIN (copy_limit, (int)MEM_ALIGN (dst));
+
+  bool use_pairs = aarch64_can_use_pair_load_stores (copy_limit);
+
+  /* The max set size is 8 instructions of the copy_limit sized stores
+     (also taking into account using pair stores or not);
+     for the non strict alignment, this is 256 bytes.  */
+  unsigned max_set_size;
+  max_set_size = (copy_limit * (use_pairs ? 2 : 1) * 8) / BITS_PER_UNIT;
 
   len = INTVAL (operands[1]);
   if (len > max_set_size && !TARGET_MOPS)
@@ -24990,8 +25012,10 @@  aarch64_expand_setmem (rtx *operands)
     (zero constants can use XZR directly).  */
   unsigned mops_cost = 3 + 1 + cst_val;
   /* A libcall to memset in the worst case takes 3 instructions to prepare
-     the arguments + 1 for the call.  */
-  unsigned libcall_cost = 4;
+     the arguments + 1 for the call.
+     In the case of not optimizing for size the cost of doing a libcall
+     is 17 instructions . */
+  unsigned libcall_cost = size_p ? 4 : 17;
 
   /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
      when available.  */
@@ -25001,12 +25025,12 @@  aarch64_expand_setmem (rtx *operands)
 
   /* Attempt a sequence with a vector broadcast followed by stores.
      Count the number of operations involved to see if it's worth it
-     against the alternatives.  A simple counter simd_ops on the
+     against the alternatives.  A simple counter inlined_ops on the
      algorithmically-relevant operations is used rather than an rtx_insn count
      as all the pointer adjusmtents and mode reinterprets will be optimized
      away later.  */
   start_sequence ();
-  unsigned simd_ops = 0;
+  unsigned inlined_ops = 0;
 
   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
@@ -25014,16 +25038,10 @@  aarch64_expand_setmem (rtx *operands)
   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
   src = expand_vector_broadcast (V16QImode, val);
   src = force_reg (V16QImode, src);
-  simd_ops++;
+  inlined_ops++;
   /* Convert len to bits to make the rest of the code simpler.  */
   n = len * BITS_PER_UNIT;
 
-  /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
-     AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
-  const int copy_limit = (aarch64_tune_params.extra_tuning_flags
-			  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
-			  ? GET_MODE_BITSIZE (TImode) : 256;
-
   while (n > 0)
     {
       /* Find the largest mode in which to do the copy without
@@ -25036,15 +25054,18 @@  aarch64_expand_setmem (rtx *operands)
       gcc_assert (cur_mode != BLKmode);
 
       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
-      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
-      simd_ops++;
-      n -= mode_bits;
+      bool pairs = aarch64_can_use_pair_load_stores (mode_bits);
+      if (n < (mode_bits * 2))
+	pairs = false;
+      aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode, pairs);
+      inlined_ops++;
+      n -= mode_bits * (pairs ? 2 : 1);
 
       /* Do certain trailing copies as overlapping if it's going to be
 	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
 	 byte copy it's more efficient to do two overlapping 8 byte copies than
 	 8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
-      if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
+      if (n > 0 && n < copy_limit && !STRICT_ALIGNMENT)
 	{
 	  next_mode = smallest_mode_for_size (n, MODE_INT);
 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
@@ -25056,24 +25077,17 @@  aarch64_expand_setmem (rtx *operands)
   rtx_insn *seq = get_insns ();
   end_sequence ();
 
-  if (size_p)
-    {
-      /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
-	 call to memset or the MOPS expansion.  */
-      if (TARGET_MOPS
-	  && mops_cost <= libcall_cost
-	  && mops_cost <= simd_ops)
-	return aarch64_expand_setmem_mops (operands);
-      /* If MOPS is not available or not shorter pick a libcall if the SIMD
-	 sequence is too long.  */
-      else if (libcall_cost < simd_ops)
-	return false;
-      emit_insn (seq);
-      return true;
-    }
+  /* When optimizing for size we have 3 options: the inlined sequence,
+     call to memset or the MOPS expansion.  */
+  if (size_p
+      && TARGET_MOPS
+      && mops_cost <= libcall_cost
+      && mops_cost <= inlined_ops)
+    return aarch64_expand_setmem_mops (operands);
+  /* Pick a libcall if the inlined sequence is too long.  */
+  else if (libcall_cost < inlined_ops)
+    return false;
 
-  /* At this point the SIMD broadcast sequence is the best choice when
-     optimizing for speed.  */
   emit_insn (seq);
   return true;
 }
diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
index 664d43aee13..63c864b25b0 100644
--- a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-1.c
@@ -1,28 +1,27 @@ 
 /* { dg-do compile } */
 /* { dg-options "-O2 -mstrict-align" } */
 
-struct s { char x[255]; };
+struct s { char x[207]; };
+struct s s1 __attribute__((aligned(16)));
 void foo (struct s *);
-void bar (void) { struct s s1 = {}; foo (&s1); }
+void bar (void) { s1 = (struct s){}; foo (&s1); }
 
-/* memset (s1 = {}, sizeof = 255) should be expanded out
+/* memset (s1 = {}, sizeof = 207) should be expanded out
    such that there are no overlap stores when -mstrict-align
    is in use.
-   so 7 pairs of 16 bytes stores (224 bytes).
-   1 16 byte stores
+   so 6 pairs of 16 bytes stores (96 bytes).
    1 8 byte store
    1 4 byte store
    1 2 byte store
    1 1 byte store
    */
 
-/* { dg-final { scan-assembler-times "stp\tq" 7 } } */
-/* { dg-final { scan-assembler-times "str\tq" 1 } } */
+/* { dg-final { scan-assembler-times "stp\tq" 6 } } */
+/* { dg-final { scan-assembler-times "str\tq" 0 } } */
 /* { dg-final { scan-assembler-times "str\txzr" 1 } } */
 /* { dg-final { scan-assembler-times "str\twzr" 1 } } */
 /* { dg-final { scan-assembler-times "strh\twzr" 1 } } */
 /* { dg-final { scan-assembler-times "strb\twzr" 1 } } */
 
-/* Also one store pair for the frame-pointer and the LR. */
-/* { dg-final { scan-assembler-times "stp\tx" 1 } } */
-
+/* No store pair with 8 byte words is needed as foo is called with a sibcall. */
+/* { dg-final { scan-assembler-times "stp\tx" 0 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
new file mode 100644
index 00000000000..650be86604b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-2.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mstrict-align" } */
+
+struct s { char x[7]; };
+void foo (struct s *);
+void bar (struct s *s1) { *s1 = (struct s){}; foo (s1); }
+
+/* memset (s1 = {}, sizeof = 7) should be expanded out
+   such that there are no overlap stores when -mstrict-align
+   is in use. As the alignment of s1 is unknown, byte stores are needed.
+   so 15 byte stores
+   */
+
+/* { dg-final { scan-assembler-times "strb\twzr" 7 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
new file mode 100644
index 00000000000..09cb9a654dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/memset-strict-align-3.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Os -mstrict-align" } */
+
+int a[2*3];
+int f(int t)
+{
+  __builtin_memset(a, t, 2*3*sizeof(int));
+}
+
+/* memset (a, (val), sizeof = 2*3*4) should be expanded out
+   such that there are no overlap stores when -mstrict-align
+   is in use. As the alignment of a is 4 byte aligned (due to -Os),
+   store word pairs are needed. so 3 stp are in use.  */
+
+/* { dg-final { scan-assembler-times "stp\ts" 3 } } */