[aarch64] Use dup and zip1 for interleaving elements in initializing vector

Message ID CAAgBjM=0mHW4Aw2u-Kksy=OV5KY-G7_CW+mrT1QKPyKMrBi80g@mail.gmail.com
State Accepted
Headers
Series [aarch64] Use dup and zip1 for interleaving elements in initializing vector |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Prathamesh Kulkarni Nov. 29, 2022, 2:39 p.m. UTC
  Hi,
For the following test-case:

int16x8_t foo(int16_t x, int16_t y)
{
  return (int16x8_t) { x, y, x, y, x, y, x, y };
}

Code gen at -O3:
foo:
        dup    v0.8h, w0
        ins     v0.h[1], w1
        ins     v0.h[3], w1
        ins     v0.h[5], w1
        ins     v0.h[7], w1
        ret

For 16 elements, it results in 8 ins instructions which might not be
optimal perhaps.
I guess, the above code-gen would be equivalent to the following ?
dup v0.8h, w0
dup v1.8h, w1
zip1 v0.8h, v0.8h, v1.8h

I have attached patch to do the same, if number of elements >= 8,
which should be possibly better compared to current code-gen ?
Patch passes bootstrap+test on aarch64-linux-gnu.
Does the patch look OK ?

Thanks,
Prathamesh
  

Comments

Andrew Pinski Nov. 29, 2022, 3:13 p.m. UTC | #1
On Tue, Nov 29, 2022 at 6:40 AM Prathamesh Kulkarni via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
> For the following test-case:
>
> int16x8_t foo(int16_t x, int16_t y)
> {
>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> }

(Not to block this patch)
Seems like this trick can be done even with less than perfect initializer too:
e.g.
int16x8_t foo(int16_t x, int16_t y)
{
  return (int16x8_t) { x, y, x, y, x, y, x, 0 };
}

Which should generate something like:
dup v0.8h, w0
dup v1.8h, w1
zip1 v0.8h, v0.8h, v1.8h
ins v0.h[7], wzr

Thanks,
Andrew Pinski


>
> Code gen at -O3:
> foo:
>         dup    v0.8h, w0
>         ins     v0.h[1], w1
>         ins     v0.h[3], w1
>         ins     v0.h[5], w1
>         ins     v0.h[7], w1
>         ret
>
> For 16 elements, it results in 8 ins instructions which might not be
> optimal perhaps.
> I guess, the above code-gen would be equivalent to the following ?
> dup v0.8h, w0
> dup v1.8h, w1
> zip1 v0.8h, v0.8h, v1.8h
>
> I have attached patch to do the same, if number of elements >= 8,
> which should be possibly better compared to current code-gen ?
> Patch passes bootstrap+test on aarch64-linux-gnu.
> Does the patch look OK ?
>
> Thanks,
> Prathamesh
  
Prathamesh Kulkarni Nov. 29, 2022, 5:06 p.m. UTC | #2
On Tue, 29 Nov 2022 at 20:43, Andrew Pinski <pinskia@gmail.com> wrote:
>
> On Tue, Nov 29, 2022 at 6:40 AM Prathamesh Kulkarni via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Hi,
> > For the following test-case:
> >
> > int16x8_t foo(int16_t x, int16_t y)
> > {
> >   return (int16x8_t) { x, y, x, y, x, y, x, y };
> > }
>
> (Not to block this patch)
> Seems like this trick can be done even with less than perfect initializer too:
> e.g.
> int16x8_t foo(int16_t x, int16_t y)
> {
>   return (int16x8_t) { x, y, x, y, x, y, x, 0 };
> }
>
> Which should generate something like:
> dup v0.8h, w0
> dup v1.8h, w1
> zip1 v0.8h, v0.8h, v1.8h
> ins v0.h[7], wzr
Hi Andrew,
Nice catch, thanks for the suggestions!
More generally, code-gen with constants involved seems to be sub-optimal.
For example:
int16x8_t foo(int16_t x)
{
  return (int16x8_t) { x, x, x, x, x, x, x, 1 };
}

results in:
foo:
        movi    v0.8h, 0x1
        ins     v0.h[0], w0
        ins     v0.h[1], w0
        ins     v0.h[2], w0
        ins     v0.h[3], w0
        ins     v0.h[4], w0
        ins     v0.h[5], w0
        ins     v0.h[6], w0
        ret

which I suppose could instead be the following ?
foo:
        dup     v0.8h, w0
        mov    w1, 0x1
        ins       v0.h[7], w1
        ret

I will try to address this in follow up patch.

Thanks,
Prathamesh

>
> Thanks,
> Andrew Pinski
>
>
> >
> > Code gen at -O3:
> > foo:
> >         dup    v0.8h, w0
> >         ins     v0.h[1], w1
> >         ins     v0.h[3], w1
> >         ins     v0.h[5], w1
> >         ins     v0.h[7], w1
> >         ret
> >
> > For 16 elements, it results in 8 ins instructions which might not be
> > optimal perhaps.
> > I guess, the above code-gen would be equivalent to the following ?
> > dup v0.8h, w0
> > dup v1.8h, w1
> > zip1 v0.8h, v0.8h, v1.8h
> >
> > I have attached patch to do the same, if number of elements >= 8,
> > which should be possibly better compared to current code-gen ?
> > Patch passes bootstrap+test on aarch64-linux-gnu.
> > Does the patch look OK ?
> >
> > Thanks,
> > Prathamesh
  
Richard Sandiford Dec. 5, 2022, 10:52 a.m. UTC | #3
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> Hi,
> For the following test-case:
>
> int16x8_t foo(int16_t x, int16_t y)
> {
>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> }
>
> Code gen at -O3:
> foo:
>         dup    v0.8h, w0
>         ins     v0.h[1], w1
>         ins     v0.h[3], w1
>         ins     v0.h[5], w1
>         ins     v0.h[7], w1
>         ret
>
> For 16 elements, it results in 8 ins instructions which might not be
> optimal perhaps.
> I guess, the above code-gen would be equivalent to the following ?
> dup v0.8h, w0
> dup v1.8h, w1
> zip1 v0.8h, v0.8h, v1.8h
>
> I have attached patch to do the same, if number of elements >= 8,
> which should be possibly better compared to current code-gen ?
> Patch passes bootstrap+test on aarch64-linux-gnu.
> Does the patch look OK ?
>
> Thanks,
> Prathamesh
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index c91df6f5006..e5dea70e363 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>        return;
>      }
>  
> +  /* Check for interleaving case.
> +     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> +     Generate following code:
> +     dup v0.h, x
> +     dup v1.h, y
> +     zip1 v0.h, v0.h, v1.h
> +     for "large enough" initializer.  */
> +
> +  if (n_elts >= 8)
> +    {
> +      int i;
> +      for (i = 2; i < n_elts; i++)
> +	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> +	  break;
> +
> +      if (i == n_elts)
> +	{
> +	  machine_mode mode = GET_MODE (target);
> +	  rtx dest[2];
> +
> +	  for (int i = 0; i < 2; i++)
> +	    {
> +	      rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));

Formatting nit: long line.

> +	      dest[i] = gen_reg_rtx (mode);
> +	      aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
> +	    }

This could probably be written:

	  for (int i = 0; i < 2; i++)
	    {
	      rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
	      dest[i] = force_reg (GET_MODE_INNER (mode), x);
	    }

which avoids forcing constant elements into a register before the duplication.

OK with that change if it works.

Thanks,
Richard

> +
> +	  rtvec v = gen_rtvec (2, dest[0], dest[1]);
> +	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> +	  return;
> +	}
> +    }
> +
>    enum insn_code icode = optab_handler (vec_set_optab, mode);
>    gcc_assert (icode != CODE_FOR_nothing);
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> new file mode 100644
> index 00000000000..ee775048589
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> @@ -0,0 +1,37 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foo:
> +**	...
> +**	dup	v[0-9]+\.8h, w[0-9]+
> +**	dup	v[0-9]+\.8h, w[0-9]+
> +**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> +**	...
> +**	ret
> +*/
> +
> +int16x8_t foo(int16_t x, int y)
> +{
> +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
> +  return v;
> +}
> +
> +/*
> +** foo2:
> +**	...
> +**	dup	v[0-9]+\.8h, w[0-9]+
> +**	movi	v[0-9]+\.8h, 0x1
> +**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> +**	...
> +**	ret
> +*/
> +
> +int16x8_t foo2(int16_t x) 
> +{
> +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
> +  return v;
> +}
  
Richard Sandiford Dec. 5, 2022, 11:20 a.m. UTC | #4
Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> Hi,
>> For the following test-case:
>>
>> int16x8_t foo(int16_t x, int16_t y)
>> {
>>   return (int16x8_t) { x, y, x, y, x, y, x, y };
>> }
>>
>> Code gen at -O3:
>> foo:
>>         dup    v0.8h, w0
>>         ins     v0.h[1], w1
>>         ins     v0.h[3], w1
>>         ins     v0.h[5], w1
>>         ins     v0.h[7], w1
>>         ret
>>
>> For 16 elements, it results in 8 ins instructions which might not be
>> optimal perhaps.
>> I guess, the above code-gen would be equivalent to the following ?
>> dup v0.8h, w0
>> dup v1.8h, w1
>> zip1 v0.8h, v0.8h, v1.8h
>>
>> I have attached patch to do the same, if number of elements >= 8,
>> which should be possibly better compared to current code-gen ?
>> Patch passes bootstrap+test on aarch64-linux-gnu.
>> Does the patch look OK ?
>>
>> Thanks,
>> Prathamesh
>>
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index c91df6f5006..e5dea70e363 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>>        return;
>>      }
>>  
>> +  /* Check for interleaving case.
>> +     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
>> +     Generate following code:
>> +     dup v0.h, x
>> +     dup v1.h, y
>> +     zip1 v0.h, v0.h, v1.h
>> +     for "large enough" initializer.  */
>> +
>> +  if (n_elts >= 8)
>> +    {
>> +      int i;
>> +      for (i = 2; i < n_elts; i++)
>> +	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
>> +	  break;
>> +
>> +      if (i == n_elts)
>> +	{
>> +	  machine_mode mode = GET_MODE (target);
>> +	  rtx dest[2];
>> +
>> +	  for (int i = 0; i < 2; i++)
>> +	    {
>> +	      rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
>
> Formatting nit: long line.
>
>> +	      dest[i] = gen_reg_rtx (mode);
>> +	      aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
>> +	    }
>
> This could probably be written:
>
> 	  for (int i = 0; i < 2; i++)
> 	    {
> 	      rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> 	      dest[i] = force_reg (GET_MODE_INNER (mode), x);

Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.

> 	    }
>
> which avoids forcing constant elements into a register before the duplication.
> OK with that change if it works.
>
> Thanks,
> Richard
>
>> +
>> +	  rtvec v = gen_rtvec (2, dest[0], dest[1]);
>> +	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
>> +	  return;
>> +	}
>> +    }
>> +
>>    enum insn_code icode = optab_handler (vec_set_optab, mode);
>>    gcc_assert (icode != CODE_FOR_nothing);
>>  
>> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
>> new file mode 100644
>> index 00000000000..ee775048589
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
>> @@ -0,0 +1,37 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3" } */
>> +/* { dg-final { check-function-bodies "**" "" "" } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +** foo:
>> +**	...
>> +**	dup	v[0-9]+\.8h, w[0-9]+
>> +**	dup	v[0-9]+\.8h, w[0-9]+
>> +**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>> +**	...
>> +**	ret
>> +*/
>> +
>> +int16x8_t foo(int16_t x, int y)
>> +{
>> +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
>> +  return v;
>> +}
>> +
>> +/*
>> +** foo2:
>> +**	...
>> +**	dup	v[0-9]+\.8h, w[0-9]+
>> +**	movi	v[0-9]+\.8h, 0x1
>> +**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>> +**	...
>> +**	ret
>> +*/
>> +
>> +int16x8_t foo2(int16_t x) 
>> +{
>> +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
>> +  return v;
>> +}
  
Prathamesh Kulkarni Dec. 6, 2022, 1:31 a.m. UTC | #5
On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> Hi,
> >> For the following test-case:
> >>
> >> int16x8_t foo(int16_t x, int16_t y)
> >> {
> >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> >> }
> >>
> >> Code gen at -O3:
> >> foo:
> >>         dup    v0.8h, w0
> >>         ins     v0.h[1], w1
> >>         ins     v0.h[3], w1
> >>         ins     v0.h[5], w1
> >>         ins     v0.h[7], w1
> >>         ret
> >>
> >> For 16 elements, it results in 8 ins instructions which might not be
> >> optimal perhaps.
> >> I guess, the above code-gen would be equivalent to the following ?
> >> dup v0.8h, w0
> >> dup v1.8h, w1
> >> zip1 v0.8h, v0.8h, v1.8h
> >>
> >> I have attached patch to do the same, if number of elements >= 8,
> >> which should be possibly better compared to current code-gen ?
> >> Patch passes bootstrap+test on aarch64-linux-gnu.
> >> Does the patch look OK ?
> >>
> >> Thanks,
> >> Prathamesh
> >>
> >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> >> index c91df6f5006..e5dea70e363 100644
> >> --- a/gcc/config/aarch64/aarch64.cc
> >> +++ b/gcc/config/aarch64/aarch64.cc
> >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >>        return;
> >>      }
> >>
> >> +  /* Check for interleaving case.
> >> +     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> >> +     Generate following code:
> >> +     dup v0.h, x
> >> +     dup v1.h, y
> >> +     zip1 v0.h, v0.h, v1.h
> >> +     for "large enough" initializer.  */
> >> +
> >> +  if (n_elts >= 8)
> >> +    {
> >> +      int i;
> >> +      for (i = 2; i < n_elts; i++)
> >> +    if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> >> +      break;
> >> +
> >> +      if (i == n_elts)
> >> +    {
> >> +      machine_mode mode = GET_MODE (target);
> >> +      rtx dest[2];
> >> +
> >> +      for (int i = 0; i < 2; i++)
> >> +        {
> >> +          rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
> >
> > Formatting nit: long line.
> >
> >> +          dest[i] = gen_reg_rtx (mode);
> >> +          aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
> >> +        }
> >
> > This could probably be written:
> >
> >         for (int i = 0; i < 2; i++)
> >           {
> >             rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> >             dest[i] = force_reg (GET_MODE_INNER (mode), x);
>
> Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
Thanks, I have pushed the change in
769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
bootstrap+test on aarch64-linux-gnu.

Thanks,
Prathamesh
>
> >           }
> >
> > which avoids forcing constant elements into a register before the duplication.
> > OK with that change if it works.
> >
> > Thanks,
> > Richard
> >
> >> +
> >> +      rtvec v = gen_rtvec (2, dest[0], dest[1]);
> >> +      emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> >> +      return;
> >> +    }
> >> +    }
> >> +
> >>    enum insn_code icode = optab_handler (vec_set_optab, mode);
> >>    gcc_assert (icode != CODE_FOR_nothing);
> >>
> >> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> >> new file mode 100644
> >> index 00000000000..ee775048589
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> >> @@ -0,0 +1,37 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-options "-O3" } */
> >> +/* { dg-final { check-function-bodies "**" "" "" } } */
> >> +
> >> +#include <arm_neon.h>
> >> +
> >> +/*
> >> +** foo:
> >> +**  ...
> >> +**  dup     v[0-9]+\.8h, w[0-9]+
> >> +**  dup     v[0-9]+\.8h, w[0-9]+
> >> +**  zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >> +**  ...
> >> +**  ret
> >> +*/
> >> +
> >> +int16x8_t foo(int16_t x, int y)
> >> +{
> >> +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y};
> >> +  return v;
> >> +}
> >> +
> >> +/*
> >> +** foo2:
> >> +**  ...
> >> +**  dup     v[0-9]+\.8h, w[0-9]+
> >> +**  movi    v[0-9]+\.8h, 0x1
> >> +**  zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >> +**  ...
> >> +**  ret
> >> +*/
> >> +
> >> +int16x8_t foo2(int16_t x)
> >> +{
> >> +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1};
> >> +  return v;
> >> +}
  
Prathamesh Kulkarni Dec. 26, 2022, 4:22 a.m. UTC | #6
On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > >> Hi,
> > >> For the following test-case:
> > >>
> > >> int16x8_t foo(int16_t x, int16_t y)
> > >> {
> > >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> > >> }
> > >>
> > >> Code gen at -O3:
> > >> foo:
> > >>         dup    v0.8h, w0
> > >>         ins     v0.h[1], w1
> > >>         ins     v0.h[3], w1
> > >>         ins     v0.h[5], w1
> > >>         ins     v0.h[7], w1
> > >>         ret
> > >>
> > >> For 16 elements, it results in 8 ins instructions which might not be
> > >> optimal perhaps.
> > >> I guess, the above code-gen would be equivalent to the following ?
> > >> dup v0.8h, w0
> > >> dup v1.8h, w1
> > >> zip1 v0.8h, v0.8h, v1.8h
> > >>
> > >> I have attached patch to do the same, if number of elements >= 8,
> > >> which should be possibly better compared to current code-gen ?
> > >> Patch passes bootstrap+test on aarch64-linux-gnu.
> > >> Does the patch look OK ?
> > >>
> > >> Thanks,
> > >> Prathamesh
> > >>
> > >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > >> index c91df6f5006..e5dea70e363 100644
> > >> --- a/gcc/config/aarch64/aarch64.cc
> > >> +++ b/gcc/config/aarch64/aarch64.cc
> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> > >>        return;
> > >>      }
> > >>
> > >> +  /* Check for interleaving case.
> > >> +     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> > >> +     Generate following code:
> > >> +     dup v0.h, x
> > >> +     dup v1.h, y
> > >> +     zip1 v0.h, v0.h, v1.h
> > >> +     for "large enough" initializer.  */
> > >> +
> > >> +  if (n_elts >= 8)
> > >> +    {
> > >> +      int i;
> > >> +      for (i = 2; i < n_elts; i++)
> > >> +    if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> > >> +      break;
> > >> +
> > >> +      if (i == n_elts)
> > >> +    {
> > >> +      machine_mode mode = GET_MODE (target);
> > >> +      rtx dest[2];
> > >> +
> > >> +      for (int i = 0; i < 2; i++)
> > >> +        {
> > >> +          rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
> > >
> > > Formatting nit: long line.
> > >
> > >> +          dest[i] = gen_reg_rtx (mode);
> > >> +          aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
> > >> +        }
> > >
> > > This could probably be written:
> > >
> > >         for (int i = 0; i < 2; i++)
> > >           {
> > >             rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> > >             dest[i] = force_reg (GET_MODE_INNER (mode), x);
> >
> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
> Thanks, I have pushed the change in
> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
> bootstrap+test on aarch64-linux-gnu.
Hi Richard,
I have attached a patch that extends the transform if one half is dup
and other is set of constants.
For eg:
int8x16_t f(int8_t x)
{
  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
}

code-gen trunk:
f:
        adrp    x1, .LC0
        ldr     q0, [x1, #:lo12:.LC0]
        ins     v0.b[0], w0
        ins     v0.b[2], w0
        ins     v0.b[4], w0
        ins     v0.b[6], w0
        ins     v0.b[8], w0
        ins     v0.b[10], w0
        ins     v0.b[12], w0
        ins     v0.b[14], w0
        ret

code-gen with patch:
f:
        dup     v0.16b, w0
        adrp    x0, .LC0
        ldr     q1, [x0, #:lo12:.LC0]
        zip1    v0.16b, v0.16b, v1.16b
        ret

Bootstrapped+tested on aarch64-linux-gnu.
Does it look OK ?

Thanks,
Prathamesh
>

> Thanks,
> Prathamesh
> >
> > >           }
> > >
> > > which avoids forcing constant elements into a register before the duplication.
> > > OK with that change if it works.
> > >
> > > Thanks,
> > > Richard
> > >
> > >> +
> > >> +      rtvec v = gen_rtvec (2, dest[0], dest[1]);
> > >> +      emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > >> +      return;
> > >> +    }
> > >> +    }
> > >> +
> > >>    enum insn_code icode = optab_handler (vec_set_optab, mode);
> > >>    gcc_assert (icode != CODE_FOR_nothing);
> > >>
> > >> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> > >> new file mode 100644
> > >> index 00000000000..ee775048589
> > >> --- /dev/null
> > >> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> > >> @@ -0,0 +1,37 @@
> > >> +/* { dg-do compile } */
> > >> +/* { dg-options "-O3" } */
> > >> +/* { dg-final { check-function-bodies "**" "" "" } } */
> > >> +
> > >> +#include <arm_neon.h>
> > >> +
> > >> +/*
> > >> +** foo:
> > >> +**  ...
> > >> +**  dup     v[0-9]+\.8h, w[0-9]+
> > >> +**  dup     v[0-9]+\.8h, w[0-9]+
> > >> +**  zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> > >> +**  ...
> > >> +**  ret
> > >> +*/
> > >> +
> > >> +int16x8_t foo(int16_t x, int y)
> > >> +{
> > >> +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y};
> > >> +  return v;
> > >> +}
> > >> +
> > >> +/*
> > >> +** foo2:
> > >> +**  ...
> > >> +**  dup     v[0-9]+\.8h, w[0-9]+
> > >> +**  movi    v[0-9]+\.8h, 0x1
> > >> +**  zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> > >> +**  ...
> > >> +**  ret
> > >> +*/
> > >> +
> > >> +int16x8_t foo2(int16_t x)
> > >> +{
> > >> +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1};
> > >> +  return v;
> > >> +}
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9a79a9e7928..411e85f52a4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -21984,6 +21984,54 @@ aarch64_simd_make_constant (rtx vals)
     return NULL_RTX;
 }
 
+/* Subroutine of aarch64_expand_vector_init.
+   Check if VALS has same element at every alternate position
+   from START_POS.  */
+
+static
+bool aarch64_init_interleaving_dup_p (rtx vals, int start_pos)
+{
+  for (int i = start_pos + 2; i < XVECLEN (vals, 0); i += 2)
+    if (!rtx_equal_p (XVECEXP (vals, 0, start_pos), XVECEXP (vals, 0, i)))
+      return false;
+  return true;
+}
+
+/* Subroutine of aarch64_expand_vector_init.
+   Check if every alternate element in VALS starting from START_POS
+   is a constant.  */
+
+static
+bool aarch64_init_interleaving_const_p (rtx vals, int start_pos)
+{
+  for (int i = start_pos; i < XVECLEN (vals, 0); i += 2)
+    if (!CONSTANT_P (XVECEXP (vals, 0, i)))
+      return false;
+  return true;
+}
+
+/* Subroutine of aarch64_expand_vector_init.
+   Copy all odd-numbered or even-numbered elements from VALS
+   depending on CONST_EVEN.
+   For eg if VALS is { x, 1, x, 2, x, 3, x, 4 }
+   return {1, 2, 3, 4, 1, 1, 1, 1}.
+   We are only interested in the first half {0 ... n_elts/2} since
+   that will be used by zip1 for merging. Fill the second half
+   with an arbitrary value since it will be discarded.  */
+
+static
+rtx aarch64_init_interleaving_shift_init (rtx vals, bool const_even)
+{
+  int n_elts = XVECLEN (vals, 0);
+  rtvec vec = rtvec_alloc (n_elts);
+  int i;
+  for (i = 0; i < n_elts / 2; i++)
+    RTVEC_ELT (vec, i) = XVECEXP (vals, 0, (const_even) ? 2 * i : 2 * i + 1);
+  for (; i < n_elts; i++)
+    RTVEC_ELT (vec, i) = RTVEC_ELT (vec, 0);
+  return gen_rtx_CONST_VECTOR (GET_MODE (vals), vec);
+}
+
 /* Expand a vector initialisation sequence, such that TARGET is
    initialised to contain VALS.  */
 
@@ -22048,22 +22096,55 @@ aarch64_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
-  /* Check for interleaving case.
-     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
-     Generate following code:
-     dup v0.h, x
-     dup v1.h, y
-     zip1 v0.h, v0.h, v1.h
-     for "large enough" initializer.  */
+  /* Check for interleaving case for "large enough" initializer.
+     Currently we handle following cases:
+     (a) Even part is dup and odd part is const.
+     (b) Odd part is dup and even part is const.
+     (c) Both even and odd parts are dup.  */
 
   if (n_elts >= 8)
     {
-      int i;
-      for (i = 2; i < n_elts; i++)
-	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
-	  break;
+      bool even_dup = false, even_const = false;
+      bool odd_dup = false, odd_const = false;
+
+      even_dup = aarch64_init_interleaving_dup_p (vals, 0);
+      if (!even_dup)
+	even_const = aarch64_init_interleaving_const_p (vals, 0);
+
+      odd_dup = aarch64_init_interleaving_dup_p (vals, 1);
+      if (!odd_dup)
+	odd_const = aarch64_init_interleaving_const_p (vals, 1);
+
+      /* This case should already be handled above when all elements are constants.  */
+      gcc_assert (!(even_const && odd_const));
 
-      if (i == n_elts)
+      if (even_dup && odd_const)
+	{
+	  rtx dup_reg = expand_vector_broadcast (mode, XVECEXP (vals, 0, 0));
+	  dup_reg = force_reg (mode, dup_reg);
+
+	  rtx const_reg = gen_reg_rtx (mode);
+	  rtx const_vector = aarch64_init_interleaving_shift_init (vals, false);
+	  aarch64_expand_vector_init (const_reg, const_vector);
+
+	  rtvec v = gen_rtvec (2, dup_reg, const_reg);
+	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+	  return;
+	}
+      else if (odd_dup && even_const)
+	{
+	  rtx dup_reg = expand_vector_broadcast (mode, XVECEXP (vals, 0, 1));
+	  dup_reg = force_reg (mode, dup_reg);
+
+	  rtx const_reg = gen_reg_rtx (mode);
+	  rtx const_vector = aarch64_init_interleaving_shift_init (vals, true);
+	  aarch64_expand_vector_init (const_reg, const_vector);
+
+	  rtvec v = gen_rtvec (2, const_reg, dup_reg);
+	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+	  return;
+	}
+      else if (even_dup && odd_dup)
 	{
 	  machine_mode mode = GET_MODE (target);
 	  rtx dest[2];
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c
new file mode 100644
index 00000000000..3ad06c00451
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include "arm_neon.h"
+
+/*
+**foo:
+**	...
+**	dup	v[0-9]+\.8h, w[0-9]+
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**	...
+*/
+
+int16x8_t foo(int16_t x)
+{
+  return (int16x8_t) { x, 1, x, 2, x, 3, x, 4 }; 
+}
+
+
+/*
+**foo2:
+**	...
+**	dup	v[0-9]+\.8h, w[0-9]+
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**	...
+*/
+
+int16x8_t foo2(int16_t x)
+{
+  return (int16x8_t) { 1, x, 2, x, 3, x, 4, x };
+}
+
+/*
+**foo3:
+**	...
+**	dup	v[0-9]+\.8h, v[0-9]+\.h\[0\]
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**	...
+*/
+
+float16x8_t foo3(float16_t x)
+{
+  return (float16x8_t) { x, 1.0, x, 2.0, x, 3.0, x, 4.0 };
+}
  
Richard Sandiford Jan. 12, 2023, 3:51 p.m. UTC | #7
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
>>
>> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>> >
>> > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> > >> Hi,
>> > >> For the following test-case:
>> > >>
>> > >> int16x8_t foo(int16_t x, int16_t y)
>> > >> {
>> > >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
>> > >> }
>> > >>
>> > >> Code gen at -O3:
>> > >> foo:
>> > >>         dup    v0.8h, w0
>> > >>         ins     v0.h[1], w1
>> > >>         ins     v0.h[3], w1
>> > >>         ins     v0.h[5], w1
>> > >>         ins     v0.h[7], w1
>> > >>         ret
>> > >>
>> > >> For 16 elements, it results in 8 ins instructions which might not be
>> > >> optimal perhaps.
>> > >> I guess, the above code-gen would be equivalent to the following ?
>> > >> dup v0.8h, w0
>> > >> dup v1.8h, w1
>> > >> zip1 v0.8h, v0.8h, v1.8h
>> > >>
>> > >> I have attached patch to do the same, if number of elements >= 8,
>> > >> which should be possibly better compared to current code-gen ?
>> > >> Patch passes bootstrap+test on aarch64-linux-gnu.
>> > >> Does the patch look OK ?
>> > >>
>> > >> Thanks,
>> > >> Prathamesh
>> > >>
>> > >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> > >> index c91df6f5006..e5dea70e363 100644
>> > >> --- a/gcc/config/aarch64/aarch64.cc
>> > >> +++ b/gcc/config/aarch64/aarch64.cc
>> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>> > >>        return;
>> > >>      }
>> > >>
>> > >> +  /* Check for interleaving case.
>> > >> +     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
>> > >> +     Generate following code:
>> > >> +     dup v0.h, x
>> > >> +     dup v1.h, y
>> > >> +     zip1 v0.h, v0.h, v1.h
>> > >> +     for "large enough" initializer.  */
>> > >> +
>> > >> +  if (n_elts >= 8)
>> > >> +    {
>> > >> +      int i;
>> > >> +      for (i = 2; i < n_elts; i++)
>> > >> +    if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
>> > >> +      break;
>> > >> +
>> > >> +      if (i == n_elts)
>> > >> +    {
>> > >> +      machine_mode mode = GET_MODE (target);
>> > >> +      rtx dest[2];
>> > >> +
>> > >> +      for (int i = 0; i < 2; i++)
>> > >> +        {
>> > >> +          rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
>> > >
>> > > Formatting nit: long line.
>> > >
>> > >> +          dest[i] = gen_reg_rtx (mode);
>> > >> +          aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
>> > >> +        }
>> > >
>> > > This could probably be written:
>> > >
>> > >         for (int i = 0; i < 2; i++)
>> > >           {
>> > >             rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
>> > >             dest[i] = force_reg (GET_MODE_INNER (mode), x);
>> >
>> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
>> Thanks, I have pushed the change in
>> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
>> bootstrap+test on aarch64-linux-gnu.
> Hi Richard,
> I have attached a patch that extends the transform if one half is dup
> and other is set of constants.
> For eg:
> int8x16_t f(int8_t x)
> {
>   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
> }
>
> code-gen trunk:
> f:
>         adrp    x1, .LC0
>         ldr     q0, [x1, #:lo12:.LC0]
>         ins     v0.b[0], w0
>         ins     v0.b[2], w0
>         ins     v0.b[4], w0
>         ins     v0.b[6], w0
>         ins     v0.b[8], w0
>         ins     v0.b[10], w0
>         ins     v0.b[12], w0
>         ins     v0.b[14], w0
>         ret
>
> code-gen with patch:
> f:
>         dup     v0.16b, w0
>         adrp    x0, .LC0
>         ldr     q1, [x0, #:lo12:.LC0]
>         zip1    v0.16b, v0.16b, v1.16b
>         ret
>
> Bootstrapped+tested on aarch64-linux-gnu.
> Does it look OK ?

Looks like a nice improvement.  It'll need to wait for GCC 14 now though.

However, rather than handle this case specially, I think we should instead
take a divide-and-conquer approach: split the initialiser into even and
odd elements, find the best way of loading each part, then compare the
cost of these sequences + ZIP with the cost of the fallback code (the code
later in aarch64_expand_vector_init).

For example, doing that would allow:

  { x, y, 0, y, 0, y, 0, y, 0, y }

to be loaded more easily, even though the even elements aren't wholly
constant.

Thanks,
Richard

>
> Thanks,
> Prathamesh
>>
>
>> Thanks,
>> Prathamesh
>> >
>> > >           }
>> > >
>> > > which avoids forcing constant elements into a register before the duplication.
>> > > OK with that change if it works.
>> > >
>> > > Thanks,
>> > > Richard
>> > >
>> > >> +
>> > >> +      rtvec v = gen_rtvec (2, dest[0], dest[1]);
>> > >> +      emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
>> > >> +      return;
>> > >> +    }
>> > >> +    }
>> > >> +
>> > >>    enum insn_code icode = optab_handler (vec_set_optab, mode);
>> > >>    gcc_assert (icode != CODE_FOR_nothing);
>> > >>
>> > >> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
>> > >> new file mode 100644
>> > >> index 00000000000..ee775048589
>> > >> --- /dev/null
>> > >> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
>> > >> @@ -0,0 +1,37 @@
>> > >> +/* { dg-do compile } */
>> > >> +/* { dg-options "-O3" } */
>> > >> +/* { dg-final { check-function-bodies "**" "" "" } } */
>> > >> +
>> > >> +#include <arm_neon.h>
>> > >> +
>> > >> +/*
>> > >> +** foo:
>> > >> +**  ...
>> > >> +**  dup     v[0-9]+\.8h, w[0-9]+
>> > >> +**  dup     v[0-9]+\.8h, w[0-9]+
>> > >> +**  zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>> > >> +**  ...
>> > >> +**  ret
>> > >> +*/
>> > >> +
>> > >> +int16x8_t foo(int16_t x, int y)
>> > >> +{
>> > >> +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y};
>> > >> +  return v;
>> > >> +}
>> > >> +
>> > >> +/*
>> > >> +** foo2:
>> > >> +**  ...
>> > >> +**  dup     v[0-9]+\.8h, w[0-9]+
>> > >> +**  movi    v[0-9]+\.8h, 0x1
>> > >> +**  zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>> > >> +**  ...
>> > >> +**  ret
>> > >> +*/
>> > >> +
>> > >> +int16x8_t foo2(int16_t x)
>> > >> +{
>> > >> +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1};
>> > >> +  return v;
>> > >> +}
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 9a79a9e7928..411e85f52a4 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -21984,6 +21984,54 @@ aarch64_simd_make_constant (rtx vals)
>      return NULL_RTX;
>  }
>  
> +/* Subroutine of aarch64_expand_vector_init.
> +   Check if VALS has same element at every alternate position
> +   from START_POS.  */
> +
> +static
> +bool aarch64_init_interleaving_dup_p (rtx vals, int start_pos)
> +{
> +  for (int i = start_pos + 2; i < XVECLEN (vals, 0); i += 2)
> +    if (!rtx_equal_p (XVECEXP (vals, 0, start_pos), XVECEXP (vals, 0, i)))
> +      return false;
> +  return true;
> +}
> +
> +/* Subroutine of aarch64_expand_vector_init.
> +   Check if every alternate element in VALS starting from START_POS
> +   is a constant.  */
> +
> +static
> +bool aarch64_init_interleaving_const_p (rtx vals, int start_pos)
> +{
> +  for (int i = start_pos; i < XVECLEN (vals, 0); i += 2)
> +    if (!CONSTANT_P (XVECEXP (vals, 0, i)))
> +      return false;
> +  return true;
> +}
> +
> +/* Subroutine of aarch64_expand_vector_init.
> +   Copy all odd-numbered or even-numbered elements from VALS
> +   depending on CONST_EVEN.
> +   For eg if VALS is { x, 1, x, 2, x, 3, x, 4 }
> +   return {1, 2, 3, 4, 1, 1, 1, 1}.
> +   We are only interested in the first half {0 ... n_elts/2} since
> +   that will be used by zip1 for merging. Fill the second half
> +   with an arbitrary value since it will be discarded.  */
> +
> +static
> +rtx aarch64_init_interleaving_shift_init (rtx vals, bool const_even)
> +{
> +  int n_elts = XVECLEN (vals, 0);
> +  rtvec vec = rtvec_alloc (n_elts);
> +  int i;
> +  for (i = 0; i < n_elts / 2; i++)
> +    RTVEC_ELT (vec, i) = XVECEXP (vals, 0, (const_even) ? 2 * i : 2 * i + 1);
> +  for (; i < n_elts; i++)
> +    RTVEC_ELT (vec, i) = RTVEC_ELT (vec, 0);
> +  return gen_rtx_CONST_VECTOR (GET_MODE (vals), vec);
> +}
> +
>  /* Expand a vector initialisation sequence, such that TARGET is
>     initialised to contain VALS.  */
>  
> @@ -22048,22 +22096,55 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>        return;
>      }
>  
> -  /* Check for interleaving case.
> -     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> -     Generate following code:
> -     dup v0.h, x
> -     dup v1.h, y
> -     zip1 v0.h, v0.h, v1.h
> -     for "large enough" initializer.  */
> +  /* Check for interleaving case for "large enough" initializer.
> +     Currently we handle following cases:
> +     (a) Even part is dup and odd part is const.
> +     (b) Odd part is dup and even part is const.
> +     (c) Both even and odd parts are dup.  */
>  
>    if (n_elts >= 8)
>      {
> -      int i;
> -      for (i = 2; i < n_elts; i++)
> -	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> -	  break;
> +      bool even_dup = false, even_const = false;
> +      bool odd_dup = false, odd_const = false;
> +
> +      even_dup = aarch64_init_interleaving_dup_p (vals, 0);
> +      if (!even_dup)
> +	even_const = aarch64_init_interleaving_const_p (vals, 0);
> +
> +      odd_dup = aarch64_init_interleaving_dup_p (vals, 1);
> +      if (!odd_dup)
> +	odd_const = aarch64_init_interleaving_const_p (vals, 1);
> +
> +      /* This case should already be handled above when all elements are constants.  */
> +      gcc_assert (!(even_const && odd_const));
>  
> -      if (i == n_elts)
> +      if (even_dup && odd_const)
> +	{
> +	  rtx dup_reg = expand_vector_broadcast (mode, XVECEXP (vals, 0, 0));
> +	  dup_reg = force_reg (mode, dup_reg);
> +
> +	  rtx const_reg = gen_reg_rtx (mode);
> +	  rtx const_vector = aarch64_init_interleaving_shift_init (vals, false);
> +	  aarch64_expand_vector_init (const_reg, const_vector);
> +
> +	  rtvec v = gen_rtvec (2, dup_reg, const_reg);
> +	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> +	  return;
> +	}
> +      else if (odd_dup && even_const)
> +	{
> +	  rtx dup_reg = expand_vector_broadcast (mode, XVECEXP (vals, 0, 1));
> +	  dup_reg = force_reg (mode, dup_reg);
> +
> +	  rtx const_reg = gen_reg_rtx (mode);
> +	  rtx const_vector = aarch64_init_interleaving_shift_init (vals, true);
> +	  aarch64_expand_vector_init (const_reg, const_vector);
> +
> +	  rtvec v = gen_rtvec (2, const_reg, dup_reg);
> +	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> +	  return;
> +	}
> +      else if (even_dup && odd_dup)
>  	{
>  	  machine_mode mode = GET_MODE (target);
>  	  rtx dest[2];
> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c
> new file mode 100644
> index 00000000000..3ad06c00451
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include "arm_neon.h"
> +
> +/*
> +**foo:
> +**	...
> +**	dup	v[0-9]+\.8h, w[0-9]+
> +**	adrp	x[0-9]+, .LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> +**	...
> +*/
> +
> +int16x8_t foo(int16_t x)
> +{
> +  return (int16x8_t) { x, 1, x, 2, x, 3, x, 4 }; 
> +}
> +
> +
> +/*
> +**foo2:
> +**	...
> +**	dup	v[0-9]+\.8h, w[0-9]+
> +**	adrp	x[0-9]+, .LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> +**	...
> +*/
> +
> +int16x8_t foo2(int16_t x)
> +{
> +  return (int16x8_t) { 1, x, 2, x, 3, x, 4, x };
> +}
> +
> +/*
> +**foo3:
> +**	...
> +**	dup	v[0-9]+\.8h, v[0-9]+\.h\[0\]
> +**	adrp	x[0-9]+, .LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> +**	...
> +*/
> +
> +float16x8_t foo3(float16_t x)
> +{
> +  return (float16x8_t) { x, 1.0, x, 2.0, x, 3.0, x, 4.0 };
> +}
  
Prathamesh Kulkarni Feb. 1, 2023, 9:36 a.m. UTC | #8
On Thu, 12 Jan 2023 at 21:21, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> >>
> >> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
> >> <richard.sandiford@arm.com> wrote:
> >> >
> >> > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >> > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> > >> Hi,
> >> > >> For the following test-case:
> >> > >>
> >> > >> int16x8_t foo(int16_t x, int16_t y)
> >> > >> {
> >> > >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> >> > >> }
> >> > >>
> >> > >> Code gen at -O3:
> >> > >> foo:
> >> > >>         dup    v0.8h, w0
> >> > >>         ins     v0.h[1], w1
> >> > >>         ins     v0.h[3], w1
> >> > >>         ins     v0.h[5], w1
> >> > >>         ins     v0.h[7], w1
> >> > >>         ret
> >> > >>
> >> > >> For 16 elements, it results in 8 ins instructions which might not be
> >> > >> optimal perhaps.
> >> > >> I guess, the above code-gen would be equivalent to the following ?
> >> > >> dup v0.8h, w0
> >> > >> dup v1.8h, w1
> >> > >> zip1 v0.8h, v0.8h, v1.8h
> >> > >>
> >> > >> I have attached patch to do the same, if number of elements >= 8,
> >> > >> which should be possibly better compared to current code-gen ?
> >> > >> Patch passes bootstrap+test on aarch64-linux-gnu.
> >> > >> Does the patch look OK ?
> >> > >>
> >> > >> Thanks,
> >> > >> Prathamesh
> >> > >>
> >> > >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> >> > >> index c91df6f5006..e5dea70e363 100644
> >> > >> --- a/gcc/config/aarch64/aarch64.cc
> >> > >> +++ b/gcc/config/aarch64/aarch64.cc
> >> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >> > >>        return;
> >> > >>      }
> >> > >>
> >> > >> +  /* Check for interleaving case.
> >> > >> +     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> >> > >> +     Generate following code:
> >> > >> +     dup v0.h, x
> >> > >> +     dup v1.h, y
> >> > >> +     zip1 v0.h, v0.h, v1.h
> >> > >> +     for "large enough" initializer.  */
> >> > >> +
> >> > >> +  if (n_elts >= 8)
> >> > >> +    {
> >> > >> +      int i;
> >> > >> +      for (i = 2; i < n_elts; i++)
> >> > >> +    if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> >> > >> +      break;
> >> > >> +
> >> > >> +      if (i == n_elts)
> >> > >> +    {
> >> > >> +      machine_mode mode = GET_MODE (target);
> >> > >> +      rtx dest[2];
> >> > >> +
> >> > >> +      for (int i = 0; i < 2; i++)
> >> > >> +        {
> >> > >> +          rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
> >> > >
> >> > > Formatting nit: long line.
> >> > >
> >> > >> +          dest[i] = gen_reg_rtx (mode);
> >> > >> +          aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
> >> > >> +        }
> >> > >
> >> > > This could probably be written:
> >> > >
> >> > >         for (int i = 0; i < 2; i++)
> >> > >           {
> >> > >             rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> >> > >             dest[i] = force_reg (GET_MODE_INNER (mode), x);
> >> >
> >> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
> >> Thanks, I have pushed the change in
> >> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
> >> bootstrap+test on aarch64-linux-gnu.
> > Hi Richard,
> > I have attached a patch that extends the transform if one half is dup
> > and other is set of constants.
> > For eg:
> > int8x16_t f(int8_t x)
> > {
> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
> > }
> >
> > code-gen trunk:
> > f:
> >         adrp    x1, .LC0
> >         ldr     q0, [x1, #:lo12:.LC0]
> >         ins     v0.b[0], w0
> >         ins     v0.b[2], w0
> >         ins     v0.b[4], w0
> >         ins     v0.b[6], w0
> >         ins     v0.b[8], w0
> >         ins     v0.b[10], w0
> >         ins     v0.b[12], w0
> >         ins     v0.b[14], w0
> >         ret
> >
> > code-gen with patch:
> > f:
> >         dup     v0.16b, w0
> >         adrp    x0, .LC0
> >         ldr     q1, [x0, #:lo12:.LC0]
> >         zip1    v0.16b, v0.16b, v1.16b
> >         ret
> >
> > Bootstrapped+tested on aarch64-linux-gnu.
> > Does it look OK ?
>
> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
>
> However, rather than handle this case specially, I think we should instead
> take a divide-and-conquer approach: split the initialiser into even and
> odd elements, find the best way of loading each part, then compare the
> cost of these sequences + ZIP with the cost of the fallback code (the code
> later in aarch64_expand_vector_init).
>
> For example, doing that would allow:
>
>   { x, y, 0, y, 0, y, 0, y, 0, y }
>
> to be loaded more easily, even though the even elements aren't wholly
> constant.
Hi Richard,
I have attached a prototype patch based on the above approach.
It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating
same sequence, thus I removed that hunk, and improves the following cases:

(a)
int8x16_t f_s16(int8_t x)
{
  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
                                 x, 5, x, 6, x, 7, x, 8 };
}

code-gen trunk:
f_s16:
        adrp    x1, .LC0
        ldr     q0, [x1, #:lo12:.LC0]
        ins     v0.b[0], w0
        ins     v0.b[2], w0
        ins     v0.b[4], w0
        ins     v0.b[6], w0
        ins     v0.b[8], w0
        ins     v0.b[10], w0
        ins     v0.b[12], w0
        ins     v0.b[14], w0
        ret

code-gen with patch:
f_s16:
        dup     v0.16b, w0
        adrp    x0, .LC0
        ldr     q1, [x0, #:lo12:.LC0]
        zip1    v0.16b, v0.16b, v1.16b
        ret

(b)
int8x16_t f_s16(int8_t x, int8_t y)
{
  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
                                4, y, 5, y, 6, y, 7, y };
}

code-gen trunk:
f_s16:
        adrp    x2, .LC0
        ldr     q0, [x2, #:lo12:.LC0]
        ins     v0.b[0], w0
        ins     v0.b[1], w1
        ins     v0.b[3], w1
        ins     v0.b[5], w1
        ins     v0.b[7], w1
        ins     v0.b[9], w1
        ins     v0.b[11], w1
        ins     v0.b[13], w1
        ins     v0.b[15], w1
        ret

code-gen patch:
f_s16:
        adrp    x2, .LC0
        dup     v1.16b, w1
        ldr     q0, [x2, #:lo12:.LC0]
        ins     v0.b[0], w0
        zip1    v0.16b, v0.16b, v1.16b
        ret

There are a couple of issues I have come across:
(1) Choosing element to pad vector.
For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
with mode V8HI.
We split it into { x, 0, 1, 2 } and { y, y, y, y}
However since the mode is V8HI, we would need to pad the above split vectors
with 4 more elements to match up to vector length.
For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, y, y}
using 'y' is the obvious choice thus making them:
{x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
These would be then merged using zip1 which would discard the lower half
of both vectors.
Currently I encoded the above two heuristics in
aarch64_expand_vector_init_get_padded_elem:
(a) If split portion contains a constant, use the constant to pad the vector.
(b) If split portion only contains variables, then use the most
frequently repeating variable
to pad the vector.
I suppose tho this could be improved ?

(2) Setting cost for zip1:
Currently it returns 4 as cost for following zip1 insn:
(set (reg:V8HI 102)
    (unspec:V8HI [
            (reg:V8HI 103)
            (reg:V8HI 108)
        ] UNSPEC_ZIP1))
I am not sure if that's correct, or if not, what cost to use in this case
for zip1 ?

Thanks,
Prathamesh
>
> Thanks,
> Richard
>
> >
> > Thanks,
> > Prathamesh
> >>
> >
> >> Thanks,
> >> Prathamesh
> >> >
> >> > >           }
> >> > >
> >> > > which avoids forcing constant elements into a register before the duplication.
> >> > > OK with that change if it works.
> >> > >
> >> > > Thanks,
> >> > > Richard
> >> > >
> >> > >> +
> >> > >> +      rtvec v = gen_rtvec (2, dest[0], dest[1]);
> >> > >> +      emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> >> > >> +      return;
> >> > >> +    }
> >> > >> +    }
> >> > >> +
> >> > >>    enum insn_code icode = optab_handler (vec_set_optab, mode);
> >> > >>    gcc_assert (icode != CODE_FOR_nothing);
> >> > >>
> >> > >> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> >> > >> new file mode 100644
> >> > >> index 00000000000..ee775048589
> >> > >> --- /dev/null
> >> > >> +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> >> > >> @@ -0,0 +1,37 @@
> >> > >> +/* { dg-do compile } */
> >> > >> +/* { dg-options "-O3" } */
> >> > >> +/* { dg-final { check-function-bodies "**" "" "" } } */
> >> > >> +
> >> > >> +#include <arm_neon.h>
> >> > >> +
> >> > >> +/*
> >> > >> +** foo:
> >> > >> +**  ...
> >> > >> +**  dup     v[0-9]+\.8h, w[0-9]+
> >> > >> +**  dup     v[0-9]+\.8h, w[0-9]+
> >> > >> +**  zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >> > >> +**  ...
> >> > >> +**  ret
> >> > >> +*/
> >> > >> +
> >> > >> +int16x8_t foo(int16_t x, int y)
> >> > >> +{
> >> > >> +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y};
> >> > >> +  return v;
> >> > >> +}
> >> > >> +
> >> > >> +/*
> >> > >> +** foo2:
> >> > >> +**  ...
> >> > >> +**  dup     v[0-9]+\.8h, w[0-9]+
> >> > >> +**  movi    v[0-9]+\.8h, 0x1
> >> > >> +**  zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >> > >> +**  ...
> >> > >> +**  ret
> >> > >> +*/
> >> > >> +
> >> > >> +int16x8_t foo2(int16_t x)
> >> > >> +{
> >> > >> +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1};
> >> > >> +  return v;
> >> > >> +}
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index 9a79a9e7928..411e85f52a4 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -21984,6 +21984,54 @@ aarch64_simd_make_constant (rtx vals)
> >      return NULL_RTX;
> >  }
> >
> > +/* Subroutine of aarch64_expand_vector_init.
> > +   Check if VALS has same element at every alternate position
> > +   from START_POS.  */
> > +
> > +static
> > +bool aarch64_init_interleaving_dup_p (rtx vals, int start_pos)
> > +{
> > +  for (int i = start_pos + 2; i < XVECLEN (vals, 0); i += 2)
> > +    if (!rtx_equal_p (XVECEXP (vals, 0, start_pos), XVECEXP (vals, 0, i)))
> > +      return false;
> > +  return true;
> > +}
> > +
> > +/* Subroutine of aarch64_expand_vector_init.
> > +   Check if every alternate element in VALS starting from START_POS
> > +   is a constant.  */
> > +
> > +static
> > +bool aarch64_init_interleaving_const_p (rtx vals, int start_pos)
> > +{
> > +  for (int i = start_pos; i < XVECLEN (vals, 0); i += 2)
> > +    if (!CONSTANT_P (XVECEXP (vals, 0, i)))
> > +      return false;
> > +  return true;
> > +}
> > +
> > +/* Subroutine of aarch64_expand_vector_init.
> > +   Copy all odd-numbered or even-numbered elements from VALS
> > +   depending on CONST_EVEN.
> > +   For eg if VALS is { x, 1, x, 2, x, 3, x, 4 }
> > +   return {1, 2, 3, 4, 1, 1, 1, 1}.
> > +   We are only interested in the first half {0 ... n_elts/2} since
> > +   that will be used by zip1 for merging. Fill the second half
> > +   with an arbitrary value since it will be discarded.  */
> > +
> > +static
> > +rtx aarch64_init_interleaving_shift_init (rtx vals, bool const_even)
> > +{
> > +  int n_elts = XVECLEN (vals, 0);
> > +  rtvec vec = rtvec_alloc (n_elts);
> > +  int i;
> > +  for (i = 0; i < n_elts / 2; i++)
> > +    RTVEC_ELT (vec, i) = XVECEXP (vals, 0, (const_even) ? 2 * i : 2 * i + 1);
> > +  for (; i < n_elts; i++)
> > +    RTVEC_ELT (vec, i) = RTVEC_ELT (vec, 0);
> > +  return gen_rtx_CONST_VECTOR (GET_MODE (vals), vec);
> > +}
> > +
> >  /* Expand a vector initialisation sequence, such that TARGET is
> >     initialised to contain VALS.  */
> >
> > @@ -22048,22 +22096,55 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >        return;
> >      }
> >
> > -  /* Check for interleaving case.
> > -     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> > -     Generate following code:
> > -     dup v0.h, x
> > -     dup v1.h, y
> > -     zip1 v0.h, v0.h, v1.h
> > -     for "large enough" initializer.  */
> > +  /* Check for interleaving case for "large enough" initializer.
> > +     Currently we handle following cases:
> > +     (a) Even part is dup and odd part is const.
> > +     (b) Odd part is dup and even part is const.
> > +     (c) Both even and odd parts are dup.  */
> >
> >    if (n_elts >= 8)
> >      {
> > -      int i;
> > -      for (i = 2; i < n_elts; i++)
> > -     if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> > -       break;
> > +      bool even_dup = false, even_const = false;
> > +      bool odd_dup = false, odd_const = false;
> > +
> > +      even_dup = aarch64_init_interleaving_dup_p (vals, 0);
> > +      if (!even_dup)
> > +     even_const = aarch64_init_interleaving_const_p (vals, 0);
> > +
> > +      odd_dup = aarch64_init_interleaving_dup_p (vals, 1);
> > +      if (!odd_dup)
> > +     odd_const = aarch64_init_interleaving_const_p (vals, 1);
> > +
> > +      /* This case should already be handled above when all elements are constants.  */
> > +      gcc_assert (!(even_const && odd_const));
> >
> > -      if (i == n_elts)
> > +      if (even_dup && odd_const)
> > +     {
> > +       rtx dup_reg = expand_vector_broadcast (mode, XVECEXP (vals, 0, 0));
> > +       dup_reg = force_reg (mode, dup_reg);
> > +
> > +       rtx const_reg = gen_reg_rtx (mode);
> > +       rtx const_vector = aarch64_init_interleaving_shift_init (vals, false);
> > +       aarch64_expand_vector_init (const_reg, const_vector);
> > +
> > +       rtvec v = gen_rtvec (2, dup_reg, const_reg);
> > +       emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > +       return;
> > +     }
> > +      else if (odd_dup && even_const)
> > +     {
> > +       rtx dup_reg = expand_vector_broadcast (mode, XVECEXP (vals, 0, 1));
> > +       dup_reg = force_reg (mode, dup_reg);
> > +
> > +       rtx const_reg = gen_reg_rtx (mode);
> > +       rtx const_vector = aarch64_init_interleaving_shift_init (vals, true);
> > +       aarch64_expand_vector_init (const_reg, const_vector);
> > +
> > +       rtvec v = gen_rtvec (2, const_reg, dup_reg);
> > +       emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > +       return;
> > +     }
> > +      else if (even_dup && odd_dup)
> >       {
> >         machine_mode mode = GET_MODE (target);
> >         rtx dest[2];
> > diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c
> > new file mode 100644
> > index 00000000000..3ad06c00451
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-2.c
> > @@ -0,0 +1,51 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include "arm_neon.h"
> > +
> > +/*
> > +**foo:
> > +**   ...
> > +**   dup     v[0-9]+\.8h, w[0-9]+
> > +**   adrp    x[0-9]+, .LC[0-9]+
> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> > +**   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> > +**   ...
> > +*/
> > +
> > +int16x8_t foo(int16_t x)
> > +{
> > +  return (int16x8_t) { x, 1, x, 2, x, 3, x, 4 };
> > +}
> > +
> > +
> > +/*
> > +**foo2:
> > +**   ...
> > +**   dup     v[0-9]+\.8h, w[0-9]+
> > +**   adrp    x[0-9]+, .LC[0-9]+
> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> > +**   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> > +**   ...
> > +*/
> > +
> > +int16x8_t foo2(int16_t x)
> > +{
> > +  return (int16x8_t) { 1, x, 2, x, 3, x, 4, x };
> > +}
> > +
> > +/*
> > +**foo3:
> > +**   ...
> > +**   dup     v[0-9]+\.8h, v[0-9]+\.h\[0\]
> > +**   adrp    x[0-9]+, .LC[0-9]+
> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> > +**   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> > +**   ...
> > +*/
> > +
> > +float16x8_t foo3(float16_t x)
> > +{
> > +  return (float16x8_t) { x, 1.0, x, 2.0, x, 3.0, x, 4.0 };
> > +}
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 17c1e23e5b5..0090fb47d98 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15065,6 +15065,11 @@ cost_plus:
       return false;
 
     case UNSPEC:
+      /* FIXME: What cost to use for zip1 ?
+	 Currently using default cost.  */
+      if (XINT (x, 1) == UNSPEC_ZIP1)
+	break;
+
       /* The floating point round to integer frint* instructions.  */
       if (aarch64_frint_unspec_p (XINT (x, 1)))
         {
@@ -21972,11 +21977,44 @@ aarch64_simd_make_constant (rtx vals)
     return NULL_RTX;
 }
 
+/* The algorithm will fill matches[*][0] with the earliest matching element,
+   and matches[X][1] with the count of duplicate elements (if X is the
+   earliest element which has duplicates).  */
+
+static void
+aarch64_expand_vector_init_get_most_repeating_elem (rtx vals, int n,
+						    int (*matches)[2],
+						    int &maxv, int &maxelement)
+{
+  memset (matches, 0, 16 * 2 * sizeof (int));
+  for (int i = 0; i < n; i++)
+    {
+      for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+	    {
+	      matches[i][0] = j;
+	      matches[j][1]++;
+	      break;
+	    }
+	}
+    }
+  
+  maxelement = 0;
+  maxv = 0;
+  for (int i = 0; i < n; i++)
+    if (matches[i][1] > maxv)
+      {
+	maxelement = i;
+	maxv = matches[i][1];
+      }
+}
+
 /* Expand a vector initialisation sequence, such that TARGET is
    initialised to contain VALS.  */
 
-void
-aarch64_expand_vector_init (rtx target, rtx vals)
+static void
+aarch64_expand_vector_init_fallback (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   scalar_mode inner_mode = GET_MODE_INNER (mode);
@@ -22036,38 +22074,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
-  /* Check for interleaving case.
-     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
-     Generate following code:
-     dup v0.h, x
-     dup v1.h, y
-     zip1 v0.h, v0.h, v1.h
-     for "large enough" initializer.  */
-
-  if (n_elts >= 8)
-    {
-      int i;
-      for (i = 2; i < n_elts; i++)
-	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
-	  break;
-
-      if (i == n_elts)
-	{
-	  machine_mode mode = GET_MODE (target);
-	  rtx dest[2];
-
-	  for (int i = 0; i < 2; i++)
-	    {
-	      rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
-	      dest[i] = force_reg (mode, x);
-	    }
-
-	  rtvec v = gen_rtvec (2, dest[0], dest[1]);
-	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
-	  return;
-	}
-    }
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
@@ -22075,33 +22081,15 @@ aarch64_expand_vector_init (rtx target, rtx vals)
      the insertion using dup for the most common element
      followed by insertions.  */
 
-  /* The algorithm will fill matches[*][0] with the earliest matching element,
-     and matches[X][1] with the count of duplicate elements (if X is the
-     earliest element which has duplicates).  */
 
   if (n_var == n_elts && n_elts <= 16)
     {
-      int matches[16][2] = {0};
-      for (int i = 0; i < n_elts; i++)
-	{
-	  for (int j = 0; j <= i; j++)
-	    {
-	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
-		{
-		  matches[i][0] = j;
-		  matches[j][1]++;
-		  break;
-		}
-	    }
-	}
-      int maxelement = 0;
-      int maxv = 0;
-      for (int i = 0; i < n_elts; i++)
-	if (matches[i][1] > maxv)
-	  {
-	    maxelement = i;
-	    maxv = matches[i][1];
-	  }
+      int matches[16][2];
+      int maxelement, maxv;
+      aarch64_expand_vector_init_get_most_repeating_elem (vals, n_elts,
+      							  matches,
+							  maxv,
+							  maxelement);
 
       /* Create a duplicate of the most common element, unless all elements
 	 are equally useless to us, in which case just immediately set the
@@ -22189,7 +22177,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	    }
 	  XVECEXP (copy, 0, i) = subst;
 	}
-      aarch64_expand_vector_init (target, copy);
+      aarch64_expand_vector_init_fallback (target, copy);
     }
 
   /* Insert the variable lanes directly.  */
@@ -22203,6 +22191,126 @@ aarch64_expand_vector_init (rtx target, rtx vals)
     }
 }
 
+/* Function to pad elements in VALS as described in the comment
+   for aarch64_expand_vector_init_split_vals.  */
+
+static rtx
+aarch64_expand_vector_init_get_padded_elem (rtx vals, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      rtx elem = XVECEXP (vals, 0, i);
+      if (CONST_INT_P (elem) || CONST_DOUBLE_P (elem))
+	return elem;
+    }
+
+  int matches[16][2];
+  int maxv, maxelement;
+  aarch64_expand_vector_init_get_most_repeating_elem (vals, n, matches, maxv, maxelement);
+  return XVECEXP (vals, 0, maxelement);
+}
+
+/*
+Split vals into even or odd half, however since the mode remains same,
+we have to pad up with extra elements to fill vector length.
+The function uses couple of heuristics for padding:
+(1) If the split portion contains a constant, pad the vector with
+    constant elem.
+    For eg if split portion is {x, 1, 2, 3} and mode is V8HI
+    then the result is {x, 1, 2, 3, 1, 1, 1, 1}
+(2) If the split portion is entirely of variables, then use the
+    most frequently repeating variable as padding element.
+    For eg if split portion is {x, x, x, y} and mode is V8HI,
+    then the result is {x, x, x, x, y, x, x, x}
+    We use the most frequenty repeating variable so dup will initialize
+    most of the vector and then use insr to insert remaining ones,
+    which will be done in aarch64_expand_vector_init_fallback.
+*/
+
+static rtx
+aarch64_expand_vector_init_split_vals (rtx vals, bool even_p)
+{
+  rtx new_vals = copy_rtx (vals);
+  int n = XVECLEN (vals, 0);
+  int i;
+  for (i = 0; i < n / 2; i++)
+    XVECEXP (new_vals, 0, i)
+      = XVECEXP (new_vals, 0, (even_p) ? 2 * i : 2 * i + 1);
+
+  rtx padded_val
+    = aarch64_expand_vector_init_get_padded_elem (new_vals, n / 2); 
+  for (; i < n; i++)
+    XVECEXP (new_vals, 0, i) = padded_val;
+  return new_vals;
+}
+
+DEBUG_FUNCTION
+static void
+aarch64_expand_vector_init_debug_seq (rtx_insn *seq, const char *s)
+{
+  fprintf (stderr, "%s: %u\n", s, seq_cost (seq, !optimize_size));
+  for (rtx_insn *i = seq; i; i = NEXT_INSN (i))
+    {
+      debug_rtx (PATTERN (i));
+      fprintf (stderr, "cost: %d\n", pattern_cost (PATTERN (i), !optimize_size));
+    }
+}
+
+/*
+The function does the following:
+(a) Generates code sequence by splitting VALS into even and odd halves,
+    and recursively calling itself to initialize them and then merge using
+    zip1.
+(b) Generate code sequence directly using aarch64_expand_vector_init_fallback.
+(c) Compare the cost of code sequences generated by (a) and (b), and choose
+    the more efficient one.
+*/
+
+void
+aarch64_expand_vector_init_1 (rtx target, rtx vals, int n_elts)
+{
+  if (n_elts < 8)
+    {
+      aarch64_expand_vector_init_fallback (target, vals);
+      return;
+    }
+
+  machine_mode mode = GET_MODE (target);
+
+  start_sequence ();
+  rtx dest[2];
+  for (int i = 0; i < 2; i++)
+    {
+      dest[i] = gen_reg_rtx (mode);
+      rtx new_vals
+	= aarch64_expand_vector_init_split_vals (vals, (i % 2) == 0);
+      aarch64_expand_vector_init_1 (dest[i], new_vals, n_elts / 2);
+    }
+
+  rtvec v = gen_rtvec (2, dest[0], dest[1]);
+  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+  
+  rtx_insn *seq = get_insns (); 
+  end_sequence ();
+
+  start_sequence ();
+  aarch64_expand_vector_init_fallback (target, vals);
+  rtx_insn *fallback_seq = get_insns (); 
+  end_sequence ();
+
+  emit_insn (seq_cost (seq, !optimize_size)
+	     < seq_cost (fallback_seq, !optimize_size)
+	     ? seq : fallback_seq);
+}
+
+/* Wrapper around aarch64_expand_vector_init_1.  */
+
+void
+aarch64_expand_vector_init (rtx target, rtx vals)
+{
+  aarch64_expand_vector_init_1 (target, vals, XVECLEN (vals, 0));
+}
+
 /* Emit RTL corresponding to:
    insr TARGET, ELEM.  */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
similarity index 100%
rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
new file mode 100644
index 00000000000..d204c7e1f8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	dup	v[0-9]+\.16b, w[0-9]+
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x)
+{
+  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
+                       x, 5, x, 6, x, 7, x, 8 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
new file mode 100644
index 00000000000..c2c97469940
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	dup	v[0-9]+\.16b, w[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
+                       4, y, 5, y, 6, y, 7, y };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
new file mode 100644
index 00000000000..e16459486d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	ins	v0\.b\[1\], w1
+**	...
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
+                       7, 8, 9, 10, 11, 12, 13, 14 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22.c
new file mode 100644
index 00000000000..e5016a47a3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/* Verify that fallback code-sequence is chosen over
+   recursively generated code-sequence merged with zip1.  */
+
+/*
+** f_s16:
+**	...
+**	sxth	w0, w0
+**	fmov	s0, w0
+**	ins	v0\.h\[1\], w1
+**	ins	v0\.h\[2\], w2
+**	ins	v0\.h\[3\], w3
+**	ins	v0\.h\[4\], w4
+**	ins	v0\.h\[5\], w5
+**	ins	v0\.h\[6\], w6
+**	ins	v0\.h\[7\], w7
+**	...
+**	ret
+*/
+
+int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
+                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
+{
+  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
+}
  
Richard Sandiford Feb. 1, 2023, 4:26 p.m. UTC | #9
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Thu, 12 Jan 2023 at 21:21, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> > On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni
>> > <prathamesh.kulkarni@linaro.org> wrote:
>> >>
>> >> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
>> >> <richard.sandiford@arm.com> wrote:
>> >> >
>> >> > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> >> > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> >> > >> Hi,
>> >> > >> For the following test-case:
>> >> > >>
>> >> > >> int16x8_t foo(int16_t x, int16_t y)
>> >> > >> {
>> >> > >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
>> >> > >> }
>> >> > >>
>> >> > >> Code gen at -O3:
>> >> > >> foo:
>> >> > >>         dup    v0.8h, w0
>> >> > >>         ins     v0.h[1], w1
>> >> > >>         ins     v0.h[3], w1
>> >> > >>         ins     v0.h[5], w1
>> >> > >>         ins     v0.h[7], w1
>> >> > >>         ret
>> >> > >>
>> >> > >> For 16 elements, it results in 8 ins instructions which might not be
>> >> > >> optimal perhaps.
>> >> > >> I guess, the above code-gen would be equivalent to the following ?
>> >> > >> dup v0.8h, w0
>> >> > >> dup v1.8h, w1
>> >> > >> zip1 v0.8h, v0.8h, v1.8h
>> >> > >>
>> >> > >> I have attached patch to do the same, if number of elements >= 8,
>> >> > >> which should be possibly better compared to current code-gen ?
>> >> > >> Patch passes bootstrap+test on aarch64-linux-gnu.
>> >> > >> Does the patch look OK ?
>> >> > >>
>> >> > >> Thanks,
>> >> > >> Prathamesh
>> >> > >>
>> >> > >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> >> > >> index c91df6f5006..e5dea70e363 100644
>> >> > >> --- a/gcc/config/aarch64/aarch64.cc
>> >> > >> +++ b/gcc/config/aarch64/aarch64.cc
>> >> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>> >> > >>        return;
>> >> > >>      }
>> >> > >>
>> >> > >> +  /* Check for interleaving case.
>> >> > >> +     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
>> >> > >> +     Generate following code:
>> >> > >> +     dup v0.h, x
>> >> > >> +     dup v1.h, y
>> >> > >> +     zip1 v0.h, v0.h, v1.h
>> >> > >> +     for "large enough" initializer.  */
>> >> > >> +
>> >> > >> +  if (n_elts >= 8)
>> >> > >> +    {
>> >> > >> +      int i;
>> >> > >> +      for (i = 2; i < n_elts; i++)
>> >> > >> +    if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
>> >> > >> +      break;
>> >> > >> +
>> >> > >> +      if (i == n_elts)
>> >> > >> +    {
>> >> > >> +      machine_mode mode = GET_MODE (target);
>> >> > >> +      rtx dest[2];
>> >> > >> +
>> >> > >> +      for (int i = 0; i < 2; i++)
>> >> > >> +        {
>> >> > >> +          rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
>> >> > >
>> >> > > Formatting nit: long line.
>> >> > >
>> >> > >> +          dest[i] = gen_reg_rtx (mode);
>> >> > >> +          aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
>> >> > >> +        }
>> >> > >
>> >> > > This could probably be written:
>> >> > >
>> >> > >         for (int i = 0; i < 2; i++)
>> >> > >           {
>> >> > >             rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
>> >> > >             dest[i] = force_reg (GET_MODE_INNER (mode), x);
>> >> >
>> >> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
>> >> Thanks, I have pushed the change in
>> >> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
>> >> bootstrap+test on aarch64-linux-gnu.
>> > Hi Richard,
>> > I have attached a patch that extends the transform if one half is dup
>> > and other is set of constants.
>> > For eg:
>> > int8x16_t f(int8_t x)
>> > {
>> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
>> > }
>> >
>> > code-gen trunk:
>> > f:
>> >         adrp    x1, .LC0
>> >         ldr     q0, [x1, #:lo12:.LC0]
>> >         ins     v0.b[0], w0
>> >         ins     v0.b[2], w0
>> >         ins     v0.b[4], w0
>> >         ins     v0.b[6], w0
>> >         ins     v0.b[8], w0
>> >         ins     v0.b[10], w0
>> >         ins     v0.b[12], w0
>> >         ins     v0.b[14], w0
>> >         ret
>> >
>> > code-gen with patch:
>> > f:
>> >         dup     v0.16b, w0
>> >         adrp    x0, .LC0
>> >         ldr     q1, [x0, #:lo12:.LC0]
>> >         zip1    v0.16b, v0.16b, v1.16b
>> >         ret
>> >
>> > Bootstrapped+tested on aarch64-linux-gnu.
>> > Does it look OK ?
>>
>> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
>>
>> However, rather than handle this case specially, I think we should instead
>> take a divide-and-conquer approach: split the initialiser into even and
>> odd elements, find the best way of loading each part, then compare the
>> cost of these sequences + ZIP with the cost of the fallback code (the code
>> later in aarch64_expand_vector_init).
>>
>> For example, doing that would allow:
>>
>>   { x, y, 0, y, 0, y, 0, y, 0, y }
>>
>> to be loaded more easily, even though the even elements aren't wholly
>> constant.
> Hi Richard,
> I have attached a prototype patch based on the above approach.
> It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating
> same sequence, thus I removed that hunk, and improves the following cases:
>
> (a)
> int8x16_t f_s16(int8_t x)
> {
>   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
>                                  x, 5, x, 6, x, 7, x, 8 };
> }
>
> code-gen trunk:
> f_s16:
>         adrp    x1, .LC0
>         ldr     q0, [x1, #:lo12:.LC0]
>         ins     v0.b[0], w0
>         ins     v0.b[2], w0
>         ins     v0.b[4], w0
>         ins     v0.b[6], w0
>         ins     v0.b[8], w0
>         ins     v0.b[10], w0
>         ins     v0.b[12], w0
>         ins     v0.b[14], w0
>         ret
>
> code-gen with patch:
> f_s16:
>         dup     v0.16b, w0
>         adrp    x0, .LC0
>         ldr     q1, [x0, #:lo12:.LC0]
>         zip1    v0.16b, v0.16b, v1.16b
>         ret
>
> (b)
> int8x16_t f_s16(int8_t x, int8_t y)
> {
>   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
>                                 4, y, 5, y, 6, y, 7, y };
> }
>
> code-gen trunk:
> f_s16:
>         adrp    x2, .LC0
>         ldr     q0, [x2, #:lo12:.LC0]
>         ins     v0.b[0], w0
>         ins     v0.b[1], w1
>         ins     v0.b[3], w1
>         ins     v0.b[5], w1
>         ins     v0.b[7], w1
>         ins     v0.b[9], w1
>         ins     v0.b[11], w1
>         ins     v0.b[13], w1
>         ins     v0.b[15], w1
>         ret
>
> code-gen patch:
> f_s16:
>         adrp    x2, .LC0
>         dup     v1.16b, w1
>         ldr     q0, [x2, #:lo12:.LC0]
>         ins     v0.b[0], w0
>         zip1    v0.16b, v0.16b, v1.16b
>         ret

Nice.

> There are a couple of issues I have come across:
> (1) Choosing element to pad vector.
> For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
> with mode V8HI.
> We split it into { x, 0, 1, 2 } and { y, y, y, y}
> However since the mode is V8HI, we would need to pad the above split vectors
> with 4 more elements to match up to vector length.
> For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, y, y}
> using 'y' is the obvious choice thus making them:
> {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
> These would be then merged using zip1 which would discard the lower half
> of both vectors.
> Currently I encoded the above two heuristics in
> aarch64_expand_vector_init_get_padded_elem:
> (a) If split portion contains a constant, use the constant to pad the vector.
> (b) If split portion only contains variables, then use the most
> frequently repeating variable
> to pad the vector.
> I suppose tho this could be improved ?

I think we should just build two 64-bit vectors (V4HIs) and use a subreg
to fill the upper elements with undefined values.

I suppose in principle we would have the same problem when splitting
a 64-bit vector into 2 32-bit vectors, but it's probably better to punt
on that for now.  Eventually it would be worth adding full support for
32-bit Advanced SIMD modes (with necessary restrictions for FP exceptions)
but it's quite a big task.  The 128-bit to 64-bit split is the one that
matters most.

> (2) Setting cost for zip1:
> Currently it returns 4 as cost for following zip1 insn:
> (set (reg:V8HI 102)
>     (unspec:V8HI [
>             (reg:V8HI 103)
>             (reg:V8HI 108)
>         ] UNSPEC_ZIP1))
> I am not sure if that's correct, or if not, what cost to use in this case
> for zip1 ?

TBH 4 seems a bit optimistic.  It's COSTS_N_INSNS (1), whereas the
generic advsimd_vec_cost::permute_cost is 2 insns.  But the costs of
inserts are probably underestimated to the same extent, so hopefully
things work out.

So it's probably best to accept the costs as they're currently given.
Changing them would need extensive testing.

However, one of the advantages of the split is that it allows the
subvectors to be built in parallel.  When optimising for speed,
it might make sense to take the maximum of the subsequence costs
and add the cost of the zip to that.

Thanks,
Richard
  
Prathamesh Kulkarni Feb. 2, 2023, 2:51 p.m. UTC | #10
On Wed, 1 Feb 2023 at 21:56, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > On Thu, 12 Jan 2023 at 21:21, Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> > On Tue, 6 Dec 2022 at 07:01, Prathamesh Kulkarni
> >> > <prathamesh.kulkarni@linaro.org> wrote:
> >> >>
> >> >> On Mon, 5 Dec 2022 at 16:50, Richard Sandiford
> >> >> <richard.sandiford@arm.com> wrote:
> >> >> >
> >> >> > Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >> >> > > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> >> > >> Hi,
> >> >> > >> For the following test-case:
> >> >> > >>
> >> >> > >> int16x8_t foo(int16_t x, int16_t y)
> >> >> > >> {
> >> >> > >>   return (int16x8_t) { x, y, x, y, x, y, x, y };
> >> >> > >> }
> >> >> > >>
> >> >> > >> Code gen at -O3:
> >> >> > >> foo:
> >> >> > >>         dup    v0.8h, w0
> >> >> > >>         ins     v0.h[1], w1
> >> >> > >>         ins     v0.h[3], w1
> >> >> > >>         ins     v0.h[5], w1
> >> >> > >>         ins     v0.h[7], w1
> >> >> > >>         ret
> >> >> > >>
> >> >> > >> For 16 elements, it results in 8 ins instructions which might not be
> >> >> > >> optimal perhaps.
> >> >> > >> I guess, the above code-gen would be equivalent to the following ?
> >> >> > >> dup v0.8h, w0
> >> >> > >> dup v1.8h, w1
> >> >> > >> zip1 v0.8h, v0.8h, v1.8h
> >> >> > >>
> >> >> > >> I have attached patch to do the same, if number of elements >= 8,
> >> >> > >> which should be possibly better compared to current code-gen ?
> >> >> > >> Patch passes bootstrap+test on aarch64-linux-gnu.
> >> >> > >> Does the patch look OK ?
> >> >> > >>
> >> >> > >> Thanks,
> >> >> > >> Prathamesh
> >> >> > >>
> >> >> > >> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> >> >> > >> index c91df6f5006..e5dea70e363 100644
> >> >> > >> --- a/gcc/config/aarch64/aarch64.cc
> >> >> > >> +++ b/gcc/config/aarch64/aarch64.cc
> >> >> > >> @@ -22028,6 +22028,39 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >> >> > >>        return;
> >> >> > >>      }
> >> >> > >>
> >> >> > >> +  /* Check for interleaving case.
> >> >> > >> +     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> >> >> > >> +     Generate following code:
> >> >> > >> +     dup v0.h, x
> >> >> > >> +     dup v1.h, y
> >> >> > >> +     zip1 v0.h, v0.h, v1.h
> >> >> > >> +     for "large enough" initializer.  */
> >> >> > >> +
> >> >> > >> +  if (n_elts >= 8)
> >> >> > >> +    {
> >> >> > >> +      int i;
> >> >> > >> +      for (i = 2; i < n_elts; i++)
> >> >> > >> +    if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> >> >> > >> +      break;
> >> >> > >> +
> >> >> > >> +      if (i == n_elts)
> >> >> > >> +    {
> >> >> > >> +      machine_mode mode = GET_MODE (target);
> >> >> > >> +      rtx dest[2];
> >> >> > >> +
> >> >> > >> +      for (int i = 0; i < 2; i++)
> >> >> > >> +        {
> >> >> > >> +          rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
> >> >> > >
> >> >> > > Formatting nit: long line.
> >> >> > >
> >> >> > >> +          dest[i] = gen_reg_rtx (mode);
> >> >> > >> +          aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
> >> >> > >> +        }
> >> >> > >
> >> >> > > This could probably be written:
> >> >> > >
> >> >> > >         for (int i = 0; i < 2; i++)
> >> >> > >           {
> >> >> > >             rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> >> >> > >             dest[i] = force_reg (GET_MODE_INNER (mode), x);
> >> >> >
> >> >> > Oops, I meant "mode" rather than "GET_MODE_INNER (mode)", sorry.
> >> >> Thanks, I have pushed the change in
> >> >> 769370f3e2e04823c8a621d8ffa756dd83ebf21e after running
> >> >> bootstrap+test on aarch64-linux-gnu.
> >> > Hi Richard,
> >> > I have attached a patch that extends the transform if one half is dup
> >> > and other is set of constants.
> >> > For eg:
> >> > int8x16_t f(int8_t x)
> >> > {
> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
> >> > }
> >> >
> >> > code-gen trunk:
> >> > f:
> >> >         adrp    x1, .LC0
> >> >         ldr     q0, [x1, #:lo12:.LC0]
> >> >         ins     v0.b[0], w0
> >> >         ins     v0.b[2], w0
> >> >         ins     v0.b[4], w0
> >> >         ins     v0.b[6], w0
> >> >         ins     v0.b[8], w0
> >> >         ins     v0.b[10], w0
> >> >         ins     v0.b[12], w0
> >> >         ins     v0.b[14], w0
> >> >         ret
> >> >
> >> > code-gen with patch:
> >> > f:
> >> >         dup     v0.16b, w0
> >> >         adrp    x0, .LC0
> >> >         ldr     q1, [x0, #:lo12:.LC0]
> >> >         zip1    v0.16b, v0.16b, v1.16b
> >> >         ret
> >> >
> >> > Bootstrapped+tested on aarch64-linux-gnu.
> >> > Does it look OK ?
> >>
> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
> >>
> >> However, rather than handle this case specially, I think we should instead
> >> take a divide-and-conquer approach: split the initialiser into even and
> >> odd elements, find the best way of loading each part, then compare the
> >> cost of these sequences + ZIP with the cost of the fallback code (the code
> >> later in aarch64_expand_vector_init).
> >>
> >> For example, doing that would allow:
> >>
> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
> >>
> >> to be loaded more easily, even though the even elements aren't wholly
> >> constant.
> > Hi Richard,
> > I have attached a prototype patch based on the above approach.
> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating
> > same sequence, thus I removed that hunk, and improves the following cases:
> >
> > (a)
> > int8x16_t f_s16(int8_t x)
> > {
> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> >                                  x, 5, x, 6, x, 7, x, 8 };
> > }
> >
> > code-gen trunk:
> > f_s16:
> >         adrp    x1, .LC0
> >         ldr     q0, [x1, #:lo12:.LC0]
> >         ins     v0.b[0], w0
> >         ins     v0.b[2], w0
> >         ins     v0.b[4], w0
> >         ins     v0.b[6], w0
> >         ins     v0.b[8], w0
> >         ins     v0.b[10], w0
> >         ins     v0.b[12], w0
> >         ins     v0.b[14], w0
> >         ret
> >
> > code-gen with patch:
> > f_s16:
> >         dup     v0.16b, w0
> >         adrp    x0, .LC0
> >         ldr     q1, [x0, #:lo12:.LC0]
> >         zip1    v0.16b, v0.16b, v1.16b
> >         ret
> >
> > (b)
> > int8x16_t f_s16(int8_t x, int8_t y)
> > {
> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> >                                 4, y, 5, y, 6, y, 7, y };
> > }
> >
> > code-gen trunk:
> > f_s16:
> >         adrp    x2, .LC0
> >         ldr     q0, [x2, #:lo12:.LC0]
> >         ins     v0.b[0], w0
> >         ins     v0.b[1], w1
> >         ins     v0.b[3], w1
> >         ins     v0.b[5], w1
> >         ins     v0.b[7], w1
> >         ins     v0.b[9], w1
> >         ins     v0.b[11], w1
> >         ins     v0.b[13], w1
> >         ins     v0.b[15], w1
> >         ret
> >
> > code-gen patch:
> > f_s16:
> >         adrp    x2, .LC0
> >         dup     v1.16b, w1
> >         ldr     q0, [x2, #:lo12:.LC0]
> >         ins     v0.b[0], w0
> >         zip1    v0.16b, v0.16b, v1.16b
> >         ret
>
> Nice.
>
> > There are a couple of issues I have come across:
> > (1) Choosing element to pad vector.
> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
> > with mode V8HI.
> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
> > However since the mode is V8HI, we would need to pad the above split vectors
> > with 4 more elements to match up to vector length.
> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, y, y}
> > using 'y' is the obvious choice thus making them:
> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
> > These would be then merged using zip1 which would discard the lower half
> > of both vectors.
> > Currently I encoded the above two heuristics in
> > aarch64_expand_vector_init_get_padded_elem:
> > (a) If split portion contains a constant, use the constant to pad the vector.
> > (b) If split portion only contains variables, then use the most
> > frequently repeating variable
> > to pad the vector.
> > I suppose tho this could be improved ?
>
> I think we should just build two 64-bit vectors (V4HIs) and use a subreg
> to fill the upper elements with undefined values.
>
> I suppose in principle we would have the same problem when splitting
> a 64-bit vector into 2 32-bit vectors, but it's probably better to punt
> on that for now.  Eventually it would be worth adding full support for
> 32-bit Advanced SIMD modes (with necessary restrictions for FP exceptions)
> but it's quite a big task.  The 128-bit to 64-bit split is the one that
> matters most.
>
> > (2) Setting cost for zip1:
> > Currently it returns 4 as cost for following zip1 insn:
> > (set (reg:V8HI 102)
> >     (unspec:V8HI [
> >             (reg:V8HI 103)
> >             (reg:V8HI 108)
> >         ] UNSPEC_ZIP1))
> > I am not sure if that's correct, or if not, what cost to use in this case
> > for zip1 ?
>
> TBH 4 seems a bit optimistic.  It's COSTS_N_INSNS (1), whereas the
> generic advsimd_vec_cost::permute_cost is 2 insns.  But the costs of
> inserts are probably underestimated to the same extent, so hopefully
> things work out.
>
> So it's probably best to accept the costs as they're currently given.
> Changing them would need extensive testing.
>
> However, one of the advantages of the split is that it allows the
> subvectors to be built in parallel.  When optimising for speed,
> it might make sense to take the maximum of the subsequence costs
> and add the cost of the zip to that.
Hi Richard,
Thanks for the suggestions.
In the attached patch, it recurses only if nelts == 16 to punt for 64
-> 32 bit split,
and uses std::max(even_init, odd_init) + insn_cost (zip1_insn) for
computing total cost of the sequence.

So, for following case:
int8x16_t f_s8(int8_t x)
{
  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
                                x, 5, x, 6, x, 7, x, 8 };
}

it now generates:
f_s16:
        dup     v0.8b, w0
        adrp    x0, .LC0
        ldr       d1, [x0, #:lo12:.LC0]
        zip1    v0.16b, v0.16b, v1.16b
        ret

Which I assume is correct, since zip1 will merge the lower halves of
two vectors while leaving the upper halves undefined ?

Thanks,
Prathamesh
>
> Thanks,
> Richard
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index acc0cfe5f94..a527c48e916 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -21976,7 +21976,7 @@ aarch64_simd_make_constant (rtx vals)
    initialised to contain VALS.  */
 
 void
-aarch64_expand_vector_init (rtx target, rtx vals)
+aarch64_expand_vector_init_fallback (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   scalar_mode inner_mode = GET_MODE_INNER (mode);
@@ -22189,7 +22189,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	    }
 	  XVECEXP (copy, 0, i) = subst;
 	}
-      aarch64_expand_vector_init (target, copy);
+      aarch64_expand_vector_init_fallback (target, copy);
     }
 
   /* Insert the variable lanes directly.  */
@@ -22203,6 +22203,89 @@ aarch64_expand_vector_init (rtx target, rtx vals)
     }
 }
 
+DEBUG_FUNCTION
+static void
+aarch64_expand_vector_init_debug_seq (rtx_insn *seq, const char *s)
+{
+  fprintf (stderr, "%s: %u\n", s, seq_cost (seq, !optimize_size));
+  for (rtx_insn *i = seq; i; i = NEXT_INSN (i))
+    {
+      debug_rtx (PATTERN (i));
+      fprintf (stderr, "cost: %d\n", pattern_cost (PATTERN (i), !optimize_size));
+    }
+}
+
+static rtx
+aarch64_expand_vector_init_split_vals (machine_mode mode, rtx vals, bool even_p)
+{
+  int n = XVECLEN (vals, 0);
+  machine_mode new_mode
+    = aarch64_simd_container_mode (GET_MODE_INNER (mode), 64);
+  rtvec vec = rtvec_alloc (n / 2);
+  for (int i = 0; i < n; i++)
+    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
+				  : XVECEXP (vals, 0, 2 * i + 1);
+  return gen_rtx_PARALLEL (new_mode, vec);
+}
+
+/*
+The function does the following:
+(a) Generates code sequence by splitting VALS into even and odd halves,
+    and recursively calling itself to initialize them and then merge using
+    zip1.
+(b) Generate code sequence directly using aarch64_expand_vector_init_fallback.
+(c) Compare the cost of code sequences generated by (a) and (b), and choose
+    the more efficient one.
+*/
+
+void
+aarch64_expand_vector_init (rtx target, rtx vals)
+{
+  machine_mode mode = GET_MODE (target);
+  int n_elts = XVECLEN (vals, 0);
+
+  if (n_elts < 16)
+    {
+      aarch64_expand_vector_init_fallback (target, vals);
+      return;
+    }
+
+  start_sequence ();
+  rtx dest[2];
+  unsigned costs[2];
+  for (int i = 0; i < 2; i++)
+    {
+      start_sequence ();
+      dest[i] = gen_reg_rtx (mode);
+      rtx new_vals
+	= aarch64_expand_vector_init_split_vals (mode, vals, (i % 2) == 0);
+      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
+      aarch64_expand_vector_init (tmp_reg, new_vals);
+      dest[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
+      rtx_insn *rec_seq = get_insns ();
+      end_sequence ();
+      costs[i] = seq_cost (rec_seq, !optimize_size);
+      emit_insn (rec_seq);
+    }
+
+  rtvec v = gen_rtvec (2, dest[0], dest[1]);
+  rtx_insn *zip1_insn
+    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+  unsigned seq_total_cost
+    = std::max (costs[0], costs[1]) + insn_cost (zip1_insn, !optimize_size);
+
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  start_sequence ();
+  aarch64_expand_vector_init_fallback (target, vals);
+  rtx_insn *fallback_seq = get_insns ();
+  end_sequence ();
+
+  emit_insn (seq_total_cost < seq_cost (fallback_seq, !optimize_size)
+	     ? seq : fallback_seq);
+}
+
 /* Emit RTL corresponding to:
    insr TARGET, ELEM.  */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
similarity index 100%
rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
new file mode 100644
index 00000000000..e28fdcda29d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	dup	v[0-9]+\.8b, w[0-9]+
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x)
+{
+  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
+                       x, 5, x, 6, x, 7, x, 8 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
new file mode 100644
index 00000000000..9366ca349b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	dup	v[0-9]+\.8b, w[0-9]+
+**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
+                       4, y, 5, y, 6, y, 7, y };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
new file mode 100644
index 00000000000..e16459486d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	ins	v0\.b\[1\], w1
+**	...
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
+                       7, 8, 9, 10, 11, 12, 13, 14 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22.c
new file mode 100644
index 00000000000..e5016a47a3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/* Verify that fallback code-sequence is chosen over
+   recursively generated code-sequence merged with zip1.  */
+
+/*
+** f_s16:
+**	...
+**	sxth	w0, w0
+**	fmov	s0, w0
+**	ins	v0\.h\[1\], w1
+**	ins	v0\.h\[2\], w2
+**	ins	v0\.h\[3\], w3
+**	ins	v0\.h\[4\], w4
+**	ins	v0\.h\[5\], w5
+**	ins	v0\.h\[6\], w6
+**	ins	v0\.h\[7\], w7
+**	...
+**	ret
+*/
+
+int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
+                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
+{
+  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
+}
  
Richard Sandiford Feb. 2, 2023, 3:20 p.m. UTC | #11
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> >> > I have attached a patch that extends the transform if one half is dup
>> >> > and other is set of constants.
>> >> > For eg:
>> >> > int8x16_t f(int8_t x)
>> >> > {
>> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
>> >> > }
>> >> >
>> >> > code-gen trunk:
>> >> > f:
>> >> >         adrp    x1, .LC0
>> >> >         ldr     q0, [x1, #:lo12:.LC0]
>> >> >         ins     v0.b[0], w0
>> >> >         ins     v0.b[2], w0
>> >> >         ins     v0.b[4], w0
>> >> >         ins     v0.b[6], w0
>> >> >         ins     v0.b[8], w0
>> >> >         ins     v0.b[10], w0
>> >> >         ins     v0.b[12], w0
>> >> >         ins     v0.b[14], w0
>> >> >         ret
>> >> >
>> >> > code-gen with patch:
>> >> > f:
>> >> >         dup     v0.16b, w0
>> >> >         adrp    x0, .LC0
>> >> >         ldr     q1, [x0, #:lo12:.LC0]
>> >> >         zip1    v0.16b, v0.16b, v1.16b
>> >> >         ret
>> >> >
>> >> > Bootstrapped+tested on aarch64-linux-gnu.
>> >> > Does it look OK ?
>> >>
>> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
>> >>
>> >> However, rather than handle this case specially, I think we should instead
>> >> take a divide-and-conquer approach: split the initialiser into even and
>> >> odd elements, find the best way of loading each part, then compare the
>> >> cost of these sequences + ZIP with the cost of the fallback code (the code
>> >> later in aarch64_expand_vector_init).
>> >>
>> >> For example, doing that would allow:
>> >>
>> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
>> >>
>> >> to be loaded more easily, even though the even elements aren't wholly
>> >> constant.
>> > Hi Richard,
>> > I have attached a prototype patch based on the above approach.
>> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating
>> > same sequence, thus I removed that hunk, and improves the following cases:
>> >
>> > (a)
>> > int8x16_t f_s16(int8_t x)
>> > {
>> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
>> >                                  x, 5, x, 6, x, 7, x, 8 };
>> > }
>> >
>> > code-gen trunk:
>> > f_s16:
>> >         adrp    x1, .LC0
>> >         ldr     q0, [x1, #:lo12:.LC0]
>> >         ins     v0.b[0], w0
>> >         ins     v0.b[2], w0
>> >         ins     v0.b[4], w0
>> >         ins     v0.b[6], w0
>> >         ins     v0.b[8], w0
>> >         ins     v0.b[10], w0
>> >         ins     v0.b[12], w0
>> >         ins     v0.b[14], w0
>> >         ret
>> >
>> > code-gen with patch:
>> > f_s16:
>> >         dup     v0.16b, w0
>> >         adrp    x0, .LC0
>> >         ldr     q1, [x0, #:lo12:.LC0]
>> >         zip1    v0.16b, v0.16b, v1.16b
>> >         ret
>> >
>> > (b)
>> > int8x16_t f_s16(int8_t x, int8_t y)
>> > {
>> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
>> >                                 4, y, 5, y, 6, y, 7, y };
>> > }
>> >
>> > code-gen trunk:
>> > f_s16:
>> >         adrp    x2, .LC0
>> >         ldr     q0, [x2, #:lo12:.LC0]
>> >         ins     v0.b[0], w0
>> >         ins     v0.b[1], w1
>> >         ins     v0.b[3], w1
>> >         ins     v0.b[5], w1
>> >         ins     v0.b[7], w1
>> >         ins     v0.b[9], w1
>> >         ins     v0.b[11], w1
>> >         ins     v0.b[13], w1
>> >         ins     v0.b[15], w1
>> >         ret
>> >
>> > code-gen patch:
>> > f_s16:
>> >         adrp    x2, .LC0
>> >         dup     v1.16b, w1
>> >         ldr     q0, [x2, #:lo12:.LC0]
>> >         ins     v0.b[0], w0
>> >         zip1    v0.16b, v0.16b, v1.16b
>> >         ret
>>
>> Nice.
>>
>> > There are a couple of issues I have come across:
>> > (1) Choosing element to pad vector.
>> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
>> > with mode V8HI.
>> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
>> > However since the mode is V8HI, we would need to pad the above split vectors
>> > with 4 more elements to match up to vector length.
>> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, y, y}
>> > using 'y' is the obvious choice thus making them:
>> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
>> > These would be then merged using zip1 which would discard the lower half
>> > of both vectors.
>> > Currently I encoded the above two heuristics in
>> > aarch64_expand_vector_init_get_padded_elem:
>> > (a) If split portion contains a constant, use the constant to pad the vector.
>> > (b) If split portion only contains variables, then use the most
>> > frequently repeating variable
>> > to pad the vector.
>> > I suppose tho this could be improved ?
>>
>> I think we should just build two 64-bit vectors (V4HIs) and use a subreg
>> to fill the upper elements with undefined values.
>>
>> I suppose in principle we would have the same problem when splitting
>> a 64-bit vector into 2 32-bit vectors, but it's probably better to punt
>> on that for now.  Eventually it would be worth adding full support for
>> 32-bit Advanced SIMD modes (with necessary restrictions for FP exceptions)
>> but it's quite a big task.  The 128-bit to 64-bit split is the one that
>> matters most.
>>
>> > (2) Setting cost for zip1:
>> > Currently it returns 4 as cost for following zip1 insn:
>> > (set (reg:V8HI 102)
>> >     (unspec:V8HI [
>> >             (reg:V8HI 103)
>> >             (reg:V8HI 108)
>> >         ] UNSPEC_ZIP1))
>> > I am not sure if that's correct, or if not, what cost to use in this case
>> > for zip1 ?
>>
>> TBH 4 seems a bit optimistic.  It's COSTS_N_INSNS (1), whereas the
>> generic advsimd_vec_cost::permute_cost is 2 insns.  But the costs of
>> inserts are probably underestimated to the same extent, so hopefully
>> things work out.
>>
>> So it's probably best to accept the costs as they're currently given.
>> Changing them would need extensive testing.
>>
>> However, one of the advantages of the split is that it allows the
>> subvectors to be built in parallel.  When optimising for speed,
>> it might make sense to take the maximum of the subsequence costs
>> and add the cost of the zip to that.
> Hi Richard,
> Thanks for the suggestions.
> In the attached patch, it recurses only if nelts == 16 to punt for 64
> -> 32 bit split,

It should be based on the size rather than the number of elements.
The example we talked about above involved building V8HIs from two
V4HIs, which is also valid.

> and uses std::max(even_init, odd_init) + insn_cost (zip1_insn) for
> computing total cost of the sequence.
>
> So, for following case:
> int8x16_t f_s8(int8_t x)
> {
>   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
>                                 x, 5, x, 6, x, 7, x, 8 };
> }
>
> it now generates:
> f_s16:
>         dup     v0.8b, w0
>         adrp    x0, .LC0
>         ldr       d1, [x0, #:lo12:.LC0]
>         zip1    v0.16b, v0.16b, v1.16b
>         ret
>
> Which I assume is correct, since zip1 will merge the lower halves of
> two vectors while leaving the upper halves undefined ?

Yeah, it looks valid, but I would say that zip1 ignores the upper halves
(rather than leaving them undefined).

Thanks,
Richard
  
Prathamesh Kulkarni Feb. 3, 2023, 1:40 a.m. UTC | #12
On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> >> > I have attached a patch that extends the transform if one half is dup
> >> >> > and other is set of constants.
> >> >> > For eg:
> >> >> > int8x16_t f(int8_t x)
> >> >> > {
> >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
> >> >> > }
> >> >> >
> >> >> > code-gen trunk:
> >> >> > f:
> >> >> >         adrp    x1, .LC0
> >> >> >         ldr     q0, [x1, #:lo12:.LC0]
> >> >> >         ins     v0.b[0], w0
> >> >> >         ins     v0.b[2], w0
> >> >> >         ins     v0.b[4], w0
> >> >> >         ins     v0.b[6], w0
> >> >> >         ins     v0.b[8], w0
> >> >> >         ins     v0.b[10], w0
> >> >> >         ins     v0.b[12], w0
> >> >> >         ins     v0.b[14], w0
> >> >> >         ret
> >> >> >
> >> >> > code-gen with patch:
> >> >> > f:
> >> >> >         dup     v0.16b, w0
> >> >> >         adrp    x0, .LC0
> >> >> >         ldr     q1, [x0, #:lo12:.LC0]
> >> >> >         zip1    v0.16b, v0.16b, v1.16b
> >> >> >         ret
> >> >> >
> >> >> > Bootstrapped+tested on aarch64-linux-gnu.
> >> >> > Does it look OK ?
> >> >>
> >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
> >> >>
> >> >> However, rather than handle this case specially, I think we should instead
> >> >> take a divide-and-conquer approach: split the initialiser into even and
> >> >> odd elements, find the best way of loading each part, then compare the
> >> >> cost of these sequences + ZIP with the cost of the fallback code (the code
> >> >> later in aarch64_expand_vector_init).
> >> >>
> >> >> For example, doing that would allow:
> >> >>
> >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
> >> >>
> >> >> to be loaded more easily, even though the even elements aren't wholly
> >> >> constant.
> >> > Hi Richard,
> >> > I have attached a prototype patch based on the above approach.
> >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating
> >> > same sequence, thus I removed that hunk, and improves the following cases:
> >> >
> >> > (a)
> >> > int8x16_t f_s16(int8_t x)
> >> > {
> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> >> >                                  x, 5, x, 6, x, 7, x, 8 };
> >> > }
> >> >
> >> > code-gen trunk:
> >> > f_s16:
> >> >         adrp    x1, .LC0
> >> >         ldr     q0, [x1, #:lo12:.LC0]
> >> >         ins     v0.b[0], w0
> >> >         ins     v0.b[2], w0
> >> >         ins     v0.b[4], w0
> >> >         ins     v0.b[6], w0
> >> >         ins     v0.b[8], w0
> >> >         ins     v0.b[10], w0
> >> >         ins     v0.b[12], w0
> >> >         ins     v0.b[14], w0
> >> >         ret
> >> >
> >> > code-gen with patch:
> >> > f_s16:
> >> >         dup     v0.16b, w0
> >> >         adrp    x0, .LC0
> >> >         ldr     q1, [x0, #:lo12:.LC0]
> >> >         zip1    v0.16b, v0.16b, v1.16b
> >> >         ret
> >> >
> >> > (b)
> >> > int8x16_t f_s16(int8_t x, int8_t y)
> >> > {
> >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> >> >                                 4, y, 5, y, 6, y, 7, y };
> >> > }
> >> >
> >> > code-gen trunk:
> >> > f_s16:
> >> >         adrp    x2, .LC0
> >> >         ldr     q0, [x2, #:lo12:.LC0]
> >> >         ins     v0.b[0], w0
> >> >         ins     v0.b[1], w1
> >> >         ins     v0.b[3], w1
> >> >         ins     v0.b[5], w1
> >> >         ins     v0.b[7], w1
> >> >         ins     v0.b[9], w1
> >> >         ins     v0.b[11], w1
> >> >         ins     v0.b[13], w1
> >> >         ins     v0.b[15], w1
> >> >         ret
> >> >
> >> > code-gen patch:
> >> > f_s16:
> >> >         adrp    x2, .LC0
> >> >         dup     v1.16b, w1
> >> >         ldr     q0, [x2, #:lo12:.LC0]
> >> >         ins     v0.b[0], w0
> >> >         zip1    v0.16b, v0.16b, v1.16b
> >> >         ret
> >>
> >> Nice.
> >>
> >> > There are a couple of issues I have come across:
> >> > (1) Choosing element to pad vector.
> >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
> >> > with mode V8HI.
> >> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
> >> > However since the mode is V8HI, we would need to pad the above split vectors
> >> > with 4 more elements to match up to vector length.
> >> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, y, y}
> >> > using 'y' is the obvious choice thus making them:
> >> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
> >> > These would be then merged using zip1 which would discard the lower half
> >> > of both vectors.
> >> > Currently I encoded the above two heuristics in
> >> > aarch64_expand_vector_init_get_padded_elem:
> >> > (a) If split portion contains a constant, use the constant to pad the vector.
> >> > (b) If split portion only contains variables, then use the most
> >> > frequently repeating variable
> >> > to pad the vector.
> >> > I suppose tho this could be improved ?
> >>
> >> I think we should just build two 64-bit vectors (V4HIs) and use a subreg
> >> to fill the upper elements with undefined values.
> >>
> >> I suppose in principle we would have the same problem when splitting
> >> a 64-bit vector into 2 32-bit vectors, but it's probably better to punt
> >> on that for now.  Eventually it would be worth adding full support for
> >> 32-bit Advanced SIMD modes (with necessary restrictions for FP exceptions)
> >> but it's quite a big task.  The 128-bit to 64-bit split is the one that
> >> matters most.
> >>
> >> > (2) Setting cost for zip1:
> >> > Currently it returns 4 as cost for following zip1 insn:
> >> > (set (reg:V8HI 102)
> >> >     (unspec:V8HI [
> >> >             (reg:V8HI 103)
> >> >             (reg:V8HI 108)
> >> >         ] UNSPEC_ZIP1))
> >> > I am not sure if that's correct, or if not, what cost to use in this case
> >> > for zip1 ?
> >>
> >> TBH 4 seems a bit optimistic.  It's COSTS_N_INSNS (1), whereas the
> >> generic advsimd_vec_cost::permute_cost is 2 insns.  But the costs of
> >> inserts are probably underestimated to the same extent, so hopefully
> >> things work out.
> >>
> >> So it's probably best to accept the costs as they're currently given.
> >> Changing them would need extensive testing.
> >>
> >> However, one of the advantages of the split is that it allows the
> >> subvectors to be built in parallel.  When optimising for speed,
> >> it might make sense to take the maximum of the subsequence costs
> >> and add the cost of the zip to that.
> > Hi Richard,
> > Thanks for the suggestions.
> > In the attached patch, it recurses only if nelts == 16 to punt for 64
> > -> 32 bit split,
>
> It should be based on the size rather than the number of elements.
> The example we talked about above involved building V8HIs from two
> V4HIs, which is also valid.
Right, sorry got mixed up. The attached patch punts if vector_size == 64 by
resorting to fallback, which handles V8HI cases.
For eg:
int16x8_t f(int16_t x)
{
  return (int16x8_t) { x, 1, x, 2, x, 3, x, 4 };
}

code-gen with patch:
f:
        dup     v0.4h, w0
        adrp    x0, .LC0
        ldr       d1, [x0, #:lo12:.LC0]
        zip1    v0.8h, v0.8h, v1.8h
        ret

Just to clarify, we punt on 64 bit vector size, because there is no
32-bit vector available,
to build 2 32-bit vectors for even and odd halves, and then "extend"
them with subreg ?

It also punts if n_elts < 8, because I am not sure
if it's profitable to do recursion+merging for 4 or lesser elements.
Does it look OK ?
>
> > and uses std::max(even_init, odd_init) + insn_cost (zip1_insn) for
> > computing total cost of the sequence.
> >
> > So, for following case:
> > int8x16_t f_s8(int8_t x)
> > {
> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> >                                 x, 5, x, 6, x, 7, x, 8 };
> > }
> >
> > it now generates:
> > f_s16:
> >         dup     v0.8b, w0
> >         adrp    x0, .LC0
> >         ldr       d1, [x0, #:lo12:.LC0]
> >         zip1    v0.16b, v0.16b, v1.16b
> >         ret
> >
> > Which I assume is correct, since zip1 will merge the lower halves of
> > two vectors while leaving the upper halves undefined ?
>
> Yeah, it looks valid, but I would say that zip1 ignores the upper halves
> (rather than leaving them undefined).
Yes, sorry for mis-phrasing.

For the following test:
int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
                          int16_t x4, int16_t x5, int16_t x6, int16_t x7)
{
  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
}

it chose to go recursive+zip1 route since we take max (cost
(odd_init), cost (even_init)) and add
cost of zip1 insn which turns out to be lesser than cost of fallback:

f_s16:
        sxth    w0, w0
        sxth    w1, w1
        fmov    d0, x0
        fmov    d1, x1
        ins     v0.h[1], w2
        ins     v1.h[1], w3
        ins     v0.h[2], w4
        ins     v1.h[2], w5
        ins     v0.h[3], w6
        ins     v1.h[3], w7
        zip1    v0.8h, v0.8h, v1.8h
        ret

I assume that's OK since it has fewer dependencies compared to
fallback code-gen even if it's longer ?
With -Os the cost for sequence is taken as cost(odd_init) +
cost(even_init) + cost(zip1_insn)
which turns out to be same as cost for fallback sequence and it
generates the fallback code-sequence:

f_s16:
        sxth    w0, w0
        fmov    s0, w0
        ins     v0.h[1], w1
        ins     v0.h[2], w2
        ins     v0.h[3], w3
        ins     v0.h[4], w4
        ins     v0.h[5], w5
        ins     v0.h[6], w6
        ins     v0.h[7], w7
        ret

Thanks,
Prathamesh
>
> Thanks,
> Richard
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index acc0cfe5f94..4383e4e1d0c 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -21976,7 +21976,7 @@ aarch64_simd_make_constant (rtx vals)
    initialised to contain VALS.  */
 
 void
-aarch64_expand_vector_init (rtx target, rtx vals)
+aarch64_expand_vector_init_fallback (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   scalar_mode inner_mode = GET_MODE_INNER (mode);
@@ -22189,7 +22189,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	    }
 	  XVECEXP (copy, 0, i) = subst;
 	}
-      aarch64_expand_vector_init (target, copy);
+      aarch64_expand_vector_init_fallback (target, copy);
     }
 
   /* Insert the variable lanes directly.  */
@@ -22203,6 +22203,91 @@ aarch64_expand_vector_init (rtx target, rtx vals)
     }
 }
 
+DEBUG_FUNCTION
+static void
+aarch64_expand_vector_init_debug_seq (rtx_insn *seq, const char *s)
+{
+  fprintf (stderr, "%s: %u\n", s, seq_cost (seq, !optimize_size));
+  for (rtx_insn *i = seq; i; i = NEXT_INSN (i))
+    {
+      debug_rtx (PATTERN (i));
+      fprintf (stderr, "cost: %d\n", pattern_cost (PATTERN (i), !optimize_size));
+    }
+}
+
+static rtx
+aarch64_expand_vector_init_split_vals (machine_mode mode, rtx vals, bool even_p)
+{
+  int n = XVECLEN (vals, 0);
+  machine_mode new_mode
+    = aarch64_simd_container_mode (GET_MODE_INNER (mode), 64);
+  rtvec vec = rtvec_alloc (n / 2);
+  for (int i = 0; i < n; i++)
+    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
+				  : XVECEXP (vals, 0, 2 * i + 1);
+  return gen_rtx_PARALLEL (new_mode, vec);
+}
+
+/*
+The function does the following:
+(a) Generates code sequence by splitting VALS into even and odd halves,
+    and recursively calling itself to initialize them and then merge using
+    zip1.
+(b) Generate code sequence directly using aarch64_expand_vector_init_fallback.
+(c) Compare the cost of code sequences generated by (a) and (b), and choose
+    the more efficient one.
+*/
+
+void
+aarch64_expand_vector_init (rtx target, rtx vals)
+{
+  machine_mode mode = GET_MODE (target);
+  int n_elts = XVECLEN (vals, 0);
+
+  if (n_elts < 8
+      || known_eq (GET_MODE_BITSIZE (mode), 64))
+    {
+      aarch64_expand_vector_init_fallback (target, vals);
+      return;
+    }
+
+  start_sequence ();
+  rtx dest[2];
+  unsigned costs[2];
+  for (int i = 0; i < 2; i++)
+    {
+      start_sequence ();
+      dest[i] = gen_reg_rtx (mode);
+      rtx new_vals
+	= aarch64_expand_vector_init_split_vals (mode, vals, (i % 2) == 0);
+      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
+      aarch64_expand_vector_init (tmp_reg, new_vals);
+      dest[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
+      rtx_insn *rec_seq = get_insns ();
+      end_sequence ();
+      costs[i] = seq_cost (rec_seq, !optimize_size);
+      emit_insn (rec_seq);
+    }
+
+  rtvec v = gen_rtvec (2, dest[0], dest[1]);
+  rtx_insn *zip1_insn
+    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+  unsigned seq_total_cost
+    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
+  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
+
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  start_sequence ();
+  aarch64_expand_vector_init_fallback (target, vals);
+  rtx_insn *fallback_seq = get_insns ();
+  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
+  end_sequence ();
+
+  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
+}
+
 /* Emit RTL corresponding to:
    insr TARGET, ELEM.  */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
similarity index 82%
rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
index ee775048589..e812d3946de 100644
--- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
@@ -7,8 +7,8 @@
 /*
 ** foo:
 **	...
-**	dup	v[0-9]+\.8h, w[0-9]+
-**	dup	v[0-9]+\.8h, w[0-9]+
+**	dup	v[0-9]+\.4h, w[0-9]+
+**	dup	v[0-9]+\.4h, w[0-9]+
 **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
 **	...
 **	ret
@@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
 /*
 ** foo2:
 **	...
-**	dup	v[0-9]+\.8h, w[0-9]+
-**	movi	v[0-9]+\.8h, 0x1
+**	dup	v[0-9]+\.4h, w[0-9]+
+**	movi	v[0-9]+\.4h, 0x1
 **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
 **	...
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
new file mode 100644
index 00000000000..e28fdcda29d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	dup	v[0-9]+\.8b, w[0-9]+
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x)
+{
+  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
+                       x, 5, x, 6, x, 7, x, 8 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
new file mode 100644
index 00000000000..9366ca349b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	dup	v[0-9]+\.8b, w[0-9]+
+**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
+                       4, y, 5, y, 6, y, 7, y };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
new file mode 100644
index 00000000000..e16459486d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	ins	v0\.b\[1\], w1
+**	...
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
+                       7, 8, 9, 10, 11, 12, 13, 14 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
new file mode 100644
index 00000000000..8f35854c008
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/* Verify that fallback code-sequence is chosen over
+   recursively generated code-sequence merged with zip1.  */
+
+/*
+** f_s16:
+**	...
+**	sxth	w0, w0
+**	fmov	s0, w0
+**	ins	v0\.h\[1\], w1
+**	ins	v0\.h\[2\], w2
+**	ins	v0\.h\[3\], w3
+**	ins	v0\.h\[4\], w4
+**	ins	v0\.h\[5\], w5
+**	ins	v0\.h\[6\], w6
+**	ins	v0\.h\[7\], w7
+**	...
+**	ret
+*/
+
+#include "vec-init-22.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
new file mode 100644
index 00000000000..172d56ffdf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/* Verify that we recursively generate code for even and odd halves
+   instead of fallback code. This is so despite the longer code-gen
+   because it has fewer dependencies and thus has lesser cost.  */
+
+/*
+** f_s16:
+**	...
+**	sxth	w0, w0
+**	sxth	w1, w1
+**	fmov	d0, x0
+**	fmov	d1, x1
+**	ins	v[0-9]+\.h\[1\], w2
+**	ins	v[0-9]+\.h\[1\], w3
+**	ins	v[0-9]+\.h\[2\], w4
+**	ins	v[0-9]+\.h\[2\], w5
+**	ins	v[0-9]+\.h\[3\], w6
+**	ins	v[0-9]+\.h\[3\], w7
+**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**	...
+**	ret
+*/
+
+#include "vec-init-22.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
new file mode 100644
index 00000000000..15b889d4097
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
@@ -0,0 +1,7 @@
+#include <arm_neon.h>
+
+int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
+                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
+{
+  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
+}
  
Prathamesh Kulkarni Feb. 3, 2023, 3:02 a.m. UTC | #13
On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > >> >> > I have attached a patch that extends the transform if one half is dup
> > >> >> > and other is set of constants.
> > >> >> > For eg:
> > >> >> > int8x16_t f(int8_t x)
> > >> >> > {
> > >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
> > >> >> > }
> > >> >> >
> > >> >> > code-gen trunk:
> > >> >> > f:
> > >> >> >         adrp    x1, .LC0
> > >> >> >         ldr     q0, [x1, #:lo12:.LC0]
> > >> >> >         ins     v0.b[0], w0
> > >> >> >         ins     v0.b[2], w0
> > >> >> >         ins     v0.b[4], w0
> > >> >> >         ins     v0.b[6], w0
> > >> >> >         ins     v0.b[8], w0
> > >> >> >         ins     v0.b[10], w0
> > >> >> >         ins     v0.b[12], w0
> > >> >> >         ins     v0.b[14], w0
> > >> >> >         ret
> > >> >> >
> > >> >> > code-gen with patch:
> > >> >> > f:
> > >> >> >         dup     v0.16b, w0
> > >> >> >         adrp    x0, .LC0
> > >> >> >         ldr     q1, [x0, #:lo12:.LC0]
> > >> >> >         zip1    v0.16b, v0.16b, v1.16b
> > >> >> >         ret
> > >> >> >
> > >> >> > Bootstrapped+tested on aarch64-linux-gnu.
> > >> >> > Does it look OK ?
> > >> >>
> > >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
> > >> >>
> > >> >> However, rather than handle this case specially, I think we should instead
> > >> >> take a divide-and-conquer approach: split the initialiser into even and
> > >> >> odd elements, find the best way of loading each part, then compare the
> > >> >> cost of these sequences + ZIP with the cost of the fallback code (the code
> > >> >> later in aarch64_expand_vector_init).
> > >> >>
> > >> >> For example, doing that would allow:
> > >> >>
> > >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
> > >> >>
> > >> >> to be loaded more easily, even though the even elements aren't wholly
> > >> >> constant.
> > >> > Hi Richard,
> > >> > I have attached a prototype patch based on the above approach.
> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating
> > >> > same sequence, thus I removed that hunk, and improves the following cases:
> > >> >
> > >> > (a)
> > >> > int8x16_t f_s16(int8_t x)
> > >> > {
> > >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> > >> >                                  x, 5, x, 6, x, 7, x, 8 };
> > >> > }
> > >> >
> > >> > code-gen trunk:
> > >> > f_s16:
> > >> >         adrp    x1, .LC0
> > >> >         ldr     q0, [x1, #:lo12:.LC0]
> > >> >         ins     v0.b[0], w0
> > >> >         ins     v0.b[2], w0
> > >> >         ins     v0.b[4], w0
> > >> >         ins     v0.b[6], w0
> > >> >         ins     v0.b[8], w0
> > >> >         ins     v0.b[10], w0
> > >> >         ins     v0.b[12], w0
> > >> >         ins     v0.b[14], w0
> > >> >         ret
> > >> >
> > >> > code-gen with patch:
> > >> > f_s16:
> > >> >         dup     v0.16b, w0
> > >> >         adrp    x0, .LC0
> > >> >         ldr     q1, [x0, #:lo12:.LC0]
> > >> >         zip1    v0.16b, v0.16b, v1.16b
> > >> >         ret
> > >> >
> > >> > (b)
> > >> > int8x16_t f_s16(int8_t x, int8_t y)
> > >> > {
> > >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> > >> >                                 4, y, 5, y, 6, y, 7, y };
> > >> > }
> > >> >
> > >> > code-gen trunk:
> > >> > f_s16:
> > >> >         adrp    x2, .LC0
> > >> >         ldr     q0, [x2, #:lo12:.LC0]
> > >> >         ins     v0.b[0], w0
> > >> >         ins     v0.b[1], w1
> > >> >         ins     v0.b[3], w1
> > >> >         ins     v0.b[5], w1
> > >> >         ins     v0.b[7], w1
> > >> >         ins     v0.b[9], w1
> > >> >         ins     v0.b[11], w1
> > >> >         ins     v0.b[13], w1
> > >> >         ins     v0.b[15], w1
> > >> >         ret
> > >> >
> > >> > code-gen patch:
> > >> > f_s16:
> > >> >         adrp    x2, .LC0
> > >> >         dup     v1.16b, w1
> > >> >         ldr     q0, [x2, #:lo12:.LC0]
> > >> >         ins     v0.b[0], w0
> > >> >         zip1    v0.16b, v0.16b, v1.16b
> > >> >         ret
> > >>
> > >> Nice.
> > >>
> > >> > There are a couple of issues I have come across:
> > >> > (1) Choosing element to pad vector.
> > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
> > >> > with mode V8HI.
> > >> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
> > >> > However since the mode is V8HI, we would need to pad the above split vectors
> > >> > with 4 more elements to match up to vector length.
> > >> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, y, y}
> > >> > using 'y' is the obvious choice thus making them:
> > >> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
> > >> > These would be then merged using zip1 which would discard the lower half
> > >> > of both vectors.
> > >> > Currently I encoded the above two heuristics in
> > >> > aarch64_expand_vector_init_get_padded_elem:
> > >> > (a) If split portion contains a constant, use the constant to pad the vector.
> > >> > (b) If split portion only contains variables, then use the most
> > >> > frequently repeating variable
> > >> > to pad the vector.
> > >> > I suppose tho this could be improved ?
> > >>
> > >> I think we should just build two 64-bit vectors (V4HIs) and use a subreg
> > >> to fill the upper elements with undefined values.
> > >>
> > >> I suppose in principle we would have the same problem when splitting
> > >> a 64-bit vector into 2 32-bit vectors, but it's probably better to punt
> > >> on that for now.  Eventually it would be worth adding full support for
> > >> 32-bit Advanced SIMD modes (with necessary restrictions for FP exceptions)
> > >> but it's quite a big task.  The 128-bit to 64-bit split is the one that
> > >> matters most.
> > >>
> > >> > (2) Setting cost for zip1:
> > >> > Currently it returns 4 as cost for following zip1 insn:
> > >> > (set (reg:V8HI 102)
> > >> >     (unspec:V8HI [
> > >> >             (reg:V8HI 103)
> > >> >             (reg:V8HI 108)
> > >> >         ] UNSPEC_ZIP1))
> > >> > I am not sure if that's correct, or if not, what cost to use in this case
> > >> > for zip1 ?
> > >>
> > >> TBH 4 seems a bit optimistic.  It's COSTS_N_INSNS (1), whereas the
> > >> generic advsimd_vec_cost::permute_cost is 2 insns.  But the costs of
> > >> inserts are probably underestimated to the same extent, so hopefully
> > >> things work out.
> > >>
> > >> So it's probably best to accept the costs as they're currently given.
> > >> Changing them would need extensive testing.
> > >>
> > >> However, one of the advantages of the split is that it allows the
> > >> subvectors to be built in parallel.  When optimising for speed,
> > >> it might make sense to take the maximum of the subsequence costs
> > >> and add the cost of the zip to that.
> > > Hi Richard,
> > > Thanks for the suggestions.
> > > In the attached patch, it recurses only if nelts == 16 to punt for 64
> > > -> 32 bit split,
> >
> > It should be based on the size rather than the number of elements.
> > The example we talked about above involved building V8HIs from two
> > V4HIs, which is also valid.
> Right, sorry got mixed up. The attached patch punts if vector_size == 64 by
> resorting to fallback, which handles V8HI cases.
> For eg:
> int16x8_t f(int16_t x)
> {
>   return (int16x8_t) { x, 1, x, 2, x, 3, x, 4 };
> }
>
> code-gen with patch:
> f:
>         dup     v0.4h, w0
>         adrp    x0, .LC0
>         ldr       d1, [x0, #:lo12:.LC0]
>         zip1    v0.8h, v0.8h, v1.8h
>         ret
>
> Just to clarify, we punt on 64 bit vector size, because there is no
> 32-bit vector available,
> to build 2 32-bit vectors for even and odd halves, and then "extend"
> them with subreg ?
>
> It also punts if n_elts < 8, because I am not sure
> if it's profitable to do recursion+merging for 4 or lesser elements.
> Does it look OK ?
> >
> > > and uses std::max(even_init, odd_init) + insn_cost (zip1_insn) for
> > > computing total cost of the sequence.
> > >
> > > So, for following case:
> > > int8x16_t f_s8(int8_t x)
> > > {
> > >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> > >                                 x, 5, x, 6, x, 7, x, 8 };
> > > }
> > >
> > > it now generates:
> > > f_s16:
> > >         dup     v0.8b, w0
> > >         adrp    x0, .LC0
> > >         ldr       d1, [x0, #:lo12:.LC0]
> > >         zip1    v0.16b, v0.16b, v1.16b
> > >         ret
> > >
> > > Which I assume is correct, since zip1 will merge the lower halves of
> > > two vectors while leaving the upper halves undefined ?
> >
> > Yeah, it looks valid, but I would say that zip1 ignores the upper halves
> > (rather than leaving them undefined).
> Yes, sorry for mis-phrasing.
>
> For the following test:
> int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
>                           int16_t x4, int16_t x5, int16_t x6, int16_t x7)
> {
>   return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
> }
>
> it chose to go recursive+zip1 route since we take max (cost
> (odd_init), cost (even_init)) and add
> cost of zip1 insn which turns out to be lesser than cost of fallback:
>
> f_s16:
>         sxth    w0, w0
>         sxth    w1, w1
>         fmov    d0, x0
>         fmov    d1, x1
>         ins     v0.h[1], w2
>         ins     v1.h[1], w3
>         ins     v0.h[2], w4
>         ins     v1.h[2], w5
>         ins     v0.h[3], w6
>         ins     v1.h[3], w7
>         zip1    v0.8h, v0.8h, v1.8h
>         ret
>
> I assume that's OK since it has fewer dependencies compared to
> fallback code-gen even if it's longer ?
> With -Os the cost for sequence is taken as cost(odd_init) +
> cost(even_init) + cost(zip1_insn)
> which turns out to be same as cost for fallback sequence and it
> generates the fallback code-sequence:
>
> f_s16:
>         sxth    w0, w0
>         fmov    s0, w0
>         ins     v0.h[1], w1
>         ins     v0.h[2], w2
>         ins     v0.h[3], w3
>         ins     v0.h[4], w4
>         ins     v0.h[5], w5
>         ins     v0.h[6], w6
>         ins     v0.h[7], w7
>         ret
>
Forgot to remove the hunk handling interleaving case, done in the
attached patch.

Thanks,
Prathamesh
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Richard
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index acc0cfe5f94..dd2a64d2e4e 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -21976,7 +21976,7 @@ aarch64_simd_make_constant (rtx vals)
    initialised to contain VALS.  */
 
 void
-aarch64_expand_vector_init (rtx target, rtx vals)
+aarch64_expand_vector_init_fallback (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   scalar_mode inner_mode = GET_MODE_INNER (mode);
@@ -22036,38 +22036,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
-  /* Check for interleaving case.
-     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
-     Generate following code:
-     dup v0.h, x
-     dup v1.h, y
-     zip1 v0.h, v0.h, v1.h
-     for "large enough" initializer.  */
-
-  if (n_elts >= 8)
-    {
-      int i;
-      for (i = 2; i < n_elts; i++)
-	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
-	  break;
-
-      if (i == n_elts)
-	{
-	  machine_mode mode = GET_MODE (target);
-	  rtx dest[2];
-
-	  for (int i = 0; i < 2; i++)
-	    {
-	      rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
-	      dest[i] = force_reg (mode, x);
-	    }
-
-	  rtvec v = gen_rtvec (2, dest[0], dest[1]);
-	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
-	  return;
-	}
-    }
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
@@ -22189,7 +22157,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	    }
 	  XVECEXP (copy, 0, i) = subst;
 	}
-      aarch64_expand_vector_init (target, copy);
+      aarch64_expand_vector_init_fallback (target, copy);
     }
 
   /* Insert the variable lanes directly.  */
@@ -22203,6 +22171,91 @@ aarch64_expand_vector_init (rtx target, rtx vals)
     }
 }
 
+DEBUG_FUNCTION
+static void
+aarch64_expand_vector_init_debug_seq (rtx_insn *seq, const char *s)
+{
+  fprintf (stderr, "%s: %u\n", s, seq_cost (seq, !optimize_size));
+  for (rtx_insn *i = seq; i; i = NEXT_INSN (i))
+    {
+      debug_rtx (PATTERN (i));
+      fprintf (stderr, "cost: %d\n", pattern_cost (PATTERN (i), !optimize_size));
+    }
+}
+
+static rtx
+aarch64_expand_vector_init_split_vals (machine_mode mode, rtx vals, bool even_p)
+{
+  int n = XVECLEN (vals, 0);
+  machine_mode new_mode
+    = aarch64_simd_container_mode (GET_MODE_INNER (mode), 64);
+  rtvec vec = rtvec_alloc (n / 2);
+  for (int i = 0; i < n; i++)
+    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
+				  : XVECEXP (vals, 0, 2 * i + 1);
+  return gen_rtx_PARALLEL (new_mode, vec);
+}
+
+/*
+The function does the following:
+(a) Generates code sequence by splitting VALS into even and odd halves,
+    and recursively calling itself to initialize them and then merge using
+    zip1.
+(b) Generate code sequence directly using aarch64_expand_vector_init_fallback.
+(c) Compare the cost of code sequences generated by (a) and (b), and choose
+    the more efficient one.
+*/
+
+void
+aarch64_expand_vector_init (rtx target, rtx vals)
+{
+  machine_mode mode = GET_MODE (target);
+  int n_elts = XVECLEN (vals, 0);
+
+  if (n_elts < 8
+      || known_eq (GET_MODE_BITSIZE (mode), 64))
+    {
+      aarch64_expand_vector_init_fallback (target, vals);
+      return;
+    }
+
+  start_sequence ();
+  rtx dest[2];
+  unsigned costs[2];
+  for (int i = 0; i < 2; i++)
+    {
+      start_sequence ();
+      dest[i] = gen_reg_rtx (mode);
+      rtx new_vals
+	= aarch64_expand_vector_init_split_vals (mode, vals, (i % 2) == 0);
+      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
+      aarch64_expand_vector_init (tmp_reg, new_vals);
+      dest[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
+      rtx_insn *rec_seq = get_insns ();
+      end_sequence ();
+      costs[i] = seq_cost (rec_seq, !optimize_size);
+      emit_insn (rec_seq);
+    }
+
+  rtvec v = gen_rtvec (2, dest[0], dest[1]);
+  rtx_insn *zip1_insn
+    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+  unsigned seq_total_cost
+    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
+  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
+
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  start_sequence ();
+  aarch64_expand_vector_init_fallback (target, vals);
+  rtx_insn *fallback_seq = get_insns ();
+  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
+  end_sequence ();
+
+  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
+}
+
 /* Emit RTL corresponding to:
    insr TARGET, ELEM.  */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
similarity index 82%
rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
index ee775048589..e812d3946de 100644
--- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
@@ -7,8 +7,8 @@
 /*
 ** foo:
 **	...
-**	dup	v[0-9]+\.8h, w[0-9]+
-**	dup	v[0-9]+\.8h, w[0-9]+
+**	dup	v[0-9]+\.4h, w[0-9]+
+**	dup	v[0-9]+\.4h, w[0-9]+
 **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
 **	...
 **	ret
@@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
 /*
 ** foo2:
 **	...
-**	dup	v[0-9]+\.8h, w[0-9]+
-**	movi	v[0-9]+\.8h, 0x1
+**	dup	v[0-9]+\.4h, w[0-9]+
+**	movi	v[0-9]+\.4h, 0x1
 **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
 **	...
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
new file mode 100644
index 00000000000..e28fdcda29d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	dup	v[0-9]+\.8b, w[0-9]+
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x)
+{
+  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
+                       x, 5, x, 6, x, 7, x, 8 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
new file mode 100644
index 00000000000..9366ca349b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	dup	v[0-9]+\.8b, w[0-9]+
+**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
+                       4, y, 5, y, 6, y, 7, y };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
new file mode 100644
index 00000000000..e16459486d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	ins	v0\.b\[1\], w1
+**	...
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
+                       7, 8, 9, 10, 11, 12, 13, 14 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
new file mode 100644
index 00000000000..8f35854c008
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/* Verify that fallback code-sequence is chosen over
+   recursively generated code-sequence merged with zip1.  */
+
+/*
+** f_s16:
+**	...
+**	sxth	w0, w0
+**	fmov	s0, w0
+**	ins	v0\.h\[1\], w1
+**	ins	v0\.h\[2\], w2
+**	ins	v0\.h\[3\], w3
+**	ins	v0\.h\[4\], w4
+**	ins	v0\.h\[5\], w5
+**	ins	v0\.h\[6\], w6
+**	ins	v0\.h\[7\], w7
+**	...
+**	ret
+*/
+
+#include "vec-init-22.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
new file mode 100644
index 00000000000..172d56ffdf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/* Verify that we recursively generate code for even and odd halves
+   instead of fallback code. This is so despite the longer code-gen
+   because it has fewer dependencies and thus has lesser cost.  */
+
+/*
+** f_s16:
+**	...
+**	sxth	w0, w0
+**	sxth	w1, w1
+**	fmov	d0, x0
+**	fmov	d1, x1
+**	ins	v[0-9]+\.h\[1\], w2
+**	ins	v[0-9]+\.h\[1\], w3
+**	ins	v[0-9]+\.h\[2\], w4
+**	ins	v[0-9]+\.h\[2\], w5
+**	ins	v[0-9]+\.h\[3\], w6
+**	ins	v[0-9]+\.h\[3\], w7
+**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**	...
+**	ret
+*/
+
+#include "vec-init-22.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
new file mode 100644
index 00000000000..15b889d4097
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
@@ -0,0 +1,7 @@
+#include <arm_neon.h>
+
+int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
+                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
+{
+  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
+}
  
Richard Sandiford Feb. 3, 2023, 3:17 p.m. UTC | #14
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
>>
>> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>> >
>> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> > >> >> > I have attached a patch that extends the transform if one half is dup
>> > >> >> > and other is set of constants.
>> > >> >> > For eg:
>> > >> >> > int8x16_t f(int8_t x)
>> > >> >> > {
>> > >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
>> > >> >> > }
>> > >> >> >
>> > >> >> > code-gen trunk:
>> > >> >> > f:
>> > >> >> >         adrp    x1, .LC0
>> > >> >> >         ldr     q0, [x1, #:lo12:.LC0]
>> > >> >> >         ins     v0.b[0], w0
>> > >> >> >         ins     v0.b[2], w0
>> > >> >> >         ins     v0.b[4], w0
>> > >> >> >         ins     v0.b[6], w0
>> > >> >> >         ins     v0.b[8], w0
>> > >> >> >         ins     v0.b[10], w0
>> > >> >> >         ins     v0.b[12], w0
>> > >> >> >         ins     v0.b[14], w0
>> > >> >> >         ret
>> > >> >> >
>> > >> >> > code-gen with patch:
>> > >> >> > f:
>> > >> >> >         dup     v0.16b, w0
>> > >> >> >         adrp    x0, .LC0
>> > >> >> >         ldr     q1, [x0, #:lo12:.LC0]
>> > >> >> >         zip1    v0.16b, v0.16b, v1.16b
>> > >> >> >         ret
>> > >> >> >
>> > >> >> > Bootstrapped+tested on aarch64-linux-gnu.
>> > >> >> > Does it look OK ?
>> > >> >>
>> > >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
>> > >> >>
>> > >> >> However, rather than handle this case specially, I think we should instead
>> > >> >> take a divide-and-conquer approach: split the initialiser into even and
>> > >> >> odd elements, find the best way of loading each part, then compare the
>> > >> >> cost of these sequences + ZIP with the cost of the fallback code (the code
>> > >> >> later in aarch64_expand_vector_init).
>> > >> >>
>> > >> >> For example, doing that would allow:
>> > >> >>
>> > >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
>> > >> >>
>> > >> >> to be loaded more easily, even though the even elements aren't wholly
>> > >> >> constant.
>> > >> > Hi Richard,
>> > >> > I have attached a prototype patch based on the above approach.
>> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating
>> > >> > same sequence, thus I removed that hunk, and improves the following cases:
>> > >> >
>> > >> > (a)
>> > >> > int8x16_t f_s16(int8_t x)
>> > >> > {
>> > >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
>> > >> >                                  x, 5, x, 6, x, 7, x, 8 };
>> > >> > }
>> > >> >
>> > >> > code-gen trunk:
>> > >> > f_s16:
>> > >> >         adrp    x1, .LC0
>> > >> >         ldr     q0, [x1, #:lo12:.LC0]
>> > >> >         ins     v0.b[0], w0
>> > >> >         ins     v0.b[2], w0
>> > >> >         ins     v0.b[4], w0
>> > >> >         ins     v0.b[6], w0
>> > >> >         ins     v0.b[8], w0
>> > >> >         ins     v0.b[10], w0
>> > >> >         ins     v0.b[12], w0
>> > >> >         ins     v0.b[14], w0
>> > >> >         ret
>> > >> >
>> > >> > code-gen with patch:
>> > >> > f_s16:
>> > >> >         dup     v0.16b, w0
>> > >> >         adrp    x0, .LC0
>> > >> >         ldr     q1, [x0, #:lo12:.LC0]
>> > >> >         zip1    v0.16b, v0.16b, v1.16b
>> > >> >         ret
>> > >> >
>> > >> > (b)
>> > >> > int8x16_t f_s16(int8_t x, int8_t y)
>> > >> > {
>> > >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
>> > >> >                                 4, y, 5, y, 6, y, 7, y };
>> > >> > }
>> > >> >
>> > >> > code-gen trunk:
>> > >> > f_s16:
>> > >> >         adrp    x2, .LC0
>> > >> >         ldr     q0, [x2, #:lo12:.LC0]
>> > >> >         ins     v0.b[0], w0
>> > >> >         ins     v0.b[1], w1
>> > >> >         ins     v0.b[3], w1
>> > >> >         ins     v0.b[5], w1
>> > >> >         ins     v0.b[7], w1
>> > >> >         ins     v0.b[9], w1
>> > >> >         ins     v0.b[11], w1
>> > >> >         ins     v0.b[13], w1
>> > >> >         ins     v0.b[15], w1
>> > >> >         ret
>> > >> >
>> > >> > code-gen patch:
>> > >> > f_s16:
>> > >> >         adrp    x2, .LC0
>> > >> >         dup     v1.16b, w1
>> > >> >         ldr     q0, [x2, #:lo12:.LC0]
>> > >> >         ins     v0.b[0], w0
>> > >> >         zip1    v0.16b, v0.16b, v1.16b
>> > >> >         ret
>> > >>
>> > >> Nice.
>> > >>
>> > >> > There are a couple of issues I have come across:
>> > >> > (1) Choosing element to pad vector.
>> > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
>> > >> > with mode V8HI.
>> > >> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
>> > >> > However since the mode is V8HI, we would need to pad the above split vectors
>> > >> > with 4 more elements to match up to vector length.
>> > >> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, y, y}
>> > >> > using 'y' is the obvious choice thus making them:
>> > >> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
>> > >> > These would be then merged using zip1 which would discard the lower half
>> > >> > of both vectors.
>> > >> > Currently I encoded the above two heuristics in
>> > >> > aarch64_expand_vector_init_get_padded_elem:
>> > >> > (a) If split portion contains a constant, use the constant to pad the vector.
>> > >> > (b) If split portion only contains variables, then use the most
>> > >> > frequently repeating variable
>> > >> > to pad the vector.
>> > >> > I suppose tho this could be improved ?
>> > >>
>> > >> I think we should just build two 64-bit vectors (V4HIs) and use a subreg
>> > >> to fill the upper elements with undefined values.
>> > >>
>> > >> I suppose in principle we would have the same problem when splitting
>> > >> a 64-bit vector into 2 32-bit vectors, but it's probably better to punt
>> > >> on that for now.  Eventually it would be worth adding full support for
>> > >> 32-bit Advanced SIMD modes (with necessary restrictions for FP exceptions)
>> > >> but it's quite a big task.  The 128-bit to 64-bit split is the one that
>> > >> matters most.
>> > >>
>> > >> > (2) Setting cost for zip1:
>> > >> > Currently it returns 4 as cost for following zip1 insn:
>> > >> > (set (reg:V8HI 102)
>> > >> >     (unspec:V8HI [
>> > >> >             (reg:V8HI 103)
>> > >> >             (reg:V8HI 108)
>> > >> >         ] UNSPEC_ZIP1))
>> > >> > I am not sure if that's correct, or if not, what cost to use in this case
>> > >> > for zip1 ?
>> > >>
>> > >> TBH 4 seems a bit optimistic.  It's COSTS_N_INSNS (1), whereas the
>> > >> generic advsimd_vec_cost::permute_cost is 2 insns.  But the costs of
>> > >> inserts are probably underestimated to the same extent, so hopefully
>> > >> things work out.
>> > >>
>> > >> So it's probably best to accept the costs as they're currently given.
>> > >> Changing them would need extensive testing.
>> > >>
>> > >> However, one of the advantages of the split is that it allows the
>> > >> subvectors to be built in parallel.  When optimising for speed,
>> > >> it might make sense to take the maximum of the subsequence costs
>> > >> and add the cost of the zip to that.
>> > > Hi Richard,
>> > > Thanks for the suggestions.
>> > > In the attached patch, it recurses only if nelts == 16 to punt for 64
>> > > -> 32 bit split,
>> >
>> > It should be based on the size rather than the number of elements.
>> > The example we talked about above involved building V8HIs from two
>> > V4HIs, which is also valid.
>> Right, sorry got mixed up. The attached patch punts if vector_size == 64 by
>> resorting to fallback, which handles V8HI cases.
>> For eg:
>> int16x8_t f(int16_t x)
>> {
>>   return (int16x8_t) { x, 1, x, 2, x, 3, x, 4 };
>> }
>>
>> code-gen with patch:
>> f:
>>         dup     v0.4h, w0
>>         adrp    x0, .LC0
>>         ldr       d1, [x0, #:lo12:.LC0]
>>         zip1    v0.8h, v0.8h, v1.8h
>>         ret
>>
>> Just to clarify, we punt on 64 bit vector size, because there is no
>> 32-bit vector available,
>> to build 2 32-bit vectors for even and odd halves, and then "extend"
>> them with subreg ?

Right.  And if we want to fix that, I think the starting point would
be to add (general) 32-bit vector support first.

>> It also punts if n_elts < 8, because I am not sure
>> if it's profitable to do recursion+merging for 4 or lesser elements.
>> Does it look OK ?

Splitting { x, y, x, y } should at least be a size win over 4 individual
moves/inserts.  Possibly a speed win too if x and y are in general
registers.

So I think n_elts < 4 might be better.  If the costs get a case wrong,
we should fix the costs.

>> > > and uses std::max(even_init, odd_init) + insn_cost (zip1_insn) for
>> > > computing total cost of the sequence.
>> > >
>> > > So, for following case:
>> > > int8x16_t f_s8(int8_t x)
>> > > {
>> > >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
>> > >                                 x, 5, x, 6, x, 7, x, 8 };
>> > > }
>> > >
>> > > it now generates:
>> > > f_s16:
>> > >         dup     v0.8b, w0
>> > >         adrp    x0, .LC0
>> > >         ldr       d1, [x0, #:lo12:.LC0]
>> > >         zip1    v0.16b, v0.16b, v1.16b
>> > >         ret
>> > >
>> > > Which I assume is correct, since zip1 will merge the lower halves of
>> > > two vectors while leaving the upper halves undefined ?
>> >
>> > Yeah, it looks valid, but I would say that zip1 ignores the upper halves
>> > (rather than leaving them undefined).
>> Yes, sorry for mis-phrasing.
>>
>> For the following test:
>> int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
>>                           int16_t x4, int16_t x5, int16_t x6, int16_t x7)
>> {
>>   return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
>> }
>>
>> it chose to go recursive+zip1 route since we take max (cost
>> (odd_init), cost (even_init)) and add
>> cost of zip1 insn which turns out to be lesser than cost of fallback:
>>
>> f_s16:
>>         sxth    w0, w0
>>         sxth    w1, w1
>>         fmov    d0, x0
>>         fmov    d1, x1
>>         ins     v0.h[1], w2
>>         ins     v1.h[1], w3
>>         ins     v0.h[2], w4
>>         ins     v1.h[2], w5
>>         ins     v0.h[3], w6
>>         ins     v1.h[3], w7
>>         zip1    v0.8h, v0.8h, v1.8h
>>         ret
>>
>> I assume that's OK since it has fewer dependencies compared to
>> fallback code-gen even if it's longer ?
>> With -Os the cost for sequence is taken as cost(odd_init) +
>> cost(even_init) + cost(zip1_insn)
>> which turns out to be same as cost for fallback sequence and it
>> generates the fallback code-sequence:
>>
>> f_s16:
>>         sxth    w0, w0
>>         fmov    s0, w0
>>         ins     v0.h[1], w1
>>         ins     v0.h[2], w2
>>         ins     v0.h[3], w3
>>         ins     v0.h[4], w4
>>         ins     v0.h[5], w5
>>         ins     v0.h[6], w6
>>         ins     v0.h[7], w7
>>         ret
>>
> Forgot to remove the hunk handling interleaving case, done in the
> attached patch.
>
> Thanks,
> Prathamesh
>> Thanks,
>> Prathamesh
>> >
>> > Thanks,
>> > Richard
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index acc0cfe5f94..dd2a64d2e4e 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -21976,7 +21976,7 @@ aarch64_simd_make_constant (rtx vals)
>     initialised to contain VALS.  */
>  
>  void
> -aarch64_expand_vector_init (rtx target, rtx vals)
> +aarch64_expand_vector_init_fallback (rtx target, rtx vals)

The comment needs to be updated.  Maybe:

/* A subroutine of aarch64_expand_vector_init, with the same interface.
   The caller has already tried a divide-and-conquer approach, so do
   not consider that case here.  */

>  {
>    machine_mode mode = GET_MODE (target);
>    scalar_mode inner_mode = GET_MODE_INNER (mode);
> @@ -22036,38 +22036,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>        return;
>      }
>  
> -  /* Check for interleaving case.
> -     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> -     Generate following code:
> -     dup v0.h, x
> -     dup v1.h, y
> -     zip1 v0.h, v0.h, v1.h
> -     for "large enough" initializer.  */
> -
> -  if (n_elts >= 8)
> -    {
> -      int i;
> -      for (i = 2; i < n_elts; i++)
> -	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> -	  break;
> -
> -      if (i == n_elts)
> -	{
> -	  machine_mode mode = GET_MODE (target);
> -	  rtx dest[2];
> -
> -	  for (int i = 0; i < 2; i++)
> -	    {
> -	      rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> -	      dest[i] = force_reg (mode, x);
> -	    }
> -
> -	  rtvec v = gen_rtvec (2, dest[0], dest[1]);
> -	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> -	  return;
> -	}
> -    }
> -
>    enum insn_code icode = optab_handler (vec_set_optab, mode);
>    gcc_assert (icode != CODE_FOR_nothing);
>  
> @@ -22189,7 +22157,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>  	    }
>  	  XVECEXP (copy, 0, i) = subst;
>  	}
> -      aarch64_expand_vector_init (target, copy);
> +      aarch64_expand_vector_init_fallback (target, copy);
>      }
>  
>    /* Insert the variable lanes directly.  */
> @@ -22203,6 +22171,91 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>      }
>  }
>  
> +DEBUG_FUNCTION
> +static void
> +aarch64_expand_vector_init_debug_seq (rtx_insn *seq, const char *s)
> +{
> +  fprintf (stderr, "%s: %u\n", s, seq_cost (seq, !optimize_size));
> +  for (rtx_insn *i = seq; i; i = NEXT_INSN (i))
> +    {
> +      debug_rtx (PATTERN (i));
> +      fprintf (stderr, "cost: %d\n", pattern_cost (PATTERN (i), !optimize_size));
> +    }
> +}

I'm not sure we should commit this to the tree.

> +
> +static rtx
> +aarch64_expand_vector_init_split_vals (machine_mode mode, rtx vals, bool even_p)

How about calling this aarch64_unzip_vector_init?  It needs a function
comment.

> +{
> +  int n = XVECLEN (vals, 0);
> +  machine_mode new_mode
> +    = aarch64_simd_container_mode (GET_MODE_INNER (mode), 64);

IMO it would be better to use "GET_MODE_BITSIZE (mode).to_constant () / 2"
or "GET_MODE_UNIT_BITSIZE (mode) * n / 2" for the second argument.

> +  rtvec vec = rtvec_alloc (n / 2);
> +  for (int i = 0; i < n; i++)
> +    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> +				  : XVECEXP (vals, 0, 2 * i + 1);
> +  return gen_rtx_PARALLEL (new_mode, vec);
> +}
> +
> +/*
> +The function does the following:
> +(a) Generates code sequence by splitting VALS into even and odd halves,
> +    and recursively calling itself to initialize them and then merge using
> +    zip1.
> +(b) Generate code sequence directly using aarch64_expand_vector_init_fallback.
> +(c) Compare the cost of code sequences generated by (a) and (b), and choose
> +    the more efficient one.
> +*/

I think we should keep the current description of the interface,
before the describing the implementation:

/* Expand a vector initialization sequence, such that TARGET is
   initialized to contain VALS.  */

(includes an s/s/z/).

And it's probably better to describe the implementation inside
the function.

Most comments are written in imperative style, so how about:

  /* Try decomposing the initializer into even and odd halves and
     then ZIP them together.  Use the resulting sequence if it is
     strictly cheaper than loading VALS directly.

     Prefer the fallback sequence in the event of a tie, since it
     will tend to use fewer registers.  */

> +
> +void
> +aarch64_expand_vector_init (rtx target, rtx vals)
> +{
> +  machine_mode mode = GET_MODE (target);
> +  int n_elts = XVECLEN (vals, 0);
> +
> +  if (n_elts < 8
> +      || known_eq (GET_MODE_BITSIZE (mode), 64))

Might be more robust to test maybe_ne (GET_MODE_BITSIZE (mode), 128)

> +    {
> +      aarch64_expand_vector_init_fallback (target, vals);
> +      return;
> +    }
> +
> +  start_sequence ();
> +  rtx dest[2];
> +  unsigned costs[2];
> +  for (int i = 0; i < 2; i++)
> +    {
> +      start_sequence ();
> +      dest[i] = gen_reg_rtx (mode);
> +      rtx new_vals
> +	= aarch64_expand_vector_init_split_vals (mode, vals, (i % 2) == 0);
> +      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
> +      aarch64_expand_vector_init (tmp_reg, new_vals);
> +      dest[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);

Maybe "src" or "halves" would be a better name than "dest", given that
the rtx isn't actually the destination of the subsequence.

> +      rtx_insn *rec_seq = get_insns ();
> +      end_sequence ();
> +      costs[i] = seq_cost (rec_seq, !optimize_size);
> +      emit_insn (rec_seq);
> +    }
> +
> +  rtvec v = gen_rtvec (2, dest[0], dest[1]);
> +  rtx_insn *zip1_insn
> +    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> +  unsigned seq_total_cost
> +    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];

This is the wrong way round: max should be for speed and addition
for size.

Thanks,
Richard

> +  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
> +
> +  rtx_insn *seq = get_insns ();
> +  end_sequence ();
> +
> +  start_sequence ();
> +  aarch64_expand_vector_init_fallback (target, vals);
> +  rtx_insn *fallback_seq = get_insns ();
> +  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
> +  end_sequence ();
> +
> +  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
> +}
> +
>  /* Emit RTL corresponding to:
>     insr TARGET, ELEM.  */
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> similarity index 82%
> rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> index ee775048589..e812d3946de 100644
> --- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> @@ -7,8 +7,8 @@
>  /*
>  ** foo:
>  **	...
> -**	dup	v[0-9]+\.8h, w[0-9]+
> -**	dup	v[0-9]+\.8h, w[0-9]+
> +**	dup	v[0-9]+\.4h, w[0-9]+
> +**	dup	v[0-9]+\.4h, w[0-9]+
>  **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>  **	...
>  **	ret
> @@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
>  /*
>  ** foo2:
>  **	...
> -**	dup	v[0-9]+\.8h, w[0-9]+
> -**	movi	v[0-9]+\.8h, 0x1
> +**	dup	v[0-9]+\.4h, w[0-9]+
> +**	movi	v[0-9]+\.4h, 0x1
>  **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>  **	...
>  **	ret
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> new file mode 100644
> index 00000000000..e28fdcda29d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f_s8:
> +**	...
> +**	dup	v[0-9]+\.8b, w[0-9]+
> +**	adrp	x[0-9]+, \.LC[0-9]+
> +**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> +**	ret
> +*/
> +
> +int8x16_t f_s8(int8_t x)
> +{
> +  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> +                       x, 5, x, 6, x, 7, x, 8 };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> new file mode 100644
> index 00000000000..9366ca349b6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f_s8:
> +**	...
> +**	adrp	x[0-9]+, \.LC[0-9]+
> +**	dup	v[0-9]+\.8b, w[0-9]+
> +**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> +**	ins	v0\.b\[0\], w0
> +**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> +**	ret
> +*/
> +
> +int8x16_t f_s8(int8_t x, int8_t y)
> +{
> +  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> +                       4, y, 5, y, 6, y, 7, y };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> new file mode 100644
> index 00000000000..e16459486d7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f_s8:
> +**	...
> +**	adrp	x[0-9]+, \.LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> +**	ins	v0\.b\[0\], w0
> +**	ins	v0\.b\[1\], w1
> +**	...
> +**	ret
> +*/
> +
> +int8x16_t f_s8(int8_t x, int8_t y)
> +{
> +  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
> +                       7, 8, 9, 10, 11, 12, 13, 14 };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> new file mode 100644
> index 00000000000..8f35854c008
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/* Verify that fallback code-sequence is chosen over
> +   recursively generated code-sequence merged with zip1.  */
> +
> +/*
> +** f_s16:
> +**	...
> +**	sxth	w0, w0
> +**	fmov	s0, w0
> +**	ins	v0\.h\[1\], w1
> +**	ins	v0\.h\[2\], w2
> +**	ins	v0\.h\[3\], w3
> +**	ins	v0\.h\[4\], w4
> +**	ins	v0\.h\[5\], w5
> +**	ins	v0\.h\[6\], w6
> +**	ins	v0\.h\[7\], w7
> +**	...
> +**	ret
> +*/
> +
> +#include "vec-init-22.h"
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> new file mode 100644
> index 00000000000..172d56ffdf1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/* Verify that we recursively generate code for even and odd halves
> +   instead of fallback code. This is so despite the longer code-gen
> +   because it has fewer dependencies and thus has lesser cost.  */
> +
> +/*
> +** f_s16:
> +**	...
> +**	sxth	w0, w0
> +**	sxth	w1, w1
> +**	fmov	d0, x0
> +**	fmov	d1, x1
> +**	ins	v[0-9]+\.h\[1\], w2
> +**	ins	v[0-9]+\.h\[1\], w3
> +**	ins	v[0-9]+\.h\[2\], w4
> +**	ins	v[0-9]+\.h\[2\], w5
> +**	ins	v[0-9]+\.h\[3\], w6
> +**	ins	v[0-9]+\.h\[3\], w7
> +**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> +**	...
> +**	ret
> +*/
> +
> +#include "vec-init-22.h"
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> new file mode 100644
> index 00000000000..15b889d4097
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> @@ -0,0 +1,7 @@
> +#include <arm_neon.h>
> +
> +int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
> +                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
> +{
> +  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
> +}
  
Prathamesh Kulkarni Feb. 4, 2023, 6:49 a.m. UTC | #15
On Fri, 3 Feb 2023 at 20:47, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> >>
> >> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
> >> <richard.sandiford@arm.com> wrote:
> >> >
> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> > >> >> > I have attached a patch that extends the transform if one half is dup
> >> > >> >> > and other is set of constants.
> >> > >> >> > For eg:
> >> > >> >> > int8x16_t f(int8_t x)
> >> > >> >> > {
> >> > >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
> >> > >> >> > }
> >> > >> >> >
> >> > >> >> > code-gen trunk:
> >> > >> >> > f:
> >> > >> >> >         adrp    x1, .LC0
> >> > >> >> >         ldr     q0, [x1, #:lo12:.LC0]
> >> > >> >> >         ins     v0.b[0], w0
> >> > >> >> >         ins     v0.b[2], w0
> >> > >> >> >         ins     v0.b[4], w0
> >> > >> >> >         ins     v0.b[6], w0
> >> > >> >> >         ins     v0.b[8], w0
> >> > >> >> >         ins     v0.b[10], w0
> >> > >> >> >         ins     v0.b[12], w0
> >> > >> >> >         ins     v0.b[14], w0
> >> > >> >> >         ret
> >> > >> >> >
> >> > >> >> > code-gen with patch:
> >> > >> >> > f:
> >> > >> >> >         dup     v0.16b, w0
> >> > >> >> >         adrp    x0, .LC0
> >> > >> >> >         ldr     q1, [x0, #:lo12:.LC0]
> >> > >> >> >         zip1    v0.16b, v0.16b, v1.16b
> >> > >> >> >         ret
> >> > >> >> >
> >> > >> >> > Bootstrapped+tested on aarch64-linux-gnu.
> >> > >> >> > Does it look OK ?
> >> > >> >>
> >> > >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
> >> > >> >>
> >> > >> >> However, rather than handle this case specially, I think we should instead
> >> > >> >> take a divide-and-conquer approach: split the initialiser into even and
> >> > >> >> odd elements, find the best way of loading each part, then compare the
> >> > >> >> cost of these sequences + ZIP with the cost of the fallback code (the code
> >> > >> >> later in aarch64_expand_vector_init).
> >> > >> >>
> >> > >> >> For example, doing that would allow:
> >> > >> >>
> >> > >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
> >> > >> >>
> >> > >> >> to be loaded more easily, even though the even elements aren't wholly
> >> > >> >> constant.
> >> > >> > Hi Richard,
> >> > >> > I have attached a prototype patch based on the above approach.
> >> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating
> >> > >> > same sequence, thus I removed that hunk, and improves the following cases:
> >> > >> >
> >> > >> > (a)
> >> > >> > int8x16_t f_s16(int8_t x)
> >> > >> > {
> >> > >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> >> > >> >                                  x, 5, x, 6, x, 7, x, 8 };
> >> > >> > }
> >> > >> >
> >> > >> > code-gen trunk:
> >> > >> > f_s16:
> >> > >> >         adrp    x1, .LC0
> >> > >> >         ldr     q0, [x1, #:lo12:.LC0]
> >> > >> >         ins     v0.b[0], w0
> >> > >> >         ins     v0.b[2], w0
> >> > >> >         ins     v0.b[4], w0
> >> > >> >         ins     v0.b[6], w0
> >> > >> >         ins     v0.b[8], w0
> >> > >> >         ins     v0.b[10], w0
> >> > >> >         ins     v0.b[12], w0
> >> > >> >         ins     v0.b[14], w0
> >> > >> >         ret
> >> > >> >
> >> > >> > code-gen with patch:
> >> > >> > f_s16:
> >> > >> >         dup     v0.16b, w0
> >> > >> >         adrp    x0, .LC0
> >> > >> >         ldr     q1, [x0, #:lo12:.LC0]
> >> > >> >         zip1    v0.16b, v0.16b, v1.16b
> >> > >> >         ret
> >> > >> >
> >> > >> > (b)
> >> > >> > int8x16_t f_s16(int8_t x, int8_t y)
> >> > >> > {
> >> > >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> >> > >> >                                 4, y, 5, y, 6, y, 7, y };
> >> > >> > }
> >> > >> >
> >> > >> > code-gen trunk:
> >> > >> > f_s16:
> >> > >> >         adrp    x2, .LC0
> >> > >> >         ldr     q0, [x2, #:lo12:.LC0]
> >> > >> >         ins     v0.b[0], w0
> >> > >> >         ins     v0.b[1], w1
> >> > >> >         ins     v0.b[3], w1
> >> > >> >         ins     v0.b[5], w1
> >> > >> >         ins     v0.b[7], w1
> >> > >> >         ins     v0.b[9], w1
> >> > >> >         ins     v0.b[11], w1
> >> > >> >         ins     v0.b[13], w1
> >> > >> >         ins     v0.b[15], w1
> >> > >> >         ret
> >> > >> >
> >> > >> > code-gen patch:
> >> > >> > f_s16:
> >> > >> >         adrp    x2, .LC0
> >> > >> >         dup     v1.16b, w1
> >> > >> >         ldr     q0, [x2, #:lo12:.LC0]
> >> > >> >         ins     v0.b[0], w0
> >> > >> >         zip1    v0.16b, v0.16b, v1.16b
> >> > >> >         ret
> >> > >>
> >> > >> Nice.
> >> > >>
> >> > >> > There are a couple of issues I have come across:
> >> > >> > (1) Choosing element to pad vector.
> >> > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
> >> > >> > with mode V8HI.
> >> > >> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
> >> > >> > However since the mode is V8HI, we would need to pad the above split vectors
> >> > >> > with 4 more elements to match up to vector length.
> >> > >> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, y, y}
> >> > >> > using 'y' is the obvious choice thus making them:
> >> > >> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
> >> > >> > These would be then merged using zip1 which would discard the lower half
> >> > >> > of both vectors.
> >> > >> > Currently I encoded the above two heuristics in
> >> > >> > aarch64_expand_vector_init_get_padded_elem:
> >> > >> > (a) If split portion contains a constant, use the constant to pad the vector.
> >> > >> > (b) If split portion only contains variables, then use the most
> >> > >> > frequently repeating variable
> >> > >> > to pad the vector.
> >> > >> > I suppose tho this could be improved ?
> >> > >>
> >> > >> I think we should just build two 64-bit vectors (V4HIs) and use a subreg
> >> > >> to fill the upper elements with undefined values.
> >> > >>
> >> > >> I suppose in principle we would have the same problem when splitting
> >> > >> a 64-bit vector into 2 32-bit vectors, but it's probably better to punt
> >> > >> on that for now.  Eventually it would be worth adding full support for
> >> > >> 32-bit Advanced SIMD modes (with necessary restrictions for FP exceptions)
> >> > >> but it's quite a big task.  The 128-bit to 64-bit split is the one that
> >> > >> matters most.
> >> > >>
> >> > >> > (2) Setting cost for zip1:
> >> > >> > Currently it returns 4 as cost for following zip1 insn:
> >> > >> > (set (reg:V8HI 102)
> >> > >> >     (unspec:V8HI [
> >> > >> >             (reg:V8HI 103)
> >> > >> >             (reg:V8HI 108)
> >> > >> >         ] UNSPEC_ZIP1))
> >> > >> > I am not sure if that's correct, or if not, what cost to use in this case
> >> > >> > for zip1 ?
> >> > >>
> >> > >> TBH 4 seems a bit optimistic.  It's COSTS_N_INSNS (1), whereas the
> >> > >> generic advsimd_vec_cost::permute_cost is 2 insns.  But the costs of
> >> > >> inserts are probably underestimated to the same extent, so hopefully
> >> > >> things work out.
> >> > >>
> >> > >> So it's probably best to accept the costs as they're currently given.
> >> > >> Changing them would need extensive testing.
> >> > >>
> >> > >> However, one of the advantages of the split is that it allows the
> >> > >> subvectors to be built in parallel.  When optimising for speed,
> >> > >> it might make sense to take the maximum of the subsequence costs
> >> > >> and add the cost of the zip to that.
> >> > > Hi Richard,
> >> > > Thanks for the suggestions.
> >> > > In the attached patch, it recurses only if nelts == 16 to punt for 64
> >> > > -> 32 bit split,
> >> >
> >> > It should be based on the size rather than the number of elements.
> >> > The example we talked about above involved building V8HIs from two
> >> > V4HIs, which is also valid.
> >> Right, sorry got mixed up. The attached patch punts if vector_size == 64 by
> >> resorting to fallback, which handles V8HI cases.
> >> For eg:
> >> int16x8_t f(int16_t x)
> >> {
> >>   return (int16x8_t) { x, 1, x, 2, x, 3, x, 4 };
> >> }
> >>
> >> code-gen with patch:
> >> f:
> >>         dup     v0.4h, w0
> >>         adrp    x0, .LC0
> >>         ldr       d1, [x0, #:lo12:.LC0]
> >>         zip1    v0.8h, v0.8h, v1.8h
> >>         ret
> >>
> >> Just to clarify, we punt on 64 bit vector size, because there is no
> >> 32-bit vector available,
> >> to build 2 32-bit vectors for even and odd halves, and then "extend"
> >> them with subreg ?
>
> Right.  And if we want to fix that, I think the starting point would
> be to add (general) 32-bit vector support first.
>
> >> It also punts if n_elts < 8, because I am not sure
> >> if it's profitable to do recursion+merging for 4 or lesser elements.
> >> Does it look OK ?
>
> Splitting { x, y, x, y } should at least be a size win over 4 individual
> moves/inserts.  Possibly a speed win too if x and y are in general
> registers.
>
> So I think n_elts < 4 might be better.  If the costs get a case wrong,
> we should fix the costs.
>
> >> > > and uses std::max(even_init, odd_init) + insn_cost (zip1_insn) for
> >> > > computing total cost of the sequence.
> >> > >
> >> > > So, for following case:
> >> > > int8x16_t f_s8(int8_t x)
> >> > > {
> >> > >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> >> > >                                 x, 5, x, 6, x, 7, x, 8 };
> >> > > }
> >> > >
> >> > > it now generates:
> >> > > f_s16:
> >> > >         dup     v0.8b, w0
> >> > >         adrp    x0, .LC0
> >> > >         ldr       d1, [x0, #:lo12:.LC0]
> >> > >         zip1    v0.16b, v0.16b, v1.16b
> >> > >         ret
> >> > >
> >> > > Which I assume is correct, since zip1 will merge the lower halves of
> >> > > two vectors while leaving the upper halves undefined ?
> >> >
> >> > Yeah, it looks valid, but I would say that zip1 ignores the upper halves
> >> > (rather than leaving them undefined).
> >> Yes, sorry for mis-phrasing.
> >>
> >> For the following test:
> >> int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
> >>                           int16_t x4, int16_t x5, int16_t x6, int16_t x7)
> >> {
> >>   return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
> >> }
> >>
> >> it chose to go recursive+zip1 route since we take max (cost
> >> (odd_init), cost (even_init)) and add
> >> cost of zip1 insn which turns out to be lesser than cost of fallback:
> >>
> >> f_s16:
> >>         sxth    w0, w0
> >>         sxth    w1, w1
> >>         fmov    d0, x0
> >>         fmov    d1, x1
> >>         ins     v0.h[1], w2
> >>         ins     v1.h[1], w3
> >>         ins     v0.h[2], w4
> >>         ins     v1.h[2], w5
> >>         ins     v0.h[3], w6
> >>         ins     v1.h[3], w7
> >>         zip1    v0.8h, v0.8h, v1.8h
> >>         ret
> >>
> >> I assume that's OK since it has fewer dependencies compared to
> >> fallback code-gen even if it's longer ?
> >> With -Os the cost for sequence is taken as cost(odd_init) +
> >> cost(even_init) + cost(zip1_insn)
> >> which turns out to be same as cost for fallback sequence and it
> >> generates the fallback code-sequence:
> >>
> >> f_s16:
> >>         sxth    w0, w0
> >>         fmov    s0, w0
> >>         ins     v0.h[1], w1
> >>         ins     v0.h[2], w2
> >>         ins     v0.h[3], w3
> >>         ins     v0.h[4], w4
> >>         ins     v0.h[5], w5
> >>         ins     v0.h[6], w6
> >>         ins     v0.h[7], w7
> >>         ret
> >>
> > Forgot to remove the hunk handling interleaving case, done in the
> > attached patch.
> >
> > Thanks,
> > Prathamesh
> >> Thanks,
> >> Prathamesh
> >> >
> >> > Thanks,
> >> > Richard
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index acc0cfe5f94..dd2a64d2e4e 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -21976,7 +21976,7 @@ aarch64_simd_make_constant (rtx vals)
> >     initialised to contain VALS.  */
> >
> >  void
> > -aarch64_expand_vector_init (rtx target, rtx vals)
> > +aarch64_expand_vector_init_fallback (rtx target, rtx vals)
>
> The comment needs to be updated.  Maybe:
>
> /* A subroutine of aarch64_expand_vector_init, with the same interface.
>    The caller has already tried a divide-and-conquer approach, so do
>    not consider that case here.  */
>
> >  {
> >    machine_mode mode = GET_MODE (target);
> >    scalar_mode inner_mode = GET_MODE_INNER (mode);
> > @@ -22036,38 +22036,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >        return;
> >      }
> >
> > -  /* Check for interleaving case.
> > -     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> > -     Generate following code:
> > -     dup v0.h, x
> > -     dup v1.h, y
> > -     zip1 v0.h, v0.h, v1.h
> > -     for "large enough" initializer.  */
> > -
> > -  if (n_elts >= 8)
> > -    {
> > -      int i;
> > -      for (i = 2; i < n_elts; i++)
> > -     if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> > -       break;
> > -
> > -      if (i == n_elts)
> > -     {
> > -       machine_mode mode = GET_MODE (target);
> > -       rtx dest[2];
> > -
> > -       for (int i = 0; i < 2; i++)
> > -         {
> > -           rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> > -           dest[i] = force_reg (mode, x);
> > -         }
> > -
> > -       rtvec v = gen_rtvec (2, dest[0], dest[1]);
> > -       emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > -       return;
> > -     }
> > -    }
> > -
> >    enum insn_code icode = optab_handler (vec_set_optab, mode);
> >    gcc_assert (icode != CODE_FOR_nothing);
> >
> > @@ -22189,7 +22157,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >           }
> >         XVECEXP (copy, 0, i) = subst;
> >       }
> > -      aarch64_expand_vector_init (target, copy);
> > +      aarch64_expand_vector_init_fallback (target, copy);
> >      }
> >
> >    /* Insert the variable lanes directly.  */
> > @@ -22203,6 +22171,91 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >      }
> >  }
> >
> > +DEBUG_FUNCTION
> > +static void
> > +aarch64_expand_vector_init_debug_seq (rtx_insn *seq, const char *s)
> > +{
> > +  fprintf (stderr, "%s: %u\n", s, seq_cost (seq, !optimize_size));
> > +  for (rtx_insn *i = seq; i; i = NEXT_INSN (i))
> > +    {
> > +      debug_rtx (PATTERN (i));
> > +      fprintf (stderr, "cost: %d\n", pattern_cost (PATTERN (i), !optimize_size));
> > +    }
> > +}
>
> I'm not sure we should commit this to the tree.
>
> > +
> > +static rtx
> > +aarch64_expand_vector_init_split_vals (machine_mode mode, rtx vals, bool even_p)
>
> How about calling this aarch64_unzip_vector_init?  It needs a function
> comment.
>
> > +{
> > +  int n = XVECLEN (vals, 0);
> > +  machine_mode new_mode
> > +    = aarch64_simd_container_mode (GET_MODE_INNER (mode), 64);
>
> IMO it would be better to use "GET_MODE_BITSIZE (mode).to_constant () / 2"
> or "GET_MODE_UNIT_BITSIZE (mode) * n / 2" for the second argument.
>
> > +  rtvec vec = rtvec_alloc (n / 2);
> > +  for (int i = 0; i < n; i++)
> > +    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> > +                               : XVECEXP (vals, 0, 2 * i + 1);
> > +  return gen_rtx_PARALLEL (new_mode, vec);
> > +}
> > +
> > +/*
> > +The function does the following:
> > +(a) Generates code sequence by splitting VALS into even and odd halves,
> > +    and recursively calling itself to initialize them and then merge using
> > +    zip1.
> > +(b) Generate code sequence directly using aarch64_expand_vector_init_fallback.
> > +(c) Compare the cost of code sequences generated by (a) and (b), and choose
> > +    the more efficient one.
> > +*/
>
> I think we should keep the current description of the interface,
> before the describing the implementation:
>
> /* Expand a vector initialization sequence, such that TARGET is
>    initialized to contain VALS.  */
>
> (includes an s/s/z/).
>
> And it's probably better to describe the implementation inside
> the function.
>
> Most comments are written in imperative style, so how about:
>
>   /* Try decomposing the initializer into even and odd halves and
>      then ZIP them together.  Use the resulting sequence if it is
>      strictly cheaper than loading VALS directly.
>
>      Prefer the fallback sequence in the event of a tie, since it
>      will tend to use fewer registers.  */
>
> > +
> > +void
> > +aarch64_expand_vector_init (rtx target, rtx vals)
> > +{
> > +  machine_mode mode = GET_MODE (target);
> > +  int n_elts = XVECLEN (vals, 0);
> > +
> > +  if (n_elts < 8
> > +      || known_eq (GET_MODE_BITSIZE (mode), 64))
>
> Might be more robust to test maybe_ne (GET_MODE_BITSIZE (mode), 128)
>
> > +    {
> > +      aarch64_expand_vector_init_fallback (target, vals);
> > +      return;
> > +    }
> > +
> > +  start_sequence ();
> > +  rtx dest[2];
> > +  unsigned costs[2];
> > +  for (int i = 0; i < 2; i++)
> > +    {
> > +      start_sequence ();
> > +      dest[i] = gen_reg_rtx (mode);
> > +      rtx new_vals
> > +     = aarch64_expand_vector_init_split_vals (mode, vals, (i % 2) == 0);
> > +      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
> > +      aarch64_expand_vector_init (tmp_reg, new_vals);
> > +      dest[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
>
> Maybe "src" or "halves" would be a better name than "dest", given that
> the rtx isn't actually the destination of the subsequence.
>
> > +      rtx_insn *rec_seq = get_insns ();
> > +      end_sequence ();
> > +      costs[i] = seq_cost (rec_seq, !optimize_size);
> > +      emit_insn (rec_seq);
> > +    }
> > +
> > +  rtvec v = gen_rtvec (2, dest[0], dest[1]);
> > +  rtx_insn *zip1_insn
> > +    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > +  unsigned seq_total_cost
> > +    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
>
> This is the wrong way round: max should be for speed and addition
> for size.
I assumed, !optimize_size meant optimizing for speed ?
So (!optimize_size) ? std::max (costs[0] ,costs[1]) : costs[0] + costs[1]
would imply taking max of the two for speed and addition for size, or
am I misunderstanding ?
I have done rest of the changes in attached patch.

Thanks,
Prathamesh
>
> Thanks,
> Richard
>
> > +  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
> > +
> > +  rtx_insn *seq = get_insns ();
> > +  end_sequence ();
> > +
> > +  start_sequence ();
> > +  aarch64_expand_vector_init_fallback (target, vals);
> > +  rtx_insn *fallback_seq = get_insns ();
> > +  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
> > +  end_sequence ();
> > +
> > +  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
> > +}
> > +
> >  /* Emit RTL corresponding to:
> >     insr TARGET, ELEM.  */
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > similarity index 82%
> > rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> > rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > index ee775048589..e812d3946de 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > @@ -7,8 +7,8 @@
> >  /*
> >  ** foo:
> >  **   ...
> > -**   dup     v[0-9]+\.8h, w[0-9]+
> > -**   dup     v[0-9]+\.8h, w[0-9]+
> > +**   dup     v[0-9]+\.4h, w[0-9]+
> > +**   dup     v[0-9]+\.4h, w[0-9]+
> >  **   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >  **   ...
> >  **   ret
> > @@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
> >  /*
> >  ** foo2:
> >  **   ...
> > -**   dup     v[0-9]+\.8h, w[0-9]+
> > -**   movi    v[0-9]+\.8h, 0x1
> > +**   dup     v[0-9]+\.4h, w[0-9]+
> > +**   movi    v[0-9]+\.4h, 0x1
> >  **   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >  **   ...
> >  **   ret
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> > new file mode 100644
> > index 00000000000..e28fdcda29d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> > @@ -0,0 +1,21 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** f_s8:
> > +**   ...
> > +**   dup     v[0-9]+\.8b, w[0-9]+
> > +**   adrp    x[0-9]+, \.LC[0-9]+
> > +**   ldr     d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> > +**   zip1    v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> > +**   ret
> > +*/
> > +
> > +int8x16_t f_s8(int8_t x)
> > +{
> > +  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> > +                       x, 5, x, 6, x, 7, x, 8 };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> > new file mode 100644
> > index 00000000000..9366ca349b6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** f_s8:
> > +**   ...
> > +**   adrp    x[0-9]+, \.LC[0-9]+
> > +**   dup     v[0-9]+\.8b, w[0-9]+
> > +**   ldr     d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> > +**   ins     v0\.b\[0\], w0
> > +**   zip1    v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> > +**   ret
> > +*/
> > +
> > +int8x16_t f_s8(int8_t x, int8_t y)
> > +{
> > +  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> > +                       4, y, 5, y, 6, y, 7, y };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> > new file mode 100644
> > index 00000000000..e16459486d7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** f_s8:
> > +**   ...
> > +**   adrp    x[0-9]+, \.LC[0-9]+
> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> > +**   ins     v0\.b\[0\], w0
> > +**   ins     v0\.b\[1\], w1
> > +**   ...
> > +**   ret
> > +*/
> > +
> > +int8x16_t f_s8(int8_t x, int8_t y)
> > +{
> > +  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
> > +                       7, 8, 9, 10, 11, 12, 13, 14 };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> > new file mode 100644
> > index 00000000000..8f35854c008
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> > @@ -0,0 +1,24 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-Os" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +/* Verify that fallback code-sequence is chosen over
> > +   recursively generated code-sequence merged with zip1.  */
> > +
> > +/*
> > +** f_s16:
> > +**   ...
> > +**   sxth    w0, w0
> > +**   fmov    s0, w0
> > +**   ins     v0\.h\[1\], w1
> > +**   ins     v0\.h\[2\], w2
> > +**   ins     v0\.h\[3\], w3
> > +**   ins     v0\.h\[4\], w4
> > +**   ins     v0\.h\[5\], w5
> > +**   ins     v0\.h\[6\], w6
> > +**   ins     v0\.h\[7\], w7
> > +**   ...
> > +**   ret
> > +*/
> > +
> > +#include "vec-init-22.h"
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> > new file mode 100644
> > index 00000000000..172d56ffdf1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> > @@ -0,0 +1,27 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +/* Verify that we recursively generate code for even and odd halves
> > +   instead of fallback code. This is so despite the longer code-gen
> > +   because it has fewer dependencies and thus has lesser cost.  */
> > +
> > +/*
> > +** f_s16:
> > +**   ...
> > +**   sxth    w0, w0
> > +**   sxth    w1, w1
> > +**   fmov    d0, x0
> > +**   fmov    d1, x1
> > +**   ins     v[0-9]+\.h\[1\], w2
> > +**   ins     v[0-9]+\.h\[1\], w3
> > +**   ins     v[0-9]+\.h\[2\], w4
> > +**   ins     v[0-9]+\.h\[2\], w5
> > +**   ins     v[0-9]+\.h\[3\], w6
> > +**   ins     v[0-9]+\.h\[3\], w7
> > +**   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> > +**   ...
> > +**   ret
> > +*/
> > +
> > +#include "vec-init-22.h"
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> > new file mode 100644
> > index 00000000000..15b889d4097
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> > @@ -0,0 +1,7 @@
> > +#include <arm_neon.h>
> > +
> > +int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
> > +                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
> > +{
> > +  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
> > +}
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index acc0cfe5f94..94cc4338678 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -21972,11 +21972,12 @@ aarch64_simd_make_constant (rtx vals)
     return NULL_RTX;
 }
 
-/* Expand a vector initialisation sequence, such that TARGET is
-   initialised to contain VALS.  */
+/* A subroutine of aarch64_expand_vector_init, with the same interface.
+   The caller has already tried a divide-and-conquer approach, so do
+   not consider that case here.  */
 
 void
-aarch64_expand_vector_init (rtx target, rtx vals)
+aarch64_expand_vector_init_fallback (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   scalar_mode inner_mode = GET_MODE_INNER (mode);
@@ -22036,38 +22037,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
-  /* Check for interleaving case.
-     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
-     Generate following code:
-     dup v0.h, x
-     dup v1.h, y
-     zip1 v0.h, v0.h, v1.h
-     for "large enough" initializer.  */
-
-  if (n_elts >= 8)
-    {
-      int i;
-      for (i = 2; i < n_elts; i++)
-	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
-	  break;
-
-      if (i == n_elts)
-	{
-	  machine_mode mode = GET_MODE (target);
-	  rtx dest[2];
-
-	  for (int i = 0; i < 2; i++)
-	    {
-	      rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
-	      dest[i] = force_reg (mode, x);
-	    }
-
-	  rtvec v = gen_rtvec (2, dest[0], dest[1]);
-	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
-	  return;
-	}
-    }
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
@@ -22189,7 +22158,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	    }
 	  XVECEXP (copy, 0, i) = subst;
 	}
-      aarch64_expand_vector_init (target, copy);
+      aarch64_expand_vector_init_fallback (target, copy);
     }
 
   /* Insert the variable lanes directly.  */
@@ -22203,6 +22172,81 @@ aarch64_expand_vector_init (rtx target, rtx vals)
     }
 }
 
+/* Return even or odd half of VALS depending on EVEN_P.  */
+
+static rtx
+aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
+{
+  int n = XVECLEN (vals, 0);
+  machine_mode new_mode
+    = aarch64_simd_container_mode (GET_MODE_INNER (mode),
+				   GET_MODE_BITSIZE (mode).to_constant () / 2);
+  rtvec vec = rtvec_alloc (n / 2);
+  for (int i = 0; i < n; i++)
+    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
+				  : XVECEXP (vals, 0, 2 * i + 1);
+  return gen_rtx_PARALLEL (new_mode, vec);
+}
+
+/* Expand a vector initialisation sequence, such that TARGET is
+   initialized to contain VALS.  */
+
+void
+aarch64_expand_vector_init (rtx target, rtx vals)
+{
+  /* Try decomposing the initializer into even and odd halves and
+     then ZIP them together.  Use the resulting sequence if it is
+     strictly cheaper than loading VALS directly.
+
+     Prefer the fallback sequence in the event of a tie, since it
+     will tend to use fewer registers.  */
+
+  machine_mode mode = GET_MODE (target);
+  int n_elts = XVECLEN (vals, 0);
+
+  if (n_elts < 4
+      || maybe_ne (GET_MODE_BITSIZE (mode), 128))
+    {
+      aarch64_expand_vector_init_fallback (target, vals);
+      return;
+    }
+
+  start_sequence ();
+  rtx halves[2];
+  unsigned costs[2];
+  for (int i = 0; i < 2; i++)
+    {
+      start_sequence ();
+      rtx new_vals
+	= aarch64_unzip_vector_init (mode, vals, (i % 2) == 0);
+      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
+      aarch64_expand_vector_init (tmp_reg, new_vals);
+      halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
+      rtx_insn *rec_seq = get_insns ();
+      end_sequence ();
+      costs[i] = seq_cost (rec_seq, !optimize_size);
+      emit_insn (rec_seq);
+    }
+
+  rtvec v = gen_rtvec (2, halves[0], halves[1]);
+  rtx_insn *zip1_insn
+    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+  unsigned seq_total_cost
+    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
+  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
+
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  start_sequence ();
+  aarch64_expand_vector_init_fallback (target, vals);
+  rtx_insn *fallback_seq = get_insns ();
+  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
+  end_sequence ();
+
+  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
+}
+
 /* Emit RTL corresponding to:
    insr TARGET, ELEM.  */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
similarity index 82%
rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
index ee775048589..e812d3946de 100644
--- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
@@ -7,8 +7,8 @@
 /*
 ** foo:
 **	...
-**	dup	v[0-9]+\.8h, w[0-9]+
-**	dup	v[0-9]+\.8h, w[0-9]+
+**	dup	v[0-9]+\.4h, w[0-9]+
+**	dup	v[0-9]+\.4h, w[0-9]+
 **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
 **	...
 **	ret
@@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
 /*
 ** foo2:
 **	...
-**	dup	v[0-9]+\.8h, w[0-9]+
-**	movi	v[0-9]+\.8h, 0x1
+**	dup	v[0-9]+\.4h, w[0-9]+
+**	movi	v[0-9]+\.4h, 0x1
 **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
 **	...
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
new file mode 100644
index 00000000000..e28fdcda29d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	dup	v[0-9]+\.8b, w[0-9]+
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x)
+{
+  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
+                       x, 5, x, 6, x, 7, x, 8 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
new file mode 100644
index 00000000000..9366ca349b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	dup	v[0-9]+\.8b, w[0-9]+
+**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
+                       4, y, 5, y, 6, y, 7, y };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
new file mode 100644
index 00000000000..e16459486d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	ins	v0\.b\[1\], w1
+**	...
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
+                       7, 8, 9, 10, 11, 12, 13, 14 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
new file mode 100644
index 00000000000..8f35854c008
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/* Verify that fallback code-sequence is chosen over
+   recursively generated code-sequence merged with zip1.  */
+
+/*
+** f_s16:
+**	...
+**	sxth	w0, w0
+**	fmov	s0, w0
+**	ins	v0\.h\[1\], w1
+**	ins	v0\.h\[2\], w2
+**	ins	v0\.h\[3\], w3
+**	ins	v0\.h\[4\], w4
+**	ins	v0\.h\[5\], w5
+**	ins	v0\.h\[6\], w6
+**	ins	v0\.h\[7\], w7
+**	...
+**	ret
+*/
+
+#include "vec-init-22.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
new file mode 100644
index 00000000000..172d56ffdf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/* Verify that we recursively generate code for even and odd halves
+   instead of fallback code. This is so despite the longer code-gen
+   because it has fewer dependencies and thus has lesser cost.  */
+
+/*
+** f_s16:
+**	...
+**	sxth	w0, w0
+**	sxth	w1, w1
+**	fmov	d0, x0
+**	fmov	d1, x1
+**	ins	v[0-9]+\.h\[1\], w2
+**	ins	v[0-9]+\.h\[1\], w3
+**	ins	v[0-9]+\.h\[2\], w4
+**	ins	v[0-9]+\.h\[2\], w5
+**	ins	v[0-9]+\.h\[3\], w6
+**	ins	v[0-9]+\.h\[3\], w7
+**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**	...
+**	ret
+*/
+
+#include "vec-init-22.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
new file mode 100644
index 00000000000..15b889d4097
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
@@ -0,0 +1,7 @@
+#include <arm_neon.h>
+
+int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
+                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
+{
+  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
+}
  
Richard Sandiford Feb. 6, 2023, 12:13 p.m. UTC | #16
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Fri, 3 Feb 2023 at 20:47, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> > On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni
>> > <prathamesh.kulkarni@linaro.org> wrote:
>> >>
>> >> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
>> >> <richard.sandiford@arm.com> wrote:
>> >> >
>> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> >> > >> >> > I have attached a patch that extends the transform if one half is dup
>> >> > >> >> > and other is set of constants.
>> >> > >> >> > For eg:
>> >> > >> >> > int8x16_t f(int8_t x)
>> >> > >> >> > {
>> >> > >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
>> >> > >> >> > }
>> >> > >> >> >
>> >> > >> >> > code-gen trunk:
>> >> > >> >> > f:
>> >> > >> >> >         adrp    x1, .LC0
>> >> > >> >> >         ldr     q0, [x1, #:lo12:.LC0]
>> >> > >> >> >         ins     v0.b[0], w0
>> >> > >> >> >         ins     v0.b[2], w0
>> >> > >> >> >         ins     v0.b[4], w0
>> >> > >> >> >         ins     v0.b[6], w0
>> >> > >> >> >         ins     v0.b[8], w0
>> >> > >> >> >         ins     v0.b[10], w0
>> >> > >> >> >         ins     v0.b[12], w0
>> >> > >> >> >         ins     v0.b[14], w0
>> >> > >> >> >         ret
>> >> > >> >> >
>> >> > >> >> > code-gen with patch:
>> >> > >> >> > f:
>> >> > >> >> >         dup     v0.16b, w0
>> >> > >> >> >         adrp    x0, .LC0
>> >> > >> >> >         ldr     q1, [x0, #:lo12:.LC0]
>> >> > >> >> >         zip1    v0.16b, v0.16b, v1.16b
>> >> > >> >> >         ret
>> >> > >> >> >
>> >> > >> >> > Bootstrapped+tested on aarch64-linux-gnu.
>> >> > >> >> > Does it look OK ?
>> >> > >> >>
>> >> > >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
>> >> > >> >>
>> >> > >> >> However, rather than handle this case specially, I think we should instead
>> >> > >> >> take a divide-and-conquer approach: split the initialiser into even and
>> >> > >> >> odd elements, find the best way of loading each part, then compare the
>> >> > >> >> cost of these sequences + ZIP with the cost of the fallback code (the code
>> >> > >> >> later in aarch64_expand_vector_init).
>> >> > >> >>
>> >> > >> >> For example, doing that would allow:
>> >> > >> >>
>> >> > >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
>> >> > >> >>
>> >> > >> >> to be loaded more easily, even though the even elements aren't wholly
>> >> > >> >> constant.
>> >> > >> > Hi Richard,
>> >> > >> > I have attached a prototype patch based on the above approach.
>> >> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating
>> >> > >> > same sequence, thus I removed that hunk, and improves the following cases:
>> >> > >> >
>> >> > >> > (a)
>> >> > >> > int8x16_t f_s16(int8_t x)
>> >> > >> > {
>> >> > >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
>> >> > >> >                                  x, 5, x, 6, x, 7, x, 8 };
>> >> > >> > }
>> >> > >> >
>> >> > >> > code-gen trunk:
>> >> > >> > f_s16:
>> >> > >> >         adrp    x1, .LC0
>> >> > >> >         ldr     q0, [x1, #:lo12:.LC0]
>> >> > >> >         ins     v0.b[0], w0
>> >> > >> >         ins     v0.b[2], w0
>> >> > >> >         ins     v0.b[4], w0
>> >> > >> >         ins     v0.b[6], w0
>> >> > >> >         ins     v0.b[8], w0
>> >> > >> >         ins     v0.b[10], w0
>> >> > >> >         ins     v0.b[12], w0
>> >> > >> >         ins     v0.b[14], w0
>> >> > >> >         ret
>> >> > >> >
>> >> > >> > code-gen with patch:
>> >> > >> > f_s16:
>> >> > >> >         dup     v0.16b, w0
>> >> > >> >         adrp    x0, .LC0
>> >> > >> >         ldr     q1, [x0, #:lo12:.LC0]
>> >> > >> >         zip1    v0.16b, v0.16b, v1.16b
>> >> > >> >         ret
>> >> > >> >
>> >> > >> > (b)
>> >> > >> > int8x16_t f_s16(int8_t x, int8_t y)
>> >> > >> > {
>> >> > >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
>> >> > >> >                                 4, y, 5, y, 6, y, 7, y };
>> >> > >> > }
>> >> > >> >
>> >> > >> > code-gen trunk:
>> >> > >> > f_s16:
>> >> > >> >         adrp    x2, .LC0
>> >> > >> >         ldr     q0, [x2, #:lo12:.LC0]
>> >> > >> >         ins     v0.b[0], w0
>> >> > >> >         ins     v0.b[1], w1
>> >> > >> >         ins     v0.b[3], w1
>> >> > >> >         ins     v0.b[5], w1
>> >> > >> >         ins     v0.b[7], w1
>> >> > >> >         ins     v0.b[9], w1
>> >> > >> >         ins     v0.b[11], w1
>> >> > >> >         ins     v0.b[13], w1
>> >> > >> >         ins     v0.b[15], w1
>> >> > >> >         ret
>> >> > >> >
>> >> > >> > code-gen patch:
>> >> > >> > f_s16:
>> >> > >> >         adrp    x2, .LC0
>> >> > >> >         dup     v1.16b, w1
>> >> > >> >         ldr     q0, [x2, #:lo12:.LC0]
>> >> > >> >         ins     v0.b[0], w0
>> >> > >> >         zip1    v0.16b, v0.16b, v1.16b
>> >> > >> >         ret
>> >> > >>
>> >> > >> Nice.
>> >> > >>
>> >> > >> > There are a couple of issues I have come across:
>> >> > >> > (1) Choosing element to pad vector.
>> >> > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
>> >> > >> > with mode V8HI.
>> >> > >> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
>> >> > >> > However since the mode is V8HI, we would need to pad the above split vectors
>> >> > >> > with 4 more elements to match up to vector length.
>> >> > >> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, y, y}
>> >> > >> > using 'y' is the obvious choice thus making them:
>> >> > >> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
>> >> > >> > These would be then merged using zip1 which would discard the lower half
>> >> > >> > of both vectors.
>> >> > >> > Currently I encoded the above two heuristics in
>> >> > >> > aarch64_expand_vector_init_get_padded_elem:
>> >> > >> > (a) If split portion contains a constant, use the constant to pad the vector.
>> >> > >> > (b) If split portion only contains variables, then use the most
>> >> > >> > frequently repeating variable
>> >> > >> > to pad the vector.
>> >> > >> > I suppose tho this could be improved ?
>> >> > >>
>> >> > >> I think we should just build two 64-bit vectors (V4HIs) and use a subreg
>> >> > >> to fill the upper elements with undefined values.
>> >> > >>
>> >> > >> I suppose in principle we would have the same problem when splitting
>> >> > >> a 64-bit vector into 2 32-bit vectors, but it's probably better to punt
>> >> > >> on that for now.  Eventually it would be worth adding full support for
>> >> > >> 32-bit Advanced SIMD modes (with necessary restrictions for FP exceptions)
>> >> > >> but it's quite a big task.  The 128-bit to 64-bit split is the one that
>> >> > >> matters most.
>> >> > >>
>> >> > >> > (2) Setting cost for zip1:
>> >> > >> > Currently it returns 4 as cost for following zip1 insn:
>> >> > >> > (set (reg:V8HI 102)
>> >> > >> >     (unspec:V8HI [
>> >> > >> >             (reg:V8HI 103)
>> >> > >> >             (reg:V8HI 108)
>> >> > >> >         ] UNSPEC_ZIP1))
>> >> > >> > I am not sure if that's correct, or if not, what cost to use in this case
>> >> > >> > for zip1 ?
>> >> > >>
>> >> > >> TBH 4 seems a bit optimistic.  It's COSTS_N_INSNS (1), whereas the
>> >> > >> generic advsimd_vec_cost::permute_cost is 2 insns.  But the costs of
>> >> > >> inserts are probably underestimated to the same extent, so hopefully
>> >> > >> things work out.
>> >> > >>
>> >> > >> So it's probably best to accept the costs as they're currently given.
>> >> > >> Changing them would need extensive testing.
>> >> > >>
>> >> > >> However, one of the advantages of the split is that it allows the
>> >> > >> subvectors to be built in parallel.  When optimising for speed,
>> >> > >> it might make sense to take the maximum of the subsequence costs
>> >> > >> and add the cost of the zip to that.
>> >> > > Hi Richard,
>> >> > > Thanks for the suggestions.
>> >> > > In the attached patch, it recurses only if nelts == 16 to punt for 64
>> >> > > -> 32 bit split,
>> >> >
>> >> > It should be based on the size rather than the number of elements.
>> >> > The example we talked about above involved building V8HIs from two
>> >> > V4HIs, which is also valid.
>> >> Right, sorry got mixed up. The attached patch punts if vector_size == 64 by
>> >> resorting to fallback, which handles V8HI cases.
>> >> For eg:
>> >> int16x8_t f(int16_t x)
>> >> {
>> >>   return (int16x8_t) { x, 1, x, 2, x, 3, x, 4 };
>> >> }
>> >>
>> >> code-gen with patch:
>> >> f:
>> >>         dup     v0.4h, w0
>> >>         adrp    x0, .LC0
>> >>         ldr       d1, [x0, #:lo12:.LC0]
>> >>         zip1    v0.8h, v0.8h, v1.8h
>> >>         ret
>> >>
>> >> Just to clarify, we punt on 64 bit vector size, because there is no
>> >> 32-bit vector available,
>> >> to build 2 32-bit vectors for even and odd halves, and then "extend"
>> >> them with subreg ?
>>
>> Right.  And if we want to fix that, I think the starting point would
>> be to add (general) 32-bit vector support first.
>>
>> >> It also punts if n_elts < 8, because I am not sure
>> >> if it's profitable to do recursion+merging for 4 or lesser elements.
>> >> Does it look OK ?
>>
>> Splitting { x, y, x, y } should at least be a size win over 4 individual
>> moves/inserts.  Possibly a speed win too if x and y are in general
>> registers.
>>
>> So I think n_elts < 4 might be better.  If the costs get a case wrong,
>> we should fix the costs.
>>
>> >> > > and uses std::max(even_init, odd_init) + insn_cost (zip1_insn) for
>> >> > > computing total cost of the sequence.
>> >> > >
>> >> > > So, for following case:
>> >> > > int8x16_t f_s8(int8_t x)
>> >> > > {
>> >> > >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
>> >> > >                                 x, 5, x, 6, x, 7, x, 8 };
>> >> > > }
>> >> > >
>> >> > > it now generates:
>> >> > > f_s16:
>> >> > >         dup     v0.8b, w0
>> >> > >         adrp    x0, .LC0
>> >> > >         ldr       d1, [x0, #:lo12:.LC0]
>> >> > >         zip1    v0.16b, v0.16b, v1.16b
>> >> > >         ret
>> >> > >
>> >> > > Which I assume is correct, since zip1 will merge the lower halves of
>> >> > > two vectors while leaving the upper halves undefined ?
>> >> >
>> >> > Yeah, it looks valid, but I would say that zip1 ignores the upper halves
>> >> > (rather than leaving them undefined).
>> >> Yes, sorry for mis-phrasing.
>> >>
>> >> For the following test:
>> >> int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
>> >>                           int16_t x4, int16_t x5, int16_t x6, int16_t x7)
>> >> {
>> >>   return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
>> >> }
>> >>
>> >> it chose to go recursive+zip1 route since we take max (cost
>> >> (odd_init), cost (even_init)) and add
>> >> cost of zip1 insn which turns out to be lesser than cost of fallback:
>> >>
>> >> f_s16:
>> >>         sxth    w0, w0
>> >>         sxth    w1, w1
>> >>         fmov    d0, x0
>> >>         fmov    d1, x1
>> >>         ins     v0.h[1], w2
>> >>         ins     v1.h[1], w3
>> >>         ins     v0.h[2], w4
>> >>         ins     v1.h[2], w5
>> >>         ins     v0.h[3], w6
>> >>         ins     v1.h[3], w7
>> >>         zip1    v0.8h, v0.8h, v1.8h
>> >>         ret
>> >>
>> >> I assume that's OK since it has fewer dependencies compared to
>> >> fallback code-gen even if it's longer ?
>> >> With -Os the cost for sequence is taken as cost(odd_init) +
>> >> cost(even_init) + cost(zip1_insn)
>> >> which turns out to be same as cost for fallback sequence and it
>> >> generates the fallback code-sequence:
>> >>
>> >> f_s16:
>> >>         sxth    w0, w0
>> >>         fmov    s0, w0
>> >>         ins     v0.h[1], w1
>> >>         ins     v0.h[2], w2
>> >>         ins     v0.h[3], w3
>> >>         ins     v0.h[4], w4
>> >>         ins     v0.h[5], w5
>> >>         ins     v0.h[6], w6
>> >>         ins     v0.h[7], w7
>> >>         ret
>> >>
>> > Forgot to remove the hunk handling interleaving case, done in the
>> > attached patch.
>> >
>> > Thanks,
>> > Prathamesh
>> >> Thanks,
>> >> Prathamesh
>> >> >
>> >> > Thanks,
>> >> > Richard
>> >
>> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> > index acc0cfe5f94..dd2a64d2e4e 100644
>> > --- a/gcc/config/aarch64/aarch64.cc
>> > +++ b/gcc/config/aarch64/aarch64.cc
>> > @@ -21976,7 +21976,7 @@ aarch64_simd_make_constant (rtx vals)
>> >     initialised to contain VALS.  */
>> >
>> >  void
>> > -aarch64_expand_vector_init (rtx target, rtx vals)
>> > +aarch64_expand_vector_init_fallback (rtx target, rtx vals)
>>
>> The comment needs to be updated.  Maybe:
>>
>> /* A subroutine of aarch64_expand_vector_init, with the same interface.
>>    The caller has already tried a divide-and-conquer approach, so do
>>    not consider that case here.  */
>>
>> >  {
>> >    machine_mode mode = GET_MODE (target);
>> >    scalar_mode inner_mode = GET_MODE_INNER (mode);
>> > @@ -22036,38 +22036,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>> >        return;
>> >      }
>> >
>> > -  /* Check for interleaving case.
>> > -     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
>> > -     Generate following code:
>> > -     dup v0.h, x
>> > -     dup v1.h, y
>> > -     zip1 v0.h, v0.h, v1.h
>> > -     for "large enough" initializer.  */
>> > -
>> > -  if (n_elts >= 8)
>> > -    {
>> > -      int i;
>> > -      for (i = 2; i < n_elts; i++)
>> > -     if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
>> > -       break;
>> > -
>> > -      if (i == n_elts)
>> > -     {
>> > -       machine_mode mode = GET_MODE (target);
>> > -       rtx dest[2];
>> > -
>> > -       for (int i = 0; i < 2; i++)
>> > -         {
>> > -           rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
>> > -           dest[i] = force_reg (mode, x);
>> > -         }
>> > -
>> > -       rtvec v = gen_rtvec (2, dest[0], dest[1]);
>> > -       emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
>> > -       return;
>> > -     }
>> > -    }
>> > -
>> >    enum insn_code icode = optab_handler (vec_set_optab, mode);
>> >    gcc_assert (icode != CODE_FOR_nothing);
>> >
>> > @@ -22189,7 +22157,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>> >           }
>> >         XVECEXP (copy, 0, i) = subst;
>> >       }
>> > -      aarch64_expand_vector_init (target, copy);
>> > +      aarch64_expand_vector_init_fallback (target, copy);
>> >      }
>> >
>> >    /* Insert the variable lanes directly.  */
>> > @@ -22203,6 +22171,91 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>> >      }
>> >  }
>> >
>> > +DEBUG_FUNCTION
>> > +static void
>> > +aarch64_expand_vector_init_debug_seq (rtx_insn *seq, const char *s)
>> > +{
>> > +  fprintf (stderr, "%s: %u\n", s, seq_cost (seq, !optimize_size));
>> > +  for (rtx_insn *i = seq; i; i = NEXT_INSN (i))
>> > +    {
>> > +      debug_rtx (PATTERN (i));
>> > +      fprintf (stderr, "cost: %d\n", pattern_cost (PATTERN (i), !optimize_size));
>> > +    }
>> > +}
>>
>> I'm not sure we should commit this to the tree.
>>
>> > +
>> > +static rtx
>> > +aarch64_expand_vector_init_split_vals (machine_mode mode, rtx vals, bool even_p)
>>
>> How about calling this aarch64_unzip_vector_init?  It needs a function
>> comment.
>>
>> > +{
>> > +  int n = XVECLEN (vals, 0);
>> > +  machine_mode new_mode
>> > +    = aarch64_simd_container_mode (GET_MODE_INNER (mode), 64);
>>
>> IMO it would be better to use "GET_MODE_BITSIZE (mode).to_constant () / 2"
>> or "GET_MODE_UNIT_BITSIZE (mode) * n / 2" for the second argument.
>>
>> > +  rtvec vec = rtvec_alloc (n / 2);
>> > +  for (int i = 0; i < n; i++)
>> > +    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
>> > +                               : XVECEXP (vals, 0, 2 * i + 1);
>> > +  return gen_rtx_PARALLEL (new_mode, vec);
>> > +}
>> > +
>> > +/*
>> > +The function does the following:
>> > +(a) Generates code sequence by splitting VALS into even and odd halves,
>> > +    and recursively calling itself to initialize them and then merge using
>> > +    zip1.
>> > +(b) Generate code sequence directly using aarch64_expand_vector_init_fallback.
>> > +(c) Compare the cost of code sequences generated by (a) and (b), and choose
>> > +    the more efficient one.
>> > +*/
>>
>> I think we should keep the current description of the interface,
>> before the describing the implementation:
>>
>> /* Expand a vector initialization sequence, such that TARGET is
>>    initialized to contain VALS.  */
>>
>> (includes an s/s/z/).
>>
>> And it's probably better to describe the implementation inside
>> the function.
>>
>> Most comments are written in imperative style, so how about:
>>
>>   /* Try decomposing the initializer into even and odd halves and
>>      then ZIP them together.  Use the resulting sequence if it is
>>      strictly cheaper than loading VALS directly.
>>
>>      Prefer the fallback sequence in the event of a tie, since it
>>      will tend to use fewer registers.  */
>>
>> > +
>> > +void
>> > +aarch64_expand_vector_init (rtx target, rtx vals)
>> > +{
>> > +  machine_mode mode = GET_MODE (target);
>> > +  int n_elts = XVECLEN (vals, 0);
>> > +
>> > +  if (n_elts < 8
>> > +      || known_eq (GET_MODE_BITSIZE (mode), 64))
>>
>> Might be more robust to test maybe_ne (GET_MODE_BITSIZE (mode), 128)
>>
>> > +    {
>> > +      aarch64_expand_vector_init_fallback (target, vals);
>> > +      return;
>> > +    }
>> > +
>> > +  start_sequence ();
>> > +  rtx dest[2];
>> > +  unsigned costs[2];
>> > +  for (int i = 0; i < 2; i++)
>> > +    {
>> > +      start_sequence ();
>> > +      dest[i] = gen_reg_rtx (mode);
>> > +      rtx new_vals
>> > +     = aarch64_expand_vector_init_split_vals (mode, vals, (i % 2) == 0);
>> > +      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
>> > +      aarch64_expand_vector_init (tmp_reg, new_vals);
>> > +      dest[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
>>
>> Maybe "src" or "halves" would be a better name than "dest", given that
>> the rtx isn't actually the destination of the subsequence.
>>
>> > +      rtx_insn *rec_seq = get_insns ();
>> > +      end_sequence ();
>> > +      costs[i] = seq_cost (rec_seq, !optimize_size);
>> > +      emit_insn (rec_seq);
>> > +    }
>> > +
>> > +  rtvec v = gen_rtvec (2, dest[0], dest[1]);
>> > +  rtx_insn *zip1_insn
>> > +    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
>> > +  unsigned seq_total_cost
>> > +    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
>>
>> This is the wrong way round: max should be for speed and addition
>> for size.
> I assumed, !optimize_size meant optimizing for speed ?
> So (!optimize_size) ? std::max (costs[0] ,costs[1]) : costs[0] + costs[1]
> would imply taking max of the two for speed and addition for size, or
> am I misunderstanding ?

Ah, sorry, I misread.  But IMO it would be more natural as:

  optimize_size ? ... : ...;

> I have done rest of the changes in attached patch.
>
> Thanks,
> Prathamesh
>>
>> Thanks,
>> Richard
>>
>> > +  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
>> > +
>> > +  rtx_insn *seq = get_insns ();
>> > +  end_sequence ();
>> > +
>> > +  start_sequence ();
>> > +  aarch64_expand_vector_init_fallback (target, vals);
>> > +  rtx_insn *fallback_seq = get_insns ();
>> > +  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
>> > +  end_sequence ();
>> > +
>> > +  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
>> > +}
>> > +
>> >  /* Emit RTL corresponding to:
>> >     insr TARGET, ELEM.  */
>> >
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
>> > similarity index 82%
>> > rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
>> > rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
>> > index ee775048589..e812d3946de 100644
>> > --- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
>> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
>> > @@ -7,8 +7,8 @@
>> >  /*
>> >  ** foo:
>> >  **   ...
>> > -**   dup     v[0-9]+\.8h, w[0-9]+
>> > -**   dup     v[0-9]+\.8h, w[0-9]+
>> > +**   dup     v[0-9]+\.4h, w[0-9]+
>> > +**   dup     v[0-9]+\.4h, w[0-9]+
>> >  **   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>> >  **   ...
>> >  **   ret
>> > @@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
>> >  /*
>> >  ** foo2:
>> >  **   ...
>> > -**   dup     v[0-9]+\.8h, w[0-9]+
>> > -**   movi    v[0-9]+\.8h, 0x1
>> > +**   dup     v[0-9]+\.4h, w[0-9]+
>> > +**   movi    v[0-9]+\.4h, 0x1
>> >  **   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>> >  **   ...
>> >  **   ret
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
>> > new file mode 100644
>> > index 00000000000..e28fdcda29d
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
>> > @@ -0,0 +1,21 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O3" } */
>> > +/* { dg-final { check-function-bodies "**" "" "" } } */
>> > +
>> > +#include <arm_neon.h>
>> > +
>> > +/*
>> > +** f_s8:
>> > +**   ...
>> > +**   dup     v[0-9]+\.8b, w[0-9]+
>> > +**   adrp    x[0-9]+, \.LC[0-9]+
>> > +**   ldr     d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
>> > +**   zip1    v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
>> > +**   ret
>> > +*/
>> > +
>> > +int8x16_t f_s8(int8_t x)
>> > +{
>> > +  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
>> > +                       x, 5, x, 6, x, 7, x, 8 };
>> > +}
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
>> > new file mode 100644
>> > index 00000000000..9366ca349b6
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
>> > @@ -0,0 +1,22 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O3" } */
>> > +/* { dg-final { check-function-bodies "**" "" "" } } */
>> > +
>> > +#include <arm_neon.h>
>> > +
>> > +/*
>> > +** f_s8:
>> > +**   ...
>> > +**   adrp    x[0-9]+, \.LC[0-9]+
>> > +**   dup     v[0-9]+\.8b, w[0-9]+
>> > +**   ldr     d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
>> > +**   ins     v0\.b\[0\], w0
>> > +**   zip1    v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
>> > +**   ret
>> > +*/
>> > +
>> > +int8x16_t f_s8(int8_t x, int8_t y)
>> > +{
>> > +  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
>> > +                       4, y, 5, y, 6, y, 7, y };
>> > +}
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
>> > new file mode 100644
>> > index 00000000000..e16459486d7
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
>> > @@ -0,0 +1,22 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O3" } */
>> > +/* { dg-final { check-function-bodies "**" "" "" } } */
>> > +
>> > +#include <arm_neon.h>
>> > +
>> > +/*
>> > +** f_s8:
>> > +**   ...
>> > +**   adrp    x[0-9]+, \.LC[0-9]+
>> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
>> > +**   ins     v0\.b\[0\], w0
>> > +**   ins     v0\.b\[1\], w1
>> > +**   ...
>> > +**   ret
>> > +*/
>> > +
>> > +int8x16_t f_s8(int8_t x, int8_t y)
>> > +{
>> > +  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
>> > +                       7, 8, 9, 10, 11, 12, 13, 14 };
>> > +}
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
>> > new file mode 100644
>> > index 00000000000..8f35854c008
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
>> > @@ -0,0 +1,24 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-Os" } */
>> > +/* { dg-final { check-function-bodies "**" "" "" } } */
>> > +
>> > +/* Verify that fallback code-sequence is chosen over
>> > +   recursively generated code-sequence merged with zip1.  */
>> > +
>> > +/*
>> > +** f_s16:
>> > +**   ...
>> > +**   sxth    w0, w0
>> > +**   fmov    s0, w0
>> > +**   ins     v0\.h\[1\], w1
>> > +**   ins     v0\.h\[2\], w2
>> > +**   ins     v0\.h\[3\], w3
>> > +**   ins     v0\.h\[4\], w4
>> > +**   ins     v0\.h\[5\], w5
>> > +**   ins     v0\.h\[6\], w6
>> > +**   ins     v0\.h\[7\], w7
>> > +**   ...
>> > +**   ret
>> > +*/
>> > +
>> > +#include "vec-init-22.h"
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
>> > new file mode 100644
>> > index 00000000000..172d56ffdf1
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
>> > @@ -0,0 +1,27 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O3" } */
>> > +/* { dg-final { check-function-bodies "**" "" "" } } */
>> > +
>> > +/* Verify that we recursively generate code for even and odd halves
>> > +   instead of fallback code. This is so despite the longer code-gen
>> > +   because it has fewer dependencies and thus has lesser cost.  */
>> > +
>> > +/*
>> > +** f_s16:
>> > +**   ...
>> > +**   sxth    w0, w0
>> > +**   sxth    w1, w1
>> > +**   fmov    d0, x0
>> > +**   fmov    d1, x1
>> > +**   ins     v[0-9]+\.h\[1\], w2
>> > +**   ins     v[0-9]+\.h\[1\], w3
>> > +**   ins     v[0-9]+\.h\[2\], w4
>> > +**   ins     v[0-9]+\.h\[2\], w5
>> > +**   ins     v[0-9]+\.h\[3\], w6
>> > +**   ins     v[0-9]+\.h\[3\], w7
>> > +**   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>> > +**   ...
>> > +**   ret
>> > +*/
>> > +
>> > +#include "vec-init-22.h"
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
>> > new file mode 100644
>> > index 00000000000..15b889d4097
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
>> > @@ -0,0 +1,7 @@
>> > +#include <arm_neon.h>
>> > +
>> > +int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
>> > +                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
>> > +{
>> > +  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
>> > +}
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index acc0cfe5f94..94cc4338678 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -21972,11 +21972,12 @@ aarch64_simd_make_constant (rtx vals)
>      return NULL_RTX;
>  }
>  
> -/* Expand a vector initialisation sequence, such that TARGET is
> -   initialised to contain VALS.  */
> +/* A subroutine of aarch64_expand_vector_init, with the same interface.
> +   The caller has already tried a divide-and-conquer approach, so do
> +   not consider that case here.  */
>  
>  void
> -aarch64_expand_vector_init (rtx target, rtx vals)
> +aarch64_expand_vector_init_fallback (rtx target, rtx vals)
>  {
>    machine_mode mode = GET_MODE (target);
>    scalar_mode inner_mode = GET_MODE_INNER (mode);
> @@ -22036,38 +22037,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>        return;
>      }
>  
> -  /* Check for interleaving case.
> -     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> -     Generate following code:
> -     dup v0.h, x
> -     dup v1.h, y
> -     zip1 v0.h, v0.h, v1.h
> -     for "large enough" initializer.  */
> -
> -  if (n_elts >= 8)
> -    {
> -      int i;
> -      for (i = 2; i < n_elts; i++)
> -	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> -	  break;
> -
> -      if (i == n_elts)
> -	{
> -	  machine_mode mode = GET_MODE (target);
> -	  rtx dest[2];
> -
> -	  for (int i = 0; i < 2; i++)
> -	    {
> -	      rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> -	      dest[i] = force_reg (mode, x);
> -	    }
> -
> -	  rtvec v = gen_rtvec (2, dest[0], dest[1]);
> -	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> -	  return;
> -	}
> -    }
> -
>    enum insn_code icode = optab_handler (vec_set_optab, mode);
>    gcc_assert (icode != CODE_FOR_nothing);
>  
> @@ -22189,7 +22158,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>  	    }
>  	  XVECEXP (copy, 0, i) = subst;
>  	}
> -      aarch64_expand_vector_init (target, copy);
> +      aarch64_expand_vector_init_fallback (target, copy);
>      }
>  
>    /* Insert the variable lanes directly.  */
> @@ -22203,6 +22172,81 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>      }
>  }
>  
> +/* Return even or odd half of VALS depending on EVEN_P.  */
> +
> +static rtx
> +aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
> +{
> +  int n = XVECLEN (vals, 0);
> +  machine_mode new_mode
> +    = aarch64_simd_container_mode (GET_MODE_INNER (mode),
> +				   GET_MODE_BITSIZE (mode).to_constant () / 2);
> +  rtvec vec = rtvec_alloc (n / 2);
> +  for (int i = 0; i < n; i++)
> +    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> +				  : XVECEXP (vals, 0, 2 * i + 1);
> +  return gen_rtx_PARALLEL (new_mode, vec);
> +}
> +
> +/* Expand a vector initialisation sequence, such that TARGET is

initialization

It would be good to add -fno-schedule-insns -fno-schedule-insns2
to the tests' dg-options (or use -O instead of -O3 for the -O3 tests,
if that works).

OK for stage 1 with those changes, thanks.

Richard

> +   initialized to contain VALS.  */
> +
> +void
> +aarch64_expand_vector_init (rtx target, rtx vals)
> +{
> +  /* Try decomposing the initializer into even and odd halves and
> +     then ZIP them together.  Use the resulting sequence if it is
> +     strictly cheaper than loading VALS directly.
> +
> +     Prefer the fallback sequence in the event of a tie, since it
> +     will tend to use fewer registers.  */
> +
> +  machine_mode mode = GET_MODE (target);
> +  int n_elts = XVECLEN (vals, 0);
> +
> +  if (n_elts < 4
> +      || maybe_ne (GET_MODE_BITSIZE (mode), 128))
> +    {
> +      aarch64_expand_vector_init_fallback (target, vals);
> +      return;
> +    }
> +
> +  start_sequence ();
> +  rtx halves[2];
> +  unsigned costs[2];
> +  for (int i = 0; i < 2; i++)
> +    {
> +      start_sequence ();
> +      rtx new_vals
> +	= aarch64_unzip_vector_init (mode, vals, (i % 2) == 0);
> +      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
> +      aarch64_expand_vector_init (tmp_reg, new_vals);
> +      halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
> +      rtx_insn *rec_seq = get_insns ();
> +      end_sequence ();
> +      costs[i] = seq_cost (rec_seq, !optimize_size);
> +      emit_insn (rec_seq);
> +    }
> +
> +  rtvec v = gen_rtvec (2, halves[0], halves[1]);
> +  rtx_insn *zip1_insn
> +    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> +  unsigned seq_total_cost
> +    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
> +  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
> +
> +  rtx_insn *seq = get_insns ();
> +  end_sequence ();
> +
> +  start_sequence ();
> +  aarch64_expand_vector_init_fallback (target, vals);
> +  rtx_insn *fallback_seq = get_insns ();
> +  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
> +  end_sequence ();
> +
> +  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
> +}
> +
>  /* Emit RTL corresponding to:
>     insr TARGET, ELEM.  */
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> similarity index 82%
> rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> index ee775048589..e812d3946de 100644
> --- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> @@ -7,8 +7,8 @@
>  /*
>  ** foo:
>  **	...
> -**	dup	v[0-9]+\.8h, w[0-9]+
> -**	dup	v[0-9]+\.8h, w[0-9]+
> +**	dup	v[0-9]+\.4h, w[0-9]+
> +**	dup	v[0-9]+\.4h, w[0-9]+
>  **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>  **	...
>  **	ret
> @@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
>  /*
>  ** foo2:
>  **	...
> -**	dup	v[0-9]+\.8h, w[0-9]+
> -**	movi	v[0-9]+\.8h, 0x1
> +**	dup	v[0-9]+\.4h, w[0-9]+
> +**	movi	v[0-9]+\.4h, 0x1
>  **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>  **	...
>  **	ret
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> new file mode 100644
> index 00000000000..e28fdcda29d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f_s8:
> +**	...
> +**	dup	v[0-9]+\.8b, w[0-9]+
> +**	adrp	x[0-9]+, \.LC[0-9]+
> +**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> +**	ret
> +*/
> +
> +int8x16_t f_s8(int8_t x)
> +{
> +  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> +                       x, 5, x, 6, x, 7, x, 8 };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> new file mode 100644
> index 00000000000..9366ca349b6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f_s8:
> +**	...
> +**	adrp	x[0-9]+, \.LC[0-9]+
> +**	dup	v[0-9]+\.8b, w[0-9]+
> +**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> +**	ins	v0\.b\[0\], w0
> +**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> +**	ret
> +*/
> +
> +int8x16_t f_s8(int8_t x, int8_t y)
> +{
> +  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> +                       4, y, 5, y, 6, y, 7, y };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> new file mode 100644
> index 00000000000..e16459486d7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f_s8:
> +**	...
> +**	adrp	x[0-9]+, \.LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> +**	ins	v0\.b\[0\], w0
> +**	ins	v0\.b\[1\], w1
> +**	...
> +**	ret
> +*/
> +
> +int8x16_t f_s8(int8_t x, int8_t y)
> +{
> +  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
> +                       7, 8, 9, 10, 11, 12, 13, 14 };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> new file mode 100644
> index 00000000000..8f35854c008
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/* Verify that fallback code-sequence is chosen over
> +   recursively generated code-sequence merged with zip1.  */
> +
> +/*
> +** f_s16:
> +**	...
> +**	sxth	w0, w0
> +**	fmov	s0, w0
> +**	ins	v0\.h\[1\], w1
> +**	ins	v0\.h\[2\], w2
> +**	ins	v0\.h\[3\], w3
> +**	ins	v0\.h\[4\], w4
> +**	ins	v0\.h\[5\], w5
> +**	ins	v0\.h\[6\], w6
> +**	ins	v0\.h\[7\], w7
> +**	...
> +**	ret
> +*/
> +
> +#include "vec-init-22.h"
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> new file mode 100644
> index 00000000000..172d56ffdf1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/* Verify that we recursively generate code for even and odd halves
> +   instead of fallback code. This is so despite the longer code-gen
> +   because it has fewer dependencies and thus has lesser cost.  */
> +
> +/*
> +** f_s16:
> +**	...
> +**	sxth	w0, w0
> +**	sxth	w1, w1
> +**	fmov	d0, x0
> +**	fmov	d1, x1
> +**	ins	v[0-9]+\.h\[1\], w2
> +**	ins	v[0-9]+\.h\[1\], w3
> +**	ins	v[0-9]+\.h\[2\], w4
> +**	ins	v[0-9]+\.h\[2\], w5
> +**	ins	v[0-9]+\.h\[3\], w6
> +**	ins	v[0-9]+\.h\[3\], w7
> +**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> +**	...
> +**	ret
> +*/
> +
> +#include "vec-init-22.h"
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> new file mode 100644
> index 00000000000..15b889d4097
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> @@ -0,0 +1,7 @@
> +#include <arm_neon.h>
> +
> +int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
> +                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
> +{
> +  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
> +}
  
Prathamesh Kulkarni Feb. 11, 2023, 9:12 a.m. UTC | #17
On Mon, 6 Feb 2023 at 17:43, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > On Fri, 3 Feb 2023 at 20:47, Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> > On Fri, 3 Feb 2023 at 07:10, Prathamesh Kulkarni
> >> > <prathamesh.kulkarni@linaro.org> wrote:
> >> >>
> >> >> On Thu, 2 Feb 2023 at 20:50, Richard Sandiford
> >> >> <richard.sandiford@arm.com> wrote:
> >> >> >
> >> >> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> >> > >> >> > I have attached a patch that extends the transform if one half is dup
> >> >> > >> >> > and other is set of constants.
> >> >> > >> >> > For eg:
> >> >> > >> >> > int8x16_t f(int8_t x)
> >> >> > >> >> > {
> >> >> > >> >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4, x, 5, x, 6, x, 7, x, 8 };
> >> >> > >> >> > }
> >> >> > >> >> >
> >> >> > >> >> > code-gen trunk:
> >> >> > >> >> > f:
> >> >> > >> >> >         adrp    x1, .LC0
> >> >> > >> >> >         ldr     q0, [x1, #:lo12:.LC0]
> >> >> > >> >> >         ins     v0.b[0], w0
> >> >> > >> >> >         ins     v0.b[2], w0
> >> >> > >> >> >         ins     v0.b[4], w0
> >> >> > >> >> >         ins     v0.b[6], w0
> >> >> > >> >> >         ins     v0.b[8], w0
> >> >> > >> >> >         ins     v0.b[10], w0
> >> >> > >> >> >         ins     v0.b[12], w0
> >> >> > >> >> >         ins     v0.b[14], w0
> >> >> > >> >> >         ret
> >> >> > >> >> >
> >> >> > >> >> > code-gen with patch:
> >> >> > >> >> > f:
> >> >> > >> >> >         dup     v0.16b, w0
> >> >> > >> >> >         adrp    x0, .LC0
> >> >> > >> >> >         ldr     q1, [x0, #:lo12:.LC0]
> >> >> > >> >> >         zip1    v0.16b, v0.16b, v1.16b
> >> >> > >> >> >         ret
> >> >> > >> >> >
> >> >> > >> >> > Bootstrapped+tested on aarch64-linux-gnu.
> >> >> > >> >> > Does it look OK ?
> >> >> > >> >>
> >> >> > >> >> Looks like a nice improvement.  It'll need to wait for GCC 14 now though.
> >> >> > >> >>
> >> >> > >> >> However, rather than handle this case specially, I think we should instead
> >> >> > >> >> take a divide-and-conquer approach: split the initialiser into even and
> >> >> > >> >> odd elements, find the best way of loading each part, then compare the
> >> >> > >> >> cost of these sequences + ZIP with the cost of the fallback code (the code
> >> >> > >> >> later in aarch64_expand_vector_init).
> >> >> > >> >>
> >> >> > >> >> For example, doing that would allow:
> >> >> > >> >>
> >> >> > >> >>   { x, y, 0, y, 0, y, 0, y, 0, y }
> >> >> > >> >>
> >> >> > >> >> to be loaded more easily, even though the even elements aren't wholly
> >> >> > >> >> constant.
> >> >> > >> > Hi Richard,
> >> >> > >> > I have attached a prototype patch based on the above approach.
> >> >> > >> > It subsumes specializing for above {x, y, x, y, x, y, x, y} case by generating
> >> >> > >> > same sequence, thus I removed that hunk, and improves the following cases:
> >> >> > >> >
> >> >> > >> > (a)
> >> >> > >> > int8x16_t f_s16(int8_t x)
> >> >> > >> > {
> >> >> > >> >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> >> >> > >> >                                  x, 5, x, 6, x, 7, x, 8 };
> >> >> > >> > }
> >> >> > >> >
> >> >> > >> > code-gen trunk:
> >> >> > >> > f_s16:
> >> >> > >> >         adrp    x1, .LC0
> >> >> > >> >         ldr     q0, [x1, #:lo12:.LC0]
> >> >> > >> >         ins     v0.b[0], w0
> >> >> > >> >         ins     v0.b[2], w0
> >> >> > >> >         ins     v0.b[4], w0
> >> >> > >> >         ins     v0.b[6], w0
> >> >> > >> >         ins     v0.b[8], w0
> >> >> > >> >         ins     v0.b[10], w0
> >> >> > >> >         ins     v0.b[12], w0
> >> >> > >> >         ins     v0.b[14], w0
> >> >> > >> >         ret
> >> >> > >> >
> >> >> > >> > code-gen with patch:
> >> >> > >> > f_s16:
> >> >> > >> >         dup     v0.16b, w0
> >> >> > >> >         adrp    x0, .LC0
> >> >> > >> >         ldr     q1, [x0, #:lo12:.LC0]
> >> >> > >> >         zip1    v0.16b, v0.16b, v1.16b
> >> >> > >> >         ret
> >> >> > >> >
> >> >> > >> > (b)
> >> >> > >> > int8x16_t f_s16(int8_t x, int8_t y)
> >> >> > >> > {
> >> >> > >> >   return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> >> >> > >> >                                 4, y, 5, y, 6, y, 7, y };
> >> >> > >> > }
> >> >> > >> >
> >> >> > >> > code-gen trunk:
> >> >> > >> > f_s16:
> >> >> > >> >         adrp    x2, .LC0
> >> >> > >> >         ldr     q0, [x2, #:lo12:.LC0]
> >> >> > >> >         ins     v0.b[0], w0
> >> >> > >> >         ins     v0.b[1], w1
> >> >> > >> >         ins     v0.b[3], w1
> >> >> > >> >         ins     v0.b[5], w1
> >> >> > >> >         ins     v0.b[7], w1
> >> >> > >> >         ins     v0.b[9], w1
> >> >> > >> >         ins     v0.b[11], w1
> >> >> > >> >         ins     v0.b[13], w1
> >> >> > >> >         ins     v0.b[15], w1
> >> >> > >> >         ret
> >> >> > >> >
> >> >> > >> > code-gen patch:
> >> >> > >> > f_s16:
> >> >> > >> >         adrp    x2, .LC0
> >> >> > >> >         dup     v1.16b, w1
> >> >> > >> >         ldr     q0, [x2, #:lo12:.LC0]
> >> >> > >> >         ins     v0.b[0], w0
> >> >> > >> >         zip1    v0.16b, v0.16b, v1.16b
> >> >> > >> >         ret
> >> >> > >>
> >> >> > >> Nice.
> >> >> > >>
> >> >> > >> > There are a couple of issues I have come across:
> >> >> > >> > (1) Choosing element to pad vector.
> >> >> > >> > For eg, if we are initiailizing a vector say { x, y, 0, y, 1, y, 2, y }
> >> >> > >> > with mode V8HI.
> >> >> > >> > We split it into { x, 0, 1, 2 } and { y, y, y, y}
> >> >> > >> > However since the mode is V8HI, we would need to pad the above split vectors
> >> >> > >> > with 4 more elements to match up to vector length.
> >> >> > >> > For {x, 0, 1, 2} using any constant is the obvious choice while for {y, y, y, y}
> >> >> > >> > using 'y' is the obvious choice thus making them:
> >> >> > >> > {x, 0, 1, 2, 0, 0, 0, 0} and {y, y, y, y, y, y, y, y}
> >> >> > >> > These would be then merged using zip1 which would discard the lower half
> >> >> > >> > of both vectors.
> >> >> > >> > Currently I encoded the above two heuristics in
> >> >> > >> > aarch64_expand_vector_init_get_padded_elem:
> >> >> > >> > (a) If split portion contains a constant, use the constant to pad the vector.
> >> >> > >> > (b) If split portion only contains variables, then use the most
> >> >> > >> > frequently repeating variable
> >> >> > >> > to pad the vector.
> >> >> > >> > I suppose tho this could be improved ?
> >> >> > >>
> >> >> > >> I think we should just build two 64-bit vectors (V4HIs) and use a subreg
> >> >> > >> to fill the upper elements with undefined values.
> >> >> > >>
> >> >> > >> I suppose in principle we would have the same problem when splitting
> >> >> > >> a 64-bit vector into 2 32-bit vectors, but it's probably better to punt
> >> >> > >> on that for now.  Eventually it would be worth adding full support for
> >> >> > >> 32-bit Advanced SIMD modes (with necessary restrictions for FP exceptions)
> >> >> > >> but it's quite a big task.  The 128-bit to 64-bit split is the one that
> >> >> > >> matters most.
> >> >> > >>
> >> >> > >> > (2) Setting cost for zip1:
> >> >> > >> > Currently it returns 4 as cost for following zip1 insn:
> >> >> > >> > (set (reg:V8HI 102)
> >> >> > >> >     (unspec:V8HI [
> >> >> > >> >             (reg:V8HI 103)
> >> >> > >> >             (reg:V8HI 108)
> >> >> > >> >         ] UNSPEC_ZIP1))
> >> >> > >> > I am not sure if that's correct, or if not, what cost to use in this case
> >> >> > >> > for zip1 ?
> >> >> > >>
> >> >> > >> TBH 4 seems a bit optimistic.  It's COSTS_N_INSNS (1), whereas the
> >> >> > >> generic advsimd_vec_cost::permute_cost is 2 insns.  But the costs of
> >> >> > >> inserts are probably underestimated to the same extent, so hopefully
> >> >> > >> things work out.
> >> >> > >>
> >> >> > >> So it's probably best to accept the costs as they're currently given.
> >> >> > >> Changing them would need extensive testing.
> >> >> > >>
> >> >> > >> However, one of the advantages of the split is that it allows the
> >> >> > >> subvectors to be built in parallel.  When optimising for speed,
> >> >> > >> it might make sense to take the maximum of the subsequence costs
> >> >> > >> and add the cost of the zip to that.
> >> >> > > Hi Richard,
> >> >> > > Thanks for the suggestions.
> >> >> > > In the attached patch, it recurses only if nelts == 16 to punt for 64
> >> >> > > -> 32 bit split,
> >> >> >
> >> >> > It should be based on the size rather than the number of elements.
> >> >> > The example we talked about above involved building V8HIs from two
> >> >> > V4HIs, which is also valid.
> >> >> Right, sorry got mixed up. The attached patch punts if vector_size == 64 by
> >> >> resorting to fallback, which handles V8HI cases.
> >> >> For eg:
> >> >> int16x8_t f(int16_t x)
> >> >> {
> >> >>   return (int16x8_t) { x, 1, x, 2, x, 3, x, 4 };
> >> >> }
> >> >>
> >> >> code-gen with patch:
> >> >> f:
> >> >>         dup     v0.4h, w0
> >> >>         adrp    x0, .LC0
> >> >>         ldr       d1, [x0, #:lo12:.LC0]
> >> >>         zip1    v0.8h, v0.8h, v1.8h
> >> >>         ret
> >> >>
> >> >> Just to clarify, we punt on 64 bit vector size, because there is no
> >> >> 32-bit vector available,
> >> >> to build 2 32-bit vectors for even and odd halves, and then "extend"
> >> >> them with subreg ?
> >>
> >> Right.  And if we want to fix that, I think the starting point would
> >> be to add (general) 32-bit vector support first.
> >>
> >> >> It also punts if n_elts < 8, because I am not sure
> >> >> if it's profitable to do recursion+merging for 4 or lesser elements.
> >> >> Does it look OK ?
> >>
> >> Splitting { x, y, x, y } should at least be a size win over 4 individual
> >> moves/inserts.  Possibly a speed win too if x and y are in general
> >> registers.
> >>
> >> So I think n_elts < 4 might be better.  If the costs get a case wrong,
> >> we should fix the costs.
> >>
> >> >> > > and uses std::max(even_init, odd_init) + insn_cost (zip1_insn) for
> >> >> > > computing total cost of the sequence.
> >> >> > >
> >> >> > > So, for following case:
> >> >> > > int8x16_t f_s8(int8_t x)
> >> >> > > {
> >> >> > >   return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> >> >> > >                                 x, 5, x, 6, x, 7, x, 8 };
> >> >> > > }
> >> >> > >
> >> >> > > it now generates:
> >> >> > > f_s16:
> >> >> > >         dup     v0.8b, w0
> >> >> > >         adrp    x0, .LC0
> >> >> > >         ldr       d1, [x0, #:lo12:.LC0]
> >> >> > >         zip1    v0.16b, v0.16b, v1.16b
> >> >> > >         ret
> >> >> > >
> >> >> > > Which I assume is correct, since zip1 will merge the lower halves of
> >> >> > > two vectors while leaving the upper halves undefined ?
> >> >> >
> >> >> > Yeah, it looks valid, but I would say that zip1 ignores the upper halves
> >> >> > (rather than leaving them undefined).
> >> >> Yes, sorry for mis-phrasing.
> >> >>
> >> >> For the following test:
> >> >> int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
> >> >>                           int16_t x4, int16_t x5, int16_t x6, int16_t x7)
> >> >> {
> >> >>   return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
> >> >> }
> >> >>
> >> >> it chose to go recursive+zip1 route since we take max (cost
> >> >> (odd_init), cost (even_init)) and add
> >> >> cost of zip1 insn which turns out to be lesser than cost of fallback:
> >> >>
> >> >> f_s16:
> >> >>         sxth    w0, w0
> >> >>         sxth    w1, w1
> >> >>         fmov    d0, x0
> >> >>         fmov    d1, x1
> >> >>         ins     v0.h[1], w2
> >> >>         ins     v1.h[1], w3
> >> >>         ins     v0.h[2], w4
> >> >>         ins     v1.h[2], w5
> >> >>         ins     v0.h[3], w6
> >> >>         ins     v1.h[3], w7
> >> >>         zip1    v0.8h, v0.8h, v1.8h
> >> >>         ret
> >> >>
> >> >> I assume that's OK since it has fewer dependencies compared to
> >> >> fallback code-gen even if it's longer ?
> >> >> With -Os the cost for sequence is taken as cost(odd_init) +
> >> >> cost(even_init) + cost(zip1_insn)
> >> >> which turns out to be same as cost for fallback sequence and it
> >> >> generates the fallback code-sequence:
> >> >>
> >> >> f_s16:
> >> >>         sxth    w0, w0
> >> >>         fmov    s0, w0
> >> >>         ins     v0.h[1], w1
> >> >>         ins     v0.h[2], w2
> >> >>         ins     v0.h[3], w3
> >> >>         ins     v0.h[4], w4
> >> >>         ins     v0.h[5], w5
> >> >>         ins     v0.h[6], w6
> >> >>         ins     v0.h[7], w7
> >> >>         ret
> >> >>
> >> > Forgot to remove the hunk handling interleaving case, done in the
> >> > attached patch.
> >> >
> >> > Thanks,
> >> > Prathamesh
> >> >> Thanks,
> >> >> Prathamesh
> >> >> >
> >> >> > Thanks,
> >> >> > Richard
> >> >
> >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> >> > index acc0cfe5f94..dd2a64d2e4e 100644
> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> > +++ b/gcc/config/aarch64/aarch64.cc
> >> > @@ -21976,7 +21976,7 @@ aarch64_simd_make_constant (rtx vals)
> >> >     initialised to contain VALS.  */
> >> >
> >> >  void
> >> > -aarch64_expand_vector_init (rtx target, rtx vals)
> >> > +aarch64_expand_vector_init_fallback (rtx target, rtx vals)
> >>
> >> The comment needs to be updated.  Maybe:
> >>
> >> /* A subroutine of aarch64_expand_vector_init, with the same interface.
> >>    The caller has already tried a divide-and-conquer approach, so do
> >>    not consider that case here.  */
> >>
> >> >  {
> >> >    machine_mode mode = GET_MODE (target);
> >> >    scalar_mode inner_mode = GET_MODE_INNER (mode);
> >> > @@ -22036,38 +22036,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >> >        return;
> >> >      }
> >> >
> >> > -  /* Check for interleaving case.
> >> > -     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> >> > -     Generate following code:
> >> > -     dup v0.h, x
> >> > -     dup v1.h, y
> >> > -     zip1 v0.h, v0.h, v1.h
> >> > -     for "large enough" initializer.  */
> >> > -
> >> > -  if (n_elts >= 8)
> >> > -    {
> >> > -      int i;
> >> > -      for (i = 2; i < n_elts; i++)
> >> > -     if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> >> > -       break;
> >> > -
> >> > -      if (i == n_elts)
> >> > -     {
> >> > -       machine_mode mode = GET_MODE (target);
> >> > -       rtx dest[2];
> >> > -
> >> > -       for (int i = 0; i < 2; i++)
> >> > -         {
> >> > -           rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> >> > -           dest[i] = force_reg (mode, x);
> >> > -         }
> >> > -
> >> > -       rtvec v = gen_rtvec (2, dest[0], dest[1]);
> >> > -       emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> >> > -       return;
> >> > -     }
> >> > -    }
> >> > -
> >> >    enum insn_code icode = optab_handler (vec_set_optab, mode);
> >> >    gcc_assert (icode != CODE_FOR_nothing);
> >> >
> >> > @@ -22189,7 +22157,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >> >           }
> >> >         XVECEXP (copy, 0, i) = subst;
> >> >       }
> >> > -      aarch64_expand_vector_init (target, copy);
> >> > +      aarch64_expand_vector_init_fallback (target, copy);
> >> >      }
> >> >
> >> >    /* Insert the variable lanes directly.  */
> >> > @@ -22203,6 +22171,91 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >> >      }
> >> >  }
> >> >
> >> > +DEBUG_FUNCTION
> >> > +static void
> >> > +aarch64_expand_vector_init_debug_seq (rtx_insn *seq, const char *s)
> >> > +{
> >> > +  fprintf (stderr, "%s: %u\n", s, seq_cost (seq, !optimize_size));
> >> > +  for (rtx_insn *i = seq; i; i = NEXT_INSN (i))
> >> > +    {
> >> > +      debug_rtx (PATTERN (i));
> >> > +      fprintf (stderr, "cost: %d\n", pattern_cost (PATTERN (i), !optimize_size));
> >> > +    }
> >> > +}
> >>
> >> I'm not sure we should commit this to the tree.
> >>
> >> > +
> >> > +static rtx
> >> > +aarch64_expand_vector_init_split_vals (machine_mode mode, rtx vals, bool even_p)
> >>
> >> How about calling this aarch64_unzip_vector_init?  It needs a function
> >> comment.
> >>
> >> > +{
> >> > +  int n = XVECLEN (vals, 0);
> >> > +  machine_mode new_mode
> >> > +    = aarch64_simd_container_mode (GET_MODE_INNER (mode), 64);
> >>
> >> IMO it would be better to use "GET_MODE_BITSIZE (mode).to_constant () / 2"
> >> or "GET_MODE_UNIT_BITSIZE (mode) * n / 2" for the second argument.
> >>
> >> > +  rtvec vec = rtvec_alloc (n / 2);
> >> > +  for (int i = 0; i < n; i++)
> >> > +    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> >> > +                               : XVECEXP (vals, 0, 2 * i + 1);
> >> > +  return gen_rtx_PARALLEL (new_mode, vec);
> >> > +}
> >> > +
> >> > +/*
> >> > +The function does the following:
> >> > +(a) Generates code sequence by splitting VALS into even and odd halves,
> >> > +    and recursively calling itself to initialize them and then merge using
> >> > +    zip1.
> >> > +(b) Generate code sequence directly using aarch64_expand_vector_init_fallback.
> >> > +(c) Compare the cost of code sequences generated by (a) and (b), and choose
> >> > +    the more efficient one.
> >> > +*/
> >>
> >> I think we should keep the current description of the interface,
> >> before the describing the implementation:
> >>
> >> /* Expand a vector initialization sequence, such that TARGET is
> >>    initialized to contain VALS.  */
> >>
> >> (includes an s/s/z/).
> >>
> >> And it's probably better to describe the implementation inside
> >> the function.
> >>
> >> Most comments are written in imperative style, so how about:
> >>
> >>   /* Try decomposing the initializer into even and odd halves and
> >>      then ZIP them together.  Use the resulting sequence if it is
> >>      strictly cheaper than loading VALS directly.
> >>
> >>      Prefer the fallback sequence in the event of a tie, since it
> >>      will tend to use fewer registers.  */
> >>
> >> > +
> >> > +void
> >> > +aarch64_expand_vector_init (rtx target, rtx vals)
> >> > +{
> >> > +  machine_mode mode = GET_MODE (target);
> >> > +  int n_elts = XVECLEN (vals, 0);
> >> > +
> >> > +  if (n_elts < 8
> >> > +      || known_eq (GET_MODE_BITSIZE (mode), 64))
> >>
> >> Might be more robust to test maybe_ne (GET_MODE_BITSIZE (mode), 128)
> >>
> >> > +    {
> >> > +      aarch64_expand_vector_init_fallback (target, vals);
> >> > +      return;
> >> > +    }
> >> > +
> >> > +  start_sequence ();
> >> > +  rtx dest[2];
> >> > +  unsigned costs[2];
> >> > +  for (int i = 0; i < 2; i++)
> >> > +    {
> >> > +      start_sequence ();
> >> > +      dest[i] = gen_reg_rtx (mode);
> >> > +      rtx new_vals
> >> > +     = aarch64_expand_vector_init_split_vals (mode, vals, (i % 2) == 0);
> >> > +      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
> >> > +      aarch64_expand_vector_init (tmp_reg, new_vals);
> >> > +      dest[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
> >>
> >> Maybe "src" or "halves" would be a better name than "dest", given that
> >> the rtx isn't actually the destination of the subsequence.
> >>
> >> > +      rtx_insn *rec_seq = get_insns ();
> >> > +      end_sequence ();
> >> > +      costs[i] = seq_cost (rec_seq, !optimize_size);
> >> > +      emit_insn (rec_seq);
> >> > +    }
> >> > +
> >> > +  rtvec v = gen_rtvec (2, dest[0], dest[1]);
> >> > +  rtx_insn *zip1_insn
> >> > +    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> >> > +  unsigned seq_total_cost
> >> > +    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
> >>
> >> This is the wrong way round: max should be for speed and addition
> >> for size.
> > I assumed, !optimize_size meant optimizing for speed ?
> > So (!optimize_size) ? std::max (costs[0] ,costs[1]) : costs[0] + costs[1]
> > would imply taking max of the two for speed and addition for size, or
> > am I misunderstanding ?
>
> Ah, sorry, I misread.  But IMO it would be more natural as:
>
>   optimize_size ? ... : ...;
>
> > I have done rest of the changes in attached patch.
> >
> > Thanks,
> > Prathamesh
> >>
> >> Thanks,
> >> Richard
> >>
> >> > +  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
> >> > +
> >> > +  rtx_insn *seq = get_insns ();
> >> > +  end_sequence ();
> >> > +
> >> > +  start_sequence ();
> >> > +  aarch64_expand_vector_init_fallback (target, vals);
> >> > +  rtx_insn *fallback_seq = get_insns ();
> >> > +  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
> >> > +  end_sequence ();
> >> > +
> >> > +  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
> >> > +}
> >> > +
> >> >  /* Emit RTL corresponding to:
> >> >     insr TARGET, ELEM.  */
> >> >
> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> >> > similarity index 82%
> >> > rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> >> > rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> >> > index ee775048589..e812d3946de 100644
> >> > --- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> >> > @@ -7,8 +7,8 @@
> >> >  /*
> >> >  ** foo:
> >> >  **   ...
> >> > -**   dup     v[0-9]+\.8h, w[0-9]+
> >> > -**   dup     v[0-9]+\.8h, w[0-9]+
> >> > +**   dup     v[0-9]+\.4h, w[0-9]+
> >> > +**   dup     v[0-9]+\.4h, w[0-9]+
> >> >  **   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >> >  **   ...
> >> >  **   ret
> >> > @@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
> >> >  /*
> >> >  ** foo2:
> >> >  **   ...
> >> > -**   dup     v[0-9]+\.8h, w[0-9]+
> >> > -**   movi    v[0-9]+\.8h, 0x1
> >> > +**   dup     v[0-9]+\.4h, w[0-9]+
> >> > +**   movi    v[0-9]+\.4h, 0x1
> >> >  **   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >> >  **   ...
> >> >  **   ret
> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> >> > new file mode 100644
> >> > index 00000000000..e28fdcda29d
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> >> > @@ -0,0 +1,21 @@
> >> > +/* { dg-do compile } */
> >> > +/* { dg-options "-O3" } */
> >> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> >> > +
> >> > +#include <arm_neon.h>
> >> > +
> >> > +/*
> >> > +** f_s8:
> >> > +**   ...
> >> > +**   dup     v[0-9]+\.8b, w[0-9]+
> >> > +**   adrp    x[0-9]+, \.LC[0-9]+
> >> > +**   ldr     d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> >> > +**   zip1    v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> >> > +**   ret
> >> > +*/
> >> > +
> >> > +int8x16_t f_s8(int8_t x)
> >> > +{
> >> > +  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> >> > +                       x, 5, x, 6, x, 7, x, 8 };
> >> > +}
> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> >> > new file mode 100644
> >> > index 00000000000..9366ca349b6
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> >> > @@ -0,0 +1,22 @@
> >> > +/* { dg-do compile } */
> >> > +/* { dg-options "-O3" } */
> >> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> >> > +
> >> > +#include <arm_neon.h>
> >> > +
> >> > +/*
> >> > +** f_s8:
> >> > +**   ...
> >> > +**   adrp    x[0-9]+, \.LC[0-9]+
> >> > +**   dup     v[0-9]+\.8b, w[0-9]+
> >> > +**   ldr     d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> >> > +**   ins     v0\.b\[0\], w0
> >> > +**   zip1    v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> >> > +**   ret
> >> > +*/
> >> > +
> >> > +int8x16_t f_s8(int8_t x, int8_t y)
> >> > +{
> >> > +  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> >> > +                       4, y, 5, y, 6, y, 7, y };
> >> > +}
> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> >> > new file mode 100644
> >> > index 00000000000..e16459486d7
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> >> > @@ -0,0 +1,22 @@
> >> > +/* { dg-do compile } */
> >> > +/* { dg-options "-O3" } */
> >> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> >> > +
> >> > +#include <arm_neon.h>
> >> > +
> >> > +/*
> >> > +** f_s8:
> >> > +**   ...
> >> > +**   adrp    x[0-9]+, \.LC[0-9]+
> >> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> >> > +**   ins     v0\.b\[0\], w0
> >> > +**   ins     v0\.b\[1\], w1
> >> > +**   ...
> >> > +**   ret
> >> > +*/
> >> > +
> >> > +int8x16_t f_s8(int8_t x, int8_t y)
> >> > +{
> >> > +  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
> >> > +                       7, 8, 9, 10, 11, 12, 13, 14 };
> >> > +}
> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> >> > new file mode 100644
> >> > index 00000000000..8f35854c008
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> >> > @@ -0,0 +1,24 @@
> >> > +/* { dg-do compile } */
> >> > +/* { dg-options "-Os" } */
> >> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> >> > +
> >> > +/* Verify that fallback code-sequence is chosen over
> >> > +   recursively generated code-sequence merged with zip1.  */
> >> > +
> >> > +/*
> >> > +** f_s16:
> >> > +**   ...
> >> > +**   sxth    w0, w0
> >> > +**   fmov    s0, w0
> >> > +**   ins     v0\.h\[1\], w1
> >> > +**   ins     v0\.h\[2\], w2
> >> > +**   ins     v0\.h\[3\], w3
> >> > +**   ins     v0\.h\[4\], w4
> >> > +**   ins     v0\.h\[5\], w5
> >> > +**   ins     v0\.h\[6\], w6
> >> > +**   ins     v0\.h\[7\], w7
> >> > +**   ...
> >> > +**   ret
> >> > +*/
> >> > +
> >> > +#include "vec-init-22.h"
> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> >> > new file mode 100644
> >> > index 00000000000..172d56ffdf1
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> >> > @@ -0,0 +1,27 @@
> >> > +/* { dg-do compile } */
> >> > +/* { dg-options "-O3" } */
> >> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> >> > +
> >> > +/* Verify that we recursively generate code for even and odd halves
> >> > +   instead of fallback code. This is so despite the longer code-gen
> >> > +   because it has fewer dependencies and thus has lesser cost.  */
> >> > +
> >> > +/*
> >> > +** f_s16:
> >> > +**   ...
> >> > +**   sxth    w0, w0
> >> > +**   sxth    w1, w1
> >> > +**   fmov    d0, x0
> >> > +**   fmov    d1, x1
> >> > +**   ins     v[0-9]+\.h\[1\], w2
> >> > +**   ins     v[0-9]+\.h\[1\], w3
> >> > +**   ins     v[0-9]+\.h\[2\], w4
> >> > +**   ins     v[0-9]+\.h\[2\], w5
> >> > +**   ins     v[0-9]+\.h\[3\], w6
> >> > +**   ins     v[0-9]+\.h\[3\], w7
> >> > +**   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >> > +**   ...
> >> > +**   ret
> >> > +*/
> >> > +
> >> > +#include "vec-init-22.h"
> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> >> > new file mode 100644
> >> > index 00000000000..15b889d4097
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> >> > @@ -0,0 +1,7 @@
> >> > +#include <arm_neon.h>
> >> > +
> >> > +int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
> >> > +                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
> >> > +{
> >> > +  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
> >> > +}
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index acc0cfe5f94..94cc4338678 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -21972,11 +21972,12 @@ aarch64_simd_make_constant (rtx vals)
> >      return NULL_RTX;
> >  }
> >
> > -/* Expand a vector initialisation sequence, such that TARGET is
> > -   initialised to contain VALS.  */
> > +/* A subroutine of aarch64_expand_vector_init, with the same interface.
> > +   The caller has already tried a divide-and-conquer approach, so do
> > +   not consider that case here.  */
> >
> >  void
> > -aarch64_expand_vector_init (rtx target, rtx vals)
> > +aarch64_expand_vector_init_fallback (rtx target, rtx vals)
> >  {
> >    machine_mode mode = GET_MODE (target);
> >    scalar_mode inner_mode = GET_MODE_INNER (mode);
> > @@ -22036,38 +22037,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >        return;
> >      }
> >
> > -  /* Check for interleaving case.
> > -     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> > -     Generate following code:
> > -     dup v0.h, x
> > -     dup v1.h, y
> > -     zip1 v0.h, v0.h, v1.h
> > -     for "large enough" initializer.  */
> > -
> > -  if (n_elts >= 8)
> > -    {
> > -      int i;
> > -      for (i = 2; i < n_elts; i++)
> > -     if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> > -       break;
> > -
> > -      if (i == n_elts)
> > -     {
> > -       machine_mode mode = GET_MODE (target);
> > -       rtx dest[2];
> > -
> > -       for (int i = 0; i < 2; i++)
> > -         {
> > -           rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> > -           dest[i] = force_reg (mode, x);
> > -         }
> > -
> > -       rtvec v = gen_rtvec (2, dest[0], dest[1]);
> > -       emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > -       return;
> > -     }
> > -    }
> > -
> >    enum insn_code icode = optab_handler (vec_set_optab, mode);
> >    gcc_assert (icode != CODE_FOR_nothing);
> >
> > @@ -22189,7 +22158,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >           }
> >         XVECEXP (copy, 0, i) = subst;
> >       }
> > -      aarch64_expand_vector_init (target, copy);
> > +      aarch64_expand_vector_init_fallback (target, copy);
> >      }
> >
> >    /* Insert the variable lanes directly.  */
> > @@ -22203,6 +22172,81 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >      }
> >  }
> >
> > +/* Return even or odd half of VALS depending on EVEN_P.  */
> > +
> > +static rtx
> > +aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
> > +{
> > +  int n = XVECLEN (vals, 0);
> > +  machine_mode new_mode
> > +    = aarch64_simd_container_mode (GET_MODE_INNER (mode),
> > +                                GET_MODE_BITSIZE (mode).to_constant () / 2);
> > +  rtvec vec = rtvec_alloc (n / 2);
> > +  for (int i = 0; i < n; i++)
> > +    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> > +                               : XVECEXP (vals, 0, 2 * i + 1);
> > +  return gen_rtx_PARALLEL (new_mode, vec);
> > +}
> > +
> > +/* Expand a vector initialisation sequence, such that TARGET is
>
> initialization
>
> It would be good to add -fno-schedule-insns -fno-schedule-insns2
> to the tests' dg-options (or use -O instead of -O3 for the -O3 tests,
> if that works).
>
> OK for stage 1 with those changes, thanks.
Hi Richard,
Thanks for approving the patch!
Unfortunately it regresses code-gen for the following case:

svint32_t f(int32x4_t x)
{
  return svdupq_s32 (x[0], x[1], x[2], x[3]);
}

-O2 code-gen with trunk:
f:
        dup     z0.q, z0.q[0]
        ret

-O2 code-gen with patch:
f:
        dup     s1, v0.s[1]
        mov    v2.8b, v0.8b
        ins     v1.s[1], v0.s[3]
        ins     v2.s[1], v0.s[2]
        zip1    v0.4s, v2.4s, v1.4s
        dup     z0.q, z0.q[0]
        ret

IIUC, svdupq_impl::expand uses aarch64_expand_vector_init
to initialize the "base 128-bit vector" and then use dupq to replicate it.

Without patch, aarch64_expand_vector_init generates fallback code, and then
combine optimizes a sequence of vec_merge/vec_select pairs into an assignment:

(insn 7 3 8 2 (set (reg:SI 99)
        (vec_select:SI (reg/v:V4SI 97 [ x ])
            (parallel [
                    (const_int 1 [0x1])
                ]))) "bar.c":6:10 2592 {aarch64_get_lanev4si}
     (nil))

(insn 13 9 15 2 (set (reg:V4SI 102)
        (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 99))
            (reg/v:V4SI 97 [ x ])
            (const_int 2 [0x2]))) "bar.c":6:10 1794 {aarch64_simd_vec_setv4si}
     (expr_list:REG_DEAD (reg:SI 99)
        (expr_list:REG_DEAD (reg/v:V4SI 97 [ x ])
            (nil))))

into:
Trying 7 -> 13:
    7: r99:SI=vec_select(r97:V4SI,parallel)
   13: r102:V4SI=vec_merge(vec_duplicate(r99:SI),r97:V4SI,0x2)
      REG_DEAD r99:SI
      REG_DEAD r97:V4SI
Successfully matched this instruction:
(set (reg:V4SI 102)
    (reg/v:V4SI 97 [ x ]))

which eventually results into:
(note 2 25 3 2 NOTE_INSN_DELETED)
(note 3 2 7 2 NOTE_INSN_FUNCTION_BEG)
(note 7 3 8 2 NOTE_INSN_DELETED)
(note 8 7 9 2 NOTE_INSN_DELETED)
(note 9 8 13 2 NOTE_INSN_DELETED)
(note 13 9 15 2 NOTE_INSN_DELETED)
(note 15 13 17 2 NOTE_INSN_DELETED)
(note 17 15 18 2 NOTE_INSN_DELETED)
(note 18 17 22 2 NOTE_INSN_DELETED)
(insn 22 18 23 2 (parallel [
            (set (reg/i:VNx4SI 32 v0)
                (vec_duplicate:VNx4SI (reg:V4SI 108)))
            (clobber (scratch:VNx16BI))
        ]) "bar.c":7:1 5202 {aarch64_vec_duplicate_vqvnx4si_le}
     (expr_list:REG_DEAD (reg:V4SI 108)
        (nil)))
(insn 23 22 0 2 (use (reg/i:VNx4SI 32 v0)) "bar.c":7:1 -1
     (nil))

I was wondering if we should add the above special case, of assigning
target = vec in aarch64_expand_vector_init, if initializer is {
vec[0], vec[1], ... } ?

Thanks,
Prathamesh
>
> Richard
>
> > +   initialized to contain VALS.  */
> > +
> > +void
> > +aarch64_expand_vector_init (rtx target, rtx vals)
> > +{
> > +  /* Try decomposing the initializer into even and odd halves and
> > +     then ZIP them together.  Use the resulting sequence if it is
> > +     strictly cheaper than loading VALS directly.
> > +
> > +     Prefer the fallback sequence in the event of a tie, since it
> > +     will tend to use fewer registers.  */
> > +
> > +  machine_mode mode = GET_MODE (target);
> > +  int n_elts = XVECLEN (vals, 0);
> > +
> > +  if (n_elts < 4
> > +      || maybe_ne (GET_MODE_BITSIZE (mode), 128))
> > +    {
> > +      aarch64_expand_vector_init_fallback (target, vals);
> > +      return;
> > +    }
> > +
> > +  start_sequence ();
> > +  rtx halves[2];
> > +  unsigned costs[2];
> > +  for (int i = 0; i < 2; i++)
> > +    {
> > +      start_sequence ();
> > +      rtx new_vals
> > +     = aarch64_unzip_vector_init (mode, vals, (i % 2) == 0);
> > +      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
> > +      aarch64_expand_vector_init (tmp_reg, new_vals);
> > +      halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
> > +      rtx_insn *rec_seq = get_insns ();
> > +      end_sequence ();
> > +      costs[i] = seq_cost (rec_seq, !optimize_size);
> > +      emit_insn (rec_seq);
> > +    }
> > +
> > +  rtvec v = gen_rtvec (2, halves[0], halves[1]);
> > +  rtx_insn *zip1_insn
> > +    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > +  unsigned seq_total_cost
> > +    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
> > +  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
> > +
> > +  rtx_insn *seq = get_insns ();
> > +  end_sequence ();
> > +
> > +  start_sequence ();
> > +  aarch64_expand_vector_init_fallback (target, vals);
> > +  rtx_insn *fallback_seq = get_insns ();
> > +  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
> > +  end_sequence ();
> > +
> > +  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
> > +}
> > +
> >  /* Emit RTL corresponding to:
> >     insr TARGET, ELEM.  */
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > similarity index 82%
> > rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> > rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > index ee775048589..e812d3946de 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > @@ -7,8 +7,8 @@
> >  /*
> >  ** foo:
> >  **   ...
> > -**   dup     v[0-9]+\.8h, w[0-9]+
> > -**   dup     v[0-9]+\.8h, w[0-9]+
> > +**   dup     v[0-9]+\.4h, w[0-9]+
> > +**   dup     v[0-9]+\.4h, w[0-9]+
> >  **   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >  **   ...
> >  **   ret
> > @@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
> >  /*
> >  ** foo2:
> >  **   ...
> > -**   dup     v[0-9]+\.8h, w[0-9]+
> > -**   movi    v[0-9]+\.8h, 0x1
> > +**   dup     v[0-9]+\.4h, w[0-9]+
> > +**   movi    v[0-9]+\.4h, 0x1
> >  **   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >  **   ...
> >  **   ret
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> > new file mode 100644
> > index 00000000000..e28fdcda29d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> > @@ -0,0 +1,21 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** f_s8:
> > +**   ...
> > +**   dup     v[0-9]+\.8b, w[0-9]+
> > +**   adrp    x[0-9]+, \.LC[0-9]+
> > +**   ldr     d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> > +**   zip1    v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> > +**   ret
> > +*/
> > +
> > +int8x16_t f_s8(int8_t x)
> > +{
> > +  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> > +                       x, 5, x, 6, x, 7, x, 8 };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> > new file mode 100644
> > index 00000000000..9366ca349b6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** f_s8:
> > +**   ...
> > +**   adrp    x[0-9]+, \.LC[0-9]+
> > +**   dup     v[0-9]+\.8b, w[0-9]+
> > +**   ldr     d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> > +**   ins     v0\.b\[0\], w0
> > +**   zip1    v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> > +**   ret
> > +*/
> > +
> > +int8x16_t f_s8(int8_t x, int8_t y)
> > +{
> > +  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> > +                       4, y, 5, y, 6, y, 7, y };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> > new file mode 100644
> > index 00000000000..e16459486d7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** f_s8:
> > +**   ...
> > +**   adrp    x[0-9]+, \.LC[0-9]+
> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> > +**   ins     v0\.b\[0\], w0
> > +**   ins     v0\.b\[1\], w1
> > +**   ...
> > +**   ret
> > +*/
> > +
> > +int8x16_t f_s8(int8_t x, int8_t y)
> > +{
> > +  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
> > +                       7, 8, 9, 10, 11, 12, 13, 14 };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> > new file mode 100644
> > index 00000000000..8f35854c008
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> > @@ -0,0 +1,24 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-Os" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +/* Verify that fallback code-sequence is chosen over
> > +   recursively generated code-sequence merged with zip1.  */
> > +
> > +/*
> > +** f_s16:
> > +**   ...
> > +**   sxth    w0, w0
> > +**   fmov    s0, w0
> > +**   ins     v0\.h\[1\], w1
> > +**   ins     v0\.h\[2\], w2
> > +**   ins     v0\.h\[3\], w3
> > +**   ins     v0\.h\[4\], w4
> > +**   ins     v0\.h\[5\], w5
> > +**   ins     v0\.h\[6\], w6
> > +**   ins     v0\.h\[7\], w7
> > +**   ...
> > +**   ret
> > +*/
> > +
> > +#include "vec-init-22.h"
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> > new file mode 100644
> > index 00000000000..172d56ffdf1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> > @@ -0,0 +1,27 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +/* Verify that we recursively generate code for even and odd halves
> > +   instead of fallback code. This is so despite the longer code-gen
> > +   because it has fewer dependencies and thus has lesser cost.  */
> > +
> > +/*
> > +** f_s16:
> > +**   ...
> > +**   sxth    w0, w0
> > +**   sxth    w1, w1
> > +**   fmov    d0, x0
> > +**   fmov    d1, x1
> > +**   ins     v[0-9]+\.h\[1\], w2
> > +**   ins     v[0-9]+\.h\[1\], w3
> > +**   ins     v[0-9]+\.h\[2\], w4
> > +**   ins     v[0-9]+\.h\[2\], w5
> > +**   ins     v[0-9]+\.h\[3\], w6
> > +**   ins     v[0-9]+\.h\[3\], w7
> > +**   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> > +**   ...
> > +**   ret
> > +*/
> > +
> > +#include "vec-init-22.h"
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> > new file mode 100644
> > index 00000000000..15b889d4097
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> > @@ -0,0 +1,7 @@
> > +#include <arm_neon.h>
> > +
> > +int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
> > +                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
> > +{
> > +  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
> > +}
  
Richard Sandiford March 10, 2023, 6:08 p.m. UTC | #18
Sorry for the slow reply.

Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> Unfortunately it regresses code-gen for the following case:
>
> svint32_t f(int32x4_t x)
> {
>   return svdupq_s32 (x[0], x[1], x[2], x[3]);
> }
>
> -O2 code-gen with trunk:
> f:
>         dup     z0.q, z0.q[0]
>         ret
>
> -O2 code-gen with patch:
> f:
>         dup     s1, v0.s[1]
>         mov    v2.8b, v0.8b
>         ins     v1.s[1], v0.s[3]
>         ins     v2.s[1], v0.s[2]
>         zip1    v0.4s, v2.4s, v1.4s
>         dup     z0.q, z0.q[0]
>         ret
>
> IIUC, svdupq_impl::expand uses aarch64_expand_vector_init
> to initialize the "base 128-bit vector" and then use dupq to replicate it.
>
> Without patch, aarch64_expand_vector_init generates fallback code, and then
> combine optimizes a sequence of vec_merge/vec_select pairs into an assignment:
>
> (insn 7 3 8 2 (set (reg:SI 99)
>         (vec_select:SI (reg/v:V4SI 97 [ x ])
>             (parallel [
>                     (const_int 1 [0x1])
>                 ]))) "bar.c":6:10 2592 {aarch64_get_lanev4si}
>      (nil))
>
> (insn 13 9 15 2 (set (reg:V4SI 102)
>         (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 99))
>             (reg/v:V4SI 97 [ x ])
>             (const_int 2 [0x2]))) "bar.c":6:10 1794 {aarch64_simd_vec_setv4si}
>      (expr_list:REG_DEAD (reg:SI 99)
>         (expr_list:REG_DEAD (reg/v:V4SI 97 [ x ])
>             (nil))))
>
> into:
> Trying 7 -> 13:
>     7: r99:SI=vec_select(r97:V4SI,parallel)
>    13: r102:V4SI=vec_merge(vec_duplicate(r99:SI),r97:V4SI,0x2)
>       REG_DEAD r99:SI
>       REG_DEAD r97:V4SI
> Successfully matched this instruction:
> (set (reg:V4SI 102)
>     (reg/v:V4SI 97 [ x ]))
>
> which eventually results into:
> (note 2 25 3 2 NOTE_INSN_DELETED)
> (note 3 2 7 2 NOTE_INSN_FUNCTION_BEG)
> (note 7 3 8 2 NOTE_INSN_DELETED)
> (note 8 7 9 2 NOTE_INSN_DELETED)
> (note 9 8 13 2 NOTE_INSN_DELETED)
> (note 13 9 15 2 NOTE_INSN_DELETED)
> (note 15 13 17 2 NOTE_INSN_DELETED)
> (note 17 15 18 2 NOTE_INSN_DELETED)
> (note 18 17 22 2 NOTE_INSN_DELETED)
> (insn 22 18 23 2 (parallel [
>             (set (reg/i:VNx4SI 32 v0)
>                 (vec_duplicate:VNx4SI (reg:V4SI 108)))
>             (clobber (scratch:VNx16BI))
>         ]) "bar.c":7:1 5202 {aarch64_vec_duplicate_vqvnx4si_le}
>      (expr_list:REG_DEAD (reg:V4SI 108)
>         (nil)))
> (insn 23 22 0 2 (use (reg/i:VNx4SI 32 v0)) "bar.c":7:1 -1
>      (nil))
>
> I was wondering if we should add the above special case, of assigning
> target = vec in aarch64_expand_vector_init, if initializer is {
> vec[0], vec[1], ... } ?

I'm not sure it will be easy to detect that.  Won't the inputs to
aarch64_expand_vector_init just be plain registers?  It's not a
good idea in general to search for definitions of registers
during expansion.

It would be nice to fix this by lowering svdupq into:

(a) a constructor for a 128-bit vector
(b) a duplication of the 128-bit vector to fill an SVE vector

But I'm not sure what the best way of doing (b) would be.
In RTL we can use vec_duplicate, but I don't think gimple
has an equivalent construct.  Maybe Richi has some ideas.

We're planning to implement the ACLE's Neon-SVE bridge:
https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge
and so we'll need (b) to implement the svdup_neonq functions.

Thanks,
Richard
  
Richard Biener March 13, 2023, 7:33 a.m. UTC | #19
On Fri, 10 Mar 2023, Richard Sandiford wrote:

> Sorry for the slow reply.
> 
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > Unfortunately it regresses code-gen for the following case:
> >
> > svint32_t f(int32x4_t x)
> > {
> >   return svdupq_s32 (x[0], x[1], x[2], x[3]);
> > }
> >
> > -O2 code-gen with trunk:
> > f:
> >         dup     z0.q, z0.q[0]
> >         ret
> >
> > -O2 code-gen with patch:
> > f:
> >         dup     s1, v0.s[1]
> >         mov    v2.8b, v0.8b
> >         ins     v1.s[1], v0.s[3]
> >         ins     v2.s[1], v0.s[2]
> >         zip1    v0.4s, v2.4s, v1.4s
> >         dup     z0.q, z0.q[0]
> >         ret
> >
> > IIUC, svdupq_impl::expand uses aarch64_expand_vector_init
> > to initialize the "base 128-bit vector" and then use dupq to replicate it.
> >
> > Without patch, aarch64_expand_vector_init generates fallback code, and then
> > combine optimizes a sequence of vec_merge/vec_select pairs into an assignment:
> >
> > (insn 7 3 8 2 (set (reg:SI 99)
> >         (vec_select:SI (reg/v:V4SI 97 [ x ])
> >             (parallel [
> >                     (const_int 1 [0x1])
> >                 ]))) "bar.c":6:10 2592 {aarch64_get_lanev4si}
> >      (nil))
> >
> > (insn 13 9 15 2 (set (reg:V4SI 102)
> >         (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 99))
> >             (reg/v:V4SI 97 [ x ])
> >             (const_int 2 [0x2]))) "bar.c":6:10 1794 {aarch64_simd_vec_setv4si}
> >      (expr_list:REG_DEAD (reg:SI 99)
> >         (expr_list:REG_DEAD (reg/v:V4SI 97 [ x ])
> >             (nil))))
> >
> > into:
> > Trying 7 -> 13:
> >     7: r99:SI=vec_select(r97:V4SI,parallel)
> >    13: r102:V4SI=vec_merge(vec_duplicate(r99:SI),r97:V4SI,0x2)
> >       REG_DEAD r99:SI
> >       REG_DEAD r97:V4SI
> > Successfully matched this instruction:
> > (set (reg:V4SI 102)
> >     (reg/v:V4SI 97 [ x ]))
> >
> > which eventually results into:
> > (note 2 25 3 2 NOTE_INSN_DELETED)
> > (note 3 2 7 2 NOTE_INSN_FUNCTION_BEG)
> > (note 7 3 8 2 NOTE_INSN_DELETED)
> > (note 8 7 9 2 NOTE_INSN_DELETED)
> > (note 9 8 13 2 NOTE_INSN_DELETED)
> > (note 13 9 15 2 NOTE_INSN_DELETED)
> > (note 15 13 17 2 NOTE_INSN_DELETED)
> > (note 17 15 18 2 NOTE_INSN_DELETED)
> > (note 18 17 22 2 NOTE_INSN_DELETED)
> > (insn 22 18 23 2 (parallel [
> >             (set (reg/i:VNx4SI 32 v0)
> >                 (vec_duplicate:VNx4SI (reg:V4SI 108)))
> >             (clobber (scratch:VNx16BI))
> >         ]) "bar.c":7:1 5202 {aarch64_vec_duplicate_vqvnx4si_le}
> >      (expr_list:REG_DEAD (reg:V4SI 108)
> >         (nil)))
> > (insn 23 22 0 2 (use (reg/i:VNx4SI 32 v0)) "bar.c":7:1 -1
> >      (nil))
> >
> > I was wondering if we should add the above special case, of assigning
> > target = vec in aarch64_expand_vector_init, if initializer is {
> > vec[0], vec[1], ... } ?
> 
> I'm not sure it will be easy to detect that.  Won't the inputs to
> aarch64_expand_vector_init just be plain registers?  It's not a
> good idea in general to search for definitions of registers
> during expansion.
> 
> It would be nice to fix this by lowering svdupq into:
> 
> (a) a constructor for a 128-bit vector
> (b) a duplication of the 128-bit vector to fill an SVE vector
> 
> But I'm not sure what the best way of doing (b) would be.
> In RTL we can use vec_duplicate, but I don't think gimple
> has an equivalent construct.  Maybe Richi has some ideas.

On GIMPLE it would be

 _1 = { a, ... }; // (a)
 _2 = { _1, ... }; // (b)

but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is
possible?  But at least a CTOR of vectors is what we use to
concat vectors.

With the recent relaxing of VEC_PERM inputs it's also possible to
express (b) with a VEC_PERM:

 _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }>

but again I'm not sure if that repeating 0, 1, 2, 3 is expressible
for VL vectors (maybe we'd allow "wrapping" here, I'm not sure).

Richard.

> We're planning to implement the ACLE's Neon-SVE bridge:
> https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge
> and so we'll need (b) to implement the svdup_neonq functions.
> 
> Thanks,
> Richard
>
  
Prathamesh Kulkarni April 3, 2023, 4:33 p.m. UTC | #20
On Mon, 13 Mar 2023 at 13:03, Richard Biener <rguenther@suse.de> wrote:
>
> On Fri, 10 Mar 2023, Richard Sandiford wrote:
>
> > Sorry for the slow reply.
> >
> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > > Unfortunately it regresses code-gen for the following case:
> > >
> > > svint32_t f(int32x4_t x)
> > > {
> > >   return svdupq_s32 (x[0], x[1], x[2], x[3]);
> > > }
> > >
> > > -O2 code-gen with trunk:
> > > f:
> > >         dup     z0.q, z0.q[0]
> > >         ret
> > >
> > > -O2 code-gen with patch:
> > > f:
> > >         dup     s1, v0.s[1]
> > >         mov    v2.8b, v0.8b
> > >         ins     v1.s[1], v0.s[3]
> > >         ins     v2.s[1], v0.s[2]
> > >         zip1    v0.4s, v2.4s, v1.4s
> > >         dup     z0.q, z0.q[0]
> > >         ret
> > >
> > > IIUC, svdupq_impl::expand uses aarch64_expand_vector_init
> > > to initialize the "base 128-bit vector" and then use dupq to replicate it.
> > >
> > > Without patch, aarch64_expand_vector_init generates fallback code, and then
> > > combine optimizes a sequence of vec_merge/vec_select pairs into an assignment:
> > >
> > > (insn 7 3 8 2 (set (reg:SI 99)
> > >         (vec_select:SI (reg/v:V4SI 97 [ x ])
> > >             (parallel [
> > >                     (const_int 1 [0x1])
> > >                 ]))) "bar.c":6:10 2592 {aarch64_get_lanev4si}
> > >      (nil))
> > >
> > > (insn 13 9 15 2 (set (reg:V4SI 102)
> > >         (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 99))
> > >             (reg/v:V4SI 97 [ x ])
> > >             (const_int 2 [0x2]))) "bar.c":6:10 1794 {aarch64_simd_vec_setv4si}
> > >      (expr_list:REG_DEAD (reg:SI 99)
> > >         (expr_list:REG_DEAD (reg/v:V4SI 97 [ x ])
> > >             (nil))))
> > >
> > > into:
> > > Trying 7 -> 13:
> > >     7: r99:SI=vec_select(r97:V4SI,parallel)
> > >    13: r102:V4SI=vec_merge(vec_duplicate(r99:SI),r97:V4SI,0x2)
> > >       REG_DEAD r99:SI
> > >       REG_DEAD r97:V4SI
> > > Successfully matched this instruction:
> > > (set (reg:V4SI 102)
> > >     (reg/v:V4SI 97 [ x ]))
> > >
> > > which eventually results into:
> > > (note 2 25 3 2 NOTE_INSN_DELETED)
> > > (note 3 2 7 2 NOTE_INSN_FUNCTION_BEG)
> > > (note 7 3 8 2 NOTE_INSN_DELETED)
> > > (note 8 7 9 2 NOTE_INSN_DELETED)
> > > (note 9 8 13 2 NOTE_INSN_DELETED)
> > > (note 13 9 15 2 NOTE_INSN_DELETED)
> > > (note 15 13 17 2 NOTE_INSN_DELETED)
> > > (note 17 15 18 2 NOTE_INSN_DELETED)
> > > (note 18 17 22 2 NOTE_INSN_DELETED)
> > > (insn 22 18 23 2 (parallel [
> > >             (set (reg/i:VNx4SI 32 v0)
> > >                 (vec_duplicate:VNx4SI (reg:V4SI 108)))
> > >             (clobber (scratch:VNx16BI))
> > >         ]) "bar.c":7:1 5202 {aarch64_vec_duplicate_vqvnx4si_le}
> > >      (expr_list:REG_DEAD (reg:V4SI 108)
> > >         (nil)))
> > > (insn 23 22 0 2 (use (reg/i:VNx4SI 32 v0)) "bar.c":7:1 -1
> > >      (nil))
> > >
> > > I was wondering if we should add the above special case, of assigning
> > > target = vec in aarch64_expand_vector_init, if initializer is {
> > > vec[0], vec[1], ... } ?
> >
> > I'm not sure it will be easy to detect that.  Won't the inputs to
> > aarch64_expand_vector_init just be plain registers?  It's not a
> > good idea in general to search for definitions of registers
> > during expansion.
> >
> > It would be nice to fix this by lowering svdupq into:
> >
> > (a) a constructor for a 128-bit vector
> > (b) a duplication of the 128-bit vector to fill an SVE vector
> >
> > But I'm not sure what the best way of doing (b) would be.
> > In RTL we can use vec_duplicate, but I don't think gimple
> > has an equivalent construct.  Maybe Richi has some ideas.
>
> On GIMPLE it would be
>
>  _1 = { a, ... }; // (a)
>  _2 = { _1, ... }; // (b)
>
> but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is
> possible?  But at least a CTOR of vectors is what we use to
> concat vectors.
>
> With the recent relaxing of VEC_PERM inputs it's also possible to
> express (b) with a VEC_PERM:
>
>  _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }>
>
> but again I'm not sure if that repeating 0, 1, 2, 3 is expressible
> for VL vectors (maybe we'd allow "wrapping" here, I'm not sure).
>
Hi,
Thanks for the suggestions and sorry for late response in turn.
The attached patch tries to fix the issue by explicitly constructing a CTOR
from svdupq's arguments and then using VEC_PERM_EXPR with VL mask
having encoded elements {0, 1, ... nargs-1},
npatterns == nargs, and nelts_per_pattern == 1, to replicate the base vector.

So for example, for the above case,
svint32_t f_32(int32x4_t x)
{
  return svdupq_s32 (x[0], x[1], x[2], x[3]);
}

forwprop1 lowers it to:
  svint32_t _6;
  vector(4) int _8;
 <bb 2> :
  _1 = BIT_FIELD_REF <x_5(D), 32, 0>;
  _2 = BIT_FIELD_REF <x_5(D), 32, 32>;
  _3 = BIT_FIELD_REF <x_5(D), 32, 64>;
  _4 = BIT_FIELD_REF <x_5(D), 32, 96>;
  _8 = {_1, _2, _3, _4};
  _6 = VEC_PERM_EXPR <_8, _8, { 0, 1, 2, 3, ... }>;
  return _6;

which is then eventually optimized to:
  svint32_t _6;
  <bb 2> [local count: 1073741824]:
  _6 = VEC_PERM_EXPR <x_5(D), x_5(D), { 0, 1, 2, 3, ... }>;
  return _6;

code-gen:
f_32:
        dup     z0.q, z0.q[0]
        ret

Does it look OK ?

Thanks,
Prathamesh
> Richard.
>
> > We're planning to implement the ACLE's Neon-SVE bridge:
> > https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge
> > and so we'll need (b) to implement the svdup_neonq functions.
> >
> > Thanks,
> > Richard
> >
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
> Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
> HRB 36809 (AG Nuernberg)
[SVE] Fold svld1rq to VEC_PERM_EXPR if elements are not constant.

gcc/ChangeLog:
	* config/aarch64/aarch64-sve-builtins-base.cc
	(svdupq_impl::fold_nonconst_dupq): New method.
	(svdupq_impl::fold): Call fold_nonconst_dupq.

gcc/testsuite/ChangeLog:
	* gcc.target/aarch64/sve/acle/general/dupq_11.c: New test.

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index cd9cace3c9b..3de79060619 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -817,6 +817,62 @@ public:
 
 class svdupq_impl : public quiet<function_base>
 {
+private:
+  gimple *
+  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
+  {
+    /* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
+       tmp = {arg0, arg1, ..., arg<N-1>}
+       lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
+
+    /* TODO: Revisit to handle factor by padding zeros.  */
+    if (factor > 1)
+      return NULL;
+
+    if (BYTES_BIG_ENDIAN)
+      return NULL;
+
+    tree lhs = gimple_call_lhs (f.call);
+    if (TREE_CODE (lhs) != SSA_NAME)
+      return NULL;
+
+    tree lhs_type = TREE_TYPE (lhs);
+    tree elt_type = TREE_TYPE (lhs_type);
+    scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type));
+    machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
+    tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
+
+    unsigned nargs = gimple_call_num_args (f.call);
+    vec<constructor_elt, va_gc> *v;
+    vec_alloc (v, nargs);
+    for (unsigned i = 0; i < nargs; i++)
+      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
+    tree vec = build_constructor (vq_type, v);
+
+    tree access_type
+      = build_aligned_type (vq_type, TYPE_ALIGN (elt_type));
+    tree tmp = make_ssa_name_fn (cfun, access_type, 0);
+    gimple *g = gimple_build_assign (tmp, vec);
+
+    gimple_seq stmts = NULL;
+    gimple_seq_add_stmt_without_update (&stmts, g);
+
+    int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
+    poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
+    vec_perm_builder sel (lhs_len, source_nelts, 1);
+    for (int i = 0; i < source_nelts; i++)
+      sel.quick_push (i);
+
+    vec_perm_indices indices (sel, 1, source_nelts);
+    tree mask_type = build_vector_type (ssizetype, lhs_len);
+    tree mask = vec_perm_indices_to_tree (mask_type, indices);
+
+    gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask);
+    gimple_seq_add_stmt_without_update (&stmts, g2);
+    gsi_replace_with_seq (f.gsi, stmts, false);
+    return g2;
+  }
+
 public:
   gimple *
   fold (gimple_folder &f) const override
@@ -832,7 +888,7 @@ public:
       {
 	tree elt = gimple_call_arg (f.call, i);
 	if (!CONSTANT_CLASS_P (elt))
-	  return NULL;
+	  return fold_nonconst_dupq (f, factor);
 	builder.quick_push (elt);
 	for (unsigned int j = 1; j < factor; ++j)
 	  builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
new file mode 100644
index 00000000000..f19f8deb1e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+
+#include <arm_sve.h>
+#include <arm_neon.h>
+
+svint8_t f_s8(int8x16_t x)
+{
+  return svdupq_s8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+		    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
+}
+
+svint16_t f_s16(int16x8_t x)
+{
+  return svdupq_s16 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]);
+}
+
+svint32_t f_s32(int32x4_t x)
+{
+  return svdupq_s32 (x[0], x[1], x[2], x[3]);
+}
+
+svint64_t f_s64(int64x2_t x)
+{
+  return svdupq_s64 (x[0], x[1]);
+}
+
+/* { dg-final { scan-tree-dump "VEC_PERM_EXPR" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "svdupq" "optimized" } } */
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} 4 } } */
  
Richard Sandiford April 4, 2023, 6:05 p.m. UTC | #21
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Mon, 13 Mar 2023 at 13:03, Richard Biener <rguenther@suse.de> wrote:
>> On GIMPLE it would be
>>
>>  _1 = { a, ... }; // (a)
>>  _2 = { _1, ... }; // (b)
>>
>> but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is
>> possible?  But at least a CTOR of vectors is what we use to
>> concat vectors.
>>
>> With the recent relaxing of VEC_PERM inputs it's also possible to
>> express (b) with a VEC_PERM:
>>
>>  _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }>
>>
>> but again I'm not sure if that repeating 0, 1, 2, 3 is expressible
>> for VL vectors (maybe we'd allow "wrapping" here, I'm not sure).
>>
> Hi,
> Thanks for the suggestions and sorry for late response in turn.
> The attached patch tries to fix the issue by explicitly constructing a CTOR
> from svdupq's arguments and then using VEC_PERM_EXPR with VL mask
> having encoded elements {0, 1, ... nargs-1},
> npatterns == nargs, and nelts_per_pattern == 1, to replicate the base vector.
>
> So for example, for the above case,
> svint32_t f_32(int32x4_t x)
> {
>   return svdupq_s32 (x[0], x[1], x[2], x[3]);
> }
>
> forwprop1 lowers it to:
>   svint32_t _6;
>   vector(4) int _8;
>  <bb 2> :
>   _1 = BIT_FIELD_REF <x_5(D), 32, 0>;
>   _2 = BIT_FIELD_REF <x_5(D), 32, 32>;
>   _3 = BIT_FIELD_REF <x_5(D), 32, 64>;
>   _4 = BIT_FIELD_REF <x_5(D), 32, 96>;
>   _8 = {_1, _2, _3, _4};
>   _6 = VEC_PERM_EXPR <_8, _8, { 0, 1, 2, 3, ... }>;
>   return _6;
>
> which is then eventually optimized to:
>   svint32_t _6;
>   <bb 2> [local count: 1073741824]:
>   _6 = VEC_PERM_EXPR <x_5(D), x_5(D), { 0, 1, 2, 3, ... }>;
>   return _6;
>
> code-gen:
> f_32:
>         dup     z0.q, z0.q[0]
>         ret

Nice!

> Does it look OK ?
>
> Thanks,
> Prathamesh
>> Richard.
>>
>> > We're planning to implement the ACLE's Neon-SVE bridge:
>> > https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge
>> > and so we'll need (b) to implement the svdup_neonq functions.
>> >
>> > Thanks,
>> > Richard
>> >
>>
>> --
>> Richard Biener <rguenther@suse.de>
>> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
>> Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
>> HRB 36809 (AG Nuernberg)
>
> [SVE] Fold svld1rq to VEC_PERM_EXPR if elements are not constant.
>
> gcc/ChangeLog:
> 	* config/aarch64/aarch64-sve-builtins-base.cc
> 	(svdupq_impl::fold_nonconst_dupq): New method.
> 	(svdupq_impl::fold): Call fold_nonconst_dupq.
>
> gcc/testsuite/ChangeLog:
> 	* gcc.target/aarch64/sve/acle/general/dupq_11.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index cd9cace3c9b..3de79060619 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -817,6 +817,62 @@ public:
>  
>  class svdupq_impl : public quiet<function_base>
>  {
> +private:
> +  gimple *
> +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
> +  {
> +    /* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
> +       tmp = {arg0, arg1, ..., arg<N-1>}
> +       lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
> +
> +    /* TODO: Revisit to handle factor by padding zeros.  */
> +    if (factor > 1)
> +      return NULL;

Isn't the key thing here predicate vs. vector rather than factor == 1 vs.
factor != 1?  Do we generate good code for b8, where factor should be 1?

> +
> +    if (BYTES_BIG_ENDIAN)
> +      return NULL;
> +
> +    tree lhs = gimple_call_lhs (f.call);
> +    if (TREE_CODE (lhs) != SSA_NAME)
> +      return NULL;

Why is this check needed?

> +    tree lhs_type = TREE_TYPE (lhs);
> +    tree elt_type = TREE_TYPE (lhs_type);
> +    scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type));

Aren't we already dealing with a scalar type here?  I'd have expected
SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...).

> +    machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
> +    tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
> +
> +    unsigned nargs = gimple_call_num_args (f.call);
> +    vec<constructor_elt, va_gc> *v;
> +    vec_alloc (v, nargs);
> +    for (unsigned i = 0; i < nargs; i++)
> +      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
> +    tree vec = build_constructor (vq_type, v);
> +
> +    tree access_type
> +      = build_aligned_type (vq_type, TYPE_ALIGN (elt_type));

Nit: seems to fit on one line.  But do we need this?  We're not accessing
memory, so I'd have expected vq_type to be OK as-is.

> +    tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> +    gimple *g = gimple_build_assign (tmp, vec);
> +
> +    gimple_seq stmts = NULL;
> +    gimple_seq_add_stmt_without_update (&stmts, g);
> +
> +    int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();

Looks like we should be able to use nargs instead of source_nelts.

Thanks,
Richard

> +    poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> +    vec_perm_builder sel (lhs_len, source_nelts, 1);
> +    for (int i = 0; i < source_nelts; i++)
> +      sel.quick_push (i);
> +
> +    vec_perm_indices indices (sel, 1, source_nelts);
> +    tree mask_type = build_vector_type (ssizetype, lhs_len);
> +    tree mask = vec_perm_indices_to_tree (mask_type, indices);
> +
> +    gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask);
> +    gimple_seq_add_stmt_without_update (&stmts, g2);
> +    gsi_replace_with_seq (f.gsi, stmts, false);
> +    return g2;
> +  }
> +
>  public:
>    gimple *
>    fold (gimple_folder &f) const override
> @@ -832,7 +888,7 @@ public:
>        {
>  	tree elt = gimple_call_arg (f.call, i);
>  	if (!CONSTANT_CLASS_P (elt))
> -	  return NULL;
> +	  return fold_nonconst_dupq (f, factor);
>  	builder.quick_push (elt);
>  	for (unsigned int j = 1; j < factor; ++j)
>  	  builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
> new file mode 100644
> index 00000000000..f19f8deb1e5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-optimized" } */
> +
> +#include <arm_sve.h>
> +#include <arm_neon.h>
> +
> +svint8_t f_s8(int8x16_t x)
> +{
> +  return svdupq_s8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
> +		    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
> +}
> +
> +svint16_t f_s16(int16x8_t x)
> +{
> +  return svdupq_s16 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]);
> +}
> +
> +svint32_t f_s32(int32x4_t x)
> +{
> +  return svdupq_s32 (x[0], x[1], x[2], x[3]);
> +}
> +
> +svint64_t f_s64(int64x2_t x)
> +{
> +  return svdupq_s64 (x[0], x[1]);
> +}
> +
> +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR" "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "svdupq" "optimized" } } */
> +
> +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} 4 } } */
  
Prathamesh Kulkarni April 6, 2023, 10:26 a.m. UTC | #22
On Tue, 4 Apr 2023 at 23:35, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > On Mon, 13 Mar 2023 at 13:03, Richard Biener <rguenther@suse.de> wrote:
> >> On GIMPLE it would be
> >>
> >>  _1 = { a, ... }; // (a)
> >>  _2 = { _1, ... }; // (b)
> >>
> >> but I'm not sure if (b), a VL CTOR of fixed len(?) sub-vectors is
> >> possible?  But at least a CTOR of vectors is what we use to
> >> concat vectors.
> >>
> >> With the recent relaxing of VEC_PERM inputs it's also possible to
> >> express (b) with a VEC_PERM:
> >>
> >>  _2 = VEC_PERM <_1, _1, { 0, 1, 2, 3, 0, 1, 2, 3, ... }>
> >>
> >> but again I'm not sure if that repeating 0, 1, 2, 3 is expressible
> >> for VL vectors (maybe we'd allow "wrapping" here, I'm not sure).
> >>
> > Hi,
> > Thanks for the suggestions and sorry for late response in turn.
> > The attached patch tries to fix the issue by explicitly constructing a CTOR
> > from svdupq's arguments and then using VEC_PERM_EXPR with VL mask
> > having encoded elements {0, 1, ... nargs-1},
> > npatterns == nargs, and nelts_per_pattern == 1, to replicate the base vector.
> >
> > So for example, for the above case,
> > svint32_t f_32(int32x4_t x)
> > {
> >   return svdupq_s32 (x[0], x[1], x[2], x[3]);
> > }
> >
> > forwprop1 lowers it to:
> >   svint32_t _6;
> >   vector(4) int _8;
> >  <bb 2> :
> >   _1 = BIT_FIELD_REF <x_5(D), 32, 0>;
> >   _2 = BIT_FIELD_REF <x_5(D), 32, 32>;
> >   _3 = BIT_FIELD_REF <x_5(D), 32, 64>;
> >   _4 = BIT_FIELD_REF <x_5(D), 32, 96>;
> >   _8 = {_1, _2, _3, _4};
> >   _6 = VEC_PERM_EXPR <_8, _8, { 0, 1, 2, 3, ... }>;
> >   return _6;
> >
> > which is then eventually optimized to:
> >   svint32_t _6;
> >   <bb 2> [local count: 1073741824]:
> >   _6 = VEC_PERM_EXPR <x_5(D), x_5(D), { 0, 1, 2, 3, ... }>;
> >   return _6;
> >
> > code-gen:
> > f_32:
> >         dup     z0.q, z0.q[0]
> >         ret
>
> Nice!
>
> > Does it look OK ?
> >
> > Thanks,
> > Prathamesh
> >> Richard.
> >>
> >> > We're planning to implement the ACLE's Neon-SVE bridge:
> >> > https://github.com/ARM-software/acle/blob/main/main/acle.md#neon-sve-bridge
> >> > and so we'll need (b) to implement the svdup_neonq functions.
> >> >
> >> > Thanks,
> >> > Richard
> >> >
> >>
> >> --
> >> Richard Biener <rguenther@suse.de>
> >> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
> >> Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
> >> HRB 36809 (AG Nuernberg)
> >
> > [SVE] Fold svld1rq to VEC_PERM_EXPR if elements are not constant.
> >
> > gcc/ChangeLog:
> >       * config/aarch64/aarch64-sve-builtins-base.cc
> >       (svdupq_impl::fold_nonconst_dupq): New method.
> >       (svdupq_impl::fold): Call fold_nonconst_dupq.
> >
> > gcc/testsuite/ChangeLog:
> >       * gcc.target/aarch64/sve/acle/general/dupq_11.c: New test.
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index cd9cace3c9b..3de79060619 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -817,6 +817,62 @@ public:
> >
> >  class svdupq_impl : public quiet<function_base>
> >  {
> > +private:
> > +  gimple *
> > +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
> > +  {
> > +    /* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
> > +       tmp = {arg0, arg1, ..., arg<N-1>}
> > +       lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
> > +
> > +    /* TODO: Revisit to handle factor by padding zeros.  */
> > +    if (factor > 1)
> > +      return NULL;
>
> Isn't the key thing here predicate vs. vector rather than factor == 1 vs.
> factor != 1?  Do we generate good code for b8, where factor should be 1?
Hi,
It generates the following code for svdup_n_b8:
https://pastebin.com/ypYt590c

I suppose lowering to ctor+vec_perm_expr is not really useful
for this case because it won't simplify ctor, unlike the above case of
svdupq_s32 (x[0], x[1], x[2], x[3]);
However I wonder if it's still a good idea to lower svdupq for predicates, for
representing svdupq (or other intrinsics) using GIMPLE constructs as
far as possible ? In the attached patch, it simply punts if the type
suffix is b,
and doesn't try to fold the call.
>
> > +
> > +    if (BYTES_BIG_ENDIAN)
> > +      return NULL;
> > +
> > +    tree lhs = gimple_call_lhs (f.call);
> > +    if (TREE_CODE (lhs) != SSA_NAME)
> > +      return NULL;
>
> Why is this check needed?
This was a left-over from something else I was doing wrongly. Sorry I
forgot to remove it.
>
> > +    tree lhs_type = TREE_TYPE (lhs);
> > +    tree elt_type = TREE_TYPE (lhs_type);
> > +    scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type));
>
> Aren't we already dealing with a scalar type here?  I'd have expected
> SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...).
Ugh, sorry, I had most of the code copied over from svld1rq_impl for
building VEC_PERM_EXPR with VLA mask and adjusted it,
but overlooked this :/
>
> > +    machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
> > +    tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
> > +
> > +    unsigned nargs = gimple_call_num_args (f.call);
> > +    vec<constructor_elt, va_gc> *v;
> > +    vec_alloc (v, nargs);
> > +    for (unsigned i = 0; i < nargs; i++)
> > +      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
> > +    tree vec = build_constructor (vq_type, v);
> > +
> > +    tree access_type
> > +      = build_aligned_type (vq_type, TYPE_ALIGN (elt_type));
>
> Nit: seems to fit on one line.  But do we need this?  We're not accessing
> memory, so I'd have expected vq_type to be OK as-is.
>
> > +    tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> > +    gimple *g = gimple_build_assign (tmp, vec);
> > +
> > +    gimple_seq stmts = NULL;
> > +    gimple_seq_add_stmt_without_update (&stmts, g);
> > +
> > +    int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
>
> Looks like we should be able to use nargs instead of source_nelts.
Does the attached patch look OK ?

Thanks,
Prathamesh
>

> Thanks,
> Richard
>
> > +    poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> > +    vec_perm_builder sel (lhs_len, source_nelts, 1);
> > +    for (int i = 0; i < source_nelts; i++)
> > +      sel.quick_push (i);
> > +
> > +    vec_perm_indices indices (sel, 1, source_nelts);
> > +    tree mask_type = build_vector_type (ssizetype, lhs_len);
> > +    tree mask = vec_perm_indices_to_tree (mask_type, indices);
> > +
> > +    gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask);
> > +    gimple_seq_add_stmt_without_update (&stmts, g2);
> > +    gsi_replace_with_seq (f.gsi, stmts, false);
> > +    return g2;
> > +  }
> > +
> >  public:
> >    gimple *
> >    fold (gimple_folder &f) const override
> > @@ -832,7 +888,7 @@ public:
> >        {
> >       tree elt = gimple_call_arg (f.call, i);
> >       if (!CONSTANT_CLASS_P (elt))
> > -       return NULL;
> > +       return fold_nonconst_dupq (f, factor);
> >       builder.quick_push (elt);
> >       for (unsigned int j = 1; j < factor; ++j)
> >         builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
> > new file mode 100644
> > index 00000000000..f19f8deb1e5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
> > @@ -0,0 +1,31 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -fdump-tree-optimized" } */
> > +
> > +#include <arm_sve.h>
> > +#include <arm_neon.h>
> > +
> > +svint8_t f_s8(int8x16_t x)
> > +{
> > +  return svdupq_s8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
> > +                 x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
> > +}
> > +
> > +svint16_t f_s16(int16x8_t x)
> > +{
> > +  return svdupq_s16 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]);
> > +}
> > +
> > +svint32_t f_s32(int32x4_t x)
> > +{
> > +  return svdupq_s32 (x[0], x[1], x[2], x[3]);
> > +}
> > +
> > +svint64_t f_s64(int64x2_t x)
> > +{
> > +  return svdupq_s64 (x[0], x[1]);
> > +}
> > +
> > +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR" "optimized" } } */
> > +/* { dg-final { scan-tree-dump-not "svdupq" "optimized" } } */
> > +
> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} 4 } } */
[SVE] Fold svld1rq to VEC_PERM_EXPR if elements are not constant.

gcc/ChangeLog:
	* config/aarch64/aarch64-sve-builtins-base.cc
	(svdupq_impl::fold_nonconst_dupq): New method.
	(svdupq_impl::fold): Call fold_nonconst_dupq.

gcc/testsuite/ChangeLog:
	* gcc.target/aarch64/sve/acle/general/dupq_11.c: New test.

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index cd9cace3c9b..1732bf8be61 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -817,6 +817,52 @@ public:
 
 class svdupq_impl : public quiet<function_base>
 {
+private:
+  gimple *
+  fold_nonconst_dupq (gimple_folder &f) const
+  {
+    /* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
+       tmp = {arg0, arg1, ..., arg<N-1>}
+       lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
+
+    if (f.type_suffix (0).bool_p
+	|| BYTES_BIG_ENDIAN)
+      return NULL;
+
+    tree lhs = gimple_call_lhs (f.call);
+    tree lhs_type = TREE_TYPE (lhs);
+    tree elt_type = TREE_TYPE (lhs_type);
+    scalar_mode elt_mode = SCALAR_TYPE_MODE (elt_type); 
+    machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
+    tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
+
+    unsigned nargs = gimple_call_num_args (f.call);
+    vec<constructor_elt, va_gc> *v;
+    vec_alloc (v, nargs);
+    for (unsigned i = 0; i < nargs; i++)
+      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
+    tree vec = build_constructor (vq_type, v);
+    tree tmp = make_ssa_name_fn (cfun, vq_type, 0);
+    gimple *g = gimple_build_assign (tmp, vec);
+
+    gimple_seq stmts = NULL;
+    gimple_seq_add_stmt_without_update (&stmts, g);
+
+    poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
+    vec_perm_builder sel (lhs_len, nargs, 1);
+    for (unsigned i = 0; i < nargs; i++)
+      sel.quick_push (i);
+
+    vec_perm_indices indices (sel, 1, nargs);
+    tree mask_type = build_vector_type (ssizetype, lhs_len);
+    tree mask = vec_perm_indices_to_tree (mask_type, indices);
+
+    gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask);
+    gimple_seq_add_stmt_without_update (&stmts, g2);
+    gsi_replace_with_seq (f.gsi, stmts, false);
+    return g2;
+  }
+
 public:
   gimple *
   fold (gimple_folder &f) const override
@@ -832,7 +878,7 @@ public:
       {
 	tree elt = gimple_call_arg (f.call, i);
 	if (!CONSTANT_CLASS_P (elt))
-	  return NULL;
+	  return fold_nonconst_dupq (f);
 	builder.quick_push (elt);
 	for (unsigned int j = 1; j < factor; ++j)
 	  builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
new file mode 100644
index 00000000000..f19f8deb1e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+
+#include <arm_sve.h>
+#include <arm_neon.h>
+
+svint8_t f_s8(int8x16_t x)
+{
+  return svdupq_s8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
+		    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
+}
+
+svint16_t f_s16(int16x8_t x)
+{
+  return svdupq_s16 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]);
+}
+
+svint32_t f_s32(int32x4_t x)
+{
+  return svdupq_s32 (x[0], x[1], x[2], x[3]);
+}
+
+svint64_t f_s64(int64x2_t x)
+{
+  return svdupq_s64 (x[0], x[1]);
+}
+
+/* { dg-final { scan-tree-dump "VEC_PERM_EXPR" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "svdupq" "optimized" } } */
+
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} 4 } } */
  
Richard Sandiford April 6, 2023, 10:34 a.m. UTC | #23
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Tue, 4 Apr 2023 at 23:35, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > index cd9cace3c9b..3de79060619 100644
>> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > @@ -817,6 +817,62 @@ public:
>> >
>> >  class svdupq_impl : public quiet<function_base>
>> >  {
>> > +private:
>> > +  gimple *
>> > +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
>> > +  {
>> > +    /* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
>> > +       tmp = {arg0, arg1, ..., arg<N-1>}
>> > +       lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
>> > +
>> > +    /* TODO: Revisit to handle factor by padding zeros.  */
>> > +    if (factor > 1)
>> > +      return NULL;
>>
>> Isn't the key thing here predicate vs. vector rather than factor == 1 vs.
>> factor != 1?  Do we generate good code for b8, where factor should be 1?
> Hi,
> It generates the following code for svdup_n_b8:
> https://pastebin.com/ypYt590c

Hmm, yeah, not pretty :-)  But it's not pretty without either.

> I suppose lowering to ctor+vec_perm_expr is not really useful
> for this case because it won't simplify ctor, unlike the above case of
> svdupq_s32 (x[0], x[1], x[2], x[3]);
> However I wonder if it's still a good idea to lower svdupq for predicates, for
> representing svdupq (or other intrinsics) using GIMPLE constructs as
> far as possible ?

It's possible, but I think we'd need an example in which its a clear
benefit.

> In the attached patch, it simply punts if the type
> suffix is b,
> and doesn't try to fold the call.

Yeah, think that's best for now.

>> > +
>> > +    if (BYTES_BIG_ENDIAN)
>> > +      return NULL;
>> > +
>> > +    tree lhs = gimple_call_lhs (f.call);
>> > +    if (TREE_CODE (lhs) != SSA_NAME)
>> > +      return NULL;
>>
>> Why is this check needed?
> This was a left-over from something else I was doing wrongly. Sorry I
> forgot to remove it.
>>
>> > +    tree lhs_type = TREE_TYPE (lhs);
>> > +    tree elt_type = TREE_TYPE (lhs_type);
>> > +    scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type));
>>
>> Aren't we already dealing with a scalar type here?  I'd have expected
>> SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...).
> Ugh, sorry, I had most of the code copied over from svld1rq_impl for
> building VEC_PERM_EXPR with VLA mask and adjusted it,
> but overlooked this :/
>>
>> > +    machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
>> > +    tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
>> > +
>> > +    unsigned nargs = gimple_call_num_args (f.call);
>> > +    vec<constructor_elt, va_gc> *v;
>> > +    vec_alloc (v, nargs);
>> > +    for (unsigned i = 0; i < nargs; i++)
>> > +      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
>> > +    tree vec = build_constructor (vq_type, v);
>> > +
>> > +    tree access_type
>> > +      = build_aligned_type (vq_type, TYPE_ALIGN (elt_type));
>>
>> Nit: seems to fit on one line.  But do we need this?  We're not accessing
>> memory, so I'd have expected vq_type to be OK as-is.
>>
>> > +    tree tmp = make_ssa_name_fn (cfun, access_type, 0);
>> > +    gimple *g = gimple_build_assign (tmp, vec);
>> > +
>> > +    gimple_seq stmts = NULL;
>> > +    gimple_seq_add_stmt_without_update (&stmts, g);
>> > +
>> > +    int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
>>
>> Looks like we should be able to use nargs instead of source_nelts.
> Does the attached patch look OK ?
>
> Thanks,
> Prathamesh
>>
>
>> Thanks,
>> Richard
>>
>> > +    poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
>> > +    vec_perm_builder sel (lhs_len, source_nelts, 1);
>> > +    for (int i = 0; i < source_nelts; i++)
>> > +      sel.quick_push (i);
>> > +
>> > +    vec_perm_indices indices (sel, 1, source_nelts);
>> > +    tree mask_type = build_vector_type (ssizetype, lhs_len);
>> > +    tree mask = vec_perm_indices_to_tree (mask_type, indices);
>> > +
>> > +    gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask);
>> > +    gimple_seq_add_stmt_without_update (&stmts, g2);
>> > +    gsi_replace_with_seq (f.gsi, stmts, false);
>> > +    return g2;
>> > +  }
>> > +
>> >  public:
>> >    gimple *
>> >    fold (gimple_folder &f) const override
>> > @@ -832,7 +888,7 @@ public:
>> >        {
>> >       tree elt = gimple_call_arg (f.call, i);
>> >       if (!CONSTANT_CLASS_P (elt))
>> > -       return NULL;
>> > +       return fold_nonconst_dupq (f, factor);
>> >       builder.quick_push (elt);
>> >       for (unsigned int j = 1; j < factor; ++j)
>> >         builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
>> > new file mode 100644
>> > index 00000000000..f19f8deb1e5
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
>> > @@ -0,0 +1,31 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-O3 -fdump-tree-optimized" } */
>> > +
>> > +#include <arm_sve.h>
>> > +#include <arm_neon.h>
>> > +
>> > +svint8_t f_s8(int8x16_t x)
>> > +{
>> > +  return svdupq_s8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
>> > +                 x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
>> > +}
>> > +
>> > +svint16_t f_s16(int16x8_t x)
>> > +{
>> > +  return svdupq_s16 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]);
>> > +}
>> > +
>> > +svint32_t f_s32(int32x4_t x)
>> > +{
>> > +  return svdupq_s32 (x[0], x[1], x[2], x[3]);
>> > +}
>> > +
>> > +svint64_t f_s64(int64x2_t x)
>> > +{
>> > +  return svdupq_s64 (x[0], x[1]);
>> > +}
>> > +
>> > +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR" "optimized" } } */
>> > +/* { dg-final { scan-tree-dump-not "svdupq" "optimized" } } */
>> > +
>> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} 4 } } */
>
> [SVE] Fold svld1rq to VEC_PERM_EXPR if elements are not constant.
>
> gcc/ChangeLog:
> 	* config/aarch64/aarch64-sve-builtins-base.cc
> 	(svdupq_impl::fold_nonconst_dupq): New method.
> 	(svdupq_impl::fold): Call fold_nonconst_dupq.
>
> gcc/testsuite/ChangeLog:
> 	* gcc.target/aarch64/sve/acle/general/dupq_11.c: New test.

OK for GCC 14, thanks.

Richard

> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index cd9cace3c9b..1732bf8be61 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -817,6 +817,52 @@ public:
>  
>  class svdupq_impl : public quiet<function_base>
>  {
> +private:
> +  gimple *
> +  fold_nonconst_dupq (gimple_folder &f) const
> +  {
> +    /* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
> +       tmp = {arg0, arg1, ..., arg<N-1>}
> +       lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
> +
> +    if (f.type_suffix (0).bool_p
> +	|| BYTES_BIG_ENDIAN)
> +      return NULL;
> +
> +    tree lhs = gimple_call_lhs (f.call);
> +    tree lhs_type = TREE_TYPE (lhs);
> +    tree elt_type = TREE_TYPE (lhs_type);
> +    scalar_mode elt_mode = SCALAR_TYPE_MODE (elt_type); 
> +    machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
> +    tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
> +
> +    unsigned nargs = gimple_call_num_args (f.call);
> +    vec<constructor_elt, va_gc> *v;
> +    vec_alloc (v, nargs);
> +    for (unsigned i = 0; i < nargs; i++)
> +      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
> +    tree vec = build_constructor (vq_type, v);
> +    tree tmp = make_ssa_name_fn (cfun, vq_type, 0);
> +    gimple *g = gimple_build_assign (tmp, vec);
> +
> +    gimple_seq stmts = NULL;
> +    gimple_seq_add_stmt_without_update (&stmts, g);
> +
> +    poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> +    vec_perm_builder sel (lhs_len, nargs, 1);
> +    for (unsigned i = 0; i < nargs; i++)
> +      sel.quick_push (i);
> +
> +    vec_perm_indices indices (sel, 1, nargs);
> +    tree mask_type = build_vector_type (ssizetype, lhs_len);
> +    tree mask = vec_perm_indices_to_tree (mask_type, indices);
> +
> +    gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask);
> +    gimple_seq_add_stmt_without_update (&stmts, g2);
> +    gsi_replace_with_seq (f.gsi, stmts, false);
> +    return g2;
> +  }
> +
>  public:
>    gimple *
>    fold (gimple_folder &f) const override
> @@ -832,7 +878,7 @@ public:
>        {
>  	tree elt = gimple_call_arg (f.call, i);
>  	if (!CONSTANT_CLASS_P (elt))
> -	  return NULL;
> +	  return fold_nonconst_dupq (f);
>  	builder.quick_push (elt);
>  	for (unsigned int j = 1; j < factor; ++j)
>  	  builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
> new file mode 100644
> index 00000000000..f19f8deb1e5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-optimized" } */
> +
> +#include <arm_sve.h>
> +#include <arm_neon.h>
> +
> +svint8_t f_s8(int8x16_t x)
> +{
> +  return svdupq_s8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
> +		    x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
> +}
> +
> +svint16_t f_s16(int16x8_t x)
> +{
> +  return svdupq_s16 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]);
> +}
> +
> +svint32_t f_s32(int32x4_t x)
> +{
> +  return svdupq_s32 (x[0], x[1], x[2], x[3]);
> +}
> +
> +svint64_t f_s64(int64x2_t x)
> +{
> +  return svdupq_s64 (x[0], x[1]);
> +}
> +
> +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR" "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "svdupq" "optimized" } } */
> +
> +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} 4 } } */
  
Prathamesh Kulkarni April 6, 2023, 11:21 a.m. UTC | #24
On Thu, 6 Apr 2023 at 16:05, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > On Tue, 4 Apr 2023 at 23:35, Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > index cd9cace3c9b..3de79060619 100644
> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > @@ -817,6 +817,62 @@ public:
> >> >
> >> >  class svdupq_impl : public quiet<function_base>
> >> >  {
> >> > +private:
> >> > +  gimple *
> >> > +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
> >> > +  {
> >> > +    /* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
> >> > +       tmp = {arg0, arg1, ..., arg<N-1>}
> >> > +       lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
> >> > +
> >> > +    /* TODO: Revisit to handle factor by padding zeros.  */
> >> > +    if (factor > 1)
> >> > +      return NULL;
> >>
> >> Isn't the key thing here predicate vs. vector rather than factor == 1 vs.
> >> factor != 1?  Do we generate good code for b8, where factor should be 1?
> > Hi,
> > It generates the following code for svdup_n_b8:
> > https://pastebin.com/ypYt590c
>
> Hmm, yeah, not pretty :-)  But it's not pretty without either.
>
> > I suppose lowering to ctor+vec_perm_expr is not really useful
> > for this case because it won't simplify ctor, unlike the above case of
> > svdupq_s32 (x[0], x[1], x[2], x[3]);
> > However I wonder if it's still a good idea to lower svdupq for predicates, for
> > representing svdupq (or other intrinsics) using GIMPLE constructs as
> > far as possible ?
>
> It's possible, but I think we'd need an example in which its a clear
> benefit.
Sorry I posted for wrong test case above.
For the following test:
svbool_t f(uint8x16_t x)
{
  return svdupq_n_b8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
                                    x[8], x[9], x[10], x[11], x[12],
x[13], x[14], x[15]);
}

Code-gen:
https://pastebin.com/maexgeJn

I suppose it's equivalent to following ?

svbool_t f2(uint8x16_t x)
{
  svuint8_t tmp = svdupq_n_u8 ((bool) x[0], (bool) x[1], (bool) x[2],
(bool) x[3],
                               (bool) x[4], (bool) x[5], (bool) x[6],
(bool) x[7],
                               (bool) x[8], (bool) x[9], (bool) x[10],
(bool) x[11],
                               (bool) x[12], (bool) x[13], (bool)
x[14], (bool) x[15]);
  return svcmpne_n_u8 (svptrue_b8 (), tmp, 0);
}

which generates:
f2:
.LFB3901:
        .cfi_startproc
        movi    v1.16b, 0x1
        ptrue   p0.b, all
        cmeq    v0.16b, v0.16b, #0
        bic     v0.16b, v1.16b, v0.16b
        dup     z0.q, z0.q[0]
        cmpne   p0.b, p0/z, z0.b, #0
        ret

Thanks,
Prathamesh
>
> > In the attached patch, it simply punts if the type
> > suffix is b,
> > and doesn't try to fold the call.
>
> Yeah, think that's best for now.
>
> >> > +
> >> > +    if (BYTES_BIG_ENDIAN)
> >> > +      return NULL;
> >> > +
> >> > +    tree lhs = gimple_call_lhs (f.call);
> >> > +    if (TREE_CODE (lhs) != SSA_NAME)
> >> > +      return NULL;
> >>
> >> Why is this check needed?
> > This was a left-over from something else I was doing wrongly. Sorry I
> > forgot to remove it.
> >>
> >> > +    tree lhs_type = TREE_TYPE (lhs);
> >> > +    tree elt_type = TREE_TYPE (lhs_type);
> >> > +    scalar_mode elt_mode = GET_MODE_INNER (TYPE_MODE (elt_type));
> >>
> >> Aren't we already dealing with a scalar type here?  I'd have expected
> >> SCALAR_TYPE_MODE rather than GET_MODE_INNER (TYPE_MODE ...).
> > Ugh, sorry, I had most of the code copied over from svld1rq_impl for
> > building VEC_PERM_EXPR with VLA mask and adjusted it,
> > but overlooked this :/
> >>
> >> > +    machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
> >> > +    tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
> >> > +
> >> > +    unsigned nargs = gimple_call_num_args (f.call);
> >> > +    vec<constructor_elt, va_gc> *v;
> >> > +    vec_alloc (v, nargs);
> >> > +    for (unsigned i = 0; i < nargs; i++)
> >> > +      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
> >> > +    tree vec = build_constructor (vq_type, v);
> >> > +
> >> > +    tree access_type
> >> > +      = build_aligned_type (vq_type, TYPE_ALIGN (elt_type));
> >>
> >> Nit: seems to fit on one line.  But do we need this?  We're not accessing
> >> memory, so I'd have expected vq_type to be OK as-is.
> >>
> >> > +    tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> >> > +    gimple *g = gimple_build_assign (tmp, vec);
> >> > +
> >> > +    gimple_seq stmts = NULL;
> >> > +    gimple_seq_add_stmt_without_update (&stmts, g);
> >> > +
> >> > +    int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
> >>
> >> Looks like we should be able to use nargs instead of source_nelts.
> > Does the attached patch look OK ?
> >
> > Thanks,
> > Prathamesh
> >>
> >
> >> Thanks,
> >> Richard
> >>
> >> > +    poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> >> > +    vec_perm_builder sel (lhs_len, source_nelts, 1);
> >> > +    for (int i = 0; i < source_nelts; i++)
> >> > +      sel.quick_push (i);
> >> > +
> >> > +    vec_perm_indices indices (sel, 1, source_nelts);
> >> > +    tree mask_type = build_vector_type (ssizetype, lhs_len);
> >> > +    tree mask = vec_perm_indices_to_tree (mask_type, indices);
> >> > +
> >> > +    gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask);
> >> > +    gimple_seq_add_stmt_without_update (&stmts, g2);
> >> > +    gsi_replace_with_seq (f.gsi, stmts, false);
> >> > +    return g2;
> >> > +  }
> >> > +
> >> >  public:
> >> >    gimple *
> >> >    fold (gimple_folder &f) const override
> >> > @@ -832,7 +888,7 @@ public:
> >> >        {
> >> >       tree elt = gimple_call_arg (f.call, i);
> >> >       if (!CONSTANT_CLASS_P (elt))
> >> > -       return NULL;
> >> > +       return fold_nonconst_dupq (f, factor);
> >> >       builder.quick_push (elt);
> >> >       for (unsigned int j = 1; j < factor; ++j)
> >> >         builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
> >> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
> >> > new file mode 100644
> >> > index 00000000000..f19f8deb1e5
> >> > --- /dev/null
> >> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
> >> > @@ -0,0 +1,31 @@
> >> > +/* { dg-do compile } */
> >> > +/* { dg-options "-O3 -fdump-tree-optimized" } */
> >> > +
> >> > +#include <arm_sve.h>
> >> > +#include <arm_neon.h>
> >> > +
> >> > +svint8_t f_s8(int8x16_t x)
> >> > +{
> >> > +  return svdupq_s8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
> >> > +                 x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
> >> > +}
> >> > +
> >> > +svint16_t f_s16(int16x8_t x)
> >> > +{
> >> > +  return svdupq_s16 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]);
> >> > +}
> >> > +
> >> > +svint32_t f_s32(int32x4_t x)
> >> > +{
> >> > +  return svdupq_s32 (x[0], x[1], x[2], x[3]);
> >> > +}
> >> > +
> >> > +svint64_t f_s64(int64x2_t x)
> >> > +{
> >> > +  return svdupq_s64 (x[0], x[1]);
> >> > +}
> >> > +
> >> > +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR" "optimized" } } */
> >> > +/* { dg-final { scan-tree-dump-not "svdupq" "optimized" } } */
> >> > +
> >> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} 4 } } */
> >
> > [SVE] Fold svld1rq to VEC_PERM_EXPR if elements are not constant.
> >
> > gcc/ChangeLog:
> >       * config/aarch64/aarch64-sve-builtins-base.cc
> >       (svdupq_impl::fold_nonconst_dupq): New method.
> >       (svdupq_impl::fold): Call fold_nonconst_dupq.
> >
> > gcc/testsuite/ChangeLog:
> >       * gcc.target/aarch64/sve/acle/general/dupq_11.c: New test.
>
> OK for GCC 14, thanks.
>
> Richard
>
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index cd9cace3c9b..1732bf8be61 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -817,6 +817,52 @@ public:
> >
> >  class svdupq_impl : public quiet<function_base>
> >  {
> > +private:
> > +  gimple *
> > +  fold_nonconst_dupq (gimple_folder &f) const
> > +  {
> > +    /* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
> > +       tmp = {arg0, arg1, ..., arg<N-1>}
> > +       lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
> > +
> > +    if (f.type_suffix (0).bool_p
> > +     || BYTES_BIG_ENDIAN)
> > +      return NULL;
> > +
> > +    tree lhs = gimple_call_lhs (f.call);
> > +    tree lhs_type = TREE_TYPE (lhs);
> > +    tree elt_type = TREE_TYPE (lhs_type);
> > +    scalar_mode elt_mode = SCALAR_TYPE_MODE (elt_type);
> > +    machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
> > +    tree vq_type = build_vector_type_for_mode (elt_type, vq_mode);
> > +
> > +    unsigned nargs = gimple_call_num_args (f.call);
> > +    vec<constructor_elt, va_gc> *v;
> > +    vec_alloc (v, nargs);
> > +    for (unsigned i = 0; i < nargs; i++)
> > +      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, gimple_call_arg (f.call, i));
> > +    tree vec = build_constructor (vq_type, v);
> > +    tree tmp = make_ssa_name_fn (cfun, vq_type, 0);
> > +    gimple *g = gimple_build_assign (tmp, vec);
> > +
> > +    gimple_seq stmts = NULL;
> > +    gimple_seq_add_stmt_without_update (&stmts, g);
> > +
> > +    poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> > +    vec_perm_builder sel (lhs_len, nargs, 1);
> > +    for (unsigned i = 0; i < nargs; i++)
> > +      sel.quick_push (i);
> > +
> > +    vec_perm_indices indices (sel, 1, nargs);
> > +    tree mask_type = build_vector_type (ssizetype, lhs_len);
> > +    tree mask = vec_perm_indices_to_tree (mask_type, indices);
> > +
> > +    gimple *g2 = gimple_build_assign (lhs, VEC_PERM_EXPR, tmp, tmp, mask);
> > +    gimple_seq_add_stmt_without_update (&stmts, g2);
> > +    gsi_replace_with_seq (f.gsi, stmts, false);
> > +    return g2;
> > +  }
> > +
> >  public:
> >    gimple *
> >    fold (gimple_folder &f) const override
> > @@ -832,7 +878,7 @@ public:
> >        {
> >       tree elt = gimple_call_arg (f.call, i);
> >       if (!CONSTANT_CLASS_P (elt))
> > -       return NULL;
> > +       return fold_nonconst_dupq (f);
> >       builder.quick_push (elt);
> >       for (unsigned int j = 1; j < factor; ++j)
> >         builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
> > new file mode 100644
> > index 00000000000..f19f8deb1e5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_11.c
> > @@ -0,0 +1,31 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -fdump-tree-optimized" } */
> > +
> > +#include <arm_sve.h>
> > +#include <arm_neon.h>
> > +
> > +svint8_t f_s8(int8x16_t x)
> > +{
> > +  return svdupq_s8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
> > +                 x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
> > +}
> > +
> > +svint16_t f_s16(int16x8_t x)
> > +{
> > +  return svdupq_s16 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]);
> > +}
> > +
> > +svint32_t f_s32(int32x4_t x)
> > +{
> > +  return svdupq_s32 (x[0], x[1], x[2], x[3]);
> > +}
> > +
> > +svint64_t f_s64(int64x2_t x)
> > +{
> > +  return svdupq_s64 (x[0], x[1]);
> > +}
> > +
> > +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR" "optimized" } } */
> > +/* { dg-final { scan-tree-dump-not "svdupq" "optimized" } } */
> > +
> > +/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} 4 } } */
  
Richard Sandiford April 12, 2023, 8:59 a.m. UTC | #25
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> On Thu, 6 Apr 2023 at 16:05, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> > On Tue, 4 Apr 2023 at 23:35, Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > index cd9cace3c9b..3de79060619 100644
>> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > @@ -817,6 +817,62 @@ public:
>> >> >
>> >> >  class svdupq_impl : public quiet<function_base>
>> >> >  {
>> >> > +private:
>> >> > +  gimple *
>> >> > +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
>> >> > +  {
>> >> > +    /* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
>> >> > +       tmp = {arg0, arg1, ..., arg<N-1>}
>> >> > +       lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
>> >> > +
>> >> > +    /* TODO: Revisit to handle factor by padding zeros.  */
>> >> > +    if (factor > 1)
>> >> > +      return NULL;
>> >>
>> >> Isn't the key thing here predicate vs. vector rather than factor == 1 vs.
>> >> factor != 1?  Do we generate good code for b8, where factor should be 1?
>> > Hi,
>> > It generates the following code for svdup_n_b8:
>> > https://pastebin.com/ypYt590c
>>
>> Hmm, yeah, not pretty :-)  But it's not pretty without either.
>>
>> > I suppose lowering to ctor+vec_perm_expr is not really useful
>> > for this case because it won't simplify ctor, unlike the above case of
>> > svdupq_s32 (x[0], x[1], x[2], x[3]);
>> > However I wonder if it's still a good idea to lower svdupq for predicates, for
>> > representing svdupq (or other intrinsics) using GIMPLE constructs as
>> > far as possible ?
>>
>> It's possible, but I think we'd need an example in which its a clear
>> benefit.
> Sorry I posted for wrong test case above.
> For the following test:
> svbool_t f(uint8x16_t x)
> {
>   return svdupq_n_b8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
>                                     x[8], x[9], x[10], x[11], x[12],
> x[13], x[14], x[15]);
> }
>
> Code-gen:
> https://pastebin.com/maexgeJn
>
> I suppose it's equivalent to following ?
>
> svbool_t f2(uint8x16_t x)
> {
>   svuint8_t tmp = svdupq_n_u8 ((bool) x[0], (bool) x[1], (bool) x[2],
> (bool) x[3],
>                                (bool) x[4], (bool) x[5], (bool) x[6],
> (bool) x[7],
>                                (bool) x[8], (bool) x[9], (bool) x[10],
> (bool) x[11],
>                                (bool) x[12], (bool) x[13], (bool)
> x[14], (bool) x[15]);
>   return svcmpne_n_u8 (svptrue_b8 (), tmp, 0);
> }

Yeah, this is essentially the transformation that the svdupq rtl
expander uses.  It would probably be a good idea to do that in
gimple too.

Thanks,
Richard

>
> which generates:
> f2:
> .LFB3901:
>         .cfi_startproc
>         movi    v1.16b, 0x1
>         ptrue   p0.b, all
>         cmeq    v0.16b, v0.16b, #0
>         bic     v0.16b, v1.16b, v0.16b
>         dup     z0.q, z0.q[0]
>         cmpne   p0.b, p0/z, z0.b, #0
>         ret
>
> Thanks,
> Prathamesh
  
Prathamesh Kulkarni April 21, 2023, 7:27 a.m. UTC | #26
On Wed, 12 Apr 2023 at 14:29, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > On Thu, 6 Apr 2023 at 16:05, Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> >> > On Tue, 4 Apr 2023 at 23:35, Richard Sandiford
> >> > <richard.sandiford@arm.com> wrote:
> >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > index cd9cace3c9b..3de79060619 100644
> >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > @@ -817,6 +817,62 @@ public:
> >> >> >
> >> >> >  class svdupq_impl : public quiet<function_base>
> >> >> >  {
> >> >> > +private:
> >> >> > +  gimple *
> >> >> > +  fold_nonconst_dupq (gimple_folder &f, unsigned factor) const
> >> >> > +  {
> >> >> > +    /* Lower lhs = svdupq (arg0, arg1, ..., argN} into:
> >> >> > +       tmp = {arg0, arg1, ..., arg<N-1>}
> >> >> > +       lhs = VEC_PERM_EXPR (tmp, tmp, {0, 1, 2, N-1, ...})  */
> >> >> > +
> >> >> > +    /* TODO: Revisit to handle factor by padding zeros.  */
> >> >> > +    if (factor > 1)
> >> >> > +      return NULL;
> >> >>
> >> >> Isn't the key thing here predicate vs. vector rather than factor == 1 vs.
> >> >> factor != 1?  Do we generate good code for b8, where factor should be 1?
> >> > Hi,
> >> > It generates the following code for svdup_n_b8:
> >> > https://pastebin.com/ypYt590c
> >>
> >> Hmm, yeah, not pretty :-)  But it's not pretty without either.
> >>
> >> > I suppose lowering to ctor+vec_perm_expr is not really useful
> >> > for this case because it won't simplify ctor, unlike the above case of
> >> > svdupq_s32 (x[0], x[1], x[2], x[3]);
> >> > However I wonder if it's still a good idea to lower svdupq for predicates, for
> >> > representing svdupq (or other intrinsics) using GIMPLE constructs as
> >> > far as possible ?
> >>
> >> It's possible, but I think we'd need an example in which its a clear
> >> benefit.
> > Sorry I posted for wrong test case above.
> > For the following test:
> > svbool_t f(uint8x16_t x)
> > {
> >   return svdupq_n_b8 (x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7],
> >                                     x[8], x[9], x[10], x[11], x[12],
> > x[13], x[14], x[15]);
> > }
> >
> > Code-gen:
> > https://pastebin.com/maexgeJn
> >
> > I suppose it's equivalent to following ?
> >
> > svbool_t f2(uint8x16_t x)
> > {
> >   svuint8_t tmp = svdupq_n_u8 ((bool) x[0], (bool) x[1], (bool) x[2],
> > (bool) x[3],
> >                                (bool) x[4], (bool) x[5], (bool) x[6],
> > (bool) x[7],
> >                                (bool) x[8], (bool) x[9], (bool) x[10],
> > (bool) x[11],
> >                                (bool) x[12], (bool) x[13], (bool)
> > x[14], (bool) x[15]);
> >   return svcmpne_n_u8 (svptrue_b8 (), tmp, 0);
> > }
>
> Yeah, this is essentially the transformation that the svdupq rtl
> expander uses.  It would probably be a good idea to do that in
> gimple too.
Hi,
I tested the interleave+zip1 for vector init patch and it segfaulted
during bootstrap while trying to build
libgfortran/generated/matmul_i2.c.
Rebuilding with --enable-checking=rtl showed out of bounds access in
aarch64_unzip_vector_init in following hunk:

+  rtvec vec = rtvec_alloc (n / 2);
+  for (int i = 0; i < n; i++)
+    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
+                                 : XVECEXP (vals, 0, 2 * i + 1);

which is incorrect since it allocates n/2 but iterates and stores upto n.
The attached patch fixes the issue, which passed bootstrap, however
resulted in following fallout during testsuite run:

1] sve/acle/general/dupq_[1-4].c tests fail.
For the following test:
int32x4_t f(int32_t x)
{
  return (int32x4_t) { x, 1, 2, 3 };
}

Code-gen without patch:
f:
        adrp    x1, .LC0
        ldr     q0, [x1, #:lo12:.LC0]
        ins     v0.s[0], w0
        ret

Code-gen with patch:
f:
        movi    v0.2s, 0x2
        adrp    x1, .LC0
        ldr     d1, [x1, #:lo12:.LC0]
        ins     v0.s[0], w0
        zip1    v0.4s, v0.4s, v1.4s
        ret

It shows, fallback_seq_cost = 20, seq_total_cost = 16
where seq_total_cost determines the cost for interleave+zip1 sequence
and fallback_seq_cost is the cost for fallback sequence.
Altho it shows lesser cost, I am not sure if the interleave+zip1
sequence is better in this case ?

2] sve/acle/general/dupq_[5-6].c tests fail:
int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
{
  return (int32x4_t) { x0, x1, x2, x3 };
}

code-gen without patch:
f:
        fmov    s0, w0
        ins     v0.s[1], w1
        ins     v0.s[2], w2
        ins     v0.s[3], w3
        ret

code-gen with patch:
f:
        fmov    s0, w0
        fmov    s1, w1
        ins     v0.s[1], w2
        ins     v1.s[1], w3
        zip1    v0.4s, v0.4s, v1.4s
        ret

It shows fallback_seq_cost = 28, seq_total_cost = 16

3] aarch64/ldp_stp_16.c's cons2_8_float test fails.
Test case:
void cons2_8_float(float *x, float val0, float val1)
{
#pragma GCC unroll(8)
  for (int i = 0; i < 8 * 2; i += 2) {
    x[i + 0] = val0;
    x[i + 1] = val1;
  }
}

which is lowered to:
void cons2_8_float (float * x, float val0, float val1)
{
  vector(4) float _86;

  <bb 2> [local count: 119292720]:
  _86 = {val0_11(D), val1_13(D), val0_11(D), val1_13(D)};
  MEM <vector(4) float> [(float *)x_10(D)] = _86;
  MEM <vector(4) float> [(float *)x_10(D) + 16B] = _86;
  MEM <vector(4) float> [(float *)x_10(D) + 32B] = _86;
  MEM <vector(4) float> [(float *)x_10(D) + 48B] = _86;
  return;
}

code-gen without patch:
cons2_8_float:
        dup     v0.4s, v0.s[0]
        ins     v0.s[1], v1.s[0]
        ins     v0.s[3], v1.s[0]
        stp     q0, q0, [x0]
        stp     q0, q0, [x0, 32]
        ret

code-gen with patch:
cons2_8_float:
        dup     v1.2s, v1.s[0]
        dup     v0.2s, v0.s[0]
        zip1    v0.4s, v0.4s, v1.4s
        stp     q0, q0, [x0]
        stp     q0, q0, [x0, 32]
        ret

It shows fallback_seq_cost = 28, seq_total_cost = 16

I think the test fails because it doesn't match:
**      dup     v([0-9]+)\.4s, .*

Shall it be OK to amend the test assuming code-gen with patch is better ?

4] aarch64/pr109072_1.c s32x4_3 test fails:
For the following test:
int32x4_t s32x4_3 (int32_t x, int32_t y)
{
  int32_t arr[] = { x, y, y, y };
  return vld1q_s32 (arr);
}

code-gen without patch:
s32x4_3:
        dup     v0.4s, w1
        ins     v0.s[0], w0
        ret

code-gen with patch:
s32x4_3:
        fmov    s1, w1
        fmov    s0, w0
        ins     v0.s[1], v1.s[0]
        dup     v1.2s, v1.s[0]
        zip1    v0.4s, v0.4s, v1.4s
        ret

It shows fallback_seq_cost = 20, seq_total_cost = 16
I am not sure how interleave+zip1 cost is lesser than fallback seq
cost for this case.
I assume that the fallback sequence is better here ?

PS: The patch for folding svdupq to ctor+vec_perm_expr passes
bootstrap+test without any issues.

Thanks,
Prathamesh

>
> Thanks,
> Richard
>
> >
> > which generates:
> > f2:
> > .LFB3901:
> >         .cfi_startproc
> >         movi    v1.16b, 0x1
> >         ptrue   p0.b, all
> >         cmeq    v0.16b, v0.16b, #0
> >         bic     v0.16b, v1.16b, v0.16b
> >         dup     z0.q, z0.q[0]
> >         cmpne   p0.b, p0/z, z0.b, #0
> >         ret
> >
> > Thanks,
> > Prathamesh
  
Richard Sandiford April 21, 2023, 9:17 a.m. UTC | #27
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> Hi,
> I tested the interleave+zip1 for vector init patch and it segfaulted
> during bootstrap while trying to build
> libgfortran/generated/matmul_i2.c.
> Rebuilding with --enable-checking=rtl showed out of bounds access in
> aarch64_unzip_vector_init in following hunk:
>
> +  rtvec vec = rtvec_alloc (n / 2);
> +  for (int i = 0; i < n; i++)
> +    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> +                                 : XVECEXP (vals, 0, 2 * i + 1);
>
> which is incorrect since it allocates n/2 but iterates and stores upto n.
> The attached patch fixes the issue, which passed bootstrap, however
> resulted in following fallout during testsuite run:
>
> 1] sve/acle/general/dupq_[1-4].c tests fail.
> For the following test:
> int32x4_t f(int32_t x)
> {
>   return (int32x4_t) { x, 1, 2, 3 };
> }
>
> Code-gen without patch:
> f:
>         adrp    x1, .LC0
>         ldr     q0, [x1, #:lo12:.LC0]
>         ins     v0.s[0], w0
>         ret
>
> Code-gen with patch:
> f:
>         movi    v0.2s, 0x2
>         adrp    x1, .LC0
>         ldr     d1, [x1, #:lo12:.LC0]
>         ins     v0.s[0], w0
>         zip1    v0.4s, v0.4s, v1.4s
>         ret
>
> It shows, fallback_seq_cost = 20, seq_total_cost = 16
> where seq_total_cost determines the cost for interleave+zip1 sequence
> and fallback_seq_cost is the cost for fallback sequence.
> Altho it shows lesser cost, I am not sure if the interleave+zip1
> sequence is better in this case ?

Debugging the patch, it looks like this is because the fallback sequence
contains a redundant pseudo-to-pseudo move, which is costed as 1
instruction (4 units).  The RTL equivalent of the:

     movi    v0.2s, 0x2
     ins     v0.s[0], w0

has a similar redundant move, but the cost of that move is subsumed by
the cost of the other arm (the load from LC0), which is costed as 3
instructions (12 units).  So we have 12 + 4 for the parallel version
(correct) but 12 + 4 + 4 for the serial version (one instruction too
many).

The reason we have redundant moves is that the expansion code uses
copy_to_mode_reg to force a value into a register.  This creates a
new pseudo even if the original value was already a register.
Using force_reg removes the moves and makes the test pass.

So I think the first step is to use force_reg instead of
copy_to_mode_reg in aarch64_simd_dup_constant and
aarch64_expand_vector_init (as a preparatory patch).

> 2] sve/acle/general/dupq_[5-6].c tests fail:
> int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
> {
>   return (int32x4_t) { x0, x1, x2, x3 };
> }
>
> code-gen without patch:
> f:
>         fmov    s0, w0
>         ins     v0.s[1], w1
>         ins     v0.s[2], w2
>         ins     v0.s[3], w3
>         ret
>
> code-gen with patch:
> f:
>         fmov    s0, w0
>         fmov    s1, w1
>         ins     v0.s[1], w2
>         ins     v1.s[1], w3
>         zip1    v0.4s, v0.4s, v1.4s
>         ret
>
> It shows fallback_seq_cost = 28, seq_total_cost = 16

The zip verson still wins after the fix above, but by a lesser amount.
It seems like a borderline case.

>
> 3] aarch64/ldp_stp_16.c's cons2_8_float test fails.
> Test case:
> void cons2_8_float(float *x, float val0, float val1)
> {
> #pragma GCC unroll(8)
>   for (int i = 0; i < 8 * 2; i += 2) {
>     x[i + 0] = val0;
>     x[i + 1] = val1;
>   }
> }
>
> which is lowered to:
> void cons2_8_float (float * x, float val0, float val1)
> {
>   vector(4) float _86;
>
>   <bb 2> [local count: 119292720]:
>   _86 = {val0_11(D), val1_13(D), val0_11(D), val1_13(D)};
>   MEM <vector(4) float> [(float *)x_10(D)] = _86;
>   MEM <vector(4) float> [(float *)x_10(D) + 16B] = _86;
>   MEM <vector(4) float> [(float *)x_10(D) + 32B] = _86;
>   MEM <vector(4) float> [(float *)x_10(D) + 48B] = _86;
>   return;
> }
>
> code-gen without patch:
> cons2_8_float:
>         dup     v0.4s, v0.s[0]
>         ins     v0.s[1], v1.s[0]
>         ins     v0.s[3], v1.s[0]
>         stp     q0, q0, [x0]
>         stp     q0, q0, [x0, 32]
>         ret
>
> code-gen with patch:
> cons2_8_float:
>         dup     v1.2s, v1.s[0]
>         dup     v0.2s, v0.s[0]
>         zip1    v0.4s, v0.4s, v1.4s
>         stp     q0, q0, [x0]
>         stp     q0, q0, [x0, 32]
>         ret
>
> It shows fallback_seq_cost = 28, seq_total_cost = 16
>
> I think the test fails because it doesn't match:
> **      dup     v([0-9]+)\.4s, .*
>
> Shall it be OK to amend the test assuming code-gen with patch is better ?

Yeah, the new code seems like an improvement.

> 4] aarch64/pr109072_1.c s32x4_3 test fails:
> For the following test:
> int32x4_t s32x4_3 (int32_t x, int32_t y)
> {
>   int32_t arr[] = { x, y, y, y };
>   return vld1q_s32 (arr);
> }
>
> code-gen without patch:
> s32x4_3:
>         dup     v0.4s, w1
>         ins     v0.s[0], w0
>         ret
>
> code-gen with patch:
> s32x4_3:
>         fmov    s1, w1
>         fmov    s0, w0
>         ins     v0.s[1], v1.s[0]
>         dup     v1.2s, v1.s[0]
>         zip1    v0.4s, v0.4s, v1.4s
>         ret
>
> It shows fallback_seq_cost = 20, seq_total_cost = 16
> I am not sure how interleave+zip1 cost is lesser than fallback seq
> cost for this case.
> I assume that the fallback sequence is better here ?

The fix for 1] works for this case too.

Thanks,
Richard
  
Prathamesh Kulkarni April 21, 2023, 3:15 p.m. UTC | #28
On Fri, 21 Apr 2023 at 14:47, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > Hi,
> > I tested the interleave+zip1 for vector init patch and it segfaulted
> > during bootstrap while trying to build
> > libgfortran/generated/matmul_i2.c.
> > Rebuilding with --enable-checking=rtl showed out of bounds access in
> > aarch64_unzip_vector_init in following hunk:
> >
> > +  rtvec vec = rtvec_alloc (n / 2);
> > +  for (int i = 0; i < n; i++)
> > +    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> > +                                 : XVECEXP (vals, 0, 2 * i + 1);
> >
> > which is incorrect since it allocates n/2 but iterates and stores upto n.
> > The attached patch fixes the issue, which passed bootstrap, however
> > resulted in following fallout during testsuite run:
> >
> > 1] sve/acle/general/dupq_[1-4].c tests fail.
> > For the following test:
> > int32x4_t f(int32_t x)
> > {
> >   return (int32x4_t) { x, 1, 2, 3 };
> > }
> >
> > Code-gen without patch:
> > f:
> >         adrp    x1, .LC0
> >         ldr     q0, [x1, #:lo12:.LC0]
> >         ins     v0.s[0], w0
> >         ret
> >
> > Code-gen with patch:
> > f:
> >         movi    v0.2s, 0x2
> >         adrp    x1, .LC0
> >         ldr     d1, [x1, #:lo12:.LC0]
> >         ins     v0.s[0], w0
> >         zip1    v0.4s, v0.4s, v1.4s
> >         ret
> >
> > It shows, fallback_seq_cost = 20, seq_total_cost = 16
> > where seq_total_cost determines the cost for interleave+zip1 sequence
> > and fallback_seq_cost is the cost for fallback sequence.
> > Altho it shows lesser cost, I am not sure if the interleave+zip1
> > sequence is better in this case ?
>
> Debugging the patch, it looks like this is because the fallback sequence
> contains a redundant pseudo-to-pseudo move, which is costed as 1
> instruction (4 units).  The RTL equivalent of the:
>
>      movi    v0.2s, 0x2
>      ins     v0.s[0], w0
>
> has a similar redundant move, but the cost of that move is subsumed by
> the cost of the other arm (the load from LC0), which is costed as 3
> instructions (12 units).  So we have 12 + 4 for the parallel version
> (correct) but 12 + 4 + 4 for the serial version (one instruction too
> many).
>
> The reason we have redundant moves is that the expansion code uses
> copy_to_mode_reg to force a value into a register.  This creates a
> new pseudo even if the original value was already a register.
> Using force_reg removes the moves and makes the test pass.
>
> So I think the first step is to use force_reg instead of
> copy_to_mode_reg in aarch64_simd_dup_constant and
> aarch64_expand_vector_init (as a preparatory patch).
Thanks for the clarification!
>
> > 2] sve/acle/general/dupq_[5-6].c tests fail:
> > int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
> > {
> >   return (int32x4_t) { x0, x1, x2, x3 };
> > }
> >
> > code-gen without patch:
> > f:
> >         fmov    s0, w0
> >         ins     v0.s[1], w1
> >         ins     v0.s[2], w2
> >         ins     v0.s[3], w3
> >         ret
> >
> > code-gen with patch:
> > f:
> >         fmov    s0, w0
> >         fmov    s1, w1
> >         ins     v0.s[1], w2
> >         ins     v1.s[1], w3
> >         zip1    v0.4s, v0.4s, v1.4s
> >         ret
> >
> > It shows fallback_seq_cost = 28, seq_total_cost = 16
>
> The zip verson still wins after the fix above, but by a lesser amount.
> It seems like a borderline case.
>
> >
> > 3] aarch64/ldp_stp_16.c's cons2_8_float test fails.
> > Test case:
> > void cons2_8_float(float *x, float val0, float val1)
> > {
> > #pragma GCC unroll(8)
> >   for (int i = 0; i < 8 * 2; i += 2) {
> >     x[i + 0] = val0;
> >     x[i + 1] = val1;
> >   }
> > }
> >
> > which is lowered to:
> > void cons2_8_float (float * x, float val0, float val1)
> > {
> >   vector(4) float _86;
> >
> >   <bb 2> [local count: 119292720]:
> >   _86 = {val0_11(D), val1_13(D), val0_11(D), val1_13(D)};
> >   MEM <vector(4) float> [(float *)x_10(D)] = _86;
> >   MEM <vector(4) float> [(float *)x_10(D) + 16B] = _86;
> >   MEM <vector(4) float> [(float *)x_10(D) + 32B] = _86;
> >   MEM <vector(4) float> [(float *)x_10(D) + 48B] = _86;
> >   return;
> > }
> >
> > code-gen without patch:
> > cons2_8_float:
> >         dup     v0.4s, v0.s[0]
> >         ins     v0.s[1], v1.s[0]
> >         ins     v0.s[3], v1.s[0]
> >         stp     q0, q0, [x0]
> >         stp     q0, q0, [x0, 32]
> >         ret
> >
> > code-gen with patch:
> > cons2_8_float:
> >         dup     v1.2s, v1.s[0]
> >         dup     v0.2s, v0.s[0]
> >         zip1    v0.4s, v0.4s, v1.4s
> >         stp     q0, q0, [x0]
> >         stp     q0, q0, [x0, 32]
> >         ret
> >
> > It shows fallback_seq_cost = 28, seq_total_cost = 16
> >
> > I think the test fails because it doesn't match:
> > **      dup     v([0-9]+)\.4s, .*
> >
> > Shall it be OK to amend the test assuming code-gen with patch is better ?
>
> Yeah, the new code seems like an improvement.
>
> > 4] aarch64/pr109072_1.c s32x4_3 test fails:
> > For the following test:
> > int32x4_t s32x4_3 (int32_t x, int32_t y)
> > {
> >   int32_t arr[] = { x, y, y, y };
> >   return vld1q_s32 (arr);
> > }
> >
> > code-gen without patch:
> > s32x4_3:
> >         dup     v0.4s, w1
> >         ins     v0.s[0], w0
> >         ret
> >
> > code-gen with patch:
> > s32x4_3:
> >         fmov    s1, w1
> >         fmov    s0, w0
> >         ins     v0.s[1], v1.s[0]
> >         dup     v1.2s, v1.s[0]
> >         zip1    v0.4s, v0.4s, v1.4s
> >         ret
> >
> > It shows fallback_seq_cost = 20, seq_total_cost = 16
> > I am not sure how interleave+zip1 cost is lesser than fallback seq
> > cost for this case.
> > I assume that the fallback sequence is better here ?
>
> The fix for 1] works for this case too.
Indeed, I verified using force_reg fixes the issues.
I will send a follow up patch after the preparatory patch using force_reg.

Thanks,
Prathamesh
>
> Thanks,
> Richard
  
Prathamesh Kulkarni April 23, 2023, 1:53 a.m. UTC | #29
On Fri, 21 Apr 2023 at 20:45, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Fri, 21 Apr 2023 at 14:47, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > > Hi,
> > > I tested the interleave+zip1 for vector init patch and it segfaulted
> > > during bootstrap while trying to build
> > > libgfortran/generated/matmul_i2.c.
> > > Rebuilding with --enable-checking=rtl showed out of bounds access in
> > > aarch64_unzip_vector_init in following hunk:
> > >
> > > +  rtvec vec = rtvec_alloc (n / 2);
> > > +  for (int i = 0; i < n; i++)
> > > +    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> > > +                                 : XVECEXP (vals, 0, 2 * i + 1);
> > >
> > > which is incorrect since it allocates n/2 but iterates and stores upto n.
> > > The attached patch fixes the issue, which passed bootstrap, however
> > > resulted in following fallout during testsuite run:
> > >
> > > 1] sve/acle/general/dupq_[1-4].c tests fail.
> > > For the following test:
> > > int32x4_t f(int32_t x)
> > > {
> > >   return (int32x4_t) { x, 1, 2, 3 };
> > > }
> > >
> > > Code-gen without patch:
> > > f:
> > >         adrp    x1, .LC0
> > >         ldr     q0, [x1, #:lo12:.LC0]
> > >         ins     v0.s[0], w0
> > >         ret
> > >
> > > Code-gen with patch:
> > > f:
> > >         movi    v0.2s, 0x2
> > >         adrp    x1, .LC0
> > >         ldr     d1, [x1, #:lo12:.LC0]
> > >         ins     v0.s[0], w0
> > >         zip1    v0.4s, v0.4s, v1.4s
> > >         ret
> > >
> > > It shows, fallback_seq_cost = 20, seq_total_cost = 16
> > > where seq_total_cost determines the cost for interleave+zip1 sequence
> > > and fallback_seq_cost is the cost for fallback sequence.
> > > Altho it shows lesser cost, I am not sure if the interleave+zip1
> > > sequence is better in this case ?
> >
> > Debugging the patch, it looks like this is because the fallback sequence
> > contains a redundant pseudo-to-pseudo move, which is costed as 1
> > instruction (4 units).  The RTL equivalent of the:
> >
> >      movi    v0.2s, 0x2
> >      ins     v0.s[0], w0
> >
> > has a similar redundant move, but the cost of that move is subsumed by
> > the cost of the other arm (the load from LC0), which is costed as 3
> > instructions (12 units).  So we have 12 + 4 for the parallel version
> > (correct) but 12 + 4 + 4 for the serial version (one instruction too
> > many).
> >
> > The reason we have redundant moves is that the expansion code uses
> > copy_to_mode_reg to force a value into a register.  This creates a
> > new pseudo even if the original value was already a register.
> > Using force_reg removes the moves and makes the test pass.
> >
> > So I think the first step is to use force_reg instead of
> > copy_to_mode_reg in aarch64_simd_dup_constant and
> > aarch64_expand_vector_init (as a preparatory patch).
> Thanks for the clarification!
> >
> > > 2] sve/acle/general/dupq_[5-6].c tests fail:
> > > int32x4_t f(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
> > > {
> > >   return (int32x4_t) { x0, x1, x2, x3 };
> > > }
> > >
> > > code-gen without patch:
> > > f:
> > >         fmov    s0, w0
> > >         ins     v0.s[1], w1
> > >         ins     v0.s[2], w2
> > >         ins     v0.s[3], w3
> > >         ret
> > >
> > > code-gen with patch:
> > > f:
> > >         fmov    s0, w0
> > >         fmov    s1, w1
> > >         ins     v0.s[1], w2
> > >         ins     v1.s[1], w3
> > >         zip1    v0.4s, v0.4s, v1.4s
> > >         ret
> > >
> > > It shows fallback_seq_cost = 28, seq_total_cost = 16
> >
> > The zip verson still wins after the fix above, but by a lesser amount.
> > It seems like a borderline case.
> >
> > >
> > > 3] aarch64/ldp_stp_16.c's cons2_8_float test fails.
> > > Test case:
> > > void cons2_8_float(float *x, float val0, float val1)
> > > {
> > > #pragma GCC unroll(8)
> > >   for (int i = 0; i < 8 * 2; i += 2) {
> > >     x[i + 0] = val0;
> > >     x[i + 1] = val1;
> > >   }
> > > }
> > >
> > > which is lowered to:
> > > void cons2_8_float (float * x, float val0, float val1)
> > > {
> > >   vector(4) float _86;
> > >
> > >   <bb 2> [local count: 119292720]:
> > >   _86 = {val0_11(D), val1_13(D), val0_11(D), val1_13(D)};
> > >   MEM <vector(4) float> [(float *)x_10(D)] = _86;
> > >   MEM <vector(4) float> [(float *)x_10(D) + 16B] = _86;
> > >   MEM <vector(4) float> [(float *)x_10(D) + 32B] = _86;
> > >   MEM <vector(4) float> [(float *)x_10(D) + 48B] = _86;
> > >   return;
> > > }
> > >
> > > code-gen without patch:
> > > cons2_8_float:
> > >         dup     v0.4s, v0.s[0]
> > >         ins     v0.s[1], v1.s[0]
> > >         ins     v0.s[3], v1.s[0]
> > >         stp     q0, q0, [x0]
> > >         stp     q0, q0, [x0, 32]
> > >         ret
> > >
> > > code-gen with patch:
> > > cons2_8_float:
> > >         dup     v1.2s, v1.s[0]
> > >         dup     v0.2s, v0.s[0]
> > >         zip1    v0.4s, v0.4s, v1.4s
> > >         stp     q0, q0, [x0]
> > >         stp     q0, q0, [x0, 32]
> > >         ret
> > >
> > > It shows fallback_seq_cost = 28, seq_total_cost = 16
> > >
> > > I think the test fails because it doesn't match:
> > > **      dup     v([0-9]+)\.4s, .*
> > >
> > > Shall it be OK to amend the test assuming code-gen with patch is better ?
> >
> > Yeah, the new code seems like an improvement.
> >
> > > 4] aarch64/pr109072_1.c s32x4_3 test fails:
> > > For the following test:
> > > int32x4_t s32x4_3 (int32_t x, int32_t y)
> > > {
> > >   int32_t arr[] = { x, y, y, y };
> > >   return vld1q_s32 (arr);
> > > }
> > >
> > > code-gen without patch:
> > > s32x4_3:
> > >         dup     v0.4s, w1
> > >         ins     v0.s[0], w0
> > >         ret
> > >
> > > code-gen with patch:
> > > s32x4_3:
> > >         fmov    s1, w1
> > >         fmov    s0, w0
> > >         ins     v0.s[1], v1.s[0]
> > >         dup     v1.2s, v1.s[0]
> > >         zip1    v0.4s, v0.4s, v1.4s
> > >         ret
> > >
> > > It shows fallback_seq_cost = 20, seq_total_cost = 16
> > > I am not sure how interleave+zip1 cost is lesser than fallback seq
> > > cost for this case.
> > > I assume that the fallback sequence is better here ?
> >
> > The fix for 1] works for this case too.
> Indeed, I verified using force_reg fixes the issues.
> I will send a follow up patch after the preparatory patch using force_reg.
The attached patch adjusts ldp_stp_16.c, and dupq_[5-6.c] to scan for
new code-gen.
It passes bootstrap with no regressions reported for testsuite results.
For the adjusted tests, it reports "old tests that pass have
disappeared", which I assume is OK
since they now scan for new code-gen ?
Does the patch look OK to commit ?

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Richard
[aarch64] Recursively intialize even and odd sub-parts and merge with zip1.

gcc/ChangeLog:
	* config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): Rename
	aarch64_expand_vector_init to this, and remove 	interleaving case.
	Recursively call aarch64_expand_vector_init_fallback, instead of
	aarch64_expand_vector_init.
	(aarch64_unzip_vector_init): New function.
	(aarch64_expand_vector_init): Likewise.

gcc/testsuite/ChangeLog:
	* gcc.target/aarch64/ldp_stp_16.c (cons2_8_float): Adjust for new
	code-gen.
	* gcc.target/aarch64/sve/acle/general/dupq_5.c: Likewise.
	* gcc.target/aarch64/sve/acle/general/dupq_6.c: Likewise.
	* gcc.target/aarch64/vec-init-18.c: Rename interleave-init-1.c to
	this.
	* gcc.target/aarch64/vec-init-19.c: New test.
	* gcc.target/aarch64/vec-init-20.c: Likewise.
	* gcc.target/aarch64/vec-init-21.c: Likewise.
	* gcc.target/aarch64/vec-init-22-size.c: Likewise.
	* gcc.target/aarch64/vec-init-22-speed.c: Likewise.
	* gcc.target/aarch64/vec-init-22.h: New header.

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index d7e895f8d34..416e062829c 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22026,11 +22026,12 @@ aarch64_simd_make_constant (rtx vals)
     return NULL_RTX;
 }
 
-/* Expand a vector initialisation sequence, such that TARGET is
-   initialised to contain VALS.  */
+/* A subroutine of aarch64_expand_vector_init, with the same interface.
+   The caller has already tried a divide-and-conquer approach, so do
+   not consider that case here.  */
 
 void
-aarch64_expand_vector_init (rtx target, rtx vals)
+aarch64_expand_vector_init_fallback (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   scalar_mode inner_mode = GET_MODE_INNER (mode);
@@ -22090,38 +22091,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
-  /* Check for interleaving case.
-     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
-     Generate following code:
-     dup v0.h, x
-     dup v1.h, y
-     zip1 v0.h, v0.h, v1.h
-     for "large enough" initializer.  */
-
-  if (n_elts >= 8)
-    {
-      int i;
-      for (i = 2; i < n_elts; i++)
-	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
-	  break;
-
-      if (i == n_elts)
-	{
-	  machine_mode mode = GET_MODE (target);
-	  rtx dest[2];
-
-	  for (int i = 0; i < 2; i++)
-	    {
-	      rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
-	      dest[i] = force_reg (mode, x);
-	    }
-
-	  rtvec v = gen_rtvec (2, dest[0], dest[1]);
-	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
-	  return;
-	}
-    }
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
@@ -22243,7 +22212,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	    }
 	  XVECEXP (copy, 0, i) = subst;
 	}
-      aarch64_expand_vector_init (target, copy);
+      aarch64_expand_vector_init_fallback (target, copy);
     }
 
   /* Insert the variable lanes directly.  */
@@ -22257,6 +22226,81 @@ aarch64_expand_vector_init (rtx target, rtx vals)
     }
 }
 
+/* Return even or odd half of VALS depending on EVEN_P.  */
+
+static rtx
+aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
+{
+  int n = XVECLEN (vals, 0);
+  machine_mode new_mode
+    = aarch64_simd_container_mode (GET_MODE_INNER (mode),
+				   GET_MODE_BITSIZE (mode).to_constant () / 2);
+  rtvec vec = rtvec_alloc (n / 2);
+  for (int i = 0; i < n/2; i++)
+    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
+				  : XVECEXP (vals, 0, 2 * i + 1);
+  return gen_rtx_PARALLEL (new_mode, vec);
+}
+
+/* Expand a vector initialisation sequence, such that TARGET is
+   initialized to contain VALS.  */
+
+void
+aarch64_expand_vector_init (rtx target, rtx vals)
+{
+  /* Try decomposing the initializer into even and odd halves and
+     then ZIP them together.  Use the resulting sequence if it is
+     strictly cheaper than loading VALS directly.
+
+     Prefer the fallback sequence in the event of a tie, since it
+     will tend to use fewer registers.  */
+
+  machine_mode mode = GET_MODE (target);
+  int n_elts = XVECLEN (vals, 0);
+
+  if (n_elts < 4
+      || maybe_ne (GET_MODE_BITSIZE (mode), 128))
+    {
+      aarch64_expand_vector_init_fallback (target, vals);
+      return;
+    }
+
+  start_sequence ();
+  rtx halves[2];
+  unsigned costs[2];
+  for (int i = 0; i < 2; i++)
+    {
+      start_sequence ();
+      rtx new_vals
+	= aarch64_unzip_vector_init (mode, vals, (i % 2) == 0);
+      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
+      aarch64_expand_vector_init (tmp_reg, new_vals);
+      halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
+      rtx_insn *rec_seq = get_insns ();
+      end_sequence ();
+      costs[i] = seq_cost (rec_seq, !optimize_size);
+      emit_insn (rec_seq);
+    }
+
+  rtvec v = gen_rtvec (2, halves[0], halves[1]);
+  rtx_insn *zip1_insn
+    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+  unsigned seq_total_cost
+    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
+  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
+
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  start_sequence ();
+  aarch64_expand_vector_init_fallback (target, vals);
+  rtx_insn *fallback_seq = get_insns ();
+  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
+  end_sequence ();
+
+  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
+}
+
 /* Emit RTL corresponding to:
    insr TARGET, ELEM.  */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
index 8ab117c4dcd..30c86018773 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
@@ -96,10 +96,10 @@ CONS2_FN (4, float);
 
 /*
 ** cons2_8_float:
-**	dup	v([0-9]+)\.4s, .*
+**	dup	v([0-9]+)\.2s, v1.s\[0\]
 **	...
-**	stp	q\1, q\1, \[x0\]
-**	stp	q\1, q\1, \[x0, #?32\]
+**	stp	q0, q0, \[x0\]
+**	stp	q0, q0, \[x0, #?32\]
 **	ret
 */
 CONS2_FN (8, float);
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
index 53426c9af5a..c7d6f3ff390 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
@@ -11,7 +11,7 @@ dupq (int x1, int x2, int x3, int x4)
 
 /* { dg-final { scan-assembler-not {\tldr\t} } } */
 /* { dg-final { scan-assembler {, [wx]0\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w1\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w2\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[3\], w3\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w2\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w3\n} } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]\.4s\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c
index dfce5e7a12a..4745a3815b0 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c
@@ -12,7 +12,7 @@ dupq (int x1, int x2, int x3, int x4)
 
 /* { dg-final { scan-assembler-not {\tldr\t} } } */
 /* { dg-final { scan-assembler {, [wx]0\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w1\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w2\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[3\], w3\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w2\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w3\n} } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]\.4s\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
similarity index 82%
rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
index ee775048589..e812d3946de 100644
--- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
@@ -7,8 +7,8 @@
 /*
 ** foo:
 **	...
-**	dup	v[0-9]+\.8h, w[0-9]+
-**	dup	v[0-9]+\.8h, w[0-9]+
+**	dup	v[0-9]+\.4h, w[0-9]+
+**	dup	v[0-9]+\.4h, w[0-9]+
 **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
 **	...
 **	ret
@@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
 /*
 ** foo2:
 **	...
-**	dup	v[0-9]+\.8h, w[0-9]+
-**	movi	v[0-9]+\.8h, 0x1
+**	dup	v[0-9]+\.4h, w[0-9]+
+**	movi	v[0-9]+\.4h, 0x1
 **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
 **	...
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
new file mode 100644
index 00000000000..e28fdcda29d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	dup	v[0-9]+\.8b, w[0-9]+
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x)
+{
+  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
+                       x, 5, x, 6, x, 7, x, 8 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
new file mode 100644
index 00000000000..9366ca349b6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	dup	v[0-9]+\.8b, w[0-9]+
+**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
+                       4, y, 5, y, 6, y, 7, y };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
new file mode 100644
index 00000000000..e16459486d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** f_s8:
+**	...
+**	adrp	x[0-9]+, \.LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
+**	ins	v0\.b\[0\], w0
+**	ins	v0\.b\[1\], w1
+**	...
+**	ret
+*/
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
+                       7, 8, 9, 10, 11, 12, 13, 14 };
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
new file mode 100644
index 00000000000..8f35854c008
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/* Verify that fallback code-sequence is chosen over
+   recursively generated code-sequence merged with zip1.  */
+
+/*
+** f_s16:
+**	...
+**	sxth	w0, w0
+**	fmov	s0, w0
+**	ins	v0\.h\[1\], w1
+**	ins	v0\.h\[2\], w2
+**	ins	v0\.h\[3\], w3
+**	ins	v0\.h\[4\], w4
+**	ins	v0\.h\[5\], w5
+**	ins	v0\.h\[6\], w6
+**	ins	v0\.h\[7\], w7
+**	...
+**	ret
+*/
+
+#include "vec-init-22.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
new file mode 100644
index 00000000000..172d56ffdf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/* Verify that we recursively generate code for even and odd halves
+   instead of fallback code. This is so despite the longer code-gen
+   because it has fewer dependencies and thus has lesser cost.  */
+
+/*
+** f_s16:
+**	...
+**	sxth	w0, w0
+**	sxth	w1, w1
+**	fmov	d0, x0
+**	fmov	d1, x1
+**	ins	v[0-9]+\.h\[1\], w2
+**	ins	v[0-9]+\.h\[1\], w3
+**	ins	v[0-9]+\.h\[2\], w4
+**	ins	v[0-9]+\.h\[2\], w5
+**	ins	v[0-9]+\.h\[3\], w6
+**	ins	v[0-9]+\.h\[3\], w7
+**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**	...
+**	ret
+*/
+
+#include "vec-init-22.h"
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
new file mode 100644
index 00000000000..15b889d4097
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
@@ -0,0 +1,7 @@
+#include <arm_neon.h>
+
+int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
+                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
+{
+  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
+}
  
Richard Sandiford April 24, 2023, 9:29 a.m. UTC | #30
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> [aarch64] Recursively intialize even and odd sub-parts and merge with zip1.
>
> gcc/ChangeLog:
> 	* config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): Rename
> 	aarch64_expand_vector_init to this, and remove 	interleaving case.
> 	Recursively call aarch64_expand_vector_init_fallback, instead of
> 	aarch64_expand_vector_init.
> 	(aarch64_unzip_vector_init): New function.
> 	(aarch64_expand_vector_init): Likewise.
>
> gcc/testsuite/ChangeLog:
> 	* gcc.target/aarch64/ldp_stp_16.c (cons2_8_float): Adjust for new
> 	code-gen.
> 	* gcc.target/aarch64/sve/acle/general/dupq_5.c: Likewise.
> 	* gcc.target/aarch64/sve/acle/general/dupq_6.c: Likewise.
> 	* gcc.target/aarch64/vec-init-18.c: Rename interleave-init-1.c to
> 	this.
> 	* gcc.target/aarch64/vec-init-19.c: New test.
> 	* gcc.target/aarch64/vec-init-20.c: Likewise.
> 	* gcc.target/aarch64/vec-init-21.c: Likewise.
> 	* gcc.target/aarch64/vec-init-22-size.c: Likewise.
> 	* gcc.target/aarch64/vec-init-22-speed.c: Likewise.
> 	* gcc.target/aarch64/vec-init-22.h: New header.
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index d7e895f8d34..416e062829c 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -22026,11 +22026,12 @@ aarch64_simd_make_constant (rtx vals)
>      return NULL_RTX;
>  }
>  
> -/* Expand a vector initialisation sequence, such that TARGET is
> -   initialised to contain VALS.  */
> +/* A subroutine of aarch64_expand_vector_init, with the same interface.
> +   The caller has already tried a divide-and-conquer approach, so do
> +   not consider that case here.  */
>  
>  void
> -aarch64_expand_vector_init (rtx target, rtx vals)
> +aarch64_expand_vector_init_fallback (rtx target, rtx vals)
>  {
>    machine_mode mode = GET_MODE (target);
>    scalar_mode inner_mode = GET_MODE_INNER (mode);
> @@ -22090,38 +22091,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>        return;
>      }
>  
> -  /* Check for interleaving case.
> -     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> -     Generate following code:
> -     dup v0.h, x
> -     dup v1.h, y
> -     zip1 v0.h, v0.h, v1.h
> -     for "large enough" initializer.  */
> -
> -  if (n_elts >= 8)
> -    {
> -      int i;
> -      for (i = 2; i < n_elts; i++)
> -	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> -	  break;
> -
> -      if (i == n_elts)
> -	{
> -	  machine_mode mode = GET_MODE (target);
> -	  rtx dest[2];
> -
> -	  for (int i = 0; i < 2; i++)
> -	    {
> -	      rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> -	      dest[i] = force_reg (mode, x);
> -	    }
> -
> -	  rtvec v = gen_rtvec (2, dest[0], dest[1]);
> -	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> -	  return;
> -	}
> -    }
> -
>    enum insn_code icode = optab_handler (vec_set_optab, mode);
>    gcc_assert (icode != CODE_FOR_nothing);
>  
> @@ -22243,7 +22212,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>  	    }
>  	  XVECEXP (copy, 0, i) = subst;
>  	}
> -      aarch64_expand_vector_init (target, copy);
> +      aarch64_expand_vector_init_fallback (target, copy);
>      }
>  
>    /* Insert the variable lanes directly.  */
> @@ -22257,6 +22226,81 @@ aarch64_expand_vector_init (rtx target, rtx vals)
>      }
>  }
>  
> +/* Return even or odd half of VALS depending on EVEN_P.  */
> +
> +static rtx
> +aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
> +{
> +  int n = XVECLEN (vals, 0);
> +  machine_mode new_mode
> +    = aarch64_simd_container_mode (GET_MODE_INNER (mode),
> +				   GET_MODE_BITSIZE (mode).to_constant () / 2);
> +  rtvec vec = rtvec_alloc (n / 2);
> +  for (int i = 0; i < n/2; i++)

Formatting nit: n / 2

> +    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> +				  : XVECEXP (vals, 0, 2 * i + 1);
> +  return gen_rtx_PARALLEL (new_mode, vec);
> +}
> +
> +/* Expand a vector initialisation sequence, such that TARGET is

initialization

> +   initialized to contain VALS.  */
> +
> +void
> +aarch64_expand_vector_init (rtx target, rtx vals)
> +{
> +  /* Try decomposing the initializer into even and odd halves and
> +     then ZIP them together.  Use the resulting sequence if it is
> +     strictly cheaper than loading VALS directly.
> +
> +     Prefer the fallback sequence in the event of a tie, since it
> +     will tend to use fewer registers.  */
> +
> +  machine_mode mode = GET_MODE (target);
> +  int n_elts = XVECLEN (vals, 0);
> +
> +  if (n_elts < 4
> +      || maybe_ne (GET_MODE_BITSIZE (mode), 128))
> +    {
> +      aarch64_expand_vector_init_fallback (target, vals);
> +      return;
> +    }
> +
> +  start_sequence ();
> +  rtx halves[2];
> +  unsigned costs[2];
> +  for (int i = 0; i < 2; i++)
> +    {
> +      start_sequence ();
> +      rtx new_vals
> +	= aarch64_unzip_vector_init (mode, vals, (i % 2) == 0);

Just i == 0 wouold be enough.  Also, this fits on one line.

> +      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
> +      aarch64_expand_vector_init (tmp_reg, new_vals);
> +      halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
> +      rtx_insn *rec_seq = get_insns ();
> +      end_sequence ();
> +      costs[i] = seq_cost (rec_seq, !optimize_size);
> +      emit_insn (rec_seq);
> +    }
> +
> +  rtvec v = gen_rtvec (2, halves[0], halves[1]);
> +  rtx_insn *zip1_insn
> +    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> +  unsigned seq_total_cost
> +    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
> +  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
> +
> +  rtx_insn *seq = get_insns ();
> +  end_sequence ();
> +
> +  start_sequence ();
> +  aarch64_expand_vector_init_fallback (target, vals);
> +  rtx_insn *fallback_seq = get_insns ();
> +  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
> +  end_sequence ();
> +
> +  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
> +}
> +
>  /* Emit RTL corresponding to:
>     insr TARGET, ELEM.  */
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
> index 8ab117c4dcd..30c86018773 100644
> --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
> +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
> @@ -96,10 +96,10 @@ CONS2_FN (4, float);
>  
>  /*
>  ** cons2_8_float:
> -**	dup	v([0-9]+)\.4s, .*
> +**	dup	v([0-9]+)\.2s, v1.s\[0\]
>  **	...
> -**	stp	q\1, q\1, \[x0\]
> -**	stp	q\1, q\1, \[x0, #?32\]
> +**	stp	q0, q0, \[x0\]
> +**	stp	q0, q0, \[x0, #?32\]

Leaving the capture in the first line while hard-coding q0 at the end
doesn't look right.  The original was written that way because nothing
guarantees a particular register allocation.

I think this now needs to match more of the sequence.

> diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> similarity index 82%
> rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> index ee775048589..e812d3946de 100644
> --- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> @@ -7,8 +7,8 @@
>  /*
>  ** foo:
>  **	...
> -**	dup	v[0-9]+\.8h, w[0-9]+
> -**	dup	v[0-9]+\.8h, w[0-9]+
> +**	dup	v[0-9]+\.4h, w[0-9]+
> +**	dup	v[0-9]+\.4h, w[0-9]+
>  **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>  **	...
>  **	ret
> @@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
>  /*
>  ** foo2:
>  **	...
> -**	dup	v[0-9]+\.8h, w[0-9]+
> -**	movi	v[0-9]+\.8h, 0x1
> +**	dup	v[0-9]+\.4h, w[0-9]+
> +**	movi	v[0-9]+\.4h, 0x1
>  **	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
>  **	...
>  **	ret
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> new file mode 100644
> index 00000000000..e28fdcda29d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f_s8:
> +**	...
> +**	dup	v[0-9]+\.8b, w[0-9]+
> +**	adrp	x[0-9]+, \.LC[0-9]+
> +**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b

This kind of match is dangerous for a test that enables scheduling,
since the zip sequences start with two independent sequences that
build 64-bit vectors.

Since the lines of the match don't build on each other (e.g. they
don't use captures to ensure that the zip operands are in the right
order), I think it'd be better to use scan-assemblers instead.

There's then no need to match the adrp. or the exact addressing
mode of the ldr.  Just {ldr\td[0-9]+, } would be enough.

Same comments for the other tests.

Please also check that the new tests pass on big-endian targets.

Thanks,
Richard

> +**	ret
> +*/
> +
> +int8x16_t f_s8(int8_t x)
> +{
> +  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> +                       x, 5, x, 6, x, 7, x, 8 };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> new file mode 100644
> index 00000000000..9366ca349b6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f_s8:
> +**	...
> +**	adrp	x[0-9]+, \.LC[0-9]+
> +**	dup	v[0-9]+\.8b, w[0-9]+
> +**	ldr	d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> +**	ins	v0\.b\[0\], w0
> +**	zip1	v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> +**	ret
> +*/
> +
> +int8x16_t f_s8(int8_t x, int8_t y)
> +{
> +  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> +                       4, y, 5, y, 6, y, 7, y };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> new file mode 100644
> index 00000000000..e16459486d7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** f_s8:
> +**	...
> +**	adrp	x[0-9]+, \.LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> +**	ins	v0\.b\[0\], w0
> +**	ins	v0\.b\[1\], w1
> +**	...
> +**	ret
> +*/
> +
> +int8x16_t f_s8(int8_t x, int8_t y)
> +{
> +  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
> +                       7, 8, 9, 10, 11, 12, 13, 14 };
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> new file mode 100644
> index 00000000000..8f35854c008
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Os" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/* Verify that fallback code-sequence is chosen over
> +   recursively generated code-sequence merged with zip1.  */
> +
> +/*
> +** f_s16:
> +**	...
> +**	sxth	w0, w0
> +**	fmov	s0, w0
> +**	ins	v0\.h\[1\], w1
> +**	ins	v0\.h\[2\], w2
> +**	ins	v0\.h\[3\], w3
> +**	ins	v0\.h\[4\], w4
> +**	ins	v0\.h\[5\], w5
> +**	ins	v0\.h\[6\], w6
> +**	ins	v0\.h\[7\], w7
> +**	...
> +**	ret
> +*/
> +
> +#include "vec-init-22.h"
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> new file mode 100644
> index 00000000000..172d56ffdf1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/* Verify that we recursively generate code for even and odd halves
> +   instead of fallback code. This is so despite the longer code-gen
> +   because it has fewer dependencies and thus has lesser cost.  */
> +
> +/*
> +** f_s16:
> +**	...
> +**	sxth	w0, w0
> +**	sxth	w1, w1
> +**	fmov	d0, x0
> +**	fmov	d1, x1
> +**	ins	v[0-9]+\.h\[1\], w2
> +**	ins	v[0-9]+\.h\[1\], w3
> +**	ins	v[0-9]+\.h\[2\], w4
> +**	ins	v[0-9]+\.h\[2\], w5
> +**	ins	v[0-9]+\.h\[3\], w6
> +**	ins	v[0-9]+\.h\[3\], w7
> +**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> +**	...
> +**	ret
> +*/
> +
> +#include "vec-init-22.h"
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> new file mode 100644
> index 00000000000..15b889d4097
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> @@ -0,0 +1,7 @@
> +#include <arm_neon.h>
> +
> +int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
> +                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
> +{
> +  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
> +}
  
Prathamesh Kulkarni May 4, 2023, 11:47 a.m. UTC | #31
On Mon, 24 Apr 2023 at 15:00, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > [aarch64] Recursively intialize even and odd sub-parts and merge with zip1.
> >
> > gcc/ChangeLog:
> >       * config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): Rename
> >       aarch64_expand_vector_init to this, and remove  interleaving case.
> >       Recursively call aarch64_expand_vector_init_fallback, instead of
> >       aarch64_expand_vector_init.
> >       (aarch64_unzip_vector_init): New function.
> >       (aarch64_expand_vector_init): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >       * gcc.target/aarch64/ldp_stp_16.c (cons2_8_float): Adjust for new
> >       code-gen.
> >       * gcc.target/aarch64/sve/acle/general/dupq_5.c: Likewise.
> >       * gcc.target/aarch64/sve/acle/general/dupq_6.c: Likewise.
> >       * gcc.target/aarch64/vec-init-18.c: Rename interleave-init-1.c to
> >       this.
> >       * gcc.target/aarch64/vec-init-19.c: New test.
> >       * gcc.target/aarch64/vec-init-20.c: Likewise.
> >       * gcc.target/aarch64/vec-init-21.c: Likewise.
> >       * gcc.target/aarch64/vec-init-22-size.c: Likewise.
> >       * gcc.target/aarch64/vec-init-22-speed.c: Likewise.
> >       * gcc.target/aarch64/vec-init-22.h: New header.
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index d7e895f8d34..416e062829c 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -22026,11 +22026,12 @@ aarch64_simd_make_constant (rtx vals)
> >      return NULL_RTX;
> >  }
> >
> > -/* Expand a vector initialisation sequence, such that TARGET is
> > -   initialised to contain VALS.  */
> > +/* A subroutine of aarch64_expand_vector_init, with the same interface.
> > +   The caller has already tried a divide-and-conquer approach, so do
> > +   not consider that case here.  */
> >
> >  void
> > -aarch64_expand_vector_init (rtx target, rtx vals)
> > +aarch64_expand_vector_init_fallback (rtx target, rtx vals)
> >  {
> >    machine_mode mode = GET_MODE (target);
> >    scalar_mode inner_mode = GET_MODE_INNER (mode);
> > @@ -22090,38 +22091,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >        return;
> >      }
> >
> > -  /* Check for interleaving case.
> > -     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
> > -     Generate following code:
> > -     dup v0.h, x
> > -     dup v1.h, y
> > -     zip1 v0.h, v0.h, v1.h
> > -     for "large enough" initializer.  */
> > -
> > -  if (n_elts >= 8)
> > -    {
> > -      int i;
> > -      for (i = 2; i < n_elts; i++)
> > -     if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
> > -       break;
> > -
> > -      if (i == n_elts)
> > -     {
> > -       machine_mode mode = GET_MODE (target);
> > -       rtx dest[2];
> > -
> > -       for (int i = 0; i < 2; i++)
> > -         {
> > -           rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
> > -           dest[i] = force_reg (mode, x);
> > -         }
> > -
> > -       rtvec v = gen_rtvec (2, dest[0], dest[1]);
> > -       emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > -       return;
> > -     }
> > -    }
> > -
> >    enum insn_code icode = optab_handler (vec_set_optab, mode);
> >    gcc_assert (icode != CODE_FOR_nothing);
> >
> > @@ -22243,7 +22212,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >           }
> >         XVECEXP (copy, 0, i) = subst;
> >       }
> > -      aarch64_expand_vector_init (target, copy);
> > +      aarch64_expand_vector_init_fallback (target, copy);
> >      }
> >
> >    /* Insert the variable lanes directly.  */
> > @@ -22257,6 +22226,81 @@ aarch64_expand_vector_init (rtx target, rtx vals)
> >      }
> >  }
> >
> > +/* Return even or odd half of VALS depending on EVEN_P.  */
> > +
> > +static rtx
> > +aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
> > +{
> > +  int n = XVECLEN (vals, 0);
> > +  machine_mode new_mode
> > +    = aarch64_simd_container_mode (GET_MODE_INNER (mode),
> > +                                GET_MODE_BITSIZE (mode).to_constant () / 2);
> > +  rtvec vec = rtvec_alloc (n / 2);
> > +  for (int i = 0; i < n/2; i++)
>
> Formatting nit: n / 2
>
> > +    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
> > +                               : XVECEXP (vals, 0, 2 * i + 1);
> > +  return gen_rtx_PARALLEL (new_mode, vec);
> > +}
> > +
> > +/* Expand a vector initialisation sequence, such that TARGET is
>
> initialization
>
> > +   initialized to contain VALS.  */
> > +
> > +void
> > +aarch64_expand_vector_init (rtx target, rtx vals)
> > +{
> > +  /* Try decomposing the initializer into even and odd halves and
> > +     then ZIP them together.  Use the resulting sequence if it is
> > +     strictly cheaper than loading VALS directly.
> > +
> > +     Prefer the fallback sequence in the event of a tie, since it
> > +     will tend to use fewer registers.  */
> > +
> > +  machine_mode mode = GET_MODE (target);
> > +  int n_elts = XVECLEN (vals, 0);
> > +
> > +  if (n_elts < 4
> > +      || maybe_ne (GET_MODE_BITSIZE (mode), 128))
> > +    {
> > +      aarch64_expand_vector_init_fallback (target, vals);
> > +      return;
> > +    }
> > +
> > +  start_sequence ();
> > +  rtx halves[2];
> > +  unsigned costs[2];
> > +  for (int i = 0; i < 2; i++)
> > +    {
> > +      start_sequence ();
> > +      rtx new_vals
> > +     = aarch64_unzip_vector_init (mode, vals, (i % 2) == 0);
>
> Just i == 0 wouold be enough.  Also, this fits on one line.
>
> > +      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
> > +      aarch64_expand_vector_init (tmp_reg, new_vals);
> > +      halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
> > +      rtx_insn *rec_seq = get_insns ();
> > +      end_sequence ();
> > +      costs[i] = seq_cost (rec_seq, !optimize_size);
> > +      emit_insn (rec_seq);
> > +    }
> > +
> > +  rtvec v = gen_rtvec (2, halves[0], halves[1]);
> > +  rtx_insn *zip1_insn
> > +    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
> > +  unsigned seq_total_cost
> > +    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
> > +  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
> > +
> > +  rtx_insn *seq = get_insns ();
> > +  end_sequence ();
> > +
> > +  start_sequence ();
> > +  aarch64_expand_vector_init_fallback (target, vals);
> > +  rtx_insn *fallback_seq = get_insns ();
> > +  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
> > +  end_sequence ();
> > +
> > +  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
> > +}
> > +
> >  /* Emit RTL corresponding to:
> >     insr TARGET, ELEM.  */
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
> > index 8ab117c4dcd..30c86018773 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
> > @@ -96,10 +96,10 @@ CONS2_FN (4, float);
> >
> >  /*
> >  ** cons2_8_float:
> > -**   dup     v([0-9]+)\.4s, .*
> > +**   dup     v([0-9]+)\.2s, v1.s\[0\]
> >  **   ...
> > -**   stp     q\1, q\1, \[x0\]
> > -**   stp     q\1, q\1, \[x0, #?32\]
> > +**   stp     q0, q0, \[x0\]
> > +**   stp     q0, q0, \[x0, #?32\]
>
> Leaving the capture in the first line while hard-coding q0 at the end
> doesn't look right.  The original was written that way because nothing
> guarantees a particular register allocation.
>
> I think this now needs to match more of the sequence.
>
> > diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > similarity index 82%
> > rename from gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> > rename to gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > index ee775048589..e812d3946de 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > @@ -7,8 +7,8 @@
> >  /*
> >  ** foo:
> >  **   ...
> > -**   dup     v[0-9]+\.8h, w[0-9]+
> > -**   dup     v[0-9]+\.8h, w[0-9]+
> > +**   dup     v[0-9]+\.4h, w[0-9]+
> > +**   dup     v[0-9]+\.4h, w[0-9]+
> >  **   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >  **   ...
> >  **   ret
> > @@ -23,8 +23,8 @@ int16x8_t foo(int16_t x, int y)
> >  /*
> >  ** foo2:
> >  **   ...
> > -**   dup     v[0-9]+\.8h, w[0-9]+
> > -**   movi    v[0-9]+\.8h, 0x1
> > +**   dup     v[0-9]+\.4h, w[0-9]+
> > +**   movi    v[0-9]+\.4h, 0x1
> >  **   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> >  **   ...
> >  **   ret
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> > new file mode 100644
> > index 00000000000..e28fdcda29d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
> > @@ -0,0 +1,21 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** f_s8:
> > +**   ...
> > +**   dup     v[0-9]+\.8b, w[0-9]+
> > +**   adrp    x[0-9]+, \.LC[0-9]+
> > +**   ldr     d[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> > +**   zip1    v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
>
> This kind of match is dangerous for a test that enables scheduling,
> since the zip sequences start with two independent sequences that
> build 64-bit vectors.
>
> Since the lines of the match don't build on each other (e.g. they
> don't use captures to ensure that the zip operands are in the right
> order), I think it'd be better to use scan-assemblers instead.
>
> There's then no need to match the adrp. or the exact addressing
> mode of the ldr.  Just {ldr\td[0-9]+, } would be enough.
>
> Same comments for the other tests.
>
> Please also check that the new tests pass on big-endian targets.
Hi Richard,
Thanks for the suggestions, I have tried to address them in the attached patch.
I verified the new tests pass on aarch64_be-linux-gnu, and patch is
under bootstrap+progress on aarch64-linux-gnu.
OK to commit if passes ?

Thanks,
Prathamesh
>
> Thanks,
> Richard
>
> > +**   ret
> > +*/
> > +
> > +int8x16_t f_s8(int8_t x)
> > +{
> > +  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
> > +                       x, 5, x, 6, x, 7, x, 8 };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> > new file mode 100644
> > index 00000000000..9366ca349b6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** f_s8:
> > +**   ...
> > +**   adrp    x[0-9]+, \.LC[0-9]+
> > +**   dup     v[0-9]+\.8b, w[0-9]+
> > +**   ldr     d[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> > +**   ins     v0\.b\[0\], w0
> > +**   zip1    v[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b
> > +**   ret
> > +*/
> > +
> > +int8x16_t f_s8(int8_t x, int8_t y)
> > +{
> > +  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
> > +                       4, y, 5, y, 6, y, 7, y };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> > new file mode 100644
> > index 00000000000..e16459486d7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
> > @@ -0,0 +1,22 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** f_s8:
> > +**   ...
> > +**   adrp    x[0-9]+, \.LC[0-9]+
> > +**   ldr     q[0-9]+, \[x[0-9]+, #:lo12:\.LC[0-9]+\]
> > +**   ins     v0\.b\[0\], w0
> > +**   ins     v0\.b\[1\], w1
> > +**   ...
> > +**   ret
> > +*/
> > +
> > +int8x16_t f_s8(int8_t x, int8_t y)
> > +{
> > +  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
> > +                       7, 8, 9, 10, 11, 12, 13, 14 };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> > new file mode 100644
> > index 00000000000..8f35854c008
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
> > @@ -0,0 +1,24 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-Os" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +/* Verify that fallback code-sequence is chosen over
> > +   recursively generated code-sequence merged with zip1.  */
> > +
> > +/*
> > +** f_s16:
> > +**   ...
> > +**   sxth    w0, w0
> > +**   fmov    s0, w0
> > +**   ins     v0\.h\[1\], w1
> > +**   ins     v0\.h\[2\], w2
> > +**   ins     v0\.h\[3\], w3
> > +**   ins     v0\.h\[4\], w4
> > +**   ins     v0\.h\[5\], w5
> > +**   ins     v0\.h\[6\], w6
> > +**   ins     v0\.h\[7\], w7
> > +**   ...
> > +**   ret
> > +*/
> > +
> > +#include "vec-init-22.h"
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> > new file mode 100644
> > index 00000000000..172d56ffdf1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
> > @@ -0,0 +1,27 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +/* Verify that we recursively generate code for even and odd halves
> > +   instead of fallback code. This is so despite the longer code-gen
> > +   because it has fewer dependencies and thus has lesser cost.  */
> > +
> > +/*
> > +** f_s16:
> > +**   ...
> > +**   sxth    w0, w0
> > +**   sxth    w1, w1
> > +**   fmov    d0, x0
> > +**   fmov    d1, x1
> > +**   ins     v[0-9]+\.h\[1\], w2
> > +**   ins     v[0-9]+\.h\[1\], w3
> > +**   ins     v[0-9]+\.h\[2\], w4
> > +**   ins     v[0-9]+\.h\[2\], w5
> > +**   ins     v[0-9]+\.h\[3\], w6
> > +**   ins     v[0-9]+\.h\[3\], w7
> > +**   zip1    v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
> > +**   ...
> > +**   ret
> > +*/
> > +
> > +#include "vec-init-22.h"
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> > new file mode 100644
> > index 00000000000..15b889d4097
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
> > @@ -0,0 +1,7 @@
> > +#include <arm_neon.h>
> > +
> > +int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
> > +                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
> > +{
> > +  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
> > +}
[aarch64] Recursively intialize even and odd sub-parts and merge with zip1.

gcc/ChangeLog:
	* config/aarch64/aarch64.cc (aarch64_expand_vector_init_fallback): Rename
	aarch64_expand_vector_init to this, and remove 	interleaving case.
	Recursively call aarch64_expand_vector_init_fallback, instead of
	aarch64_expand_vector_init.
	(aarch64_unzip_vector_init): New function.
	(aarch64_expand_vector_init): Likewise.

gcc/testsuite/ChangeLog:
	* gcc.target/aarch64/ldp_stp_16.c (cons2_8_float): Adjust for new
	code-gen.
	* gcc.target/aarch64/sve/acle/general/dupq_5.c: Likewise.
	* gcc.target/aarch64/sve/acle/general/dupq_6.c: Likewise.
	* gcc.target/aarch64/vec-init-18.c: Rename interleave-init-1.c to
	this.
	* gcc.target/aarch64/vec-init-19.c: New test.
	* gcc.target/aarch64/vec-init-20.c: Likewise.
	* gcc.target/aarch64/vec-init-21.c: Likewise.
	* gcc.target/aarch64/vec-init-22-size.c: Likewise.
	* gcc.target/aarch64/vec-init-22-speed.c: Likewise.
	* gcc.target/aarch64/vec-init-22.h: New header.

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 2b0de7ca038..48ece0ad328 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22060,11 +22060,12 @@ aarch64_simd_make_constant (rtx vals)
     return NULL_RTX;
 }
 
-/* Expand a vector initialisation sequence, such that TARGET is
-   initialised to contain VALS.  */
+/* A subroutine of aarch64_expand_vector_init, with the same interface.
+   The caller has already tried a divide-and-conquer approach, so do
+   not consider that case here.  */
 
 void
-aarch64_expand_vector_init (rtx target, rtx vals)
+aarch64_expand_vector_init_fallback (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   scalar_mode inner_mode = GET_MODE_INNER (mode);
@@ -22124,38 +22125,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
-  /* Check for interleaving case.
-     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
-     Generate following code:
-     dup v0.h, x
-     dup v1.h, y
-     zip1 v0.h, v0.h, v1.h
-     for "large enough" initializer.  */
-
-  if (n_elts >= 8)
-    {
-      int i;
-      for (i = 2; i < n_elts; i++)
-	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
-	  break;
-
-      if (i == n_elts)
-	{
-	  machine_mode mode = GET_MODE (target);
-	  rtx dest[2];
-
-	  for (int i = 0; i < 2; i++)
-	    {
-	      rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
-	      dest[i] = force_reg (mode, x);
-	    }
-
-	  rtvec v = gen_rtvec (2, dest[0], dest[1]);
-	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
-	  return;
-	}
-    }
-
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
@@ -22277,7 +22246,7 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	    }
 	  XVECEXP (copy, 0, i) = subst;
 	}
-      aarch64_expand_vector_init (target, copy);
+      aarch64_expand_vector_init_fallback (target, copy);
     }
 
   /* Insert the variable lanes directly.  */
@@ -22291,6 +22260,80 @@ aarch64_expand_vector_init (rtx target, rtx vals)
     }
 }
 
+/* Return even or odd half of VALS depending on EVEN_P.  */
+
+static rtx
+aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
+{
+  int n = XVECLEN (vals, 0);
+  machine_mode new_mode
+    = aarch64_simd_container_mode (GET_MODE_INNER (mode),
+				   GET_MODE_BITSIZE (mode).to_constant () / 2);
+  rtvec vec = rtvec_alloc (n / 2);
+  for (int i = 0; i < n / 2; i++)
+    RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
+				  : XVECEXP (vals, 0, 2 * i + 1);
+  return gen_rtx_PARALLEL (new_mode, vec);
+}
+
+/* Expand a vector initialization sequence, such that TARGET is
+   initialized to contain VALS.  */
+
+void
+aarch64_expand_vector_init (rtx target, rtx vals)
+{
+  /* Try decomposing the initializer into even and odd halves and
+     then ZIP them together.  Use the resulting sequence if it is
+     strictly cheaper than loading VALS directly.
+
+     Prefer the fallback sequence in the event of a tie, since it
+     will tend to use fewer registers.  */
+
+  machine_mode mode = GET_MODE (target);
+  int n_elts = XVECLEN (vals, 0);
+
+  if (n_elts < 4
+      || maybe_ne (GET_MODE_BITSIZE (mode), 128))
+    {
+      aarch64_expand_vector_init_fallback (target, vals);
+      return;
+    }
+
+  start_sequence ();
+  rtx halves[2];
+  unsigned costs[2];
+  for (int i = 0; i < 2; i++)
+    {
+      start_sequence ();
+      rtx new_vals = aarch64_unzip_vector_init (mode, vals, i == 0);
+      rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
+      aarch64_expand_vector_init (tmp_reg, new_vals);
+      halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
+      rtx_insn *rec_seq = get_insns ();
+      end_sequence ();
+      costs[i] = seq_cost (rec_seq, !optimize_size);
+      emit_insn (rec_seq);
+    }
+
+  rtvec v = gen_rtvec (2, halves[0], halves[1]);
+  rtx_insn *zip1_insn
+    = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+  unsigned seq_total_cost
+    = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
+  seq_total_cost += insn_cost (zip1_insn, !optimize_size);
+
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  start_sequence ();
+  aarch64_expand_vector_init_fallback (target, vals);
+  rtx_insn *fallback_seq = get_insns ();
+  unsigned fallback_seq_cost = seq_cost (fallback_seq, !optimize_size);
+  end_sequence ();
+
+  emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
+}
+
 /* Emit RTL corresponding to:
    insr TARGET, ELEM.  */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
deleted file mode 100644
index ee775048589..00000000000
--- a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
+++ /dev/null
@@ -1,37 +0,0 @@
-/* { dg-do compile } */
-/* { dg-options "-O3" } */
-/* { dg-final { check-function-bodies "**" "" "" } } */
-
-#include <arm_neon.h>
-
-/*
-** foo:
-**	...
-**	dup	v[0-9]+\.8h, w[0-9]+
-**	dup	v[0-9]+\.8h, w[0-9]+
-**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
-**	...
-**	ret
-*/
-
-int16x8_t foo(int16_t x, int y)
-{
-  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
-  return v;
-}
-
-/*
-** foo2:
-**	...
-**	dup	v[0-9]+\.8h, w[0-9]+
-**	movi	v[0-9]+\.8h, 0x1
-**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
-**	...
-**	ret
-*/
-
-int16x8_t foo2(int16_t x) 
-{
-  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
-  return v;
-}
diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
index 8ab117c4dcd..ba14194d0a4 100644
--- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
+++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c
@@ -96,8 +96,9 @@ CONS2_FN (4, float);
 
 /*
 ** cons2_8_float:
-**	dup	v([0-9]+)\.4s, .*
-**	...
+**	dup	v[0-9]+\.2s, v[0-9]+\.s\[0\]
+**	dup	v[0-9]+\.2s, v[0-9]+\.s\[0\]
+**	zip1	v([0-9]+)\.4s, v[0-9]+\.4s, v[0-9]+\.4s
 **	stp	q\1, q\1, \[x0\]
 **	stp	q\1, q\1, \[x0, #?32\]
 **	ret
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
index 53426c9af5a..c7d6f3ff390 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_5.c
@@ -11,7 +11,7 @@ dupq (int x1, int x2, int x3, int x4)
 
 /* { dg-final { scan-assembler-not {\tldr\t} } } */
 /* { dg-final { scan-assembler {, [wx]0\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w1\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w2\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[3\], w3\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w2\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w3\n} } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]\.4s\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c
index dfce5e7a12a..4745a3815b0 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_6.c
@@ -12,7 +12,7 @@ dupq (int x1, int x2, int x3, int x4)
 
 /* { dg-final { scan-assembler-not {\tldr\t} } } */
 /* { dg-final { scan-assembler {, [wx]0\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w1\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w2\n} } } */
-/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[3\], w3\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w2\n} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[1\], w3\n} } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]\.4s\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
new file mode 100644
index 00000000000..598a51f17c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int16x8_t foo(int16_t x, int y)
+{
+  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
+  return v;
+}
+
+int16x8_t foo2(int16_t x) 
+{
+  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
+  return v;
+}
+
+/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4h, w[0-9]+} 3 } } */
+/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-19.c b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
new file mode 100644
index 00000000000..46e9dbf51a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-19.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int8x16_t f_s8(int8_t x)
+{
+  return (int8x16_t) { x, 1, x, 2, x, 3, x, 4,
+                       x, 5, x, 6, x, 7, x, 8 };
+}
+
+/* { dg-final { scan-assembler {\tdup\tv[0-9]+\.8b, w[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\td[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-20.c b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
new file mode 100644
index 00000000000..4494121cb2d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-20.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, y, 2, y, 3, y,
+                       4, y, 5, y, 6, y, 7, y };
+}
+
+/* { dg-final { scan-assembler {\tdup\tv[0-9]+\.8b, w[0-9]+} } } */
+/* { dg-final { scan-assembler {\tldr\td[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.b\[0|7\], w[0-9]+} } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-21.c b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
new file mode 100644
index 00000000000..f53e0ed08d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-21.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+int8x16_t f_s8(int8_t x, int8_t y)
+{
+  return (int8x16_t) { x, y, 1, 2, 3, 4, 5, 6,
+                       7, 8, 9, 10, 11, 12, 13, 14 };
+}
+
+/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.b\[0|15\], w0} } } */
+/* { dg-final { scan-assembler {\tins\tv[0-9]+\.b\[1|14\], w1} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
new file mode 100644
index 00000000000..4333ff50205
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-size.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-Os" } */
+
+/* Verify that fallback code-sequence is chosen over
+   recursively generated code-sequence merged with zip1.  */
+
+#include "vec-init-22.h"
+
+/* { dg-final { scan-assembler {\tfmov\ts[0-9]+, w0|w7} } } */
+/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[[1-7]\], w[0-9]+} 7 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
new file mode 100644
index 00000000000..993ef8c4161
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+/* Verify that we recursively generate code for even and odd halves
+   instead of fallback code. This is so despite the longer code-gen
+   because it has fewer dependencies and thus has lesser cost.  */
+
+#include "vec-init-22.h"
+
+/* { dg-final { scan-assembler-times {\tfmov\td[0-9]+, x[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[[1-3]\], w[0-9]+} 6 } } */
+/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22.h b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
new file mode 100644
index 00000000000..15b889d4097
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22.h
@@ -0,0 +1,7 @@
+#include <arm_neon.h>
+
+int16x8_t f_s16 (int16_t x0, int16_t x1, int16_t x2, int16_t x3,
+                 int16_t x4, int16_t x5, int16_t x6, int16_t x7)
+{
+  return (int16x8_t) { x0, x1, x2, x3, x4, x5, x6, x7 };
+}
  
Richard Sandiford May 11, 2023, 7:07 p.m. UTC | #32
Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> new file mode 100644
> index 00000000000..598a51f17c6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include <arm_neon.h>
> +
> +int16x8_t foo(int16_t x, int y)
> +{
> +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
> +  return v;
> +}
> +
> +int16x8_t foo2(int16_t x) 
> +{
> +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
> +  return v;
> +}
> +
> +/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4h, w[0-9]+} 3 } } */
> +/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */
> +/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h} } } */

Would be good to make this a scan-assembler-times ... 2.

OK with that change.  Thanks for doing this.

Richard
  
Prathamesh Kulkarni May 13, 2023, 9:10 a.m. UTC | #33
On Fri, 12 May 2023 at 00:37, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-18.c b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > new file mode 100644
> > index 00000000000..598a51f17c6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-18.c
> > @@ -0,0 +1,20 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +
> > +#include <arm_neon.h>
> > +
> > +int16x8_t foo(int16_t x, int y)
> > +{
> > +  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y};
> > +  return v;
> > +}
> > +
> > +int16x8_t foo2(int16_t x)
> > +{
> > +  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1};
> > +  return v;
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {\tdup\tv[0-9]+\.4h, w[0-9]+} 3 } } */
> > +/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4h, 0x1} } } */
> > +/* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h} } } */
>
> Would be good to make this a scan-assembler-times ... 2.
>
> OK with that change.  Thanks for doing this.
Thanks, committed the patch in:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8b18714fbb1ca9812b33b3de75fe6ba4a57d4946
after bootstrap+test on aarch64-linux-gnu, and verifying bootstrap
passes on aarch64-linux-gnu with --enable-checking=all.

Thanks,
Prathamesh
>
> Richard
  

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c91df6f5006..e5dea70e363 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22028,6 +22028,39 @@  aarch64_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
+  /* Check for interleaving case.
+     For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
+     Generate following code:
+     dup v0.h, x
+     dup v1.h, y
+     zip1 v0.h, v0.h, v1.h
+     for "large enough" initializer.  */
+
+  if (n_elts >= 8)
+    {
+      int i;
+      for (i = 2; i < n_elts; i++)
+	if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
+	  break;
+
+      if (i == n_elts)
+	{
+	  machine_mode mode = GET_MODE (target);
+	  rtx dest[2];
+
+	  for (int i = 0; i < 2; i++)
+	    {
+	      rtx x = copy_to_mode_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
+	      dest[i] = gen_reg_rtx (mode);
+	      aarch64_emit_move (dest[i], gen_vec_duplicate (mode, x));
+	    }
+
+	  rtvec v = gen_rtvec (2, dest[0], dest[1]);
+	  emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
+	  return;
+	}
+    }
+
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
diff --git a/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
new file mode 100644
index 00000000000..ee775048589
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/interleave-init-1.c
@@ -0,0 +1,37 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+/*
+** foo:
+**	...
+**	dup	v[0-9]+\.8h, w[0-9]+
+**	dup	v[0-9]+\.8h, w[0-9]+
+**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**	...
+**	ret
+*/
+
+int16x8_t foo(int16_t x, int y)
+{
+  int16x8_t v = (int16x8_t) {x, y, x, y, x, y, x, y}; 
+  return v;
+}
+
+/*
+** foo2:
+**	...
+**	dup	v[0-9]+\.8h, w[0-9]+
+**	movi	v[0-9]+\.8h, 0x1
+**	zip1	v[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h
+**	...
+**	ret
+*/
+
+int16x8_t foo2(int16_t x) 
+{
+  int16x8_t v = (int16x8_t) {x, 1, x, 1, x, 1, x, 1}; 
+  return v;
+}