rs6000: Load high and low part of 64bit constant independently

Message ID 20220915083052.74903-1-guojiufu@linux.ibm.com
State Accepted, archived
Headers
Series rs6000: Load high and low part of 64bit constant independently |

Checks

Context Check Description
snail/gcc-patches-check success Github commit url

Commit Message

Jiufu Guo Sept. 15, 2022, 8:30 a.m. UTC
  Hi,

For a complicate 64bit constant, blow is one instruction-sequence to
build:
	lis 9,0x800a
	ori 9,9,0xabcd
	sldi 9,9,32
	oris 9,9,0xc167
	ori 9,9,0xfa16

while we can also use below sequence to build:
	lis 9,0xc167
	lis 10,0x800a
	ori 9,9,0xfa16
	ori 10,10,0xabcd
	rldimi 9,10,32,0
This sequence is using 2 registers to build high and low part firstly,
and then merge them.
In parallel aspect, this sequence would be faster. (Ofcause, using 1 more
register with potential register pressure).

Bootstrap and regtest pass on ppc64le.
Is this ok for trunk?


BR,
Jeff(Jiufu)


gcc/ChangeLog:

	* config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Update 64bit
	constant build.

gcc/testsuite/ChangeLog:

	* gcc.target/powerpc/parall_5insn_const.c: New test.

---
 gcc/config/rs6000/rs6000.cc                   | 45 +++++++++++--------
 .../gcc.target/powerpc/parall_5insn_const.c   | 27 +++++++++++
 2 files changed, 53 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
  

Comments

Kewen.Lin Nov. 25, 2022, 9:15 a.m. UTC | #1
Hi Jeff,

Sorry for the late review.

on 2022/9/15 16:30, Jiufu Guo wrote:
> Hi,
> 
> For a complicate 64bit constant, blow is one instruction-sequence to
> build:
> 	lis 9,0x800a
> 	ori 9,9,0xabcd
> 	sldi 9,9,32
> 	oris 9,9,0xc167
> 	ori 9,9,0xfa16
> 
> while we can also use below sequence to build:
> 	lis 9,0xc167
> 	lis 10,0x800a
> 	ori 9,9,0xfa16
> 	ori 10,10,0xabcd
> 	rldimi 9,10,32,0
> This sequence is using 2 registers to build high and low part firstly,
> and then merge them.
> In parallel aspect, this sequence would be faster. (Ofcause, using 1 more
> register with potential register pressure).
> 
> Bootstrap and regtest pass on ppc64le.
> Is this ok for trunk?
> 
> 
> BR,
> Jeff(Jiufu)
> 
> 
> gcc/ChangeLog:
> 
> 	* config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Update 64bit
> 	constant build.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/powerpc/parall_5insn_const.c: New test.
> 
> ---
>  gcc/config/rs6000/rs6000.cc                   | 45 +++++++++++--------
>  .../gcc.target/powerpc/parall_5insn_const.c   | 27 +++++++++++
>  2 files changed, 53 insertions(+), 19 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
> 
> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
> index a656cb32a47..759c6309677 100644
> --- a/gcc/config/rs6000/rs6000.cc
> +++ b/gcc/config/rs6000/rs6000.cc
> @@ -10180,26 +10180,33 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
>      }
>    else
>      {
> -      temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
> -
> -      emit_move_insn (copy_rtx (temp),
> -		      GEN_INT (((ud4 << 16) ^ 0x80000000) - 0x80000000));
> -      if (ud3 != 0)
> -	emit_move_insn (copy_rtx (temp),
> -			gen_rtx_IOR (DImode, copy_rtx (temp),
> -				     GEN_INT (ud3)));
> +      if (can_create_pseudo_p ())
> +	{
> +	  /* lis A,U4; ori A,U3; lis B,U2; ori B,U1; rldimi A,B,32,0.  */

Nit: A, B are supposed to be H, L?

> +	  rtx H = gen_reg_rtx (DImode);
> +	  rtx L = gen_reg_rtx (DImode);
> +	  HOST_WIDE_INT num = (ud2 << 16) | ud1;
> +	  rs6000_emit_set_long_const (L, (num ^ 0x80000000) - 0x80000000);
> +	  num = (ud4 << 16) | ud3;
> +	  rs6000_emit_set_long_const (H, (num ^ 0x80000000) - 0x80000000);
> +	  emit_insn (gen_rotldi3_insert_3 (dest, H, GEN_INT (32), L,
> +					   GEN_INT (0xffffffff)));
> +	}
> +      else
> +	{
> +	  /* lis A, U4; ori A,U3; rotl A,32; oris A,U2; ori A,U1.  */
                   ~~~ unexpected space?

> +	  emit_move_insn (dest,
> +			  GEN_INT (((ud4 << 16) ^ 0x80000000) - 0x80000000));
> +	  if (ud3 != 0)
> +	    emit_move_insn (dest, gen_rtx_IOR (DImode, dest, GEN_INT (ud3)));
> 
> -      emit_move_insn (ud2 != 0 || ud1 != 0 ? copy_rtx (temp) : dest,
> -		      gen_rtx_ASHIFT (DImode, copy_rtx (temp),
> -				      GEN_INT (32)));
> -      if (ud2 != 0)
> -	emit_move_insn (ud1 != 0 ? copy_rtx (temp) : dest,
> -			gen_rtx_IOR (DImode, copy_rtx (temp),
> -				     GEN_INT (ud2 << 16)));
> -      if (ud1 != 0)
> -	emit_move_insn (dest,
> -			gen_rtx_IOR (DImode, copy_rtx (temp),
> -				     GEN_INT (ud1)));
> +	  emit_move_insn (dest, gen_rtx_ASHIFT (DImode, dest, GEN_INT (32)));
> +	  if (ud2 != 0)
> +	    emit_move_insn (dest,
> +			    gen_rtx_IOR (DImode, dest, GEN_INT (ud2 << 16)));
> +	  if (ud1 != 0)
> +	    emit_move_insn (dest, gen_rtx_IOR (DImode, dest, GEN_INT (ud1)));
> +	}
>      }
>  }
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
> new file mode 100644
> index 00000000000..ed8ccc73378
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
> @@ -0,0 +1,27 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power7  -save-temps" } */

Why do we need power7 here?

> +/* { dg-require-effective-target has_arch_ppc64 } */
> +
> +/* { dg-final { scan-assembler-times {\mlis\M} 4 } } */
> +/* { dg-final { scan-assembler-times {\mori\M} 4 } } */
> +/* { dg-final { scan-assembler-times {\mrldimi\M} 2 } } */
> +
> +void __attribute__ ((noinline)) foo (unsigned long long *a)
> +{
> +  /* 2lis+2ori+1rldimi for each constant.  */

Nit: seems better to read with "/* 2 lis + 2 ori + 1 rldimi for ..."

BR,
Kewen
  
Jiufu Guo Nov. 25, 2022, 1:21 p.m. UTC | #2
Hi Kewen,

Thanks for your review on this patch!

"Kewen.Lin" <linkw@linux.ibm.com> writes:

> Hi Jeff,
>
> Sorry for the late review.
>
> on 2022/9/15 16:30, Jiufu Guo wrote:
>> Hi,
>> 
>> For a complicate 64bit constant, blow is one instruction-sequence to
>> build:
>> 	lis 9,0x800a
>> 	ori 9,9,0xabcd
>> 	sldi 9,9,32
>> 	oris 9,9,0xc167
>> 	ori 9,9,0xfa16
>> 
>> while we can also use below sequence to build:
>> 	lis 9,0xc167
>> 	lis 10,0x800a
>> 	ori 9,9,0xfa16
>> 	ori 10,10,0xabcd
>> 	rldimi 9,10,32,0
>> This sequence is using 2 registers to build high and low part firstly,
>> and then merge them.
>> In parallel aspect, this sequence would be faster. (Ofcause, using 1 more
>> register with potential register pressure).
>> 
>> Bootstrap and regtest pass on ppc64le.
>> Is this ok for trunk?
>> 
>> 
>> BR,
>> Jeff(Jiufu)
>> 
>> 
>> gcc/ChangeLog:
>> 
>> 	* config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Update 64bit
>> 	constant build.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>> 	* gcc.target/powerpc/parall_5insn_const.c: New test.
>> 
>> ---
>>  gcc/config/rs6000/rs6000.cc                   | 45 +++++++++++--------
>>  .../gcc.target/powerpc/parall_5insn_const.c   | 27 +++++++++++
>>  2 files changed, 53 insertions(+), 19 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
>> 
>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
>> index a656cb32a47..759c6309677 100644
>> --- a/gcc/config/rs6000/rs6000.cc
>> +++ b/gcc/config/rs6000/rs6000.cc
>> @@ -10180,26 +10180,33 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
>>      }
>>    else
>>      {
>> -      temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
>> -
>> -      emit_move_insn (copy_rtx (temp),
>> -		      GEN_INT (((ud4 << 16) ^ 0x80000000) - 0x80000000));
>> -      if (ud3 != 0)
>> -	emit_move_insn (copy_rtx (temp),
>> -			gen_rtx_IOR (DImode, copy_rtx (temp),
>> -				     GEN_INT (ud3)));
>> +      if (can_create_pseudo_p ())
>> +	{
>> +	  /* lis A,U4; ori A,U3; lis B,U2; ori B,U1; rldimi A,B,32,0.  */
>
> Nit: A, B are supposed to be H, L?
Yes, thanks for this catch! It should be
/* lis H,U4; ori H,U3; lis L,U2; ori L,U1; rldimi L,H,32,0.  */
>
>> +	  rtx H = gen_reg_rtx (DImode);
>> +	  rtx L = gen_reg_rtx (DImode);
>> +	  HOST_WIDE_INT num = (ud2 << 16) | ud1;
>> +	  rs6000_emit_set_long_const (L, (num ^ 0x80000000) - 0x80000000);
>> +	  num = (ud4 << 16) | ud3;
>> +	  rs6000_emit_set_long_const (H, (num ^ 0x80000000) - 0x80000000);
>> +	  emit_insn (gen_rotldi3_insert_3 (dest, H, GEN_INT (32), L,
>> +					   GEN_INT (0xffffffff)));
>> +	}
>> +      else
>> +	{
>> +	  /* lis A, U4; ori A,U3; rotl A,32; oris A,U2; ori A,U1.  */
>                    ~~~ unexpected space?
Thanks for the catch!
>
>> +	  emit_move_insn (dest,
>> +			  GEN_INT (((ud4 << 16) ^ 0x80000000) - 0x80000000));
>> +	  if (ud3 != 0)
>> +	    emit_move_insn (dest, gen_rtx_IOR (DImode, dest, GEN_INT (ud3)));
>> 
>> -      emit_move_insn (ud2 != 0 || ud1 != 0 ? copy_rtx (temp) : dest,
>> -		      gen_rtx_ASHIFT (DImode, copy_rtx (temp),
>> -				      GEN_INT (32)));
>> -      if (ud2 != 0)
>> -	emit_move_insn (ud1 != 0 ? copy_rtx (temp) : dest,
>> -			gen_rtx_IOR (DImode, copy_rtx (temp),
>> -				     GEN_INT (ud2 << 16)));
>> -      if (ud1 != 0)
>> -	emit_move_insn (dest,
>> -			gen_rtx_IOR (DImode, copy_rtx (temp),
>> -				     GEN_INT (ud1)));
>> +	  emit_move_insn (dest, gen_rtx_ASHIFT (DImode, dest, GEN_INT (32)));
>> +	  if (ud2 != 0)
>> +	    emit_move_insn (dest,
>> +			    gen_rtx_IOR (DImode, dest, GEN_INT (ud2 << 16)));
>> +	  if (ud1 != 0)
>> +	    emit_move_insn (dest, gen_rtx_IOR (DImode, dest, GEN_INT (ud1)));
>> +	}
>>      }
>>  }
>> 
>> diff --git a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
>> new file mode 100644
>> index 00000000000..ed8ccc73378
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
>> @@ -0,0 +1,27 @@
>> +/* { dg-do run } */
>> +/* { dg-options "-O2 -mdejagnu-cpu=power7  -save-temps" } */
>
> Why do we need power7 here?
power8/9 are also ok for this case.  Actually, O just want to
avoid to use new p10 instruction, like "pli", and then selected
an old arch option.

>
>> +/* { dg-require-effective-target has_arch_ppc64 } */
>> +
>> +/* { dg-final { scan-assembler-times {\mlis\M} 4 } } */
>> +/* { dg-final { scan-assembler-times {\mori\M} 4 } } */
>> +/* { dg-final { scan-assembler-times {\mrldimi\M} 2 } } */
>> +
>> +void __attribute__ ((noinline)) foo (unsigned long long *a)
>> +{
>> +  /* 2lis+2ori+1rldimi for each constant.  */
>
> Nit: seems better to read with "/* 2 lis + 2 ori + 1 rldimi for ..."
Thanks for your sugguestion!

BR,
Jeff (Jiufu)

I updated it as below:

gcc/ChangeLog:

	* config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Update 64bit
	constant build.

gcc/testsuite/ChangeLog:

	* gcc.target/powerpc/parall_5insn_const.c: New test.

---
 gcc/config/rs6000/rs6000.cc                   | 45 +++++++++++--------
 .../gcc.target/powerpc/parall_5insn_const.c   | 27 +++++++++++
 2 files changed, 53 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index a656cb32a47..759c6309677 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -10180,26 +10180,33 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
     }
   else
     {
-      temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
-
-      emit_move_insn (copy_rtx (temp),
-		      GEN_INT (((ud4 << 16) ^ 0x80000000) - 0x80000000));
-      if (ud3 != 0)
-	emit_move_insn (copy_rtx (temp),
-			gen_rtx_IOR (DImode, copy_rtx (temp),
-				     GEN_INT (ud3)));
+      if (can_create_pseudo_p ())
+	{
+	  /* lis H,U4; ori H,U3; lis L,U2; ori L,U1; rldimi L,H,32,0.  */
+	  rtx H = gen_reg_rtx (DImode);
+	  rtx L = gen_reg_rtx (DImode);
+	  HOST_WIDE_INT num = (ud2 << 16) | ud1;
+	  rs6000_emit_set_long_const (L, (num ^ 0x80000000) - 0x80000000);
+	  num = (ud4 << 16) | ud3;
+	  rs6000_emit_set_long_const (H, (num ^ 0x80000000) - 0x80000000);
+	  emit_insn (gen_rotldi3_insert_3 (dest, H, GEN_INT (32), L,
+					   GEN_INT (0xffffffff)));
+	}
+      else
+	{
+	  /* lis A,U4; ori A,U3; rotl A,32; oris A,U2; ori A,U1.  */
+	  emit_move_insn (dest,
+			  GEN_INT (((ud4 << 16) ^ 0x80000000) - 0x80000000));
+	  if (ud3 != 0)
+	    emit_move_insn (dest, gen_rtx_IOR (DImode, dest, GEN_INT (ud3)));
 
-      emit_move_insn (ud2 != 0 || ud1 != 0 ? copy_rtx (temp) : dest,
-		      gen_rtx_ASHIFT (DImode, copy_rtx (temp),
-				      GEN_INT (32)));
-      if (ud2 != 0)
-	emit_move_insn (ud1 != 0 ? copy_rtx (temp) : dest,
-			gen_rtx_IOR (DImode, copy_rtx (temp),
-				     GEN_INT (ud2 << 16)));
-      if (ud1 != 0)
-	emit_move_insn (dest,
-			gen_rtx_IOR (DImode, copy_rtx (temp),
-				     GEN_INT (ud1)));
+	  emit_move_insn (dest, gen_rtx_ASHIFT (DImode, dest, GEN_INT (32)));
+	  if (ud2 != 0)
+	    emit_move_insn (dest,
+			    gen_rtx_IOR (DImode, dest, GEN_INT (ud2 << 16)));
+	  if (ud1 != 0)
+	    emit_move_insn (dest, gen_rtx_IOR (DImode, dest, GEN_INT (ud1)));
+	}
     }
 }
 
diff --git a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
new file mode 100644
index 00000000000..ed8ccc73378
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
@@ -0,1 +1,27 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8  -save-temps" } */
+/* { dg-require-effective-target has_arch_ppc64 } */
+
+/* { dg-final { scan-assembler-times {\mlis\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mori\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mrldimi\M} 2 } } */
+
+void __attribute__ ((noinline)) foo (unsigned long long *a)
+{
+  /* 2 lis + 2 ori + 1 rldimi for each constant.  */
+  *a++ = 0x800aabcdc167fa16ULL;
+  *a++ = 0x7543a876867f616ULL;
+}
+
+long long A[] = {0x800aabcdc167fa16ULL, 0x7543a876867f616ULL};
+int
+main ()
+{
+  long long res[2];
+
+  foo (res);
+  if (__builtin_memcmp (res, A, sizeof (res)) != 0)
+    __builtin_abort ();
+
+  return 0;
+}
  
Jiufu Guo Nov. 25, 2022, 1:31 p.m. UTC | #3
Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> writes:

> Hi Kewen,
>
> Thanks for your review on this patch!
>
> "Kewen.Lin" <linkw@linux.ibm.com> writes:
>
>> Hi Jeff,
>>
>> Sorry for the late review.
>>
>> on 2022/9/15 16:30, Jiufu Guo wrote:
>>> Hi,
>>> 
>>> For a complicate 64bit constant, blow is one instruction-sequence to
>>> build:
>>> 	lis 9,0x800a
>>> 	ori 9,9,0xabcd
>>> 	sldi 9,9,32
>>> 	oris 9,9,0xc167
>>> 	ori 9,9,0xfa16
>>> 
>>> while we can also use below sequence to build:
>>> 	lis 9,0xc167
>>> 	lis 10,0x800a
>>> 	ori 9,9,0xfa16
>>> 	ori 10,10,0xabcd
>>> 	rldimi 9,10,32,0
>>> This sequence is using 2 registers to build high and low part firstly,
>>> and then merge them.
>>> In parallel aspect, this sequence would be faster. (Ofcause, using 1 more
>>> register with potential register pressure).
>>> 
>>> Bootstrap and regtest pass on ppc64le.
>>> Is this ok for trunk?
>>> 
>>> 
>>> BR,
>>> Jeff(Jiufu)
>>> 
>>> 
>>> gcc/ChangeLog:
>>> 
>>> 	* config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Update 64bit
>>> 	constant build.
>>> 
>>> gcc/testsuite/ChangeLog:
>>> 
>>> 	* gcc.target/powerpc/parall_5insn_const.c: New test.
>>> 
>>> ---
cut...
> @@ -0,1 +1,27 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power8  -save-temps" } */
maybe, I could use power7.  Any comments?
> +/* { dg-require-effective-target has_arch_ppc64 } */
> +
> +/* { dg-final { scan-assembler-times {\mlis\M} 4 } } */
> +/* { dg-final { scan-assembler-times {\mori\M} 4 } } */
> +/* { dg-final { scan-assembler-times {\mrldimi\M} 2 } } */
> +
> +void __attribute__ ((noinline)) foo (unsigned long long *a)
> +{
> +  /* 2 lis + 2 ori + 1 rldimi for each constant.  */
> +  *a++ = 0x800aabcdc167fa16ULL;
> +  *a++ = 0x7543a876867f616ULL;
> +}
> +
> +long long A[] = {0x800aabcdc167fa16ULL, 0x7543a876867f616ULL};
> +int
> +main ()
> +{
> +  long long res[2];
> +
> +  foo (res);
> +  if (__builtin_memcmp (res, A, sizeof (res)) != 0)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
  
Segher Boessenkool Nov. 25, 2022, 3:46 p.m. UTC | #4
Hi!

On Fri, Nov 25, 2022 at 09:21:21PM +0800, Jiufu Guo wrote:
> "Kewen.Lin" <linkw@linux.ibm.com> writes:
> > on 2022/9/15 16:30, Jiufu Guo wrote:
> >> For a complicate 64bit constant, blow is one instruction-sequence to
> >> build:
> >> 	lis 9,0x800a
> >> 	ori 9,9,0xabcd
> >> 	sldi 9,9,32
> >> 	oris 9,9,0xc167
> >> 	ori 9,9,0xfa16
> >> 
> >> while we can also use below sequence to build:
> >> 	lis 9,0xc167
> >> 	lis 10,0x800a
> >> 	ori 9,9,0xfa16
> >> 	ori 10,10,0xabcd
> >> 	rldimi 9,10,32,0
> >> This sequence is using 2 registers to build high and low part firstly,
> >> and then merge them.
> >> In parallel aspect, this sequence would be faster. (Ofcause, using 1 more
> >> register with potential register pressure).

And crucially this patch only uses two registers if can_create_pseudo_p.
Please mention that.

> >> 	* config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Update 64bit
> >> 	constant build.

If you don't give details of what this does, just say "Update." please.
But update to what?

"Generate more parallel code if can_create_pseudo_p." maybe?

> >> +	  rtx H = gen_reg_rtx (DImode);
> >> +	  rtx L = gen_reg_rtx (DImode);

Please don't use all-uppercase variable names, those are for macros.  In
fact, don't use uppercase in variable (and function etc.) names at all,
unless there is a really good reason to.

Just call it "high" and "low", or "hi" and "lo", or something?

> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
> >> @@ -0,0 +1,27 @@
> >> +/* { dg-do run } */
> >> +/* { dg-options "-O2 -mdejagnu-cpu=power7  -save-temps" } */
> >
> > Why do we need power7 here?
> power8/9 are also ok for this case.  Actually, O just want to
> avoid to use new p10 instruction, like "pli", and then selected
> an old arch option.

Why does it need _at least_ p7, is the question (as I understand it).

To prohibit pli etc. you can do -mno-prefixed (which works on all older
CPUs just as well), or skip the test if prefixed insns are enabled, or
scan for the then generated code as well.  The first option is by far
the simplest.

> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
> @@ -0,1 +1,27 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power8  -save-temps" } */
> +/* { dg-require-effective-target has_arch_ppc64 } */

p8 here makes no sense, we also want good and correct code generated for
older CPUs, so we should not preevent testing on those.  For that reason
you shouldn't use -mcpu= when not needed.  Like here :-)
  
Jiufu Guo Nov. 27, 2022, 1:09 p.m. UTC | #5
Hi Segher!

Thanks for your helpful comments!

Segher Boessenkool <segher@kernel.crashing.org> writes:

> Hi!
>
> On Fri, Nov 25, 2022 at 09:21:21PM +0800, Jiufu Guo wrote:
>> "Kewen.Lin" <linkw@linux.ibm.com> writes:
>> > on 2022/9/15 16:30, Jiufu Guo wrote:
>> >> For a complicate 64bit constant, blow is one instruction-sequence to
>> >> build:
>> >> 	lis 9,0x800a
>> >> 	ori 9,9,0xabcd
>> >> 	sldi 9,9,32
>> >> 	oris 9,9,0xc167
>> >> 	ori 9,9,0xfa16
>> >> 
>> >> while we can also use below sequence to build:
>> >> 	lis 9,0xc167
>> >> 	lis 10,0x800a
>> >> 	ori 9,9,0xfa16
>> >> 	ori 10,10,0xabcd
>> >> 	rldimi 9,10,32,0
>> >> This sequence is using 2 registers to build high and low part firstly,
>> >> and then merge them.
>> >> In parallel aspect, this sequence would be faster. (Ofcause, using 1 more
>> >> register with potential register pressure).
>
> And crucially this patch only uses two registers if can_create_pseudo_p.
> Please mention that.
OK.
>
>> >> 	* config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Update 64bit
>> >> 	constant build.
>
> If you don't give details of what this does, just say "Update." please.
> But update to what?
>
> "Generate more parallel code if can_create_pseudo_p." maybe?
Thanks.
>
>> >> +	  rtx H = gen_reg_rtx (DImode);
>> >> +	  rtx L = gen_reg_rtx (DImode);
>
> Please don't use all-uppercase variable names, those are for macros.  In
> fact, don't use uppercase in variable (and function etc.) names at all,
> unless there is a really good reason to.
>
> Just call it "high" and "low", or "hi" and "lo", or something?
Thanks.
>
>> >> --- /dev/null
>> >> +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
>> >> @@ -0,0 +1,27 @@
>> >> +/* { dg-do run } */
>> >> +/* { dg-options "-O2 -mdejagnu-cpu=power7  -save-temps" } */
>> >
>> > Why do we need power7 here?
>> power8/9 are also ok for this case.  Actually, O just want to
>> avoid to use new p10 instruction, like "pli", and then selected
>> an old arch option.
>
> Why does it need _at least_ p7, is the question (as I understand it).
>
> To prohibit pli etc. you can do -mno-prefixed (which works on all older
> CPUs just as well), or skip the test if prefixed insns are enabled, or
> scan for the then generated code as well.  The first option is by far
> the simplest.
Thanks for your suggestion! -mno-prefixed is great, it meets the
intention here.

>
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
>> @@ -0,1 +1,27 @@
>> +/* { dg-do run } */
>> +/* { dg-options "-O2 -mdejagnu-cpu=power8  -save-temps" } */
>> +/* { dg-require-effective-target has_arch_ppc64 } */
>
> p8 here makes no sense, we also want good and correct code generated for
> older CPUs, so we should not preevent testing on those.  For that reason
> you shouldn't use -mcpu= when not needed.  Like here :-)
Yeap, thanks!

BR,
Jeff (Jiufu)
  
Kewen.Lin Nov. 28, 2022, 1:50 a.m. UTC | #6
Hi Segher,

on 2022/11/25 23:46, Segher Boessenkool wrote:
> Hi!
> 
> On Fri, Nov 25, 2022 at 09:21:21PM +0800, Jiufu Guo wrote:
>> "Kewen.Lin" <linkw@linux.ibm.com> writes:
>>> on 2022/9/15 16:30, Jiufu Guo wrote:
>>>> For a complicate 64bit constant, blow is one instruction-sequence to
>>>> build:
>>>> 	lis 9,0x800a
>>>> 	ori 9,9,0xabcd
>>>> 	sldi 9,9,32
>>>> 	oris 9,9,0xc167
>>>> 	ori 9,9,0xfa16
>>>>
>>>> while we can also use below sequence to build:
>>>> 	lis 9,0xc167
>>>> 	lis 10,0x800a
>>>> 	ori 9,9,0xfa16
>>>> 	ori 10,10,0xabcd
>>>> 	rldimi 9,10,32,0
>>>> This sequence is using 2 registers to build high and low part firstly,
>>>> and then merge them.
>>>> In parallel aspect, this sequence would be faster. (Ofcause, using 1 more
>>>> register with potential register pressure).
> 
> And crucially this patch only uses two registers if can_create_pseudo_p.
> Please mention that.
> 
>>>> 	* config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Update 64bit
>>>> 	constant build.
> 
> If you don't give details of what this does, just say "Update." please.
> But update to what?
> 
> "Generate more parallel code if can_create_pseudo_p." maybe?
> 
>>>> +	  rtx H = gen_reg_rtx (DImode);
>>>> +	  rtx L = gen_reg_rtx (DImode);
> 
> Please don't use all-uppercase variable names, those are for macros.  In
> fact, don't use uppercase in variable (and function etc.) names at all,
> unless there is a really good reason to.
> 
> Just call it "high" and "low", or "hi" and "lo", or something?
> 
>>>> --- /dev/null
>>>> +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
>>>> @@ -0,0 +1,27 @@
>>>> +/* { dg-do run } */
>>>> +/* { dg-options "-O2 -mdejagnu-cpu=power7  -save-temps" } */
>>>
>>> Why do we need power7 here?
>> power8/9 are also ok for this case.  Actually, O just want to
>> avoid to use new p10 instruction, like "pli", and then selected
>> an old arch option.
> 
> Why does it need _at least_ p7, is the question (as I understand it).
> 

Yeah, that's what I was intended to ask, since those insns to be scanned
don't actually require Power7 or later.

> To prohibit pli etc. you can do -mno-prefixed (which works on all older
> CPUs just as well), or skip the test if prefixed insns are enabled, or
> scan for the then generated code as well.  The first option is by far
> the simplest.

Yeah, using -mno-prefixed is perfect here, nice!

BR,
Kewen
  
Jiufu Guo Nov. 28, 2022, 2:35 a.m. UTC | #7
Hi Kewen/Segher,

Thanks a lot for your review!

I updated the patch accordingly as below for message/code/testcase:

For a complicate 64bit constant, blow is one instruction-sequence to
build:
	lis 9,0x800a
	ori 9,9,0xabcd
	sldi 9,9,32
	oris 9,9,0xc167
	ori 9,9,0xfa16

while we can also use below sequence to build:
	lis 9,0xc167
	lis 10,0x800a
	ori 9,9,0xfa16
	ori 10,10,0xabcd
	rldimi 9,10,32,0
This sequence is using 2 registers to build high and low part firstly,
and then merge them.

In parallel aspect, this sequence would be faster. (Ofcause, using 1 more
register with potential register pressure).

The instruction sequence with two registers for parallel version can be
generated only if can_create_pseudo_p.  Otherwise, the one register
sequence is generated.

Bootstrap and regtest pass on ppc64le.
Is this ok for trunk?

gcc/ChangeLog:

	* config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Generate
	more parallel code if can_create_pseudo_p.

gcc/testsuite/ChangeLog:

	* gcc.target/powerpc/parall_5insn_const.c: New test.

---
 gcc/config/rs6000/rs6000.cc                   | 45 +++++++++++--------
 .../gcc.target/powerpc/parall_5insn_const.c   | 27 +++++++++++
 2 files changed, 53 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index eb7ad5e954f..877b314a57a 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -10337,26 +10337,33 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
     }
   else
     {
-      temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
-
-      emit_move_insn (copy_rtx (temp),
-		      GEN_INT (((ud4 << 16) ^ 0x80000000) - 0x80000000));
-      if (ud3 != 0)
-	emit_move_insn (copy_rtx (temp),
-			gen_rtx_IOR (DImode, copy_rtx (temp),
-				     GEN_INT (ud3)));
+      if (can_create_pseudo_p ())
+	{
+	  /* lis H,U4; ori H,U3; lis L,U2; ori L,U1; rldimi L,H,32,0.  */
+	  rtx high = gen_reg_rtx (DImode);
+	  rtx low = gen_reg_rtx (DImode);
+	  HOST_WIDE_INT num = (ud2 << 16) | ud1;
+	  rs6000_emit_set_long_const (low, (num ^ 0x80000000) - 0x80000000);
+	  num = (ud4 << 16) | ud3;
+	  rs6000_emit_set_long_const (high, (num ^ 0x80000000) - 0x80000000);
+	  emit_insn (gen_rotldi3_insert_3 (dest, high, GEN_INT (32), low,
+					   GEN_INT (0xffffffff)));
+	}
+      else
+	{
+	  /* lis A,U4; ori A,U3; rotl A,32; oris A,U2; ori A,U1.  */
+	  emit_move_insn (dest,
+			  GEN_INT (((ud4 << 16) ^ 0x80000000) - 0x80000000));
+	  if (ud3 != 0)
+	    emit_move_insn (dest, gen_rtx_IOR (DImode, dest, GEN_INT (ud3)));
 
-      emit_move_insn (ud2 != 0 || ud1 != 0 ? copy_rtx (temp) : dest,
-		      gen_rtx_ASHIFT (DImode, copy_rtx (temp),
-				      GEN_INT (32)));
-      if (ud2 != 0)
-	emit_move_insn (ud1 != 0 ? copy_rtx (temp) : dest,
-			gen_rtx_IOR (DImode, copy_rtx (temp),
-				     GEN_INT (ud2 << 16)));
-      if (ud1 != 0)
-	emit_move_insn (dest,
-			gen_rtx_IOR (DImode, copy_rtx (temp),
-				     GEN_INT (ud1)));
+	  emit_move_insn (dest, gen_rtx_ASHIFT (DImode, dest, GEN_INT (32)));
+	  if (ud2 != 0)
+	    emit_move_insn (dest,
+			    gen_rtx_IOR (DImode, dest, GEN_INT (ud2 << 16)));
+	  if (ud1 != 0)
+	    emit_move_insn (dest, gen_rtx_IOR (DImode, dest, GEN_INT (ud1)));
+	}
     }
 }
 
diff --git a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
new file mode 100644
index 00000000000..e3a9a7264cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
@@ -0,0 +1,27 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mno-prefixed -save-temps" } */
+/* { dg-require-effective-target has_arch_ppc64 } */
+
+/* { dg-final { scan-assembler-times {\mlis\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mori\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mrldimi\M} 2 } } */
+
+void __attribute__ ((noinline)) foo (unsigned long long *a)
+{
+  /* 2 lis + 2 ori + 1 rldimi for each constant.  */
+  *a++ = 0x800aabcdc167fa16ULL;
+  *a++ = 0x7543a876867f616ULL;
+}
+
+long long A[] = {0x800aabcdc167fa16ULL, 0x7543a876867f616ULL};
+int
+main ()
+{
+  long long res[2];
+
+  foo (res);
+  if (__builtin_memcmp (res, A, sizeof (res)) != 0)
+    __builtin_abort ();
+
+  return 0;
+}
  

Patch

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index a656cb32a47..759c6309677 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -10180,26 +10180,33 @@  rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
     }
   else
     {
-      temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
-
-      emit_move_insn (copy_rtx (temp),
-		      GEN_INT (((ud4 << 16) ^ 0x80000000) - 0x80000000));
-      if (ud3 != 0)
-	emit_move_insn (copy_rtx (temp),
-			gen_rtx_IOR (DImode, copy_rtx (temp),
-				     GEN_INT (ud3)));
+      if (can_create_pseudo_p ())
+	{
+	  /* lis A,U4; ori A,U3; lis B,U2; ori B,U1; rldimi A,B,32,0.  */
+	  rtx H = gen_reg_rtx (DImode);
+	  rtx L = gen_reg_rtx (DImode);
+	  HOST_WIDE_INT num = (ud2 << 16) | ud1;
+	  rs6000_emit_set_long_const (L, (num ^ 0x80000000) - 0x80000000);
+	  num = (ud4 << 16) | ud3;
+	  rs6000_emit_set_long_const (H, (num ^ 0x80000000) - 0x80000000);
+	  emit_insn (gen_rotldi3_insert_3 (dest, H, GEN_INT (32), L,
+					   GEN_INT (0xffffffff)));
+	}
+      else
+	{
+	  /* lis A, U4; ori A,U3; rotl A,32; oris A,U2; ori A,U1.  */
+	  emit_move_insn (dest,
+			  GEN_INT (((ud4 << 16) ^ 0x80000000) - 0x80000000));
+	  if (ud3 != 0)
+	    emit_move_insn (dest, gen_rtx_IOR (DImode, dest, GEN_INT (ud3)));
 
-      emit_move_insn (ud2 != 0 || ud1 != 0 ? copy_rtx (temp) : dest,
-		      gen_rtx_ASHIFT (DImode, copy_rtx (temp),
-				      GEN_INT (32)));
-      if (ud2 != 0)
-	emit_move_insn (ud1 != 0 ? copy_rtx (temp) : dest,
-			gen_rtx_IOR (DImode, copy_rtx (temp),
-				     GEN_INT (ud2 << 16)));
-      if (ud1 != 0)
-	emit_move_insn (dest,
-			gen_rtx_IOR (DImode, copy_rtx (temp),
-				     GEN_INT (ud1)));
+	  emit_move_insn (dest, gen_rtx_ASHIFT (DImode, dest, GEN_INT (32)));
+	  if (ud2 != 0)
+	    emit_move_insn (dest,
+			    gen_rtx_IOR (DImode, dest, GEN_INT (ud2 << 16)));
+	  if (ud1 != 0)
+	    emit_move_insn (dest, gen_rtx_IOR (DImode, dest, GEN_INT (ud1)));
+	}
     }
 }
 
diff --git a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
new file mode 100644
index 00000000000..ed8ccc73378
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c
@@ -0,0 +1,27 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mdejagnu-cpu=power7  -save-temps" } */
+/* { dg-require-effective-target has_arch_ppc64 } */
+
+/* { dg-final { scan-assembler-times {\mlis\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mori\M} 4 } } */
+/* { dg-final { scan-assembler-times {\mrldimi\M} 2 } } */
+
+void __attribute__ ((noinline)) foo (unsigned long long *a)
+{
+  /* 2lis+2ori+1rldimi for each constant.  */
+  *a++ = 0x800aabcdc167fa16ULL;
+  *a++ = 0x7543a876867f616ULL;
+}
+
+long long A[] = {0x800aabcdc167fa16ULL, 0x7543a876867f616ULL};
+int
+main ()
+{
+  long long res[2];
+
+  foo (res);
+  if (__builtin_memcmp (res, A, sizeof (res)) != 0)
+    __builtin_abort ();
+
+  return 0;
+}