RISC-V: optimize stack manipulation in save-restore

Message ID 20221130083717.14438-1-gaofei@eswincomputing.com
State Accepted
Headers
Series RISC-V: optimize stack manipulation in save-restore |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Fei Gao Nov. 30, 2022, 8:37 a.m. UTC
  The stack that save-restore reserves is not well accumulated in stack allocation and deallocation.
This patch allows less instructions to be used in stack allocation and deallocation if save-restore enabled,
and also a much clear logic for save-restore stack manipulation.

before patch:
	bar:
		call	t0,__riscv_save_4
		addi	sp,sp,-64
		...
		li	t0,-12288
		addi	t0,t0,-1968 # optimized out after patch
		add	sp,sp,t0 # prologue
		...
		li	t0,12288 # epilogue
		addi	t0,t0,2000 # optimized out after patch
		add	sp,sp,t0
		...
		addi	sp,sp,32
		tail	__riscv_restore_4

after patch:
	bar:
		call	t0,__riscv_save_4
		addi	sp,sp,-2032
		...
		li	t0,-12288
		add	sp,sp,t0 # prologue
		...
		li	t0,12288 # epilogue
		add	sp,sp,t0
		...
		addi	sp,sp,2032
		tail	__riscv_restore_4

gcc/ChangeLog:

        * config/riscv/riscv.cc (riscv_first_stack_step): add a new function parameter remaining_size.
        (riscv_compute_frame_info): adapt new riscv_first_stack_step interface.
        (riscv_expand_prologue): consider save-restore in stack allocation.
        (riscv_expand_epilogue): consider save-restore in stack deallocation.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/stack_save_restore.c: New test.
---
 gcc/config/riscv/riscv.cc                     | 58 ++++++++++---------
 .../gcc.target/riscv/stack_save_restore.c     | 40 +++++++++++++
 2 files changed, 70 insertions(+), 28 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/stack_save_restore.c
  

Comments

Palmer Dabbelt Nov. 30, 2022, 10:50 p.m. UTC | #1
On Wed, 30 Nov 2022 00:37:17 PST (-0800), gaofei@eswincomputing.com wrote:
> The stack that save-restore reserves is not well accumulated in stack allocation and deallocation.
> This patch allows less instructions to be used in stack allocation and deallocation if save-restore enabled,
> and also a much clear logic for save-restore stack manipulation.
>
> before patch:
> 	bar:
> 		call	t0,__riscv_save_4
> 		addi	sp,sp,-64
> 		...
> 		li	t0,-12288
> 		addi	t0,t0,-1968 # optimized out after patch
> 		add	sp,sp,t0 # prologue
> 		...
> 		li	t0,12288 # epilogue
> 		addi	t0,t0,2000 # optimized out after patch
> 		add	sp,sp,t0
> 		...
> 		addi	sp,sp,32
> 		tail	__riscv_restore_4
>
> after patch:
> 	bar:
> 		call	t0,__riscv_save_4
> 		addi	sp,sp,-2032
> 		...
> 		li	t0,-12288
> 		add	sp,sp,t0 # prologue
> 		...
> 		li	t0,12288 # epilogue
> 		add	sp,sp,t0
> 		...
> 		addi	sp,sp,2032
> 		tail	__riscv_restore_4
>
> gcc/ChangeLog:
>
>         * config/riscv/riscv.cc (riscv_first_stack_step): add a new function parameter remaining_size.
>         (riscv_compute_frame_info): adapt new riscv_first_stack_step interface.
>         (riscv_expand_prologue): consider save-restore in stack allocation.
>         (riscv_expand_epilogue): consider save-restore in stack deallocation.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/stack_save_restore.c: New test.
> ---
>  gcc/config/riscv/riscv.cc                     | 58 ++++++++++---------
>  .../gcc.target/riscv/stack_save_restore.c     | 40 +++++++++++++
>  2 files changed, 70 insertions(+), 28 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/stack_save_restore.c

I guess with the RISC-V backend still being open for things as big as 
the V port we should probably be taking code like this as well?  I 
wouldn't be opposed to making an exception for the V code and holding 
everything else back, though.

> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 05bdba5ab4d..9e92e729a5f 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -4634,7 +4634,7 @@ riscv_save_libcall_count (unsigned mask)
>     They decrease stack_pointer_rtx but leave frame_pointer_rtx and
>     hard_frame_pointer_rtx unchanged.  */
>
> -static HOST_WIDE_INT riscv_first_stack_step (struct riscv_frame_info *frame);
> +static HOST_WIDE_INT riscv_first_stack_step (struct riscv_frame_info *frame, poly_int64 remaining_size);
>
>  /* Handle stack align for poly_int.  */
>  static poly_int64
> @@ -4663,7 +4663,7 @@ riscv_compute_frame_info (void)
>       save/restore t0.  We check for this before clearing the frame struct.  */
>    if (cfun->machine->interrupt_handler_p)
>      {
> -      HOST_WIDE_INT step1 = riscv_first_stack_step (frame);
> +      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, frame->total_size);
>        if (! POLY_SMALL_OPERAND_P ((frame->total_size - step1)))
>  	interrupt_save_prologue_temp = true;
>      }
> @@ -4913,31 +4913,31 @@ riscv_restore_reg (rtx reg, rtx mem)
>     without adding extra instructions.  */
>
>  static HOST_WIDE_INT
> -riscv_first_stack_step (struct riscv_frame_info *frame)
> +riscv_first_stack_step (struct riscv_frame_info *frame, poly_int64 remaining_size)
>  {
> -  HOST_WIDE_INT frame_total_constant_size;
> -  if (!frame->total_size.is_constant ())
> -    frame_total_constant_size
> -      = riscv_stack_align (frame->total_size.coeffs[0])
> -	- riscv_stack_align (frame->total_size.coeffs[1]);
> +  HOST_WIDE_INT remaining_const_size;
> +  if (!remaining_size.is_constant ())
> +    remaining_const_size
> +      = riscv_stack_align (remaining_size.coeffs[0])
> +	- riscv_stack_align (remaining_size.coeffs[1]);

The alignment looks off here, at least in the email.  Worth fixing it up 
if you're touching the lines anyway.

>    else
> -    frame_total_constant_size = frame->total_size.to_constant ();
> +    remaining_const_size = remaining_size.to_constant ();
>
> -  if (SMALL_OPERAND (frame_total_constant_size))
> -    return frame_total_constant_size;
> +  if (SMALL_OPERAND (remaining_const_size))
> +    return remaining_const_size;
>
>    HOST_WIDE_INT min_first_step =
> -    RISCV_STACK_ALIGN ((frame->total_size - frame->frame_pointer_offset).to_constant());
> +    RISCV_STACK_ALIGN ((remaining_size - frame->frame_pointer_offset).to_constant());
>    HOST_WIDE_INT max_first_step = IMM_REACH / 2 - PREFERRED_STACK_BOUNDARY / 8;
> -  HOST_WIDE_INT min_second_step = frame_total_constant_size - max_first_step;
> +  HOST_WIDE_INT min_second_step = remaining_const_size - max_first_step;
>    gcc_assert (min_first_step <= max_first_step);
>
>    /* As an optimization, use the least-significant bits of the total frame
>       size, so that the second adjustment step is just LUI + ADD.  */
>    if (!SMALL_OPERAND (min_second_step)
> -      && frame_total_constant_size % IMM_REACH < IMM_REACH / 2
> -      && frame_total_constant_size % IMM_REACH >= min_first_step)
> -    return frame_total_constant_size % IMM_REACH;
> +      && remaining_const_size % IMM_REACH < IMM_REACH / 2
> +      && remaining_const_size % IMM_REACH >= min_first_step)
> +    return remaining_const_size % IMM_REACH;

Looks like this entire frame->total_size -> remaining_size conversion 
could be done as an independent patch that would change no 
functionality, that's always a nice way to do things as it makes the 
code easier to read.

I spent a bit poking around here and nothing wrong is jumping out, but 
trying to keep all these offset differences in my head is a bit tricky.  
If you have the time to refactor this to be easier to read that'd be 
great, otherwise hopefully I (or someone else) will have the time to 
take a look -- probably not today on my end, though, as I've got some 
Linux backlog to look at.

Thanks!

>    if (TARGET_RVC)
>      {
> @@ -5037,9 +5037,7 @@ riscv_expand_prologue (void)
>    /* Save the registers.  */
>    if ((frame->mask | frame->fmask) != 0)
>      {
> -      HOST_WIDE_INT step1 = riscv_first_stack_step (frame);
> -      if (size.is_constant ())
> -	step1 = MIN (size.to_constant(), step1);
> +      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, size);
>
>        insn = gen_add3_insn (stack_pointer_rtx,
>  			    stack_pointer_rtx,
> @@ -5142,6 +5140,8 @@ riscv_expand_epilogue (int style)
>    HOST_WIDE_INT step2 = 0;
>    bool use_restore_libcall = ((style == NORMAL_RETURN)
>  			      && riscv_use_save_libcall (frame));
> +  unsigned libcall_size = use_restore_libcall ?
> +                            frame->save_libcall_adjustment : 0;
>    rtx ra = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
>    rtx insn;
>
> @@ -5212,13 +5212,18 @@ riscv_expand_epilogue (int style)
>        REG_NOTES (insn) = dwarf;
>      }
>
> +  if (use_restore_libcall)
> +    frame->mask = 0; /* Temporarily fib for GPRs.  */
> +
>    /* If we need to restore registers, deallocate as much stack as
>       possible in the second step without going out of range.  */
>    if ((frame->mask | frame->fmask) != 0)
> -    {
> -      step2 = riscv_first_stack_step (frame);
> -      step1 -= step2;
> -    }
> +    step2 = riscv_first_stack_step (frame, frame->total_size - libcall_size);
> +
> +  if (use_restore_libcall)
> +    frame->mask = mask; /* Undo the above fib.  */
> +
> +  step1 -= step2 + libcall_size;
>
>    /* Set TARGET to BASE + STEP1.  */
>    if (known_gt (step1, 0))
> @@ -5272,15 +5277,12 @@ riscv_expand_epilogue (int style)
>      frame->mask = 0; /* Temporarily fib that we need not save GPRs.  */
>
>    /* Restore the registers.  */
> -  riscv_for_each_saved_reg (frame->total_size - step2, riscv_restore_reg,
> +  riscv_for_each_saved_reg (frame->total_size - step2 - libcall_size,
> +                            riscv_restore_reg,
>  			    true, style == EXCEPTION_RETURN);
>
>    if (use_restore_libcall)
> -    {
>        frame->mask = mask; /* Undo the above fib.  */
> -      gcc_assert (step2 >= frame->save_libcall_adjustment);
> -      step2 -= frame->save_libcall_adjustment;
> -    }
>
>    if (need_barrier_p)
>      riscv_emit_stack_tie ();
> diff --git a/gcc/testsuite/gcc.target/riscv/stack_save_restore.c b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
> new file mode 100644
> index 00000000000..4695ef9469a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv32imafc -mabi=ilp32f -msave-restore -O2 -fno-schedule-insns -fno-schedule-insns2 -fno-unroll-loops -fno-peel-loops" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +char my_getchar();
> +float getf();
> +
> +/*
> +**bar:
> +**	call	t0,__riscv_save_4
> +**	addi	sp,sp,-2032
> +**	...
> +**	li	t0,-12288
> +**	add	sp,sp,t0
> +**	...
> +**	li	t0,12288
> +**	add	sp,sp,t0
> +**	...
> +**	addi	sp,sp,2032
> +**	tail	__riscv_restore_4
> +*/

The test needs to actually check this, it can't just be manual.

> +int bar()
> +{
> +  float volatile farray[3568];
> +
> +  float sum = 0;
> +  float f1 = getf();
> +  float f2 = getf();
> +  float f3 = getf();
> +  float f4 = getf();
> +
> +  for (int i = 0; i < 3568; i++)
> +  {
> +    farray[i] = my_getchar() * 1.2;
> +    sum += farray[i];
> +  }
> +
> +  return sum + f1 + f2 + f3 + f4;
> +}
> +
  
Fei Gao Dec. 1, 2022, 3:07 a.m. UTC | #2
On 2022-12-01 06:50  Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
>On Wed, 30 Nov 2022 00:37:17 PST (-0800), gaofei@eswincomputing.com wrote:
>> The stack that save-restore reserves is not well accumulated in stack allocation and deallocation.
>> This patch allows less instructions to be used in stack allocation and deallocation if save-restore enabled,
>> and also a much clear logic for save-restore stack manipulation.
>>
>> before patch:
>> bar:
>> call	t0,__riscv_save_4
>> addi	sp,sp,-64
>> ...
>> li	t0,-12288
>> addi	t0,t0,-1968 # optimized out after patch
>> add	sp,sp,t0 # prologue
>> ...
>> li	t0,12288 # epilogue
>> addi	t0,t0,2000 # optimized out after patch
>> add	sp,sp,t0
>> ...
>> addi	sp,sp,32
>> tail	__riscv_restore_4
>>
>> after patch:
>> bar:
>> call	t0,__riscv_save_4
>> addi	sp,sp,-2032
>> ...
>> li	t0,-12288
>> add	sp,sp,t0 # prologue
>> ...
>> li	t0,12288 # epilogue
>> add	sp,sp,t0
>> ...
>> addi	sp,sp,2032
>> tail	__riscv_restore_4
>>
>> gcc/ChangeLog:
>>
>>         * config/riscv/riscv.cc (riscv_first_stack_step): add a new function parameter remaining_size.
>>         (riscv_compute_frame_info): adapt new riscv_first_stack_step interface.
>>         (riscv_expand_prologue): consider save-restore in stack allocation.
>>         (riscv_expand_epilogue): consider save-restore in stack deallocation.
>>
>> gcc/testsuite/ChangeLog:
>>
>>         * gcc.target/riscv/stack_save_restore.c: New test.
>> ---
>>  gcc/config/riscv/riscv.cc                     | 58 ++++++++++---------
>>  .../gcc.target/riscv/stack_save_restore.c     | 40 +++++++++++++
>>  2 files changed, 70 insertions(+), 28 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.target/riscv/stack_save_restore.c
>
>I guess with the RISC-V backend still being open for things as big as
>the V port we should probably be taking code like this as well?  I
>wouldn't be opposed to making an exception for the V code and holding
>everything else back, though.
>
>> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
>> index 05bdba5ab4d..9e92e729a5f 100644
>> --- a/gcc/config/riscv/riscv.cc
>> +++ b/gcc/config/riscv/riscv.cc
>> @@ -4634,7 +4634,7 @@ riscv_save_libcall_count (unsigned mask)
>>     They decrease stack_pointer_rtx but leave frame_pointer_rtx and
>>     hard_frame_pointer_rtx unchanged.  */
>>
>> -static HOST_WIDE_INT riscv_first_stack_step (struct riscv_frame_info *frame);
>> +static HOST_WIDE_INT riscv_first_stack_step (struct riscv_frame_info *frame, poly_int64 remaining_size);
>>
>>  /* Handle stack align for poly_int.  */
>>  static poly_int64
>> @@ -4663,7 +4663,7 @@ riscv_compute_frame_info (void)
>>       save/restore t0.  We check for this before clearing the frame struct.  */
>>    if (cfun->machine->interrupt_handler_p)
>>      {
>> -      HOST_WIDE_INT step1 = riscv_first_stack_step (frame);
>> +      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, frame->total_size);
>>        if (! POLY_SMALL_OPERAND_P ((frame->total_size - step1)))
>>  interrupt_save_prologue_temp = true;
>>      }
>> @@ -4913,31 +4913,31 @@ riscv_restore_reg (rtx reg, rtx mem)
>>     without adding extra instructions.  */
>>
>>  static HOST_WIDE_INT
>> -riscv_first_stack_step (struct riscv_frame_info *frame)
>> +riscv_first_stack_step (struct riscv_frame_info *frame, poly_int64 remaining_size)
>>  {
>> -  HOST_WIDE_INT frame_total_constant_size;
>> -  if (!frame->total_size.is_constant ())
>> -    frame_total_constant_size
>> -      = riscv_stack_align (frame->total_size.coeffs[0])
>> -	- riscv_stack_align (frame->total_size.coeffs[1]);
>> +  HOST_WIDE_INT remaining_const_size;
>> +  if (!remaining_size.is_constant ())
>> +    remaining_const_size
>> +      = riscv_stack_align (remaining_size.coeffs[0])
>> +	- riscv_stack_align (remaining_size.coeffs[1]);
>
>The alignment looks off here, at least in the email.  Worth fixing it up
>if you're touching the lines anyway. 

Sure, i will change RISCV_STACK_ALIGN into riscv_stack_align.

>
>>    else
>> -    frame_total_constant_size = frame->total_size.to_constant ();
>> +    remaining_const_size = remaining_size.to_constant ();
>>
>> -  if (SMALL_OPERAND (frame_total_constant_size))
>> -    return frame_total_constant_size;
>> +  if (SMALL_OPERAND (remaining_const_size))
>> +    return remaining_const_size;
>>
>>    HOST_WIDE_INT min_first_step =
>> -    RISCV_STACK_ALIGN ((frame->total_size - frame->frame_pointer_offset).to_constant());
>> +    RISCV_STACK_ALIGN ((remaining_size - frame->frame_pointer_offset).to_constant());
>>    HOST_WIDE_INT max_first_step = IMM_REACH / 2 - PREFERRED_STACK_BOUNDARY / 8;
>> -  HOST_WIDE_INT min_second_step = frame_total_constant_size - max_first_step;
>> +  HOST_WIDE_INT min_second_step = remaining_const_size - max_first_step;
>>    gcc_assert (min_first_step <= max_first_step);
>>
>>    /* As an optimization, use the least-significant bits of the total frame
>>       size, so that the second adjustment step is just LUI + ADD.  */
>>    if (!SMALL_OPERAND (min_second_step)
>> -      && frame_total_constant_size % IMM_REACH < IMM_REACH / 2
>> -      && frame_total_constant_size % IMM_REACH >= min_first_step)
>> -    return frame_total_constant_size % IMM_REACH;
>> +      && remaining_const_size % IMM_REACH < IMM_REACH / 2
>> +      && remaining_const_size % IMM_REACH >= min_first_step)
>> +    return remaining_const_size % IMM_REACH;
>
>Looks like this entire frame->total_size -> remaining_size conversion
>could be done as an independent patch that would change no
>functionality, that's always a nice way to do things as it makes the
>code easier to read. 

Sure, i will split this patch.

>
>I spent a bit poking around here and nothing wrong is jumping out, but
>trying to keep all these offset differences in my head is a bit tricky. 
>If you have the time to refactor this to be easier to read that'd be
>great, otherwise hopefully I (or someone else) will have the time to
>take a look -- probably not today on my end, though, as I've got some
>Linux backlog to look at. 

I'll try it. Also i propose to add step0 for pre-allocated stack cases like save-restore lib call, 
and the future Zcmp in Zc* extension if needed. 
So we have a clear logic of manipulate stack step by step.

>
>Thanks!
>
>>    if (TARGET_RVC)
>>      {
>> @@ -5037,9 +5037,7 @@ riscv_expand_prologue (void)
>>    /* Save the registers.  */
>>    if ((frame->mask | frame->fmask) != 0)
>>      {
>> -      HOST_WIDE_INT step1 = riscv_first_stack_step (frame);
>> -      if (size.is_constant ())
>> -	step1 = MIN (size.to_constant(), step1);
>> +      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, size);
>>
>>        insn = gen_add3_insn (stack_pointer_rtx,
>>      stack_pointer_rtx,
>> @@ -5142,6 +5140,8 @@ riscv_expand_epilogue (int style)
>>    HOST_WIDE_INT step2 = 0;
>>    bool use_restore_libcall = ((style == NORMAL_RETURN)
>>        && riscv_use_save_libcall (frame));
>> +  unsigned libcall_size = use_restore_libcall ?
>> +                            frame->save_libcall_adjustment : 0;
>>    rtx ra = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
>>    rtx insn;
>>
>> @@ -5212,13 +5212,18 @@ riscv_expand_epilogue (int style)
>>        REG_NOTES (insn) = dwarf;
>>      }
>>
>> +  if (use_restore_libcall)
>> +    frame->mask = 0; /* Temporarily fib for GPRs.  */
>> +
>>    /* If we need to restore registers, deallocate as much stack as
>>       possible in the second step without going out of range.  */
>>    if ((frame->mask | frame->fmask) != 0)
>> -    {
>> -      step2 = riscv_first_stack_step (frame);
>> -      step1 -= step2;
>> -    }
>> +    step2 = riscv_first_stack_step (frame, frame->total_size - libcall_size);
>> +
>> +  if (use_restore_libcall)
>> +    frame->mask = mask; /* Undo the above fib.  */
>> +
>> +  step1 -= step2 + libcall_size;
>>
>>    /* Set TARGET to BASE + STEP1.  */
>>    if (known_gt (step1, 0))
>> @@ -5272,15 +5277,12 @@ riscv_expand_epilogue (int style)
>>      frame->mask = 0; /* Temporarily fib that we need not save GPRs.  */
>>
>>    /* Restore the registers.  */
>> -  riscv_for_each_saved_reg (frame->total_size - step2, riscv_restore_reg,
>> +  riscv_for_each_saved_reg (frame->total_size - step2 - libcall_size,
>> +                            riscv_restore_reg,
>>      true, style == EXCEPTION_RETURN);
>>
>>    if (use_restore_libcall)
>> -    {
>>        frame->mask = mask; /* Undo the above fib.  */
>> -      gcc_assert (step2 >= frame->save_libcall_adjustment);
>> -      step2 -= frame->save_libcall_adjustment;
>> -    }
>>
>>    if (need_barrier_p)
>>      riscv_emit_stack_tie ();
>> diff --git a/gcc/testsuite/gcc.target/riscv/stack_save_restore.c b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
>> new file mode 100644
>> index 00000000000..4695ef9469a
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
>> @@ -0,0 +1,40 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-march=rv32imafc -mabi=ilp32f -msave-restore -O2 -fno-schedule-insns -fno-schedule-insns2 -fno-unroll-loops -fno-peel-loops" } */
>> +/* { dg-final { check-function-bodies "**" "" } } */
>> +
>> +char my_getchar();
>> +float getf();
>> +
>> +/*
>> +**bar:
>> +**	call	t0,__riscv_save_4
>> +**	addi	sp,sp,-2032
>> +**	...
>> +**	li	t0,-12288
>> +**	add	sp,sp,t0
>> +**	...
>> +**	li	t0,12288
>> +**	add	sp,sp,t0
>> +**	...
>> +**	addi	sp,sp,2032
>> +**	tail	__riscv_restore_4
>> +*/
>
>The test needs to actually check this, it can't just be manual. 

I didn't get your point. 
The { dg-final { check-function-bodies "**" "" } } matches the output with the expected result above automatically. 
Please let me know your idea. 

Thanks & BR, 
Fei

>
>> +int bar()
>> +{
>> +  float volatile farray[3568];
>> +
>> +  float sum = 0;
>> +  float f1 = getf();
>> +  float f2 = getf();
>> +  float f3 = getf();
>> +  float f4 = getf();
>> +
>> +  for (int i = 0; i < 3568; i++)
>> +  {
>> +    farray[i] = my_getchar() * 1.2;
>> +    sum += farray[i];
>> +  }
>> +
>> +  return sum + f1 + f2 + f3 + f4;
>> +}
>> +
  
Fei Gao Dec. 6, 2022, 1:13 a.m. UTC | #3
Hi Palmer and all, 

I have split the patches and triggerred a new thread.
https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg297206.html

Could you please review at your convenience?

Thanks & BR, 
Fei

On 2022-12-01 11:07  Fei Gao <gaofei@eswincomputing.com> wrote:
>
>On 2022-12-01 06:50  Palmer Dabbelt <palmer@dabbelt.com> wrote:
>>
>>On Wed, 30 Nov 2022 00:37:17 PST (-0800), gaofei@eswincomputing.com wrote:
>>> The stack that save-restore reserves is not well accumulated in stack allocation and deallocation.
>>> This patch allows less instructions to be used in stack allocation and deallocation if save-restore enabled,
>>> and also a much clear logic for save-restore stack manipulation.
>>>
>>> before patch:
>>> bar:
>>> call	t0,__riscv_save_4
>>> addi	sp,sp,-64
>>> ...
>>> li	t0,-12288
>>> addi	t0,t0,-1968 # optimized out after patch
>>> add	sp,sp,t0 # prologue
>>> ...
>>> li	t0,12288 # epilogue
>>> addi	t0,t0,2000 # optimized out after patch
>>> add	sp,sp,t0
>>> ...
>>> addi	sp,sp,32
>>> tail	__riscv_restore_4
>>>
>>> after patch:
>>> bar:
>>> call	t0,__riscv_save_4
>>> addi	sp,sp,-2032
>>> ...
>>> li	t0,-12288
>>> add	sp,sp,t0 # prologue
>>> ...
>>> li	t0,12288 # epilogue
>>> add	sp,sp,t0
>>> ...
>>> addi	sp,sp,2032
>>> tail	__riscv_restore_4
>>>
>>> gcc/ChangeLog:
>>>
>>>         * config/riscv/riscv.cc (riscv_first_stack_step): add a new function parameter remaining_size.
>>>         (riscv_compute_frame_info): adapt new riscv_first_stack_step interface.
>>>         (riscv_expand_prologue): consider save-restore in stack allocation.
>>>         (riscv_expand_epilogue): consider save-restore in stack deallocation.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>>         * gcc.target/riscv/stack_save_restore.c: New test.
>>> ---
>>>  gcc/config/riscv/riscv.cc                     | 58 ++++++++++---------
>>>  .../gcc.target/riscv/stack_save_restore.c     | 40 +++++++++++++
>>>  2 files changed, 70 insertions(+), 28 deletions(-)
>>>  create mode 100644 gcc/testsuite/gcc.target/riscv/stack_save_restore.c
>>
>>I guess with the RISC-V backend still being open for things as big as
>>the V port we should probably be taking code like this as well?  I
>>wouldn't be opposed to making an exception for the V code and holding
>>everything else back, though.
>>
>>> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
>>> index 05bdba5ab4d..9e92e729a5f 100644
>>> --- a/gcc/config/riscv/riscv.cc
>>> +++ b/gcc/config/riscv/riscv.cc
>>> @@ -4634,7 +4634,7 @@ riscv_save_libcall_count (unsigned mask)
>>>     They decrease stack_pointer_rtx but leave frame_pointer_rtx and
>>>     hard_frame_pointer_rtx unchanged.  */
>>>
>>> -static HOST_WIDE_INT riscv_first_stack_step (struct riscv_frame_info *frame);
>>> +static HOST_WIDE_INT riscv_first_stack_step (struct riscv_frame_info *frame, poly_int64 remaining_size);
>>>
>>>  /* Handle stack align for poly_int.  */
>>>  static poly_int64
>>> @@ -4663,7 +4663,7 @@ riscv_compute_frame_info (void)
>>>       save/restore t0.  We check for this before clearing the frame struct.  */
>>>    if (cfun->machine->interrupt_handler_p)
>>>      {
>>> -      HOST_WIDE_INT step1 = riscv_first_stack_step (frame);
>>> +      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, frame->total_size);
>>>        if (! POLY_SMALL_OPERAND_P ((frame->total_size - step1)))
>>>  interrupt_save_prologue_temp = true;
>>>      }
>>> @@ -4913,31 +4913,31 @@ riscv_restore_reg (rtx reg, rtx mem)
>>>     without adding extra instructions.  */
>>>
>>>  static HOST_WIDE_INT
>>> -riscv_first_stack_step (struct riscv_frame_info *frame)
>>> +riscv_first_stack_step (struct riscv_frame_info *frame, poly_int64 remaining_size)
>>>  {
>>> -  HOST_WIDE_INT frame_total_constant_size;
>>> -  if (!frame->total_size.is_constant ())
>>> -    frame_total_constant_size
>>> -      = riscv_stack_align (frame->total_size.coeffs[0])
>>> -	- riscv_stack_align (frame->total_size.coeffs[1]);
>>> +  HOST_WIDE_INT remaining_const_size;
>>> +  if (!remaining_size.is_constant ())
>>> +    remaining_const_size
>>> +      = riscv_stack_align (remaining_size.coeffs[0])
>>> +	- riscv_stack_align (remaining_size.coeffs[1]);
>>
>>The alignment looks off here, at least in the email.  Worth fixing it up
>>if you're touching the lines anyway.
>
>Sure, i will change RISCV_STACK_ALIGN into riscv_stack_align.
>
>>
>>>    else
>>> -    frame_total_constant_size = frame->total_size.to_constant ();
>>> +    remaining_const_size = remaining_size.to_constant ();
>>>
>>> -  if (SMALL_OPERAND (frame_total_constant_size))
>>> -    return frame_total_constant_size;
>>> +  if (SMALL_OPERAND (remaining_const_size))
>>> +    return remaining_const_size;
>>>
>>>    HOST_WIDE_INT min_first_step =
>>> -    RISCV_STACK_ALIGN ((frame->total_size - frame->frame_pointer_offset).to_constant());
>>> +    RISCV_STACK_ALIGN ((remaining_size - frame->frame_pointer_offset).to_constant());
>>>    HOST_WIDE_INT max_first_step = IMM_REACH / 2 - PREFERRED_STACK_BOUNDARY / 8;
>>> -  HOST_WIDE_INT min_second_step = frame_total_constant_size - max_first_step;
>>> +  HOST_WIDE_INT min_second_step = remaining_const_size - max_first_step;
>>>    gcc_assert (min_first_step <= max_first_step);
>>>
>>>    /* As an optimization, use the least-significant bits of the total frame
>>>       size, so that the second adjustment step is just LUI + ADD.  */
>>>    if (!SMALL_OPERAND (min_second_step)
>>> -      && frame_total_constant_size % IMM_REACH < IMM_REACH / 2
>>> -      && frame_total_constant_size % IMM_REACH >= min_first_step)
>>> -    return frame_total_constant_size % IMM_REACH;
>>> +      && remaining_const_size % IMM_REACH < IMM_REACH / 2
>>> +      && remaining_const_size % IMM_REACH >= min_first_step)
>>> +    return remaining_const_size % IMM_REACH;
>>
>>Looks like this entire frame->total_size -> remaining_size conversion
>>could be done as an independent patch that would change no
>>functionality, that's always a nice way to do things as it makes the
>>code easier to read.
>
>Sure, i will split this patch.
>
>>
>>I spent a bit poking around here and nothing wrong is jumping out, but
>>trying to keep all these offset differences in my head is a bit tricky. 
>>If you have the time to refactor this to be easier to read that'd be
>>great, otherwise hopefully I (or someone else) will have the time to
>>take a look -- probably not today on my end, though, as I've got some
>>Linux backlog to look at.
>
>I'll try it. Also i propose to add step0 for pre-allocated stack cases like save-restore lib call,
>and the future Zcmp in Zc* extension if needed.
>So we have a clear logic of manipulate stack step by step.
>
>>
>>Thanks!
>>
>>>    if (TARGET_RVC)
>>>      {
>>> @@ -5037,9 +5037,7 @@ riscv_expand_prologue (void)
>>>    /* Save the registers.  */
>>>    if ((frame->mask | frame->fmask) != 0)
>>>      {
>>> -      HOST_WIDE_INT step1 = riscv_first_stack_step (frame);
>>> -      if (size.is_constant ())
>>> -	step1 = MIN (size.to_constant(), step1);
>>> +      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, size);
>>>
>>>        insn = gen_add3_insn (stack_pointer_rtx,
>>>      stack_pointer_rtx,
>>> @@ -5142,6 +5140,8 @@ riscv_expand_epilogue (int style)
>>>    HOST_WIDE_INT step2 = 0;
>>>    bool use_restore_libcall = ((style == NORMAL_RETURN)
>>>        && riscv_use_save_libcall (frame));
>>> +  unsigned libcall_size = use_restore_libcall ?
>>> +                            frame->save_libcall_adjustment : 0;
>>>    rtx ra = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
>>>    rtx insn;
>>>
>>> @@ -5212,13 +5212,18 @@ riscv_expand_epilogue (int style)
>>>        REG_NOTES (insn) = dwarf;
>>>      }
>>>
>>> +  if (use_restore_libcall)
>>> +    frame->mask = 0; /* Temporarily fib for GPRs.  */
>>> +
>>>    /* If we need to restore registers, deallocate as much stack as
>>>       possible in the second step without going out of range.  */
>>>    if ((frame->mask | frame->fmask) != 0)
>>> -    {
>>> -      step2 = riscv_first_stack_step (frame);
>>> -      step1 -= step2;
>>> -    }
>>> +    step2 = riscv_first_stack_step (frame, frame->total_size - libcall_size);
>>> +
>>> +  if (use_restore_libcall)
>>> +    frame->mask = mask; /* Undo the above fib.  */
>>> +
>>> +  step1 -= step2 + libcall_size;
>>>
>>>    /* Set TARGET to BASE + STEP1.  */
>>>    if (known_gt (step1, 0))
>>> @@ -5272,15 +5277,12 @@ riscv_expand_epilogue (int style)
>>>      frame->mask = 0; /* Temporarily fib that we need not save GPRs.  */
>>>
>>>    /* Restore the registers.  */
>>> -  riscv_for_each_saved_reg (frame->total_size - step2, riscv_restore_reg,
>>> +  riscv_for_each_saved_reg (frame->total_size - step2 - libcall_size,
>>> +                            riscv_restore_reg,
>>>      true, style == EXCEPTION_RETURN);
>>>
>>>    if (use_restore_libcall)
>>> -    {
>>>        frame->mask = mask; /* Undo the above fib.  */
>>> -      gcc_assert (step2 >= frame->save_libcall_adjustment);
>>> -      step2 -= frame->save_libcall_adjustment;
>>> -    }
>>>
>>>    if (need_barrier_p)
>>>      riscv_emit_stack_tie ();
>>> diff --git a/gcc/testsuite/gcc.target/riscv/stack_save_restore.c b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
>>> new file mode 100644
>>> index 00000000000..4695ef9469a
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
>>> @@ -0,0 +1,40 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-march=rv32imafc -mabi=ilp32f -msave-restore -O2 -fno-schedule-insns -fno-schedule-insns2 -fno-unroll-loops -fno-peel-loops" } */
>>> +/* { dg-final { check-function-bodies "**" "" } } */
>>> +
>>> +char my_getchar();
>>> +float getf();
>>> +
>>> +/*
>>> +**bar:
>>> +**	call	t0,__riscv_save_4
>>> +**	addi	sp,sp,-2032
>>> +**	...
>>> +**	li	t0,-12288
>>> +**	add	sp,sp,t0
>>> +**	...
>>> +**	li	t0,12288
>>> +**	add	sp,sp,t0
>>> +**	...
>>> +**	addi	sp,sp,2032
>>> +**	tail	__riscv_restore_4
>>> +*/
>>
>>The test needs to actually check this, it can't just be manual.
>
>I didn't get your point.
>The { dg-final { check-function-bodies "**" "" } } matches the output with the expected result above automatically. 
>Please let me know your idea. 
>
>Thanks & BR, 
>Fei
>
>>
>>> +int bar()
>>> +{
>>> +  float volatile farray[3568];
>>> +
>>> +  float sum = 0;
>>> +  float f1 = getf();
>>> +  float f2 = getf();
>>> +  float f3 = getf();
>>> +  float f4 = getf();
>>> +
>>> +  for (int i = 0; i < 3568; i++)
>>> +  {
>>> +    farray[i] = my_getchar() * 1.2;
>>> +    sum += farray[i];
>>> +  }
>>> +
>>> +  return sum + f1 + f2 + f3 + f4;
>>> +}
>>> +
  

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 05bdba5ab4d..9e92e729a5f 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -4634,7 +4634,7 @@  riscv_save_libcall_count (unsigned mask)
    They decrease stack_pointer_rtx but leave frame_pointer_rtx and
    hard_frame_pointer_rtx unchanged.  */
 
-static HOST_WIDE_INT riscv_first_stack_step (struct riscv_frame_info *frame);
+static HOST_WIDE_INT riscv_first_stack_step (struct riscv_frame_info *frame, poly_int64 remaining_size);
 
 /* Handle stack align for poly_int.  */
 static poly_int64
@@ -4663,7 +4663,7 @@  riscv_compute_frame_info (void)
      save/restore t0.  We check for this before clearing the frame struct.  */
   if (cfun->machine->interrupt_handler_p)
     {
-      HOST_WIDE_INT step1 = riscv_first_stack_step (frame);
+      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, frame->total_size);
       if (! POLY_SMALL_OPERAND_P ((frame->total_size - step1)))
 	interrupt_save_prologue_temp = true;
     }
@@ -4913,31 +4913,31 @@  riscv_restore_reg (rtx reg, rtx mem)
    without adding extra instructions.  */
 
 static HOST_WIDE_INT
-riscv_first_stack_step (struct riscv_frame_info *frame)
+riscv_first_stack_step (struct riscv_frame_info *frame, poly_int64 remaining_size)
 {
-  HOST_WIDE_INT frame_total_constant_size;
-  if (!frame->total_size.is_constant ())
-    frame_total_constant_size
-      = riscv_stack_align (frame->total_size.coeffs[0])
-	- riscv_stack_align (frame->total_size.coeffs[1]);
+  HOST_WIDE_INT remaining_const_size;
+  if (!remaining_size.is_constant ())
+    remaining_const_size
+      = riscv_stack_align (remaining_size.coeffs[0])
+	- riscv_stack_align (remaining_size.coeffs[1]);
   else
-    frame_total_constant_size = frame->total_size.to_constant ();
+    remaining_const_size = remaining_size.to_constant ();
 
-  if (SMALL_OPERAND (frame_total_constant_size))
-    return frame_total_constant_size;
+  if (SMALL_OPERAND (remaining_const_size))
+    return remaining_const_size;
 
   HOST_WIDE_INT min_first_step =
-    RISCV_STACK_ALIGN ((frame->total_size - frame->frame_pointer_offset).to_constant());
+    RISCV_STACK_ALIGN ((remaining_size - frame->frame_pointer_offset).to_constant());
   HOST_WIDE_INT max_first_step = IMM_REACH / 2 - PREFERRED_STACK_BOUNDARY / 8;
-  HOST_WIDE_INT min_second_step = frame_total_constant_size - max_first_step;
+  HOST_WIDE_INT min_second_step = remaining_const_size - max_first_step;
   gcc_assert (min_first_step <= max_first_step);
 
   /* As an optimization, use the least-significant bits of the total frame
      size, so that the second adjustment step is just LUI + ADD.  */
   if (!SMALL_OPERAND (min_second_step)
-      && frame_total_constant_size % IMM_REACH < IMM_REACH / 2
-      && frame_total_constant_size % IMM_REACH >= min_first_step)
-    return frame_total_constant_size % IMM_REACH;
+      && remaining_const_size % IMM_REACH < IMM_REACH / 2
+      && remaining_const_size % IMM_REACH >= min_first_step)
+    return remaining_const_size % IMM_REACH;
 
   if (TARGET_RVC)
     {
@@ -5037,9 +5037,7 @@  riscv_expand_prologue (void)
   /* Save the registers.  */
   if ((frame->mask | frame->fmask) != 0)
     {
-      HOST_WIDE_INT step1 = riscv_first_stack_step (frame);
-      if (size.is_constant ())
-	step1 = MIN (size.to_constant(), step1);
+      HOST_WIDE_INT step1 = riscv_first_stack_step (frame, size);
 
       insn = gen_add3_insn (stack_pointer_rtx,
 			    stack_pointer_rtx,
@@ -5142,6 +5140,8 @@  riscv_expand_epilogue (int style)
   HOST_WIDE_INT step2 = 0;
   bool use_restore_libcall = ((style == NORMAL_RETURN)
 			      && riscv_use_save_libcall (frame));
+  unsigned libcall_size = use_restore_libcall ?
+                            frame->save_libcall_adjustment : 0;
   rtx ra = gen_rtx_REG (Pmode, RETURN_ADDR_REGNUM);
   rtx insn;
 
@@ -5212,13 +5212,18 @@  riscv_expand_epilogue (int style)
       REG_NOTES (insn) = dwarf;
     }
 
+  if (use_restore_libcall)
+    frame->mask = 0; /* Temporarily fib for GPRs.  */
+
   /* If we need to restore registers, deallocate as much stack as
      possible in the second step without going out of range.  */
   if ((frame->mask | frame->fmask) != 0)
-    {
-      step2 = riscv_first_stack_step (frame);
-      step1 -= step2;
-    }
+    step2 = riscv_first_stack_step (frame, frame->total_size - libcall_size);
+
+  if (use_restore_libcall)
+    frame->mask = mask; /* Undo the above fib.  */
+
+  step1 -= step2 + libcall_size;
 
   /* Set TARGET to BASE + STEP1.  */
   if (known_gt (step1, 0))
@@ -5272,15 +5277,12 @@  riscv_expand_epilogue (int style)
     frame->mask = 0; /* Temporarily fib that we need not save GPRs.  */
 
   /* Restore the registers.  */
-  riscv_for_each_saved_reg (frame->total_size - step2, riscv_restore_reg,
+  riscv_for_each_saved_reg (frame->total_size - step2 - libcall_size,
+                            riscv_restore_reg,
 			    true, style == EXCEPTION_RETURN);
 
   if (use_restore_libcall)
-    {
       frame->mask = mask; /* Undo the above fib.  */
-      gcc_assert (step2 >= frame->save_libcall_adjustment);
-      step2 -= frame->save_libcall_adjustment;
-    }
 
   if (need_barrier_p)
     riscv_emit_stack_tie ();
diff --git a/gcc/testsuite/gcc.target/riscv/stack_save_restore.c b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
new file mode 100644
index 00000000000..4695ef9469a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/stack_save_restore.c
@@ -0,0 +1,40 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=rv32imafc -mabi=ilp32f -msave-restore -O2 -fno-schedule-insns -fno-schedule-insns2 -fno-unroll-loops -fno-peel-loops" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+char my_getchar();
+float getf();
+
+/*
+**bar:
+**	call	t0,__riscv_save_4
+**	addi	sp,sp,-2032
+**	...
+**	li	t0,-12288
+**	add	sp,sp,t0
+**	...
+**	li	t0,12288
+**	add	sp,sp,t0
+**	...
+**	addi	sp,sp,2032
+**	tail	__riscv_restore_4
+*/
+int bar()
+{
+  float volatile farray[3568];
+
+  float sum = 0;
+  float f1 = getf();
+  float f2 = getf();
+  float f3 = getf();
+  float f4 = getf();
+
+  for (int i = 0; i < 3568; i++)
+  {
+    farray[i] = my_getchar() * 1.2;
+    sum += farray[i];
+  }
+
+  return sum + f1 + f2 + f3 + f4;
+}
+