Enable shrink wrapping for the RISC-V target.

Message ID 20220906103902.2719585-1-manolis.tsamis@vrull.eu
State New, archived
Headers
Series Enable shrink wrapping for the RISC-V target. |

Commit Message

Manolis Tsamis Sept. 6, 2022, 10:39 a.m. UTC
  This commit implements the target macros (TARGET_SHRINK_WRAP_*) that
enable separate shrink wrapping for function prologues/epilogues in
RISC-V.

Tested against SPEC CPU 2017, this change always has a net-positive
effect on the dynamic instruction count.  See the following table for
the breakdown on how this reduces the number of dynamic instructions
per workload on a like-for-like (i.e., same config file; suppressing
shrink-wrapping with -fno-shrink-wrap):

                             # dynamic instructions
                w/o shrink-wrap   w/ shrink-wrap      reduction
500.perlbench_r   1265716786593    1262156218578     3560568015   0.28%
500.perlbench_r    779224795689     765337009025    13887786664   1.78%
500.perlbench_r    724087331471     711307152522    12780178949   1.77%
502.gcc_r          204259864844     194517006339     9742858505   4.77%
502.gcc_r          244047794302     231555834722    12491959580   5.12%
502.gcc_r          230896069400     221877703011     9018366389   3.91%
502.gcc_r          192130616624     183856450605     8274166019   4.31%
502.gcc_r          258875074079     247756203226    11118870853   4.30%
505.mcf_r          662653430325     660678680547     1974749778   0.30%
520.omnetpp_r      985114167068     934191310154    50922856914   5.17%
523.xalancbmk_r    927037633578     921688937650     5348695928   0.58%
525.x264_r         490953958454     490565583447      388375007   0.08%
525.x264_r        1994662294421    1993171932425     1490361996   0.07%
525.x264_r        1897617120450    1896062750609     1554369841   0.08%
531.deepsjeng_r   1695189878907    1669304130411    25885748496   1.53%
541.leela_r       1925941222222    1897900861198    28040361024   1.46%
548.exchange2_r   2073816227944    2073816226729           1215   0.00%
557.xz_r           379572090003     379057409041      514680962   0.14%
557.xz_r           953117469352     952680431430      437037922   0.05%
557.xz_r           536859579650     536456690164      402889486   0.08%
                 18421773405376   18223938521833   197834883543   1.07%  totals

Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>

gcc/ChangeLog:

	* config/riscv/riscv.cc (struct machine_function): Add array to store
	register wrapping information.
	(riscv_for_each_saved_reg): Skip registers that are wrapped separetely.
	(riscv_get_separate_components): New function.
	(riscv_components_for_bb): Likewise.
	(riscv_disqualify_components): Likewise.
	(riscv_process_components): Likewise.
	(riscv_emit_prologue_components): Likewise.
	(riscv_emit_epilogue_components): Likewise.
	(riscv_set_handled_components): Likewise.
	(TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define.
	(TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise.
	(TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise.
	(TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise.
	(TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise.
	(TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/shrink-wrap-1.c: New test.

---

 gcc/config/riscv/riscv.cc                     | 187 +++++++++++++++++-
 .../gcc.target/riscv/shrink-wrap-1.c          |  25 +++
 2 files changed, 210 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c
  

Comments

Palmer Dabbelt Oct. 2, 2022, 8:32 p.m. UTC | #1
On Tue, 06 Sep 2022 03:39:02 PDT (-0700), manolis.tsamis@vrull.eu wrote:
> This commit implements the target macros (TARGET_SHRINK_WRAP_*) that
> enable separate shrink wrapping for function prologues/epilogues in
> RISC-V.
>
> Tested against SPEC CPU 2017, this change always has a net-positive
> effect on the dynamic instruction count.  See the following table for
> the breakdown on how this reduces the number of dynamic instructions
> per workload on a like-for-like (i.e., same config file; suppressing
> shrink-wrapping with -fno-shrink-wrap):

Does this also pass the regression tests?

(there's also some comments on the code in-line)

>
>                              # dynamic instructions
>                 w/o shrink-wrap   w/ shrink-wrap      reduction
> 500.perlbench_r   1265716786593    1262156218578     3560568015   0.28%
> 500.perlbench_r    779224795689     765337009025    13887786664   1.78%
> 500.perlbench_r    724087331471     711307152522    12780178949   1.77%
> 502.gcc_r          204259864844     194517006339     9742858505   4.77%
> 502.gcc_r          244047794302     231555834722    12491959580   5.12%
> 502.gcc_r          230896069400     221877703011     9018366389   3.91%
> 502.gcc_r          192130616624     183856450605     8274166019   4.31%
> 502.gcc_r          258875074079     247756203226    11118870853   4.30%
> 505.mcf_r          662653430325     660678680547     1974749778   0.30%
> 520.omnetpp_r      985114167068     934191310154    50922856914   5.17%
> 523.xalancbmk_r    927037633578     921688937650     5348695928   0.58%
> 525.x264_r         490953958454     490565583447      388375007   0.08%
> 525.x264_r        1994662294421    1993171932425     1490361996   0.07%
> 525.x264_r        1897617120450    1896062750609     1554369841   0.08%
> 531.deepsjeng_r   1695189878907    1669304130411    25885748496   1.53%
> 541.leela_r       1925941222222    1897900861198    28040361024   1.46%
> 548.exchange2_r   2073816227944    2073816226729           1215   0.00%
> 557.xz_r           379572090003     379057409041      514680962   0.14%
> 557.xz_r           953117469352     952680431430      437037922   0.05%
> 557.xz_r           536859579650     536456690164      402889486   0.08%
>                  18421773405376   18223938521833   197834883543   1.07%  totals
>
> Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
>
> gcc/ChangeLog:
>
> 	* config/riscv/riscv.cc (struct machine_function): Add array to store
> 	register wrapping information.
> 	(riscv_for_each_saved_reg): Skip registers that are wrapped separetely.
> 	(riscv_get_separate_components): New function.
> 	(riscv_components_for_bb): Likewise.
> 	(riscv_disqualify_components): Likewise.
> 	(riscv_process_components): Likewise.
> 	(riscv_emit_prologue_components): Likewise.
> 	(riscv_emit_epilogue_components): Likewise.
> 	(riscv_set_handled_components): Likewise.
> 	(TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define.
> 	(TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise.
> 	(TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise.
> 	(TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise.
> 	(TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise.
> 	(TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/riscv/shrink-wrap-1.c: New test.
>
> ---
>
>  gcc/config/riscv/riscv.cc                     | 187 +++++++++++++++++-
>  .../gcc.target/riscv/shrink-wrap-1.c          |  25 +++
>  2 files changed, 210 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 5a0adffb5ce..3b633149a9a 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "config.h"
>  #include "system.h"
>  #include "coretypes.h"
> +#include "backend.h"
>  #include "tm.h"
>  #include "rtl.h"
>  #include "regs.h"
> @@ -52,6 +53,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "optabs.h"
>  #include "bitmap.h"
>  #include "df.h"
> +#include "function-abi.h"
>  #include "diagnostic.h"
>  #include "builtins.h"
>  #include "predict.h"
> @@ -147,6 +149,11 @@ struct GTY(())  machine_function {
>
>    /* The current frame information, calculated by riscv_compute_frame_info.  */
>    struct riscv_frame_info frame;
> +
> +  /* The components already handled by separate shrink-wrapping, which should
> +     not be considered by the prologue and epilogue.  */
> +  bool reg_is_wrapped_separately[FIRST_PSEUDO_REGISTER];
> +
>  };
>
>  /* Information about a single argument.  */
> @@ -4209,7 +4216,7 @@ riscv_for_each_saved_reg (HOST_WIDE_INT sp_offset, riscv_save_restore_fn fn,
>    for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
>      if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
>        {
> -	bool handle_reg = TRUE;
> +	bool handle_reg = !cfun->machine->reg_is_wrapped_separately[regno];
>
>  	/* If this is a normal return in a function that calls the eh_return
>  	   builtin, then do not restore the eh return data registers as that
> @@ -4240,9 +4247,11 @@ riscv_for_each_saved_reg (HOST_WIDE_INT sp_offset, riscv_save_restore_fn fn,
>    for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
>      if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
>        {
> +	bool handle_reg = !cfun->machine->reg_is_wrapped_separately[regno];
>  	machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
>
> -	riscv_save_restore_reg (mode, regno, offset, fn);
> +	if (handle_reg)
> +	  riscv_save_restore_reg (mode, regno, offset, fn);
>  	offset -= GET_MODE_SIZE (mode);
>        }
>  }
> @@ -4667,6 +4676,156 @@ riscv_epilogue_uses (unsigned int regno)
>    return false;
>  }
>
> +/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
> +
> +static sbitmap
> +riscv_get_separate_components (void)
> +{
> +  HOST_WIDE_INT offset;
> +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
> +  bitmap_clear (components);
> +
> +  if (riscv_use_save_libcall (&cfun->machine->frame)
> +      || cfun->machine->interrupt_handler_p)

riscv_use_save_libcall() already checks interrupt_handler_p, so that's 
redundant.  That said, I'm not sure riscv_use_save_libcall() is the 
right check here as unless I'm missing something we don't have all those 
other constraints when shrink-wrapping.

Either way, we'll also need to avoid saving for naked functions.

> +    return components;
> +
> +  offset = cfun->machine->frame.gp_sp_offset;
> +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
> +      {
> +	if (SMALL_OPERAND (offset))
> +	  bitmap_set_bit (components, regno);
> +
> +	offset -= UNITS_PER_WORD;
> +      }
> +
> +  offset = cfun->machine->frame.fp_sp_offset;
> +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
> +      {
> +	machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
> +
> +	if (SMALL_OPERAND (offset))
> +	  bitmap_set_bit (components, regno);
> +
> +	offset -= GET_MODE_SIZE (mode);
> +      }

It seems kind of clunky to have two copies of all these loops (and we'll 
need a third to make this work with the V stuff), but we've got that 
issue elsewhere in the port so I don't think you need to fix it here 
(though the V stuff will be there for the v2, so you'll need the third 
copy of each loop).

> +
> +  /* Don't mess with the hard frame pointer.  */
> +  if (frame_pointer_needed)
> +    bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
> +
> +  bitmap_clear_bit (components, RETURN_ADDR_REGNUM);
> +
> +  return components;
> +}
> +
> +/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
> +
> +static sbitmap
> +riscv_components_for_bb (basic_block bb)
> +{
> +  bitmap in = DF_LIVE_IN (bb);
> +  bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
> +  bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
> +
> +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
> +  bitmap_clear (components);
> +
> +  function_abi_aggregator callee_abis;
> +  rtx_insn *insn;
> +  FOR_BB_INSNS (bb, insn)
> +    if (CALL_P (insn))
> +      callee_abis.note_callee_abi (insn_callee_abi (insn));
> +  HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
> +
> +  /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
> +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +    if (!fixed_regs[regno]
> +       && !crtl->abi->clobbers_full_reg_p (regno)
> +       && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
> +	   || bitmap_bit_p (in, regno)
> +	   || bitmap_bit_p (gen, regno)
> +	   || bitmap_bit_p (kill, regno)))
> +      bitmap_set_bit (components, regno);
> +
> +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +    if (!fixed_regs[regno]
> +       && !crtl->abi->clobbers_full_reg_p (regno)
> +       && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
> +	   || bitmap_bit_p (in, regno)
> +	   || bitmap_bit_p (gen, regno)
> +	   || bitmap_bit_p (kill, regno)))
> +      bitmap_set_bit (components, regno);
> +
> +  return components;
> +}
> +
> +/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.  */
> +
> +static void
> +riscv_disqualify_components (sbitmap, edge, sbitmap, bool)
> +{
> +  /* Nothing to do for riscv.  */

I think that's true, but when poking around I couldn't figure out how 
saving registers to long stack frames works: we need a temporary register 
to generate the long stack offset, which is easy when we're in the 
pro/epilogue but I haven't figured out if it's handled properly when 
shrink wrapping.

It looks like the aarch64 port side steps the issue by only allowing 
registers that fall within a short offset to be shrink wrapped.  That 
seems like a pretty straight-forward way to handle the problem, but I'm 
not sure how it'd impact performance -- we've already got a lot of 
issues with large stack frames, though, so maybe it's OK to just punt on 
them for now?

Either way, this deserves a test case.  I think it should be possible to 
write one by introducing some register pressure around a 
shrink-wrappable block that needs a long stack offset and making sure 
in-flight registers don't get trashed.

> +}
> +
> +static void
> +riscv_process_components (sbitmap components, bool prologue_p)
> +{
> +  HOST_WIDE_INT offset;
> +  riscv_save_restore_fn fn = prologue_p? riscv_save_reg : riscv_restore_reg;
> +
> +  offset = cfun->machine->frame.gp_sp_offset;
> +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
> +      {
> +	if (bitmap_bit_p (components, regno))
> +	  riscv_save_restore_reg (word_mode, regno, offset, fn);
> +
> +	offset -= UNITS_PER_WORD;
> +      }
> +
> +  offset = cfun->machine->frame.fp_sp_offset;
> +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
> +      {
> +	machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
> +
> +	if (bitmap_bit_p (components, regno))
> +	  riscv_save_restore_reg (mode, regno, offset, fn);
> +
> +	offset -= GET_MODE_SIZE (mode);
> +      }
> +}
> +
> +/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
> +
> +static void
> +riscv_emit_prologue_components (sbitmap components)
> +{
> +  riscv_process_components (components, true);
> +}
> +
> +/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
> +
> +static void
> +riscv_emit_epilogue_components (sbitmap components)
> +{
> +  riscv_process_components (components, false);
> +}
> +
> +static void
> +riscv_set_handled_components (sbitmap components)
> +{
> +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> +    if (bitmap_bit_p (components, regno))
> +      cfun->machine->reg_is_wrapped_separately[regno] = true;
> +
> +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> +    if (bitmap_bit_p (components, regno))
> +      cfun->machine->reg_is_wrapped_separately[regno] = true;
> +}
> +
>  /* Return nonzero if this function is known to have a null epilogue.
>     This allows the optimizer to omit jumps to jumps if no stack
>     was created.  */
> @@ -5713,6 +5872,30 @@ riscv_asan_shadow_offset (void)
>  #undef TARGET_FUNCTION_ARG_BOUNDARY
>  #define TARGET_FUNCTION_ARG_BOUNDARY riscv_function_arg_boundary
>
> +#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
> +#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
> +  riscv_get_separate_components
> +
> +#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
> +#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
> +  riscv_components_for_bb
> +
> +#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
> +#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
> +  riscv_disqualify_components
> +
> +#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
> +#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
> +  riscv_emit_prologue_components
> +
> +#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
> +#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
> +  riscv_emit_epilogue_components
> +
> +#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
> +#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
> +  riscv_set_handled_components
> +
>  /* The generic ELF target does not always have TLS support.  */
>  #ifdef HAVE_AS_TLS
>  #undef TARGET_HAVE_TLS
> diff --git a/gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c b/gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c
> new file mode 100644
> index 00000000000..b05745968e6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-options "-fshrink-wrap" } */
> +/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Oz" } } */
> +
> +void g(void);
> +
> +void f(int x)
> +{
> +	if (x)
> +    {
> +      // Those only need to be saved if x is non-zero; without
> +      // separate shrink-wrapping it would however be saved in all cases.
> +      // Force saving of callee-saved registers
> +      register int s2 asm("18") = x;
> +      register int s3 asm("19") = x;
> +      register int s4 asm("20") = x;
> +      asm("" : : "r"(s2));
> +      asm("" : : "r"(s3));
> +      asm("" : : "r"(s4));
> +      g();
> +    }
> +}
> +
> +// The resulting code should do nothing if x == 0
> +/* { dg-final { scan-assembler "bne\ta0,zero,.*\n.*ret" } } */
  
Manolis Tsamis Oct. 18, 2022, 3:18 p.m. UTC | #2
On Sun, Oct 2, 2022 at 11:32 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Tue, 06 Sep 2022 03:39:02 PDT (-0700), manolis.tsamis@vrull.eu wrote:
> > This commit implements the target macros (TARGET_SHRINK_WRAP_*) that
> > enable separate shrink wrapping for function prologues/epilogues in
> > RISC-V.
> >
> > Tested against SPEC CPU 2017, this change always has a net-positive
> > effect on the dynamic instruction count.  See the following table for
> > the breakdown on how this reduces the number of dynamic instructions
> > per workload on a like-for-like (i.e., same config file; suppressing
> > shrink-wrapping with -fno-shrink-wrap):
>
> Does this also pass the regression tests?
>

I've re-ran the testsuite on both rv32/rv64 and could not see any regressions.
However I observed one test case failing on LTO builds because the LTO
section that is generated
during the compilation contains a "t2" string and the testcase tests
for the absence of "t2" as a register (hence a false positive).

> (there's also some comments on the code in-line)
>
> >
> >                              # dynamic instructions
> >                 w/o shrink-wrap   w/ shrink-wrap      reduction
> > 500.perlbench_r   1265716786593    1262156218578     3560568015   0.28%
> > 500.perlbench_r    779224795689     765337009025    13887786664   1.78%
> > 500.perlbench_r    724087331471     711307152522    12780178949   1.77%
> > 502.gcc_r          204259864844     194517006339     9742858505   4.77%
> > 502.gcc_r          244047794302     231555834722    12491959580   5.12%
> > 502.gcc_r          230896069400     221877703011     9018366389   3.91%
> > 502.gcc_r          192130616624     183856450605     8274166019   4.31%
> > 502.gcc_r          258875074079     247756203226    11118870853   4.30%
> > 505.mcf_r          662653430325     660678680547     1974749778   0.30%
> > 520.omnetpp_r      985114167068     934191310154    50922856914   5.17%
> > 523.xalancbmk_r    927037633578     921688937650     5348695928   0.58%
> > 525.x264_r         490953958454     490565583447      388375007   0.08%
> > 525.x264_r        1994662294421    1993171932425     1490361996   0.07%
> > 525.x264_r        1897617120450    1896062750609     1554369841   0.08%
> > 531.deepsjeng_r   1695189878907    1669304130411    25885748496   1.53%
> > 541.leela_r       1925941222222    1897900861198    28040361024   1.46%
> > 548.exchange2_r   2073816227944    2073816226729           1215   0.00%
> > 557.xz_r           379572090003     379057409041      514680962   0.14%
> > 557.xz_r           953117469352     952680431430      437037922   0.05%
> > 557.xz_r           536859579650     536456690164      402889486   0.08%
> >                  18421773405376   18223938521833   197834883543   1.07%  totals
> >
> > Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
> >
> > gcc/ChangeLog:
> >
> >       * config/riscv/riscv.cc (struct machine_function): Add array to store
> >       register wrapping information.
> >       (riscv_for_each_saved_reg): Skip registers that are wrapped separetely.
> >       (riscv_get_separate_components): New function.
> >       (riscv_components_for_bb): Likewise.
> >       (riscv_disqualify_components): Likewise.
> >       (riscv_process_components): Likewise.
> >       (riscv_emit_prologue_components): Likewise.
> >       (riscv_emit_epilogue_components): Likewise.
> >       (riscv_set_handled_components): Likewise.
> >       (TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define.
> >       (TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise.
> >       (TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise.
> >       (TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise.
> >       (TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise.
> >       (TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/riscv/shrink-wrap-1.c: New test.
> >
> > ---
> >
> >  gcc/config/riscv/riscv.cc                     | 187 +++++++++++++++++-
> >  .../gcc.target/riscv/shrink-wrap-1.c          |  25 +++
> >  2 files changed, 210 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c
> >
> > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > index 5a0adffb5ce..3b633149a9a 100644
> > --- a/gcc/config/riscv/riscv.cc
> > +++ b/gcc/config/riscv/riscv.cc
> > @@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "config.h"
> >  #include "system.h"
> >  #include "coretypes.h"
> > +#include "backend.h"
> >  #include "tm.h"
> >  #include "rtl.h"
> >  #include "regs.h"
> > @@ -52,6 +53,7 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "optabs.h"
> >  #include "bitmap.h"
> >  #include "df.h"
> > +#include "function-abi.h"
> >  #include "diagnostic.h"
> >  #include "builtins.h"
> >  #include "predict.h"
> > @@ -147,6 +149,11 @@ struct GTY(())  machine_function {
> >
> >    /* The current frame information, calculated by riscv_compute_frame_info.  */
> >    struct riscv_frame_info frame;
> > +
> > +  /* The components already handled by separate shrink-wrapping, which should
> > +     not be considered by the prologue and epilogue.  */
> > +  bool reg_is_wrapped_separately[FIRST_PSEUDO_REGISTER];
> > +
> >  };
> >
> >  /* Information about a single argument.  */
> > @@ -4209,7 +4216,7 @@ riscv_for_each_saved_reg (HOST_WIDE_INT sp_offset, riscv_save_restore_fn fn,
> >    for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> >      if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
> >        {
> > -     bool handle_reg = TRUE;
> > +     bool handle_reg = !cfun->machine->reg_is_wrapped_separately[regno];
> >
> >       /* If this is a normal return in a function that calls the eh_return
> >          builtin, then do not restore the eh return data registers as that
> > @@ -4240,9 +4247,11 @@ riscv_for_each_saved_reg (HOST_WIDE_INT sp_offset, riscv_save_restore_fn fn,
> >    for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> >      if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
> >        {
> > +     bool handle_reg = !cfun->machine->reg_is_wrapped_separately[regno];
> >       machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
> >
> > -     riscv_save_restore_reg (mode, regno, offset, fn);
> > +     if (handle_reg)
> > +       riscv_save_restore_reg (mode, regno, offset, fn);
> >       offset -= GET_MODE_SIZE (mode);
> >        }
> >  }
> > @@ -4667,6 +4676,156 @@ riscv_epilogue_uses (unsigned int regno)
> >    return false;
> >  }
> >
> > +/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
> > +
> > +static sbitmap
> > +riscv_get_separate_components (void)
> > +{
> > +  HOST_WIDE_INT offset;
> > +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
> > +  bitmap_clear (components);
> > +
> > +  if (riscv_use_save_libcall (&cfun->machine->frame)
> > +      || cfun->machine->interrupt_handler_p)
>
> riscv_use_save_libcall() already checks interrupt_handler_p, so that's
> redundant.  That said, I'm not sure riscv_use_save_libcall() is the
> right check here as unless I'm missing something we don't have all those
> other constraints when shrink-wrapping.
>

riscv_use_save_libcall returns false when interrupt_handler_p is true, so the
check for interrupt_handler_p in the branch is not redundant in this case.

I encountered some issues when shrink wrapping and libcall was used in the same
function. Thinking that libcall replaces the prologue/epilogue I didn't see a
reason to have both at the same time and hence I opted to disable
shrink wrapping
in that case. From my understanding this should be harmless?

> Either way, we'll also need to avoid saving for naked functions.
>

Do naked functions need any additional logic for that?
The shrink-wrapping logic uses cfun->machine->frame.(f)mask
which is set accordingly to naked_p. I believe naked functions
are handled properly.

> > +    return components;
> > +
> > +  offset = cfun->machine->frame.gp_sp_offset;
> > +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> > +    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
> > +      {
> > +     if (SMALL_OPERAND (offset))
> > +       bitmap_set_bit (components, regno);
> > +
> > +     offset -= UNITS_PER_WORD;
> > +      }
> > +
> > +  offset = cfun->machine->frame.fp_sp_offset;
> > +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> > +    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
> > +      {
> > +     machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
> > +
> > +     if (SMALL_OPERAND (offset))
> > +       bitmap_set_bit (components, regno);
> > +
> > +     offset -= GET_MODE_SIZE (mode);
> > +      }
>
> It seems kind of clunky to have two copies of all these loops (and we'll
> need a third to make this work with the V stuff), but we've got that
> issue elsewhere in the port so I don't think you need to fix it here
> (though the V stuff will be there for the v2, so you'll need the third
> copy of each loop).
>

Indeed, I was following the other ports here. Do you think it would be
better to refactor this when the code for the V extension is added?
By taking into account what code will be needed for V, a proper refactored
function could be made to handle all cases.

> > +
> > +  /* Don't mess with the hard frame pointer.  */
> > +  if (frame_pointer_needed)
> > +    bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
> > +
> > +  bitmap_clear_bit (components, RETURN_ADDR_REGNUM);
> > +
> > +  return components;
> > +}
> > +
> > +/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
> > +
> > +static sbitmap
> > +riscv_components_for_bb (basic_block bb)
> > +{
> > +  bitmap in = DF_LIVE_IN (bb);
> > +  bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
> > +  bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
> > +
> > +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
> > +  bitmap_clear (components);
> > +
> > +  function_abi_aggregator callee_abis;
> > +  rtx_insn *insn;
> > +  FOR_BB_INSNS (bb, insn)
> > +    if (CALL_P (insn))
> > +      callee_abis.note_callee_abi (insn_callee_abi (insn));
> > +  HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
> > +
> > +  /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
> > +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> > +    if (!fixed_regs[regno]
> > +       && !crtl->abi->clobbers_full_reg_p (regno)
> > +       && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
> > +        || bitmap_bit_p (in, regno)
> > +        || bitmap_bit_p (gen, regno)
> > +        || bitmap_bit_p (kill, regno)))
> > +      bitmap_set_bit (components, regno);
> > +
> > +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> > +    if (!fixed_regs[regno]
> > +       && !crtl->abi->clobbers_full_reg_p (regno)
> > +       && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
> > +        || bitmap_bit_p (in, regno)
> > +        || bitmap_bit_p (gen, regno)
> > +        || bitmap_bit_p (kill, regno)))
> > +      bitmap_set_bit (components, regno);
> > +
> > +  return components;
> > +}
> > +
> > +/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.  */
> > +
> > +static void
> > +riscv_disqualify_components (sbitmap, edge, sbitmap, bool)
> > +{
> > +  /* Nothing to do for riscv.  */
>
> I think that's true, but when poking around I couldn't figure out how
> saving registers to long stack frames works: we need a temporary register
> to generate the long stack offset, which is easy when we're in the
> pro/epilogue but I haven't figured out if it's handled properly when
> shrink wrapping.
>
> It looks like the aarch64 port side steps the issue by only allowing
> registers that fall within a short offset to be shrink wrapped.  That
> seems like a pretty straight-forward way to handle the problem, but I'm
> not sure how it'd impact performance -- we've already got a lot of
> issues with large stack frames, though, so maybe it's OK to just punt on
> them for now?
>

The same is done in this implementation by checking if SMALL_OPERAND (offset)
is true to shrink wrap a component. I think this should be enough here.

> Either way, this deserves a test case.  I think it should be possible to
> write one by introducing some register pressure around a
> shrink-wrappable block that needs a long stack offset and making sure
> in-flight registers don't get trashed.
>

I tried to think of some way to introduce a test like that but couldn't and
I don't see how it would be done. Shrink wrapping only affects saved registers
so there are always available temporaries that are not affected by
shrink wrapping.
(Register pressure should be irrelevant in this case if I understand correctly).
Also the implementation checks for SMALL_OPERAND (offset) shrink wrapping
should be unaffected from long stack offsets. If you see some way to write
a test for that based on what I explained please explain how I could do that.

A rebased v2 of this patch that resolves some conflicts with master
and minor formatting
issue in the testcase has been also submitted:
https://gcc.gnu.org/pipermail/gcc-patches/2022-October/603822.html

Manolis

> > +}
> > +
> > +static void
> > +riscv_process_components (sbitmap components, bool prologue_p)
> > +{
> > +  HOST_WIDE_INT offset;
> > +  riscv_save_restore_fn fn = prologue_p? riscv_save_reg : riscv_restore_reg;
> > +
> > +  offset = cfun->machine->frame.gp_sp_offset;
> > +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> > +    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
> > +      {
> > +     if (bitmap_bit_p (components, regno))
> > +       riscv_save_restore_reg (word_mode, regno, offset, fn);
> > +
> > +     offset -= UNITS_PER_WORD;
> > +      }
> > +
> > +  offset = cfun->machine->frame.fp_sp_offset;
> > +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> > +    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
> > +      {
> > +     machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
> > +
> > +     if (bitmap_bit_p (components, regno))
> > +       riscv_save_restore_reg (mode, regno, offset, fn);
> > +
> > +     offset -= GET_MODE_SIZE (mode);
> > +      }
> > +}
> > +
> > +/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
> > +
> > +static void
> > +riscv_emit_prologue_components (sbitmap components)
> > +{
> > +  riscv_process_components (components, true);
> > +}
> > +
> > +/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
> > +
> > +static void
> > +riscv_emit_epilogue_components (sbitmap components)
> > +{
> > +  riscv_process_components (components, false);
> > +}
> > +
> > +static void
> > +riscv_set_handled_components (sbitmap components)
> > +{
> > +  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
> > +    if (bitmap_bit_p (components, regno))
> > +      cfun->machine->reg_is_wrapped_separately[regno] = true;
> > +
> > +  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
> > +    if (bitmap_bit_p (components, regno))
> > +      cfun->machine->reg_is_wrapped_separately[regno] = true;
> > +}
> > +
> >  /* Return nonzero if this function is known to have a null epilogue.
> >     This allows the optimizer to omit jumps to jumps if no stack
> >     was created.  */
> > @@ -5713,6 +5872,30 @@ riscv_asan_shadow_offset (void)
> >  #undef TARGET_FUNCTION_ARG_BOUNDARY
> >  #define TARGET_FUNCTION_ARG_BOUNDARY riscv_function_arg_boundary
> >
> > +#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
> > +#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
> > +  riscv_get_separate_components
> > +
> > +#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
> > +#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
> > +  riscv_components_for_bb
> > +
> > +#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
> > +#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
> > +  riscv_disqualify_components
> > +
> > +#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
> > +#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
> > +  riscv_emit_prologue_components
> > +
> > +#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
> > +#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
> > +  riscv_emit_epilogue_components
> > +
> > +#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
> > +#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
> > +  riscv_set_handled_components
> > +
> >  /* The generic ELF target does not always have TLS support.  */
> >  #ifdef HAVE_AS_TLS
> >  #undef TARGET_HAVE_TLS
> > diff --git a/gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c b/gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c
> > new file mode 100644
> > index 00000000000..b05745968e6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-fshrink-wrap" } */
> > +/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Oz" } } */
> > +
> > +void g(void);
> > +
> > +void f(int x)
> > +{
> > +     if (x)
> > +    {
> > +      // Those only need to be saved if x is non-zero; without
> > +      // separate shrink-wrapping it would however be saved in all cases.
> > +      // Force saving of callee-saved registers
> > +      register int s2 asm("18") = x;
> > +      register int s3 asm("19") = x;
> > +      register int s4 asm("20") = x;
> > +      asm("" : : "r"(s2));
> > +      asm("" : : "r"(s3));
> > +      asm("" : : "r"(s4));
> > +      g();
> > +    }
> > +}
> > +
> > +// The resulting code should do nothing if x == 0
> > +/* { dg-final { scan-assembler "bne\ta0,zero,.*\n.*ret" } } */
  
Jeff Law Oct. 18, 2022, 3:57 p.m. UTC | #3
Just a couple more comments in-line.

On 10/18/22 09:18, Manolis Tsamis wrote:
>
>>> +/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
>>> +
>>> +static sbitmap
>>> +riscv_get_separate_components (void)
>>> +{
>>> +  HOST_WIDE_INT offset;
>>> +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
>>> +  bitmap_clear (components);
>>> +
>>> +  if (riscv_use_save_libcall (&cfun->machine->frame)
>>> +      || cfun->machine->interrupt_handler_p)
>> riscv_use_save_libcall() already checks interrupt_handler_p, so that's
>> redundant.  That said, I'm not sure riscv_use_save_libcall() is the
>> right check here as unless I'm missing something we don't have all those
>> other constraints when shrink-wrapping.
>>
> riscv_use_save_libcall returns false when interrupt_handler_p is true, so the
> check for interrupt_handler_p in the branch is not redundant in this case.
>
> I encountered some issues when shrink wrapping and libcall was used in the same
> function. Thinking that libcall replaces the prologue/epilogue I didn't see a
> reason to have both at the same time and hence I opted to disable
> shrink wrapping in that case. From my understanding this should be harmless?

I would have expected things to work fine with libcalls, perhaps with 
the exception of the save/restore libcalls.  So that needs deeper 
investigation.


>>
>> It seems kind of clunky to have two copies of all these loops (and we'll
>> need a third to make this work with the V stuff), but we've got that
>> issue elsewhere in the port so I don't think you need to fix it here
>> (though the V stuff will be there for the v2, so you'll need the third
>> copy of each loop).
>>
> Indeed, I was following the other ports here. Do you think it would be
> better to refactor this when the code for the V extension is added?
> By taking into account what code will be needed for V, a proper refactored
> function could be made to handle all cases.

I think refactoring when V gets added would be fine.  While we could 
probably refactor it correctly now (it isn't terribly complex code after 
all), but we're more likely to get it right with the least amount of 
work if we do it when V is submitted.


>> Either way, this deserves a test case.  I think it should be possible to
>> write one by introducing some register pressure around a
>> shrink-wrappable block that needs a long stack offset and making sure
>> in-flight registers don't get trashed.
>>
> I tried to think of some way to introduce a test like that but couldn't and
> I don't see how it would be done. Shrink wrapping only affects saved registers
> so there are always available temporaries that are not affected by
> shrink wrapping.
> (Register pressure should be irrelevant in this case if I understand correctly).
> Also the implementation checks for SMALL_OPERAND (offset) shrink wrapping
> should be unaffected from long stack offsets. If you see some way to write
> a test for that based on what I explained please explain how I could do that.

I think the register pressure was just to ensure that some saves were 
needed to trigger an attempt to shrink wrap something.  You'd also need 
something to eat stack space (local array which gets referenced as an 
asm operand, but where the asm doesn't generate any code perhaps)?  
Whether or not that works depends on stack layout though which I don't 
know well enough for riscv.


Generally looks pretty good though.


Jeff
  
Palmer Dabbelt Oct. 18, 2022, 5:35 p.m. UTC | #4
On Tue, 18 Oct 2022 08:57:37 PDT (-0700), jlaw@ventanamicro.com wrote:
>
> Just a couple more comments in-line.
>
> On 10/18/22 09:18, Manolis Tsamis wrote:
>>
>>>> +/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
>>>> +
>>>> +static sbitmap
>>>> +riscv_get_separate_components (void)
>>>> +{
>>>> +  HOST_WIDE_INT offset;
>>>> +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
>>>> +  bitmap_clear (components);
>>>> +
>>>> +  if (riscv_use_save_libcall (&cfun->machine->frame)
>>>> +      || cfun->machine->interrupt_handler_p)
>>> riscv_use_save_libcall() already checks interrupt_handler_p, so that's
>>> redundant.  That said, I'm not sure riscv_use_save_libcall() is the
>>> right check here as unless I'm missing something we don't have all those
>>> other constraints when shrink-wrapping.
>>>
>> riscv_use_save_libcall returns false when interrupt_handler_p is true, so the
>> check for interrupt_handler_p in the branch is not redundant in this case.
>>
>> I encountered some issues when shrink wrapping and libcall was used in the same
>> function. Thinking that libcall replaces the prologue/epilogue I didn't see a
>> reason to have both at the same time and hence I opted to disable
>> shrink wrapping in that case. From my understanding this should be harmless?
>
> I would have expected things to work fine with libcalls, perhaps with
> the exception of the save/restore libcalls.  So that needs deeper
> investigation.

The save/restore libcalls only support saving/restoring a handful of 
register configurations (just the saved X registers in the order they're 
usually saved in by GCC).  It should be OK for correctness to over-save 
registers, but it kind of just un-does the shrink wrapping so not sure 
it's worth worrying about at that point.

There's also some oddness around the save/restore libcall ABI, it's not 
the standard function ABI but instead a GCC-internal one.  IIRC it just 
uses the alternate link register (ie, t0 instead of ra) but I may have 
forgotten something else.

>>> It seems kind of clunky to have two copies of all these loops (and we'll
>>> need a third to make this work with the V stuff), but we've got that
>>> issue elsewhere in the port so I don't think you need to fix it here
>>> (though the V stuff will be there for the v2, so you'll need the third
>>> copy of each loop).
>>>
>> Indeed, I was following the other ports here. Do you think it would be
>> better to refactor this when the code for the V extension is added?
>> By taking into account what code will be needed for V, a proper refactored
>> function could be made to handle all cases.
>
> I think refactoring when V gets added would be fine.  While we could
> probably refactor it correctly now (it isn't terribly complex code after
> all), but we're more likely to get it right with the least amount of
> work if we do it when V is submitted.

Some of the V register blocks are already there, but ya I agree we can 
just wait.  There's going to be a bunch of V-related churn for a bit, 
juggling those patches is already enough of a headache ;)

>>> Either way, this deserves a test case.  I think it should be possible to
>>> write one by introducing some register pressure around a
>>> shrink-wrappable block that needs a long stack offset and making sure
>>> in-flight registers don't get trashed.
>>>
>> I tried to think of some way to introduce a test like that but couldn't and
>> I don't see how it would be done. Shrink wrapping only affects saved registers
>> so there are always available temporaries that are not affected by
>> shrink wrapping.
>> (Register pressure should be irrelevant in this case if I understand correctly).
>> Also the implementation checks for SMALL_OPERAND (offset) shrink wrapping
>> should be unaffected from long stack offsets. If you see some way to write
>> a test for that based on what I explained please explain how I could do that.
>
> I think the register pressure was just to ensure that some saves were
> needed to trigger an attempt to shrink wrap something.  You'd also need
> something to eat stack space (local array which gets referenced as an
> asm operand, but where the asm doesn't generate any code perhaps)? 
> Whether or not that works depends on stack layout though which I don't
> know well enough for riscv.

Sorry for being a bit vague, but it's because I always find it takes a 
bit of time to write up tests like this.  I think something like this 
might do it, but that almost certainly won't work as-is:

    // Some extern bits to try and trip up the optimizer.
    extern long helper(long *sa, long a, long b, long c, ...);
    extern long glob_array[1024];
    
    // The function takes a bunch of arguments to fill up the A 
    // registers.
    long func(long a, long b, long c, ...) {
      long stack_array[1024]; // should be big enough to make SP offsets 
                              // take a temporary
    
      // Here I'm loading up all the T registers with something that 
      // must be saved to the stack, since those helper calls below may 
      // change the value of glob_array.  We probably need to fill the S 
      // registers too?
      long t0 = glob_array[arg + 0];
      long t1 = glob_array[arg + 1];
      long t2 = ...
    
      // This should trigger shrink-wrapping, as the function calls will 
      // need a bunch of register state saved.  The idea is to make sure 
      // all the other registers are somehow in flight at this point and 
      // could only be recovered via their saved copies, but due to the 
      // large stack we'll need a temporary to construct the slot 
      // addresses.
      if (a) {
        t0 = helper(stack_array, a, b, ...);
      }
    
      // Go re-use those temporaries, again indirect to avoid 
      // optimization.
      a += glob_array[t0];
      b += glob_array[t1];
      c ...;
      return helper(stack_array, a, b, c, ...);
    }

At that point you should end up with the save/restore code in that if 
block needing to do something like

    lui  TMP, IMM_HI
    addi TMP, sp, IMM_LO
    sd REG0, 0(t0)
    sd REG1, 8(t0)
    ...

and if one of those registers it's trying to save is TMP then we're in 
trouble.

> Generally looks pretty good though.

Ya, and a useful optimization so thanks for the help ;)

>
>
> Jeff
  
Jeff Law Oct. 19, 2022, 5:15 p.m. UTC | #5
On 10/18/22 11:35, Palmer Dabbelt wrote:
>
>> I would have expected things to work fine with libcalls, perhaps with
>> the exception of the save/restore libcalls.  So that needs deeper
>> investigation.
>
> The save/restore libcalls only support saving/restoring a handful of 
> register configurations (just the saved X registers in the order 
> they're usually saved in by GCC).  It should be OK for correctness to 
> over-save registers, but it kind of just un-does the shrink wrapping 
> so not sure it's worth worrying about at that point.
>
> There's also some oddness around the save/restore libcall ABI, it's 
> not the standard function ABI but instead a GCC-internal one.  IIRC it 
> just uses the alternate link register (ie, t0 instead of ra) but I may 
> have forgotten something else.

I hadn't really dug into it -- I was pretty sure they weren't following 
the standard ABI based on its name and how I've used similar routines to 
save space on some targets in the past.  So if we're having problems 
with shrink-wrapping and libcalls, those two might be worth investigating.


But I think the most important takeaway is that shrink wrapping should 
work with libcalls, there's nothing radically different about libcalls 
that would make them inherently interact poorly with shrink-wrapping.  
So that aspect of the shrink-wrapping patch needs deeper investigation.

Jeff
  
Manolis Tsamis Oct. 20, 2022, 7:35 a.m. UTC | #6
On Tue, Oct 18, 2022 at 8:35 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Tue, 18 Oct 2022 08:57:37 PDT (-0700), jlaw@ventanamicro.com wrote:
> >
> > Just a couple more comments in-line.
> >
> > On 10/18/22 09:18, Manolis Tsamis wrote:
> >>
> >>>> +/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
> >>>> +
> >>>> +static sbitmap
> >>>> +riscv_get_separate_components (void)
> >>>> +{
> >>>> +  HOST_WIDE_INT offset;
> >>>> +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
> >>>> +  bitmap_clear (components);
> >>>> +
> >>>> +  if (riscv_use_save_libcall (&cfun->machine->frame)
> >>>> +      || cfun->machine->interrupt_handler_p)
> >>> riscv_use_save_libcall() already checks interrupt_handler_p, so that's
> >>> redundant.  That said, I'm not sure riscv_use_save_libcall() is the
> >>> right check here as unless I'm missing something we don't have all those
> >>> other constraints when shrink-wrapping.
> >>>
> >> riscv_use_save_libcall returns false when interrupt_handler_p is true, so the
> >> check for interrupt_handler_p in the branch is not redundant in this case.
> >>
> >> I encountered some issues when shrink wrapping and libcall was used in the same
> >> function. Thinking that libcall replaces the prologue/epilogue I didn't see a
> >> reason to have both at the same time and hence I opted to disable
> >> shrink wrapping in that case. From my understanding this should be harmless?
> >
> > I would have expected things to work fine with libcalls, perhaps with
> > the exception of the save/restore libcalls.  So that needs deeper
> > investigation.
>
> The save/restore libcalls only support saving/restoring a handful of
> register configurations (just the saved X registers in the order they're
> usually saved in by GCC).  It should be OK for correctness to over-save
> registers, but it kind of just un-does the shrink wrapping so not sure
> it's worth worrying about at that point.
>
> There's also some oddness around the save/restore libcall ABI, it's not
> the standard function ABI but instead a GCC-internal one.  IIRC it just
> uses the alternate link register (ie, t0 instead of ra) but I may have
> forgotten something else.
>
> >>> It seems kind of clunky to have two copies of all these loops (and we'll
> >>> need a third to make this work with the V stuff), but we've got that
> >>> issue elsewhere in the port so I don't think you need to fix it here
> >>> (though the V stuff will be there for the v2, so you'll need the third
> >>> copy of each loop).
> >>>
> >> Indeed, I was following the other ports here. Do you think it would be
> >> better to refactor this when the code for the V extension is added?
> >> By taking into account what code will be needed for V, a proper refactored
> >> function could be made to handle all cases.
> >
> > I think refactoring when V gets added would be fine.  While we could
> > probably refactor it correctly now (it isn't terribly complex code after
> > all), but we're more likely to get it right with the least amount of
> > work if we do it when V is submitted.
>
> Some of the V register blocks are already there, but ya I agree we can
> just wait.  There's going to be a bunch of V-related churn for a bit,
> juggling those patches is already enough of a headache ;)
>
> >>> Either way, this deserves a test case.  I think it should be possible to
> >>> write one by introducing some register pressure around a
> >>> shrink-wrappable block that needs a long stack offset and making sure
> >>> in-flight registers don't get trashed.
> >>>
> >> I tried to think of some way to introduce a test like that but couldn't and
> >> I don't see how it would be done. Shrink wrapping only affects saved registers
> >> so there are always available temporaries that are not affected by
> >> shrink wrapping.
> >> (Register pressure should be irrelevant in this case if I understand correctly).
> >> Also the implementation checks for SMALL_OPERAND (offset) shrink wrapping
> >> should be unaffected from long stack offsets. If you see some way to write
> >> a test for that based on what I explained please explain how I could do that.
> >
> > I think the register pressure was just to ensure that some saves were
> > needed to trigger an attempt to shrink wrap something.  You'd also need
> > something to eat stack space (local array which gets referenced as an
> > asm operand, but where the asm doesn't generate any code perhaps)?
> > Whether or not that works depends on stack layout though which I don't
> > know well enough for riscv.
>
> Sorry for being a bit vague, but it's because I always find it takes a
> bit of time to write up tests like this.  I think something like this
> might do it, but that almost certainly won't work as-is:
>
>     // Some extern bits to try and trip up the optimizer.
>     extern long helper(long *sa, long a, long b, long c, ...);
>     extern long glob_array[1024];
>
>     // The function takes a bunch of arguments to fill up the A
>     // registers.
>     long func(long a, long b, long c, ...) {
>       long stack_array[1024]; // should be big enough to make SP offsets
>                               // take a temporary
>
>       // Here I'm loading up all the T registers with something that
>       // must be saved to the stack, since those helper calls below may
>       // change the value of glob_array.  We probably need to fill the S
>       // registers too?
>       long t0 = glob_array[arg + 0];
>       long t1 = glob_array[arg + 1];
>       long t2 = ...
>
>       // This should trigger shrink-wrapping, as the function calls will
>       // need a bunch of register state saved.  The idea is to make sure
>       // all the other registers are somehow in flight at this point and
>       // could only be recovered via their saved copies, but due to the
>       // large stack we'll need a temporary to construct the slot
>       // addresses.
>       if (a) {
>         t0 = helper(stack_array, a, b, ...);
>       }
>
>       // Go re-use those temporaries, again indirect to avoid
>       // optimization.
>       a += glob_array[t0];
>       b += glob_array[t1];
>       c ...;
>       return helper(stack_array, a, b, c, ...);
>     }
>
> At that point you should end up with the save/restore code in that if
> block needing to do something like
>
>     lui  TMP, IMM_HI
>     addi TMP, sp, IMM_LO
>     sd REG0, 0(t0)
>     sd REG1, 8(t0)
>     ...
>
> and if one of those registers it's trying to save is TMP then we're in
> trouble.
>

Thanks for the additional details and rough sketch of the testcase.
I'll try to make something based on that.

Manolis

> > Generally looks pretty good though.
>
> Ya, and a useful optimization so thanks for the help ;)
>
> >
> >
> > Jeff
  
Manolis Tsamis Oct. 20, 2022, 7:42 a.m. UTC | #7
On Wed, Oct 19, 2022 at 8:16 PM Jeff Law via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
> On 10/18/22 11:35, Palmer Dabbelt wrote:
> >
> >> I would have expected things to work fine with libcalls, perhaps with
> >> the exception of the save/restore libcalls.  So that needs deeper
> >> investigation.
> >
> > The save/restore libcalls only support saving/restoring a handful of
> > register configurations (just the saved X registers in the order
> > they're usually saved in by GCC).  It should be OK for correctness to
> > over-save registers, but it kind of just un-does the shrink wrapping
> > so not sure it's worth worrying about at that point.
> >
> > There's also some oddness around the save/restore libcall ABI, it's
> > not the standard function ABI but instead a GCC-internal one.  IIRC it
> > just uses the alternate link register (ie, t0 instead of ra) but I may
> > have forgotten something else.
>
> I hadn't really dug into it -- I was pretty sure they weren't following
> the standard ABI based on its name and how I've used similar routines to
> save space on some targets in the past.  So if we're having problems
> with shrink-wrapping and libcalls, those two might be worth investigating.
>
>
> But I think the most important takeaway is that shrink wrapping should
> work with libcalls, there's nothing radically different about libcalls
> that would make them inherently interact poorly with shrink-wrapping.
> So that aspect of the shrink-wrapping patch needs deeper investigation.
>

Based on the feedback from both of you that shrink wrapping and
libcall save/restore
should work fine I'll try to lift that restriction again and
investigate further what happens.

Manolis

> Jeff
  
Manolis Tsamis Nov. 2, 2022, 1:54 p.m. UTC | #8
On Tue, Oct 18, 2022 at 8:35 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Tue, 18 Oct 2022 08:57:37 PDT (-0700), jlaw@ventanamicro.com wrote:
> >
> > Just a couple more comments in-line.
> >
> > On 10/18/22 09:18, Manolis Tsamis wrote:
> >>
> >>>> +/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
> >>>> +
> >>>> +static sbitmap
> >>>> +riscv_get_separate_components (void)
> >>>> +{
> >>>> +  HOST_WIDE_INT offset;
> >>>> +  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
> >>>> +  bitmap_clear (components);
> >>>> +
> >>>> +  if (riscv_use_save_libcall (&cfun->machine->frame)
> >>>> +      || cfun->machine->interrupt_handler_p)
> >>> riscv_use_save_libcall() already checks interrupt_handler_p, so that's
> >>> redundant.  That said, I'm not sure riscv_use_save_libcall() is the
> >>> right check here as unless I'm missing something we don't have all those
> >>> other constraints when shrink-wrapping.
> >>>
> >> riscv_use_save_libcall returns false when interrupt_handler_p is true, so the
> >> check for interrupt_handler_p in the branch is not redundant in this case.
> >>
> >> I encountered some issues when shrink wrapping and libcall was used in the same
> >> function. Thinking that libcall replaces the prologue/epilogue I didn't see a
> >> reason to have both at the same time and hence I opted to disable
> >> shrink wrapping in that case. From my understanding this should be harmless?
> >
> > I would have expected things to work fine with libcalls, perhaps with
> > the exception of the save/restore libcalls.  So that needs deeper
> > investigation.
>
> The save/restore libcalls only support saving/restoring a handful of
> register configurations (just the saved X registers in the order they're
> usually saved in by GCC).  It should be OK for correctness to over-save
> registers, but it kind of just un-does the shrink wrapping so not sure
> it's worth worrying about at that point.
>
> There's also some oddness around the save/restore libcall ABI, it's not
> the standard function ABI but instead a GCC-internal one.  IIRC it just
> uses the alternate link register (ie, t0 instead of ra) but I may have
> forgotten something else.
>
> >>> It seems kind of clunky to have two copies of all these loops (and we'll
> >>> need a third to make this work with the V stuff), but we've got that
> >>> issue elsewhere in the port so I don't think you need to fix it here
> >>> (though the V stuff will be there for the v2, so you'll need the third
> >>> copy of each loop).
> >>>
> >> Indeed, I was following the other ports here. Do you think it would be
> >> better to refactor this when the code for the V extension is added?
> >> By taking into account what code will be needed for V, a proper refactored
> >> function could be made to handle all cases.
> >
> > I think refactoring when V gets added would be fine.  While we could
> > probably refactor it correctly now (it isn't terribly complex code after
> > all), but we're more likely to get it right with the least amount of
> > work if we do it when V is submitted.
>
> Some of the V register blocks are already there, but ya I agree we can
> just wait.  There's going to be a bunch of V-related churn for a bit,
> juggling those patches is already enough of a headache ;)
>
> >>> Either way, this deserves a test case.  I think it should be possible to
> >>> write one by introducing some register pressure around a
> >>> shrink-wrappable block that needs a long stack offset and making sure
> >>> in-flight registers don't get trashed.
> >>>
> >> I tried to think of some way to introduce a test like that but couldn't and
> >> I don't see how it would be done. Shrink wrapping only affects saved registers
> >> so there are always available temporaries that are not affected by
> >> shrink wrapping.
> >> (Register pressure should be irrelevant in this case if I understand correctly).
> >> Also the implementation checks for SMALL_OPERAND (offset) shrink wrapping
> >> should be unaffected from long stack offsets. If you see some way to write
> >> a test for that based on what I explained please explain how I could do that.
> >
> > I think the register pressure was just to ensure that some saves were
> > needed to trigger an attempt to shrink wrap something.  You'd also need
> > something to eat stack space (local array which gets referenced as an
> > asm operand, but where the asm doesn't generate any code perhaps)?
> > Whether or not that works depends on stack layout though which I don't
> > know well enough for riscv.
>
> Sorry for being a bit vague, but it's because I always find it takes a
> bit of time to write up tests like this.  I think something like this
> might do it, but that almost certainly won't work as-is:
>
>     // Some extern bits to try and trip up the optimizer.
>     extern long helper(long *sa, long a, long b, long c, ...);
>     extern long glob_array[1024];
>
>     // The function takes a bunch of arguments to fill up the A
>     // registers.
>     long func(long a, long b, long c, ...) {
>       long stack_array[1024]; // should be big enough to make SP offsets
>                               // take a temporary
>
>       // Here I'm loading up all the T registers with something that
>       // must be saved to the stack, since those helper calls below may
>       // change the value of glob_array.  We probably need to fill the S
>       // registers too?
>       long t0 = glob_array[arg + 0];
>       long t1 = glob_array[arg + 1];
>       long t2 = ...
>
>       // This should trigger shrink-wrapping, as the function calls will
>       // need a bunch of register state saved.  The idea is to make sure
>       // all the other registers are somehow in flight at this point and
>       // could only be recovered via their saved copies, but due to the
>       // large stack we'll need a temporary to construct the slot
>       // addresses.
>       if (a) {
>         t0 = helper(stack_array, a, b, ...);
>       }
>
>       // Go re-use those temporaries, again indirect to avoid
>       // optimization.
>       a += glob_array[t0];
>       b += glob_array[t1];
>       c ...;
>       return helper(stack_array, a, b, c, ...);
>     }
>
> At that point you should end up with the save/restore code in that if
> block needing to do something like
>
>     lui  TMP, IMM_HI
>     addi TMP, sp, IMM_LO
>     sd REG0, 0(t0)
>     sd REG1, 8(t0)
>     ...
>
> and if one of those registers it's trying to save is TMP then we're in
> trouble.
>

I've revisited this testcase and I think it's not possible to make it
work with the current implementation.
It's not possible to trigger shrink wrapping in this case since the
wrapping of registers is guarded by
 if (SMALL_OPERAND (offset)) { bitmap_set_bit (components, regno); }
Hence if a long stack is generated we get no shrink wrapping.

I also tried to remove that restriction but it looks like it can't
work because we can't create
pseudo-registers during shrink wrapping and shrink wrapping can't work either.

I believe this means that shrink wrapping cannot interfere with a long
stack frame
so there is nothing to test against in this case?

Manolis

> > Generally looks pretty good though.
>
> Ya, and a useful optimization so thanks for the help ;)
>
> >
> >
> > Jeff
  
Manolis Tsamis Nov. 2, 2022, 2:12 p.m. UTC | #9
On Wed, Oct 19, 2022 at 8:16 PM Jeff Law via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
> On 10/18/22 11:35, Palmer Dabbelt wrote:
> >
> >> I would have expected things to work fine with libcalls, perhaps with
> >> the exception of the save/restore libcalls.  So that needs deeper
> >> investigation.
> >
> > The save/restore libcalls only support saving/restoring a handful of
> > register configurations (just the saved X registers in the order
> > they're usually saved in by GCC).  It should be OK for correctness to
> > over-save registers, but it kind of just un-does the shrink wrapping
> > so not sure it's worth worrying about at that point.
> >
> > There's also some oddness around the save/restore libcall ABI, it's
> > not the standard function ABI but instead a GCC-internal one.  IIRC it
> > just uses the alternate link register (ie, t0 instead of ra) but I may
> > have forgotten something else.
>
> I hadn't really dug into it -- I was pretty sure they weren't following
> the standard ABI based on its name and how I've used similar routines to
> save space on some targets in the past.  So if we're having problems
> with shrink-wrapping and libcalls, those two might be worth investigating.
>
>
> But I think the most important takeaway is that shrink wrapping should
> work with libcalls, there's nothing radically different about libcalls
> that would make them inherently interact poorly with shrink-wrapping.
> So that aspect of the shrink-wrapping patch needs deeper investigation.
>
> Jeff

I think I miscommunicated the issue previously because my understanding
of libcalls wasn't very solid. The guard is against the save/restore libcalls
specifically; other than that shrink wrapping and libcalls are fine.I think it
makes sense to leave this check because the prologue/epilogue does
something similar when using libcall save/restore:
  frame->mask = 0; /* Temporarily fib that we need not save GPRs. */

Since shrink wrap components are marked by testing frame->mask then
no registers should be wrapped with the libcall save/restore if I understand
correctly.

Nonetheless, I tested what happens if this guard condition is removed
and the result is that a RISCV test fails (riscv/pr95252.c). In that case
a unnecessary save/restore of a register is emitted together with
inconsistent cfi notes that make dwarf2cfi abort.

To conclude, I believe that this makes the code in the commit fine since
it only guards against the libcall save/restore case. But I may be still
missing something about this.

Manolis
  
Jeff Law Nov. 2, 2022, 3:02 p.m. UTC | #10
On 11/2/22 08:12, Manolis Tsamis wrote:
> On Wed, Oct 19, 2022 at 8:16 PM Jeff Law via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> On 10/18/22 11:35, Palmer Dabbelt wrote:
>>>> I would have expected things to work fine with libcalls, perhaps with
>>>> the exception of the save/restore libcalls.  So that needs deeper
>>>> investigation.
>>> The save/restore libcalls only support saving/restoring a handful of
>>> register configurations (just the saved X registers in the order
>>> they're usually saved in by GCC).  It should be OK for correctness to
>>> over-save registers, but it kind of just un-does the shrink wrapping
>>> so not sure it's worth worrying about at that point.
>>>
>>> There's also some oddness around the save/restore libcall ABI, it's
>>> not the standard function ABI but instead a GCC-internal one.  IIRC it
>>> just uses the alternate link register (ie, t0 instead of ra) but I may
>>> have forgotten something else.
>> I hadn't really dug into it -- I was pretty sure they weren't following
>> the standard ABI based on its name and how I've used similar routines to
>> save space on some targets in the past.  So if we're having problems
>> with shrink-wrapping and libcalls, those two might be worth investigating.
>>
>>
>> But I think the most important takeaway is that shrink wrapping should
>> work with libcalls, there's nothing radically different about libcalls
>> that would make them inherently interact poorly with shrink-wrapping.
>> So that aspect of the shrink-wrapping patch needs deeper investigation.
>>
>> Jeff
> I think I miscommunicated the issue previously because my understanding
> of libcalls wasn't very solid. The guard is against the save/restore libcalls
> specifically; other than that shrink wrapping and libcalls are fine.I think it
> makes sense to leave this check because the prologue/epilogue does
> something similar when using libcall save/restore:
>    frame->mask = 0; /* Temporarily fib that we need not save GPRs. */

Looking more closely, yea, it might have been a miscommunication between 
us WRT libcalls.

  You're testing riscv_use_save_libcall, which is only going to kick in 
when we're using that special function to do register saves/restores.  
While we could, in theory, shrink-wrap that as well, I don't think it's 
worth the additional headache.

>
> Since shrink wrap components are marked by testing frame->mask then
> no registers should be wrapped with the libcall save/restore if I understand
> correctly.

That's my understanding as well after looking at the code more closely.  
Essentially the mask is set to zero for the duration of the call to 
riscv_for_each_saved_regs which will effectively avoid shrink wrapping 
in that case.


>
> Nonetheless, I tested what happens if this guard condition is removed
> and the result is that a RISCV test fails (riscv/pr95252.c). In that case
> a unnecessary save/restore of a register is emitted together with
> inconsistent cfi notes that make dwarf2cfi abort.

ACK.  This is the kind of thing I was referring to above when I said we 
probably could shrink wrap the call to save/restore registers, but it's 
probably not worth the headache right now. I could see someone in the 
embedded space one day trying to tackle this problem, but I don't think 
we need to for this patch to go forward.


>
> To conclude, I believe that this makes the code in the commit fine since
> it only guards against the libcall save/restore case. But I may be still
> missing something about this.

I think you've got it figured out reasonably well.


So the final conclusion is libcalls are resolved to my satisfaction.



Jeff
  
Jeff Law Nov. 2, 2022, 3:06 p.m. UTC | #11
On 11/2/22 07:54, Manolis Tsamis wrote:
>
> I've revisited this testcase and I think it's not possible to make it
> work with the current implementation.
> It's not possible to trigger shrink wrapping in this case since the
> wrapping of registers is guarded by
>   if (SMALL_OPERAND (offset)) { bitmap_set_bit (components, regno); }
> Hence if a long stack is generated we get no shrink wrapping.
>
> I also tried to remove that restriction but it looks like it can't
> work because we can't create
> pseudo-registers during shrink wrapping and shrink wrapping can't work either.
>
> I believe this means that shrink wrapping cannot interfere with a long
> stack frame
> so there is nothing to test against in this case?

It'd be marginally better to have such a test case to ensure we don't 
shrink wrap it -- that would ensure that someone doesn't accidentally 
introduce shrink wrapping with large offsets.   Just a bit of future 
proofing.


Jeff
  
Palmer Dabbelt Nov. 3, 2022, 12:26 a.m. UTC | #12
On Wed, 02 Nov 2022 08:06:36 PDT (-0700), jlaw@ventanamicro.com wrote:
>
> On 11/2/22 07:54, Manolis Tsamis wrote:
>>
>> I've revisited this testcase and I think it's not possible to make it
>> work with the current implementation.
>> It's not possible to trigger shrink wrapping in this case since the
>> wrapping of registers is guarded by
>>   if (SMALL_OPERAND (offset)) { bitmap_set_bit (components, regno); }
>> Hence if a long stack is generated we get no shrink wrapping.

Ah, sorry, I must have just missed that when reading the code.  In that 
case we're essentailly just doing what the other port was, so we're 
safe.

>> I also tried to remove that restriction but it looks like it can't
>> work because we can't create
>> pseudo-registers during shrink wrapping and shrink wrapping can't work either.
>>
>> I believe this means that shrink wrapping cannot interfere with a long
>> stack frame
>> so there is nothing to test against in this case?
>
> It'd be marginally better to have such a test case to ensure we don't
> shrink wrap it -- that would ensure that someone doesn't accidentally
> introduce shrink wrapping with large offsets.   Just a bit of future
> proofing.

If there's passing test cases that fail with that check removed then 
it's probably good enough, though I think in this case just having a 
comment there saying why the short-stack check is necessary should be 
fine.
  
Jeff Law Nov. 3, 2022, 10:23 p.m. UTC | #13
On 11/2/22 18:26, Palmer Dabbelt wrote:
>
>>> I also tried to remove that restriction but it looks like it can't
>>> work because we can't create
>>> pseudo-registers during shrink wrapping and shrink wrapping can't 
>>> work either.
>>>
>>> I believe this means that shrink wrapping cannot interfere with a long
>>> stack frame
>>> so there is nothing to test against in this case?
>>
>> It'd be marginally better to have such a test case to ensure we don't
>> shrink wrap it -- that would ensure that someone doesn't accidentally
>> introduce shrink wrapping with large offsets.   Just a bit of future
>> proofing.
>
> If there's passing test cases that fail with that check removed then 
> it's probably good enough, though I think in this case just having a 
> comment there saying why the short-stack check is necessary should be 
> fine.

I can live with this.

jeff
  
Palmer Dabbelt Nov. 7, 2022, 10:07 p.m. UTC | #14
On Thu, 03 Nov 2022 15:23:28 PDT (-0700), jlaw@ventanamicro.com wrote:
>
> On 11/2/22 18:26, Palmer Dabbelt wrote:
>>
>>>> I also tried to remove that restriction but it looks like it can't
>>>> work because we can't create
>>>> pseudo-registers during shrink wrapping and shrink wrapping can't
>>>> work either.
>>>>
>>>> I believe this means that shrink wrapping cannot interfere with a long
>>>> stack frame
>>>> so there is nothing to test against in this case?
>>>
>>> It'd be marginally better to have such a test case to ensure we don't
>>> shrink wrap it -- that would ensure that someone doesn't accidentally
>>> introduce shrink wrapping with large offsets.   Just a bit of future
>>> proofing.
>>
>> If there's passing test cases that fail with that check removed then
>> it's probably good enough, though I think in this case just having a
>> comment there saying why the short-stack check is necessary should be
>> fine.
>
> I can live with this.

Which one (or either)?  I'm fine with either option, just trying to 
avoid another re-spin as this one is a bit vague.
  
Jeff Law Nov. 13, 2022, 1:32 a.m. UTC | #15
On 11/7/22 15:07, Palmer Dabbelt wrote:
> On Thu, 03 Nov 2022 15:23:28 PDT (-0700), jlaw@ventanamicro.com wrote:
>>
>> On 11/2/22 18:26, Palmer Dabbelt wrote:
>>>
>>>>> I also tried to remove that restriction but it looks like it can't
>>>>> work because we can't create
>>>>> pseudo-registers during shrink wrapping and shrink wrapping can't
>>>>> work either.
>>>>>
>>>>> I believe this means that shrink wrapping cannot interfere with a 
>>>>> long
>>>>> stack frame
>>>>> so there is nothing to test against in this case?
>>>>
>>>> It'd be marginally better to have such a test case to ensure we don't
>>>> shrink wrap it -- that would ensure that someone doesn't accidentally
>>>> introduce shrink wrapping with large offsets.   Just a bit of future
>>>> proofing.
>>>
>>> If there's passing test cases that fail with that check removed then
>>> it's probably good enough, though I think in this case just having a
>>> comment there saying why the short-stack check is necessary should be
>>> fine.
>>
>> I can live with this.
>
> Which one (or either)?  I'm fine with either option, just trying to 
> avoid another re-spin as this one is a bit vague.

Sorry I wasn't clear.  Either is fine with me.


Jeff
  
Manolis Tsamis Nov. 16, 2022, 10:26 a.m. UTC | #16
On Sun, Nov 13, 2022 at 3:33 AM Jeff Law via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
> On 11/7/22 15:07, Palmer Dabbelt wrote:
> > On Thu, 03 Nov 2022 15:23:28 PDT (-0700), jlaw@ventanamicro.com wrote:
> >>
> >> On 11/2/22 18:26, Palmer Dabbelt wrote:
> >>>
> >>>>> I also tried to remove that restriction but it looks like it can't
> >>>>> work because we can't create
> >>>>> pseudo-registers during shrink wrapping and shrink wrapping can't
> >>>>> work either.
> >>>>>
> >>>>> I believe this means that shrink wrapping cannot interfere with a
> >>>>> long
> >>>>> stack frame
> >>>>> so there is nothing to test against in this case?
> >>>>
> >>>> It'd be marginally better to have such a test case to ensure we don't
> >>>> shrink wrap it -- that would ensure that someone doesn't accidentally
> >>>> introduce shrink wrapping with large offsets.   Just a bit of future
> >>>> proofing.
> >>>
> >>> If there's passing test cases that fail with that check removed then
> >>> it's probably good enough, though I think in this case just having a
> >>> comment there saying why the short-stack check is necessary should be
> >>> fine.
> >>
> >> I can live with this.
> >
> > Which one (or either)?  I'm fine with either option, just trying to
> > avoid another re-spin as this one is a bit vague.
>
> Sorry I wasn't clear.  Either is fine with me.
>

Since all issues/concerns around this are resolved, is the v2 of this patch
good for merging?

Link for v2 patch is
https://gcc.gnu.org/pipermail/gcc-patches/2022-October/603822.html

Manolis

>
> Jeff
>
  
Jeff Law Nov. 17, 2022, 2:09 a.m. UTC | #17
On 11/16/22 03:26, Manolis Tsamis wrote:
> On Sun, Nov 13, 2022 at 3:33 AM Jeff Law via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> On 11/7/22 15:07, Palmer Dabbelt wrote:
>>> On Thu, 03 Nov 2022 15:23:28 PDT (-0700), jlaw@ventanamicro.com wrote:
>>>> On 11/2/22 18:26, Palmer Dabbelt wrote:
>>>>>>> I also tried to remove that restriction but it looks like it can't
>>>>>>> work because we can't create
>>>>>>> pseudo-registers during shrink wrapping and shrink wrapping can't
>>>>>>> work either.
>>>>>>>
>>>>>>> I believe this means that shrink wrapping cannot interfere with a
>>>>>>> long
>>>>>>> stack frame
>>>>>>> so there is nothing to test against in this case?
>>>>>> It'd be marginally better to have such a test case to ensure we don't
>>>>>> shrink wrap it -- that would ensure that someone doesn't accidentally
>>>>>> introduce shrink wrapping with large offsets.   Just a bit of future
>>>>>> proofing.
>>>>> If there's passing test cases that fail with that check removed then
>>>>> it's probably good enough, though I think in this case just having a
>>>>> comment there saying why the short-stack check is necessary should be
>>>>> fine.
>>>> I can live with this.
>>> Which one (or either)?  I'm fine with either option, just trying to
>>> avoid another re-spin as this one is a bit vague.
>> Sorry I wasn't clear.  Either is fine with me.
>>
> Since all issues/concerns around this are resolved, is the v2 of this patch
> good for merging?
>
> Link for v2 patch is
> https://gcc.gnu.org/pipermail/gcc-patches/2022-October/603822.html

You just need to add a comment to get_separate_components indicating 
that the SMALL_OPERAND_P check is required as we do not support 
shrink-wrapping with large stack frames.


OK with that comment.  Just post the final version and commit, no need 
to wait for another review.


jeff
  
Manolis Tsamis Nov. 17, 2022, 10:54 a.m. UTC | #18
On Thu, Nov 17, 2022 at 4:09 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
> On 11/16/22 03:26, Manolis Tsamis wrote:
> > On Sun, Nov 13, 2022 at 3:33 AM Jeff Law via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> On 11/7/22 15:07, Palmer Dabbelt wrote:
> >>> On Thu, 03 Nov 2022 15:23:28 PDT (-0700), jlaw@ventanamicro.com wrote:
> >>>> On 11/2/22 18:26, Palmer Dabbelt wrote:
> >>>>>>> I also tried to remove that restriction but it looks like it can't
> >>>>>>> work because we can't create
> >>>>>>> pseudo-registers during shrink wrapping and shrink wrapping can't
> >>>>>>> work either.
> >>>>>>>
> >>>>>>> I believe this means that shrink wrapping cannot interfere with a
> >>>>>>> long
> >>>>>>> stack frame
> >>>>>>> so there is nothing to test against in this case?
> >>>>>> It'd be marginally better to have such a test case to ensure we don't
> >>>>>> shrink wrap it -- that would ensure that someone doesn't accidentally
> >>>>>> introduce shrink wrapping with large offsets.   Just a bit of future
> >>>>>> proofing.
> >>>>> If there's passing test cases that fail with that check removed then
> >>>>> it's probably good enough, though I think in this case just having a
> >>>>> comment there saying why the short-stack check is necessary should be
> >>>>> fine.
> >>>> I can live with this.
> >>> Which one (or either)?  I'm fine with either option, just trying to
> >>> avoid another re-spin as this one is a bit vague.
> >> Sorry I wasn't clear.  Either is fine with me.
> >>
> > Since all issues/concerns around this are resolved, is the v2 of this patch
> > good for merging?
> >
> > Link for v2 patch is
> > https://gcc.gnu.org/pipermail/gcc-patches/2022-October/603822.html
>
> You just need to add a comment to get_separate_components indicating
> that the SMALL_OPERAND_P check is required as we do not support
> shrink-wrapping with large stack frames.
>
>
> OK with that comment.  Just post the final version and commit, no need
> to wait for another review.
>
Final version (v3) for commiting is here
https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606523.html

Thanks
>
> jeff
>
  
Philipp Tomsich Nov. 17, 2022, 11:59 a.m. UTC | #19
v3 committed to master. Thanks!
Philipp.

On Thu, 17 Nov 2022 at 11:55, Manolis Tsamis <manolis.tsamis@vrull.eu>
wrote:

> On Thu, Nov 17, 2022 at 4:09 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
> >
> >
> > On 11/16/22 03:26, Manolis Tsamis wrote:
> > > On Sun, Nov 13, 2022 at 3:33 AM Jeff Law via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > >>
> > >> On 11/7/22 15:07, Palmer Dabbelt wrote:
> > >>> On Thu, 03 Nov 2022 15:23:28 PDT (-0700), jlaw@ventanamicro.com
> wrote:
> > >>>> On 11/2/22 18:26, Palmer Dabbelt wrote:
> > >>>>>>> I also tried to remove that restriction but it looks like it
> can't
> > >>>>>>> work because we can't create
> > >>>>>>> pseudo-registers during shrink wrapping and shrink wrapping can't
> > >>>>>>> work either.
> > >>>>>>>
> > >>>>>>> I believe this means that shrink wrapping cannot interfere with a
> > >>>>>>> long
> > >>>>>>> stack frame
> > >>>>>>> so there is nothing to test against in this case?
> > >>>>>> It'd be marginally better to have such a test case to ensure we
> don't
> > >>>>>> shrink wrap it -- that would ensure that someone doesn't
> accidentally
> > >>>>>> introduce shrink wrapping with large offsets.   Just a bit of
> future
> > >>>>>> proofing.
> > >>>>> If there's passing test cases that fail with that check removed
> then
> > >>>>> it's probably good enough, though I think in this case just having
> a
> > >>>>> comment there saying why the short-stack check is necessary should
> be
> > >>>>> fine.
> > >>>> I can live with this.
> > >>> Which one (or either)?  I'm fine with either option, just trying to
> > >>> avoid another re-spin as this one is a bit vague.
> > >> Sorry I wasn't clear.  Either is fine with me.
> > >>
> > > Since all issues/concerns around this are resolved, is the v2 of this
> patch
> > > good for merging?
> > >
> > > Link for v2 patch is
> > > https://gcc.gnu.org/pipermail/gcc-patches/2022-October/603822.html
> >
> > You just need to add a comment to get_separate_components indicating
> > that the SMALL_OPERAND_P check is required as we do not support
> > shrink-wrapping with large stack frames.
> >
> >
> > OK with that comment.  Just post the final version and commit, no need
> > to wait for another review.
> >
> Final version (v3) for commiting is here
> https://gcc.gnu.org/pipermail/gcc-patches/2022-November/606523.html
>
> Thanks
> >
> > jeff
> >
>
  

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5a0adffb5ce..3b633149a9a 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -25,6 +25,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
+#include "backend.h"
 #include "tm.h"
 #include "rtl.h"
 #include "regs.h"
@@ -52,6 +53,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "optabs.h"
 #include "bitmap.h"
 #include "df.h"
+#include "function-abi.h"
 #include "diagnostic.h"
 #include "builtins.h"
 #include "predict.h"
@@ -147,6 +149,11 @@  struct GTY(())  machine_function {
 
   /* The current frame information, calculated by riscv_compute_frame_info.  */
   struct riscv_frame_info frame;
+
+  /* The components already handled by separate shrink-wrapping, which should
+     not be considered by the prologue and epilogue.  */
+  bool reg_is_wrapped_separately[FIRST_PSEUDO_REGISTER];
+
 };
 
 /* Information about a single argument.  */
@@ -4209,7 +4216,7 @@  riscv_for_each_saved_reg (HOST_WIDE_INT sp_offset, riscv_save_restore_fn fn,
   for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
     if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
       {
-	bool handle_reg = TRUE;
+	bool handle_reg = !cfun->machine->reg_is_wrapped_separately[regno];
 
 	/* If this is a normal return in a function that calls the eh_return
 	   builtin, then do not restore the eh return data registers as that
@@ -4240,9 +4247,11 @@  riscv_for_each_saved_reg (HOST_WIDE_INT sp_offset, riscv_save_restore_fn fn,
   for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
     if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
       {
+	bool handle_reg = !cfun->machine->reg_is_wrapped_separately[regno];
 	machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
 
-	riscv_save_restore_reg (mode, regno, offset, fn);
+	if (handle_reg)
+	  riscv_save_restore_reg (mode, regno, offset, fn);
 	offset -= GET_MODE_SIZE (mode);
       }
 }
@@ -4667,6 +4676,156 @@  riscv_epilogue_uses (unsigned int regno)
   return false;
 }
 
+/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
+
+static sbitmap
+riscv_get_separate_components (void)
+{
+  HOST_WIDE_INT offset;
+  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
+  bitmap_clear (components);
+
+  if (riscv_use_save_libcall (&cfun->machine->frame)
+      || cfun->machine->interrupt_handler_p)
+    return components;
+
+  offset = cfun->machine->frame.gp_sp_offset;
+  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
+    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
+      {
+	if (SMALL_OPERAND (offset))
+	  bitmap_set_bit (components, regno);
+
+	offset -= UNITS_PER_WORD;
+      }
+
+  offset = cfun->machine->frame.fp_sp_offset;
+  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
+    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
+      {
+	machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
+
+	if (SMALL_OPERAND (offset))
+	  bitmap_set_bit (components, regno);
+
+	offset -= GET_MODE_SIZE (mode);
+      }
+
+  /* Don't mess with the hard frame pointer.  */
+  if (frame_pointer_needed)
+    bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
+
+  bitmap_clear_bit (components, RETURN_ADDR_REGNUM);
+
+  return components;
+}
+
+/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
+
+static sbitmap
+riscv_components_for_bb (basic_block bb)
+{
+  bitmap in = DF_LIVE_IN (bb);
+  bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
+  bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+
+  sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
+  bitmap_clear (components);
+
+  function_abi_aggregator callee_abis;
+  rtx_insn *insn;
+  FOR_BB_INSNS (bb, insn)
+    if (CALL_P (insn))
+      callee_abis.note_callee_abi (insn_callee_abi (insn));
+  HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
+
+  /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
+  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
+    if (!fixed_regs[regno]
+       && !crtl->abi->clobbers_full_reg_p (regno)
+       && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
+	   || bitmap_bit_p (in, regno)
+	   || bitmap_bit_p (gen, regno)
+	   || bitmap_bit_p (kill, regno)))
+      bitmap_set_bit (components, regno);
+
+  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
+    if (!fixed_regs[regno]
+       && !crtl->abi->clobbers_full_reg_p (regno)
+       && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
+	   || bitmap_bit_p (in, regno)
+	   || bitmap_bit_p (gen, regno)
+	   || bitmap_bit_p (kill, regno)))
+      bitmap_set_bit (components, regno);
+
+  return components;
+}
+
+/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.  */
+
+static void
+riscv_disqualify_components (sbitmap, edge, sbitmap, bool)
+{
+  /* Nothing to do for riscv.  */
+}
+
+static void
+riscv_process_components (sbitmap components, bool prologue_p)
+{
+  HOST_WIDE_INT offset;
+  riscv_save_restore_fn fn = prologue_p? riscv_save_reg : riscv_restore_reg;
+
+  offset = cfun->machine->frame.gp_sp_offset;
+  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
+    if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST))
+      {
+	if (bitmap_bit_p (components, regno))
+	  riscv_save_restore_reg (word_mode, regno, offset, fn);
+
+	offset -= UNITS_PER_WORD;
+      }
+
+  offset = cfun->machine->frame.fp_sp_offset;
+  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
+    if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST))
+      {
+	machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode;
+
+	if (bitmap_bit_p (components, regno))
+	  riscv_save_restore_reg (mode, regno, offset, fn);
+
+	offset -= GET_MODE_SIZE (mode);
+      }
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
+
+static void
+riscv_emit_prologue_components (sbitmap components)
+{
+  riscv_process_components (components, true);
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
+
+static void
+riscv_emit_epilogue_components (sbitmap components)
+{
+  riscv_process_components (components, false);
+}
+
+static void
+riscv_set_handled_components (sbitmap components)
+{
+  for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++)
+    if (bitmap_bit_p (components, regno))
+      cfun->machine->reg_is_wrapped_separately[regno] = true;
+
+  for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++)
+    if (bitmap_bit_p (components, regno))
+      cfun->machine->reg_is_wrapped_separately[regno] = true;
+}
+
 /* Return nonzero if this function is known to have a null epilogue.
    This allows the optimizer to omit jumps to jumps if no stack
    was created.  */
@@ -5713,6 +5872,30 @@  riscv_asan_shadow_offset (void)
 #undef TARGET_FUNCTION_ARG_BOUNDARY
 #define TARGET_FUNCTION_ARG_BOUNDARY riscv_function_arg_boundary
 
+#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
+#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
+  riscv_get_separate_components
+
+#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
+#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
+  riscv_components_for_bb
+
+#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
+#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
+  riscv_disqualify_components
+
+#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
+  riscv_emit_prologue_components
+
+#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
+  riscv_emit_epilogue_components
+
+#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
+#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
+  riscv_set_handled_components
+
 /* The generic ELF target does not always have TLS support.  */
 #ifdef HAVE_AS_TLS
 #undef TARGET_HAVE_TLS
diff --git a/gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c b/gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c
new file mode 100644
index 00000000000..b05745968e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/shrink-wrap-1.c
@@ -0,0 +1,25 @@ 
+/* { dg-do compile } */
+/* { dg-options "-fshrink-wrap" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Os" "-Oz" } } */
+
+void g(void);
+
+void f(int x)
+{
+	if (x)
+    {
+      // Those only need to be saved if x is non-zero; without
+      // separate shrink-wrapping it would however be saved in all cases.
+      // Force saving of callee-saved registers
+      register int s2 asm("18") = x;
+      register int s3 asm("19") = x;
+      register int s4 asm("20") = x;
+      asm("" : : "r"(s2));
+      asm("" : : "r"(s3));
+      asm("" : : "r"(s4));
+      g();
+    }
+}
+
+// The resulting code should do nothing if x == 0
+/* { dg-final { scan-assembler "bne\ta0,zero,.*\n.*ret" } } */