gcn: Add __builtin_gcn_{get_stack_limit,first_call_this_thread_p}

Message ID 1bec26d6-e2c5-3408-4f61-0fb17e730b3e@codesourcery.com
State Unresolved
Headers
Series gcn: Add __builtin_gcn_{get_stack_limit,first_call_this_thread_p} |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Tobias Burnus Nov. 18, 2022, 5:20 p.m. UTC
  This patch adds two builtins (getting end-of-stack pointer and
a Boolean answer whether it was the first call to the builtin on this thread).

The idea is to replace some hard-coded values in newlib, permitting to move
later to a manually allocated stack on the compiler side without the need to
modify newlib again. The GCC patch matches what newlib did in reent; I could
imagine that we change this later on.

Lightly tested (especially by visual inspection).
Currently doing a final regtest, OK when it passes?

Any  comments to this patch - or the attached newlib patch?*

Tobias

(*) I also included a patch to newlib to see where were are heading
+ to actually use them for regtesting ...
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  

Comments

Andrew Stubbs Nov. 18, 2022, 5:49 p.m. UTC | #1
On 18/11/2022 17:20, Tobias Burnus wrote:
> This patch adds two builtins (getting end-of-stack pointer and
> a Boolean answer whether it was the first call to the builtin on this 
> thread).
> 
> The idea is to replace some hard-coded values in newlib, permitting to move
> later to a manually allocated stack on the compiler side without the 
> need to
> modify newlib again. The GCC patch matches what newlib did in reent; I 
> could
> imagine that we change this later on.
> 
> Lightly tested (especially by visual inspection).
> Currently doing a final regtest, OK when it passes?
> 
> Any  comments to this patch - or the attached newlib patch?*
> 
> Tobias
> 
> (*) I also included a patch to newlib to see where were are heading
> + to actually use them for regtesting ...

This looks wrong:

> +	/* stackbase = (stack_segment_decr & 0x0000ffffffffffff)
> +			+ stack_wave_offset);
> +	   seg_size = dispatch_ptr->private_segment_size;
> +	   stacklimit = stackbase + seg_size*64;
> +	   with segsize = dispatch_ptr + 6*sizeof(int16_t) + 3*sizeof(int32_t);
> +	   cf. struct hsa_kernel_dispatch_packet_s in the HSA doc.  */
> +	rtx ptr;
> +	if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0
> +	    && cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
> +	  {
> +	    rtx size_rtx = gen_rtx_REG (DImode,
> +					cfun->machine->args.reg[DISPATCH_PTR_ARG]);
> +	    size_rtx = gen_rtx_MEM (DImode,
> +				    gen_rtx_PLUS (DImode, size_rtx,
> +						  GEN_INT (6*16 + 3*32)));
> +	    size_rtx = gen_rtx_MULT (DImode, size_rtx, GEN_INT (64));
> +

seg_size is calculated from the private_segment_size loaded from the 
dispatch_ptr, not calculated from the dispatch_ptr itself.

Andrew
  
Tobias Burnus Nov. 19, 2022, 10:46 a.m. UTC | #2
On 18.11.22 18:49, Andrew Stubbs wrote:
> On 18/11/2022 17:20, Tobias Burnus wrote:
>
> This looks wrong:
>
>> +    /* stackbase = (stack_segment_decr & 0x0000ffffffffffff)
>> +            + stack_wave_offset);
>> +       seg_size = dispatch_ptr->private_segment_size;
>> +       stacklimit = stackbase + seg_size*64;
(this should be '*seg_size' not 'seg_size' and the name should be
s/seg_size/seg_size_ptr/.)
>> +       with segsize = dispatch_ptr + 6*sizeof(int16_t) +
>> 3*sizeof(int32_t);
>> +       cf. struct hsa_kernel_dispatch_packet_s in the HSA doc. */
>> +    rtx ptr;
>> +    if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0
>> +        && cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
>> +      {
>> +        rtx size_rtx = gen_rtx_REG (DImode,
>> + cfun->machine->args.reg[DISPATCH_PTR_ARG]);
>> +        size_rtx = gen_rtx_MEM (DImode,
>> +                    gen_rtx_PLUS (DImode, size_rtx,
>> +                          GEN_INT (6*16 + 3*32)));
>> +        size_rtx = gen_rtx_MULT (DImode, size_rtx, GEN_INT (64));
>> +
(Reading it, I think it should be '..._MEM(SImode,' and
'..._MULT(SImode' instead of DImode.)
> seg_size is calculated from the private_segment_size loaded from the
> dispatch_ptr, not calculated from the dispatch_ptr itself.

Isn't this what thee code tries to do? Namely:


My understanding is that

dispatch_ptr->private_segment_size == *((char*)dispatch_ptr + 192)

And the latter is what I attempt to do. I have a very limited knowledge
of insn/rtx/RTL and of GCN assemply; thus, I likely have done something
stupid. Having said this, Here is what I get:

(Where asm("s4") == dispatch_ptr)

         s_add_u32       s2, s4, 192
         s_addc_u32      s3, s5, 0
         v_writelane_b32 v4, s2, 0
         v_writelane_b32 v5, s3, 0
         s_mov_b64       exec, 1
         flat_load_dwordx2       v[4:5], v[4:5]
         s_waitcnt       0
         v_lshlrev_b64   v[4:5], 6, v[4:5]
         v_readlane_b32  s2, v4, 0
         v_readlane_b32  s3, v5, 0

Not that I really understand every line, but at a glance it
looks okay.

The 192 is because of (quoting newlib/libc/machine/amdgcn/getreent.c):

typedef struct hsa_kernel_dispatch_packet_s {
   uint16_t header ;
   uint16_t setup;
   uint16_t workgroup_size_x ;
   uint16_t workgroup_size_y ;
   uint16_t workgroup_size_z;
   uint16_t reserved0;
   uint32_t grid_size_x ;
   uint32_t grid_size_y ;
   uint32_t grid_size_z;
   uint32_t private_segment_size;

i.e. 6*16 + 3*32 = 192 – and we want to read a 32bit unsigned int.

  * * *

Admittedly, there is probably something not quite right as I see with gfx908

   # of expected passes            27476
   # of unexpected failures        317

where 317 FAIL comes from 88 testcase files.

That's not a a very high number but more than the usual fails, which shows that
something is not quite right.

  * * *

I am pretty sure that I missed something - but the question is what.
I hope you can help me pinpoint the place where it goes wrong.

Thanks,

Tobias

-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  
Andrew Stubbs Nov. 20, 2022, 12:23 a.m. UTC | #3
On 19/11/2022 10:46, Tobias Burnus wrote:
> On 18.11.22 18:49, Andrew Stubbs wrote:
>> On 18/11/2022 17:20, Tobias Burnus wrote:
>>
>> This looks wrong:
>>
>>> +    /* stackbase = (stack_segment_decr & 0x0000ffffffffffff)
>>> +            + stack_wave_offset);
>>> +       seg_size = dispatch_ptr->private_segment_size;
>>> +       stacklimit = stackbase + seg_size*64;
> (this should be '*seg_size' not 'seg_size' and the name should be 
> s/seg_size/seg_size_ptr/.)

Yes, looking again I think the comment is misleading, but the code has 
the MEM so the dereference is there.

>>> +       with segsize = dispatch_ptr + 6*sizeof(int16_t) + 
>>> 3*sizeof(int32_t);
>>> +       cf. struct hsa_kernel_dispatch_packet_s in the HSA doc. */
>>> +    rtx ptr;
>>> +    if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0
>>> +        && cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
>>> +      {
>>> +        rtx size_rtx = gen_rtx_REG (DImode,
>>> + cfun->machine->args.reg[DISPATCH_PTR_ARG]);
>>> +        size_rtx = gen_rtx_MEM (DImode,
>>> +                    gen_rtx_PLUS (DImode, size_rtx,
>>> +                          GEN_INT (6*16 + 3*32)));
>>> +        size_rtx = gen_rtx_MULT (DImode, size_rtx, GEN_INT (64));
>>> +
> (Reading it, I think it should be '..._MEM(SImode,' and 
> '..._MULT(SImode' instead of DImode.)

Yes, I think you're right; the field is uint32.

> Admittedly, there is probably something not quite right as I see with 
> gfx908
> 
>    # of expected passes            27476
>    # of unexpected failures        317
> 
> where 317 FAIL comes from 88 testcase files.
> 
> That's not a a very high number but more than the usual fails, which 
> shows that
> something is not quite right.
> 
>   * * *
> 
> I am pretty sure that I missed something - but the question is what.
> I hope you can help me pinpoint the place where it goes wrong.

This might be it:

> +	    if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
> +	      {
> +		rtx off;
> +		off = gen_rtx_REG (SImode,
> +		      cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
> +		ptr = gen_rtx_PLUS (DImode, ptr, off);
> +	      }

I think "off" needs to be zero-extended before you can add the SImode to 
DImode (same for the segment size, of course).

Andrew
  
Tobias Burnus Nov. 21, 2022, 1:41 p.m. UTC | #4
On 19.11.22 11:46, Tobias Burnus wrote:
>> +       stacklimit = stackbase + seg_size*64;
> (this should be '*seg_size' not 'seg_size' and the name should be
> s/seg_size/seg_size_ptr/.)
I have updated the comment and ...
> (Reading it, I think it should be '..._MEM(SImode,' and
> '..._MULT(SImode' instead of DImode.)
Additionally, there was a problem of bytes vs. bits in:
> My understanding is that
> dispatch_ptr->private_segment_size == *((char*)dispatch_ptr + 192)

which is wrong - its 192 bits but only 24 bytes!

Finally, in the first_call_this_thread_p() call, I mixed up EQ vs. NE at one place.

BTW: It seems as if there is no problem with zero extension, if I look at the assembler result.

Updated version. Consists of: GCC patch adding the builtins,
the newlib patch using those (unchanged; used for testing + to be submitted), and
a 'test.c' using the builtins and its dump produced with amdgcn's
'cc1 -O2' to show the resulting assembly.

Tested with libgomp on gfx908 offloading and getting only the known fails:
(libgomp.c-c++-common/teams-2.c, libgomp.fortran/async_io_*.f90,
libgomp.oacc-c-c++-common/{deep-copy-10.c,static-variable-1.c,vprop.c})

OK for mainline?

Tobias
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
.amdgcn_target "amdgcn-unknown-amdhsa--gfx803"
	.text
	.align	4
	.globl	foo


	.type	foo,@function
foo:
.LFB0:
	; using flat addressing in function
	; frame pointer needed: true
	; lr needs saving: false
	; outgoing args size: 0
	; pretend size: 0
	; local vars size: 0
	; callee save size: 8
	v_writelane_b32 v6, s14, 0
	v_writelane_b32 v6, s15, 1
	s_mov_b64	exec, -1
	v_lshlrev_b32	v3, 2, v1
	v_mov_b32	v4, s16
	v_mov_b32	v5, s17
	v_add_u32	v4, vcc, v3, v4
	v_addc_u32	v5, vcc, 0, v5, vcc
	s_mov_b64	exec, 3
	flat_store_dword	v[4:5], v6
	s_add_u32	s16, s16, 8
	s_addc_u32	s17, s17, 0
.LCFI0:
	s_add_u32	s14, s16, 0
.LCFI1:
	s_addc_u32	s15, s17, 0
	s_mov_b32	m0, 65536
; 5 "test.c" 1
	; one
; 0 "" 2
	s_mov_b32	s2, -1
	s_mov_b32	s3, 65535
	s_and_b64	s[2:3], s[0:1], s[2:3]
	s_add_u32	s12, s4, 24
	s_addc_u32	s13, s5, 0
	v_writelane_b32	v4, s12, 0
	v_writelane_b32	v5, s13, 0
	s_mov_b64	exec, 1
	flat_load_dword	v0, v[4:5]
	s_waitcnt	0
	v_lshlrev_b32	v0, 6, v0
	v_readlane_b32	s12, v0, 0
	s_mov_b32	s13, 0
	s_add_u32	s2, s2, s12
	s_addc_u32	s3, s3, s13
	s_mov_b32	s13, 0
	s_add_u32	s2, s2, s11
	s_addc_u32	s3, s3, s13
; 7 "test.c" 1
	; two
; 0 "" 2
	s_getpc_b64	s[12:13]
	s_add_u32	s12, s12, ptr@rel32@lo+4
	s_addc_u32	s13, s13, ptr@rel32@hi+4
	s_add_u32	s2, s2, 1234
	s_addc_u32	s3, s3, 0
	v_writelane_b32	v4, s12, 0
	v_writelane_b32	v5, s13, 0
	v_writelane_b32	v6, s2, 0
	v_writelane_b32	v7, s3, 0
	flat_store_dwordx2	v[4:5], v[6:7]
	s_sub_u32	s16, s14, 8
	s_subb_u32	s17, s15, 0
	s_mov_b64	exec, -1
	v_lshlrev_b32	v3, 2, v1
	v_mov_b32	v4, s16
	v_mov_b32	v5, s17
	v_add_u32	v4, vcc, v3, v4
	v_addc_u32	v5, vcc, 0, v5, vcc
	s_mov_b64	exec, 3
	flat_load_dword	v6, v[4:5]
	s_waitcnt	0
	v_readlane_b32 s14, v6, 0
	v_readlane_b32 s15, v6, 1
	s_setpc_b64	s[18:19]
.LFE0:
	.size	foo, .-foo
	.align	4
	.globl	bar


	.type	bar,@function
bar:
.LFB1:
	; using flat addressing in function
	; frame pointer needed: true
	; lr needs saving: false
	; outgoing args size: 0
	; pretend size: 0
	; local vars size: 0
	; callee save size: 8
	v_writelane_b32 v6, s14, 0
	v_writelane_b32 v6, s15, 1
	s_mov_b64	exec, -1
	v_lshlrev_b32	v3, 2, v1
	v_mov_b32	v4, s16
	v_mov_b32	v5, s17
	v_add_u32	v4, vcc, v3, v4
	v_addc_u32	v5, vcc, 0, v5, vcc
	s_mov_b64	exec, 3
	flat_store_dword	v[4:5], v6
	s_add_u32	s16, s16, 8
	s_addc_u32	s17, s17, 0
.LCFI2:
	s_add_u32	s14, s16, 0
.LCFI3:
	s_addc_u32	s15, s17, 0
	s_mov_b32	m0, 65536
; 15 "test.c" 1
	; three
; 0 "" 2
	s_lshr_b64	s[2:3], s[0:1], 48
	s_cmp_lg_u64	s[2:3], 12345
	s_mov_b32	s2, scc
	s_mov_b32	vcc_lo, scc
	s_mov_b32	vcc_hi, 0
	s_cbranch_vccz	.L4
	v_writelane_b32	v4, s0, 0
	v_writelane_b32	v5, s1, 0
	s_mov_b64	exec, 1
	v_and_b32	v4, -1, v4
	v_and_b32	v5, 65535, v5
	v_or_b32	v4, 0, v4
	v_or_b32	v5, 809041920, v5
.L4:
	s_lshl_b32	s2, s2, 31
	s_lshr_b32	s2, s2, 31
; 17 "test.c" 1
	; four
; 0 "" 2
	s_getpc_b64	s[12:13]
	s_add_u32	s12, s12, b@rel32@lo+4
	s_addc_u32	s13, s13, b@rel32@hi+4
	v_writelane_b32	v4, s12, 0
	v_writelane_b32	v5, s13, 0
	v_writelane_b32	v0, s2, 0
	s_mov_b64	exec, 1
	flat_store_dword	v[4:5], v0 glc
; 19 "test.c" 1
	; five
; 0 "" 2
	s_cmp_eq_u32	s2, 0
	s_cbranch_scc1	.L5
; 21 "test.c" 1
	;true
; 0 "" 2
.L3:
	s_sub_u32	s16, s14, 8
	s_subb_u32	s17, s15, 0
	s_mov_b64	exec, -1
	v_lshlrev_b32	v3, 2, v1
	v_mov_b32	v4, s16
	v_mov_b32	v5, s17
	v_add_u32	v4, vcc, v3, v4
	v_addc_u32	v5, vcc, 0, v5, vcc
	s_mov_b64	exec, 3
	flat_load_dword	v6, v[4:5]
	s_waitcnt	0
	v_readlane_b32 s14, v6, 0
	v_readlane_b32 s15, v6, 1
	s_setpc_b64	s[18:19]
.L5:
; 23 "test.c" 1
	;false
; 0 "" 2
	s_branch	.L3
.LFE1:
	.size	bar, .-bar
	.globl	b
	.bss
	.align	16
	.type	b, @object
	.size	b, 4
b:
	.zero	4
	.globl	ptr
	.align	16
	.type	ptr, @object
	.size	ptr, 8
ptr:
	.zero	8
	.section	.debug_frame,"",@progbits
.Lframe0:
	.4byte	.LECIE0-.LSCIE0
.LSCIE0:
	.4byte	0xffffffff
	.byte	0x3
	.string	""
	.byte	0x1
	.byte	0x4
	.byte	0x10
	.byte	0xf
	.byte	0xa
	.byte	0x92
	.byte	0x31
	.byte	0
	.byte	0x8
	.byte	0x20
	.byte	0x24
	.byte	0x92
	.byte	0x30
	.byte	0
	.byte	0x22
	.byte	0x10
	.byte	0x10
	.byte	0xa
	.byte	0x92
	.byte	0x33
	.byte	0
	.byte	0x8
	.byte	0x20
	.byte	0x24
	.byte	0x92
	.byte	0x32
	.byte	0
	.byte	0x22
	.align	8
.LECIE0:
.LSFDE0:
	.4byte	.LEFDE0-.LASFDE0
.LASFDE0:
	.4byte	.Lframe0
	.8byte	.LFB0
	.8byte	.LFE0-.LFB0
	.byte	0x4
	.4byte	.LCFI0-.LFB0
	.byte	0xae
	.byte	0
	.byte	0xaf
	.byte	0x1
	.byte	0x4
	.4byte	.LCFI1-.LCFI0
	.byte	0xf
	.byte	0xc
	.byte	0x92
	.byte	0x2f
	.byte	0
	.byte	0x8
	.byte	0x20
	.byte	0x24
	.byte	0x92
	.byte	0x2e
	.byte	0
	.byte	0x22
	.byte	0x38
	.byte	0x1c
	.align	8
.LEFDE0:
.LSFDE2:
	.4byte	.LEFDE2-.LASFDE2
.LASFDE2:
	.4byte	.Lframe0
	.8byte	.LFB1
	.8byte	.LFE1-.LFB1
	.byte	0x4
	.4byte	.LCFI2-.LFB1
	.byte	0xae
	.byte	0
	.byte	0xaf
	.byte	0x1
	.byte	0x4
	.4byte	.LCFI3-.LCFI2
	.byte	0xf
	.byte	0xc
	.byte	0x92
	.byte	0x2f
	.byte	0
	.byte	0x8
	.byte	0x20
	.byte	0x24
	.byte	0x92
	.byte	0x2e
	.byte	0
	.byte	0x22
	.byte	0x38
	.byte	0x1c
	.align	8
.LEFDE2:
	.ident	"GCC: (GNU) 13.0.0 20221121 (experimental)"
  
Li, Pan2 via Gcc-patches Nov. 21, 2022, 2:58 p.m. UTC | #5
On 21/11/2022 13:41, Tobias Burnus wrote:
> On 19.11.22 11:46, Tobias Burnus wrote:
>>> +       stacklimit = stackbase + seg_size*64;
>> (this should be '*seg_size' not 'seg_size' and the name should be
>> s/seg_size/seg_size_ptr/.)
> I have updated the comment and ...
>> (Reading it, I think it should be '..._MEM(SImode,' and
>> '..._MULT(SImode' instead of DImode.)
> Additionally, there was a problem of bytes vs. bits in:
>> My understanding is that
>> dispatch_ptr->private_segment_size == *((char*)dispatch_ptr + 192)
> 
> which is wrong - its 192 bits but only 24 bytes!
> 
> Finally, in the first_call_this_thread_p() call, I mixed up EQ vs. NE at 
> one place.
> 
> BTW: It seems as if there is no problem with zero extension, if I look 
> at the assembler result.
> 
> Updated version. Consists of: GCC patch adding the builtins,
> the newlib patch using those (unchanged; used for testing + to be 
> submitted), and
> a 'test.c' using the builtins and its dump produced with amdgcn's
> 'cc1 -O2' to show the resulting assembly.
> 
> Tested with libgomp on gfx908 offloading and getting only the known fails:
> (libgomp.c-c++-common/teams-2.c, libgomp.fortran/async_io_*.f90,
> libgomp.oacc-c-c++-common/{deep-copy-10.c,static-variable-1.c,vprop.c})
> 
> OK for mainline?

OK, provided it has been tested in both stand-alone and offload modes, 
and the newlib tests too.

Andrew
  

Patch

amdgcn: Use __builtin_gcn_ in libc/machine/amdgcn/getreent.c

Call __builtin_gcn_get_stack_limit and __builtin_gcn_first_call_this_thread_p
to reduce dependency on some register/layout assumptions by using the new
GCC mainline (GCC 13) builtins, if they are available. If not, the existing
code is used.

 newlib/libc/machine/amdgcn/getreent.c | 38 ++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/newlib/libc/machine/amdgcn/getreent.c b/newlib/libc/machine/amdgcn/getreent.c
index be7d2edc9..ef731f649 100644
--- a/newlib/libc/machine/amdgcn/getreent.c
+++ b/newlib/libc/machine/amdgcn/getreent.c
@@ -29,22 +29,42 @@  typedef struct hsa_kernel_dispatch_packet_s {
 struct _reent *
 __getreent (void)
 {
-  /* Place the reent data at the top of the stack allocation.
-     s[0:1] contains a 48-bit private segment base address.
+  /* Place the reent data at the top of the stack allocation.  */
+  struct data {
+    int marker;
+    struct _reent reent;
+  } *data;
+
+#if defined(__has_builtin) \
+    && __has_builtin(__builtin_gcn_get_stack_limit) \
+    && __has_builtin(__builtin_gcn_first_call_this_thread_p)
+  unsigned long addr = (((unsigned long) __builtin_gcn_get_stack_limit()
+			 - sizeof(struct data)) & ~7);
+  data = (struct data *)addr;
+
+  register long sp asm("s16");
+
+  if (sp >= addr)
+    goto stackoverflow;
+  if (__builtin_gcn_first_call_this_thread_p())
+    {
+      data->marker = 12345;
+      __builtin_memset (&data->reent, 0, sizeof(struct _reent));
+      _REENT_INIT_PTR_ZEROED (&data->reent);
+    }
+  else if (data->marker != 12345)
+    goto stackoverflow;
+#else
+  /* s[0:1] contains a 48-bit private segment base address.
      s11 contains the offset to the base of the stack.
      s[4:5] contains the dispatch pointer.
-     
+
      WARNING: this code will break if s[0:1] is ever used for anything!  */
   const register unsigned long buffer_descriptor asm("s0");
   unsigned long private_segment = buffer_descriptor & 0x0000ffffffffffff;
   const register unsigned int stack_offset asm("s11");
   const register hsa_kernel_dispatch_packet_t *dispatch_ptr asm("s4");
 
-  struct data {
-    int marker;
-    struct _reent reent;
-  } *data;
-
   unsigned long stack_base = private_segment + stack_offset;
   unsigned long stack_end = stack_base + dispatch_ptr->private_segment_size * 64;
   unsigned long addr = (stack_end - sizeof(struct data)) & ~7;
@@ -69,7 +89,7 @@  __getreent (void)
     }
   else if (data->marker != 12345)
     goto stackoverflow;
-
+#endif
 
   return &data->reent;