libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation (was: [Patch] libgomp/nvptx: Prepare for reverse-offload callback handling)
Checks
Commit Message
Hi!
On 2022-08-26T11:07:28+0200, Tobias Burnus <tobias@codesourcery.com> wrote:
> This patch adds initial [OpenMP reverse offload] support for nvptx.
> CUDA does lockup when trying to copy data from the currently running
> stream; hence, a new stream is generated to do the memory copying.
As part of other work, where I had to touch those special code paths, I
found that we may reduce complexity a little bit "by using the existing
'goacc_asyncqueue' instead of re-coding parts of it". OK to push
"libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation"
(still testing), see attached?
Grüße
Thomas
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
Comments
Hi Thomas,
On 21.03.23 16:53, Thomas Schwinge wrote:
> On 2022-08-26T11:07:28+0200, Tobias Burnus <tobias@codesourcery.com>
> wrote:
>> This patch adds initial [OpenMP reverse offload] support for nvptx.
>> CUDA does lockup when trying to copy data from the currently running
>> stream; hence, a new stream is generated to do the memory copying.
> As part of other work, where I had to touch those special code paths, I
> found that we may reduce complexity a little bit "by using the existing
> 'goacc_asyncqueue' instead of re-coding parts of it". OK to push
> "libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation"
> (still testing), see attached?
I don't think that just calling "exit (EXIT_FAILURE);" is the the proper
way – I think that should be GOMP_PLUGIN_fatal in the plugin and
gomp_fatal in target.c.
Otherwise, it LGTM.
Tobias
> Subject: [PATCH] libgomp: Simplify OpenMP reverse offload host <-> device
> memory copy implementation
>
> ... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it.
>
> Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609
> "libgomp/nvptx: Prepare for reverse-offload callback handling",
> and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8
> "libgomp: Handle OpenMP's reverse offloads".
>
> libgomp/
> * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy',
> 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'.
> * libgomp.h (gomp_target_rev): Adjust.
> * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust.
> * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust.
> * plugin/plugin-gcn.c (process_reverse_offload): Adjust.
> * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy)
> (rev_off_host_to_dev_cpy): Remove.
> (GOMP_OFFLOAD_run): Adjust.
> ---
> libgomp/libgomp-plugin.c | 7 +--
> libgomp/libgomp-plugin.h | 6 +-
> libgomp/libgomp.h | 5 +-
> libgomp/plugin/plugin-gcn.c | 2 +-
> libgomp/plugin/plugin-nvptx.c | 77 ++++++++++++++-----------
> libgomp/target.c | 102 +++++++++++++++-------------------
> 6 files changed, 96 insertions(+), 103 deletions(-)
>
> diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c
> index 27e7c94ba9b..d696515eeb6 100644
> --- a/libgomp/libgomp-plugin.c
> +++ b/libgomp/libgomp-plugin.c
> @@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
> void
> GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
> uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
> - void (*dev_to_host_cpy) (void *, const void *, size_t,
> - void *),
> - void (*host_to_dev_cpy) (void *, const void *, size_t,
> - void *), void *token)
> + struct goacc_asyncqueue *aq)
> {
> gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
> - dev_to_host_cpy, host_to_dev_cpy, token);
> + aq);
> }
> diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
> index 28267f75f7a..42ee3d6c7f9 100644
> --- a/libgomp/libgomp-plugin.h
> +++ b/libgomp/libgomp-plugin.h
> @@ -121,11 +121,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
> __attribute__ ((noreturn, format (printf, 1, 2)));
>
> extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
> - uint64_t, int,
> - void (*) (void *, const void *, size_t,
> - void *),
> - void (*) (void *, const void *, size_t,
> - void *), void *);
> + uint64_t, int, struct goacc_asyncqueue *);
>
> /* Prototypes for functions implemented by libgomp plugins. */
> extern const char *GOMP_OFFLOAD_get_name (void);
> diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
> index ba8fe348aba..4d2bfab4b71 100644
> --- a/libgomp/libgomp.h
> +++ b/libgomp/libgomp.h
> @@ -1130,10 +1130,7 @@ extern void gomp_init_targets_once (void);
> extern int gomp_get_num_devices (void);
> extern bool gomp_target_task_fn (void *);
> extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
> - int,
> - void (*) (void *, const void *, size_t, void *),
> - void (*) (void *, const void *, size_t, void *),
> - void *);
> + int, struct goacc_asyncqueue *);
>
> /* Splay tree definitions. */
> typedef struct splay_tree_node_s *splay_tree_node;
> diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
> index 347803762eb..2181bf0235f 100644
> --- a/libgomp/plugin/plugin-gcn.c
> +++ b/libgomp/plugin/plugin-gcn.c
> @@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
> {
> int dev_num = dev_num64;
> GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
> - NULL, NULL, NULL);
> + NULL);
> }
>
> /* Output any data written to console output from the kernel. It is expected
> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
> index 5bd5a419e0e..4a710851ee5 100644
> --- a/libgomp/plugin/plugin-nvptx.c
> +++ b/libgomp/plugin/plugin-nvptx.c
> @@ -56,6 +56,7 @@
> #include <unistd.h>
> #include <assert.h>
> #include <errno.h>
> +#include <stdlib.h>
>
> /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
> block to cache between kernel invocations. For soft-stacks blocks bigger
> @@ -1739,11 +1740,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
> return 1;
> }
>
> -struct goacc_asyncqueue *
> -GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
> +static struct goacc_asyncqueue *
> +nvptx_goacc_asyncqueue_construct (unsigned int flags)
> {
> CUstream stream = NULL;
> - CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
> + CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
>
> struct goacc_asyncqueue *aq
> = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
> @@ -1751,14 +1752,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
> return aq;
> }
>
> -bool
> -GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
> +struct goacc_asyncqueue *
> +GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
> +{
> + return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
> +}
> +
> +static bool
> +nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
> {
> CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
> free (aq);
> return true;
> }
>
> +bool
> +GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
> +{
> + return nvptx_goacc_asyncqueue_destruct (aq);
> +}
> +
> int
> GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
> {
> @@ -1772,13 +1785,19 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
> return -1;
> }
>
> -bool
> -GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
> +static bool
> +nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
> {
> CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
> return true;
> }
>
> +bool
> +GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
> +{
> + return nvptx_goacc_asyncqueue_synchronize (aq);
> +}
> +
> bool
> GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
> struct goacc_asyncqueue *aq2)
> @@ -2038,22 +2057,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
> }
>
>
> -void
> -rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
> - CUstream stream)
> -{
> - CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
> - CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
> -}
> -
> -void
> -rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
> - CUstream stream)
> -{
> - CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
> - CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
> -}
> -
> void
> GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
> {
> @@ -2087,9 +2090,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
> }
> nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
>
> - size_t stack_size = nvptx_stacks_size ();
> bool reverse_offload = ptx_dev->rev_data != NULL;
> - CUstream copy_stream = NULL;
> + struct goacc_asyncqueue *reverse_offload_aq = NULL;
> + if (reverse_offload)
> + {
> + reverse_offload_aq
> + = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
> + if (!reverse_offload_aq)
> + exit (EXIT_FAILURE);
> + }
> +
> + size_t stack_size = nvptx_stacks_size ();
>
> pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
> void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
> @@ -2103,8 +2114,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
> GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
> " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
> __FUNCTION__, fn_name, teams, threads);
> - if (reverse_offload)
> - CUDA_CALL_ASSERT (cuStreamCreate, ©_stream, CU_STREAM_NON_BLOCKING);
> r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
> 32, threads, 1, 0, NULL, NULL, config);
> if (r != CUDA_SUCCESS)
> @@ -2127,17 +2136,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
> GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
> rev_data->addrs, rev_data->sizes,
> rev_data->kinds, rev_data->dev_num,
> - rev_off_dev_to_host_cpy,
> - rev_off_host_to_dev_cpy, copy_stream);
> - CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
> + reverse_offload_aq);
> + if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
> + exit (EXIT_FAILURE);
> __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
> }
> usleep (1);
> }
> else
> r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
> - if (reverse_offload)
> - CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
> if (r == CUDA_ERROR_LAUNCH_FAILED)
> GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
> maybe_abort_msg);
> @@ -2145,6 +2152,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
> GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
>
> pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
> +
> + if (reverse_offload)
> + {
> + if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
> + exit (EXIT_FAILURE);
> + }
> }
>
> /* TODO: Implement GOMP_OFFLOAD_async_run. */
> diff --git a/libgomp/target.c b/libgomp/target.c
> index 79ed64a5dc3..e02188cf7e1 100644
> --- a/libgomp/target.c
> +++ b/libgomp/target.c
> @@ -3312,9 +3312,7 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
> void
> gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
> uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
> - void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
> - void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
> - void *token)
> + struct goacc_asyncqueue *aq)
> {
> /* Return early if there is no offload code. */
> if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
> @@ -3356,26 +3354,17 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
> devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
> sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
> kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
> - if (dev_to_host_cpy)
> - {
> - dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
> - mapnum * sizeof (uint64_t), token);
> - dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
> - mapnum * sizeof (uint64_t), token);
> - dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
> - mapnum * sizeof (unsigned short), token);
> - }
> - else
> - {
> - gomp_copy_dev2host (devicep, NULL, devaddrs,
> - (const void *) (uintptr_t) devaddrs_ptr,
> - mapnum * sizeof (uint64_t));
> - gomp_copy_dev2host (devicep, NULL, sizes,
> - (const void *) (uintptr_t) sizes_ptr,
> - mapnum * sizeof (uint64_t));
> - gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr,
> - mapnum * sizeof (unsigned short));
> - }
> + gomp_copy_dev2host (devicep, aq, devaddrs,
> + (const void *) (uintptr_t) devaddrs_ptr,
> + mapnum * sizeof (uint64_t));
> + gomp_copy_dev2host (devicep, aq, sizes,
> + (const void *) (uintptr_t) sizes_ptr,
> + mapnum * sizeof (uint64_t));
> + gomp_copy_dev2host (devicep, aq, kinds,
> + (const void *) (uintptr_t) kinds_ptr,
> + mapnum * sizeof (unsigned short));
> + if (aq && !devicep->openacc.async.synchronize_func (aq))
> + exit (EXIT_FAILURE);
> }
>
> size_t tgt_align = 0, tgt_size = 0;
> @@ -3402,13 +3391,14 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
> if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
> memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
> (size_t) sizes[i]);
> - else if (dev_to_host_cpy)
> - dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
> - (size_t) sizes[i], token);
> else
> - gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
> - (void *) (uintptr_t) devaddrs[i],
> - (size_t) sizes[i]);
> + {
> + gomp_copy_dev2host (devicep, aq, tgt + tgt_size,
> + (void *) (uintptr_t) devaddrs[i],
> + (size_t) sizes[i]);
> + if (aq && !devicep->openacc.async.synchronize_func (aq))
> + exit (EXIT_FAILURE);
> + }
> devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
> tgt_size = tgt_size + sizes[i];
> if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
> @@ -3498,15 +3488,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
> || kind == GOMP_MAP_ALWAYS_TO
> || kind == GOMP_MAP_ALWAYS_TOFROM)
> {
> - if (dev_to_host_cpy)
> - dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
> - (void *) (uintptr_t) cdata[i].devaddr,
> - sizes[i], token);
> - else
> - gomp_copy_dev2host (devicep, NULL,
> - (void *) (uintptr_t) devaddrs[i],
> - (void *) (uintptr_t) cdata[i].devaddr,
> - sizes[i]);
> + gomp_copy_dev2host (devicep, aq,
> + (void *) (uintptr_t) devaddrs[i],
> + (void *) (uintptr_t) cdata[i].devaddr,
> + sizes[i]);
> + if (aq && !devicep->openacc.async.synchronize_func (aq))
> + {
> + gomp_mutex_unlock (&devicep->lock);
> + exit (EXIT_FAILURE);
> + }
> }
> if (struct_cpy)
> struct_cpy--;
> @@ -3573,15 +3563,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
> devaddrs[i]
> = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
> sizes[i]);
> - if (dev_to_host_cpy)
> - dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
> - (void *) (uintptr_t) cdata[i].devaddr,
> - sizes[i], token);
> - else
> - gomp_copy_dev2host (devicep, NULL,
> - (void *) (uintptr_t) devaddrs[i],
> - (void *) (uintptr_t) cdata[i].devaddr,
> - sizes[i]);
> + gomp_copy_dev2host (devicep, aq,
> + (void *) (uintptr_t) devaddrs[i],
> + (void *) (uintptr_t) cdata[i].devaddr,
> + sizes[i]);
> + if (aq && !devicep->openacc.async.synchronize_func (aq))
> + {
> + gomp_mutex_unlock (&devicep->lock);
> + exit (EXIT_FAILURE);
> + }
> }
> for (j = i + 1; j < mapnum; j++)
> {
> @@ -3685,15 +3675,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
> /* FALLTHRU */
> case GOMP_MAP_FROM:
> case GOMP_MAP_TOFROM:
> - if (copy && host_to_dev_cpy)
> - host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
> - (void *) (uintptr_t) devaddrs[i],
> - sizes[i], token);
> - else if (copy)
> - gomp_copy_host2dev (devicep, NULL,
> - (void *) (uintptr_t) cdata[i].devaddr,
> - (void *) (uintptr_t) devaddrs[i],
> - sizes[i], false, NULL);
> + if (copy)
> + {
> + gomp_copy_host2dev (devicep, aq,
> + (void *) (uintptr_t) cdata[i].devaddr,
> + (void *) (uintptr_t) devaddrs[i],
> + sizes[i], false, NULL);
> + if (aq && !devicep->openacc.async.synchronize_func (aq))
> + exit (EXIT_FAILURE);
> + }
> default:
> break;
> }
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
Hi Tobias!
On 2023-04-28T10:48:31+0200, Tobias Burnus <tobias@codesourcery.com> wrote:
> On 21.03.23 16:53, Thomas Schwinge wrote:
>> On 2022-08-26T11:07:28+0200, Tobias Burnus <tobias@codesourcery.com>
>> wrote:
>>> This patch adds initial [OpenMP reverse offload] support for nvptx.
>>> CUDA does lockup when trying to copy data from the currently running
>>> stream; hence, a new stream is generated to do the memory copying.
>> As part of other work, where I had to touch those special code paths, I
>> found that we may reduce complexity a little bit "by using the existing
>> 'goacc_asyncqueue' instead of re-coding parts of it". OK to push
>> "libgomp: Simplify OpenMP reverse offload host <-> device memory copy implementation"
>> (still testing), see attached?
>
> I don't think that just calling "exit (EXIT_FAILURE);" is the the proper
> way
The point is, when we run into such an 'exit', we've already issued an
error (in the plugin, via 'GOMP_PLUGIN_fatal'), and then (to replicate
what 'GOMP_PLUGIN_fatal'/'gomp_fatal' do) we just need to 'exit' -- after
unlocking. The latter is the reason why we can't just do this:
> – I think that should be GOMP_PLUGIN_fatal in the plugin and
> gomp_fatal in target.c.
..., because we'd dead-lock due to 'atexit' shutdown of devices etc.,
while still having devices etc. locked.
(Resolving all this differently/"properly" is for another day.)
> Otherwise, it LGTM.
Thanks. OK to push then, given the rationale above?
Grüße
Thomas
>> Subject: [PATCH] libgomp: Simplify OpenMP reverse offload host <-> device
>> memory copy implementation
>>
>> ... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it.
>>
>> Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609
>> "libgomp/nvptx: Prepare for reverse-offload callback handling",
>> and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8
>> "libgomp: Handle OpenMP's reverse offloads".
>>
>> libgomp/
>> * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy',
>> 'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'.
>> * libgomp.h (gomp_target_rev): Adjust.
>> * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust.
>> * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust.
>> * plugin/plugin-gcn.c (process_reverse_offload): Adjust.
>> * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy)
>> (rev_off_host_to_dev_cpy): Remove.
>> (GOMP_OFFLOAD_run): Adjust.
>> ---
>> libgomp/libgomp-plugin.c | 7 +--
>> libgomp/libgomp-plugin.h | 6 +-
>> libgomp/libgomp.h | 5 +-
>> libgomp/plugin/plugin-gcn.c | 2 +-
>> libgomp/plugin/plugin-nvptx.c | 77 ++++++++++++++-----------
>> libgomp/target.c | 102 +++++++++++++++-------------------
>> 6 files changed, 96 insertions(+), 103 deletions(-)
>>
>> diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c
>> index 27e7c94ba9b..d696515eeb6 100644
>> --- a/libgomp/libgomp-plugin.c
>> +++ b/libgomp/libgomp-plugin.c
>> @@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
>> void
>> GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>> uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
>> - void (*dev_to_host_cpy) (void *, const void *, size_t,
>> - void *),
>> - void (*host_to_dev_cpy) (void *, const void *, size_t,
>> - void *), void *token)
>> + struct goacc_asyncqueue *aq)
>> {
>> gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
>> - dev_to_host_cpy, host_to_dev_cpy, token);
>> + aq);
>> }
>> diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
>> index 28267f75f7a..42ee3d6c7f9 100644
>> --- a/libgomp/libgomp-plugin.h
>> +++ b/libgomp/libgomp-plugin.h
>> @@ -121,11 +121,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
>> __attribute__ ((noreturn, format (printf, 1, 2)));
>>
>> extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
>> - uint64_t, int,
>> - void (*) (void *, const void *, size_t,
>> - void *),
>> - void (*) (void *, const void *, size_t,
>> - void *), void *);
>> + uint64_t, int, struct goacc_asyncqueue *);
>>
>> /* Prototypes for functions implemented by libgomp plugins. */
>> extern const char *GOMP_OFFLOAD_get_name (void);
>> diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
>> index ba8fe348aba..4d2bfab4b71 100644
>> --- a/libgomp/libgomp.h
>> +++ b/libgomp/libgomp.h
>> @@ -1130,10 +1130,7 @@ extern void gomp_init_targets_once (void);
>> extern int gomp_get_num_devices (void);
>> extern bool gomp_target_task_fn (void *);
>> extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
>> - int,
>> - void (*) (void *, const void *, size_t, void *),
>> - void (*) (void *, const void *, size_t, void *),
>> - void *);
>> + int, struct goacc_asyncqueue *);
>>
>> /* Splay tree definitions. */
>> typedef struct splay_tree_node_s *splay_tree_node;
>> diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
>> index 347803762eb..2181bf0235f 100644
>> --- a/libgomp/plugin/plugin-gcn.c
>> +++ b/libgomp/plugin/plugin-gcn.c
>> @@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
>> {
>> int dev_num = dev_num64;
>> GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
>> - NULL, NULL, NULL);
>> + NULL);
>> }
>>
>> /* Output any data written to console output from the kernel. It is expected
>> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
>> index 5bd5a419e0e..4a710851ee5 100644
>> --- a/libgomp/plugin/plugin-nvptx.c
>> +++ b/libgomp/plugin/plugin-nvptx.c
>> @@ -56,6 +56,7 @@
>> #include <unistd.h>
>> #include <assert.h>
>> #include <errno.h>
>> +#include <stdlib.h>
>>
>> /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
>> block to cache between kernel invocations. For soft-stacks blocks bigger
>> @@ -1739,11 +1740,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
>> return 1;
>> }
>>
>> -struct goacc_asyncqueue *
>> -GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
>> +static struct goacc_asyncqueue *
>> +nvptx_goacc_asyncqueue_construct (unsigned int flags)
>> {
>> CUstream stream = NULL;
>> - CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
>> + CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
>>
>> struct goacc_asyncqueue *aq
>> = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
>> @@ -1751,14 +1752,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
>> return aq;
>> }
>>
>> -bool
>> -GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
>> +struct goacc_asyncqueue *
>> +GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
>> +{
>> + return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
>> +}
>> +
>> +static bool
>> +nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
>> {
>> CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
>> free (aq);
>> return true;
>> }
>>
>> +bool
>> +GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
>> +{
>> + return nvptx_goacc_asyncqueue_destruct (aq);
>> +}
>> +
>> int
>> GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
>> {
>> @@ -1772,13 +1785,19 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
>> return -1;
>> }
>>
>> -bool
>> -GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
>> +static bool
>> +nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
>> {
>> CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
>> return true;
>> }
>>
>> +bool
>> +GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
>> +{
>> + return nvptx_goacc_asyncqueue_synchronize (aq);
>> +}
>> +
>> bool
>> GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
>> struct goacc_asyncqueue *aq2)
>> @@ -2038,22 +2057,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
>> }
>>
>>
>> -void
>> -rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
>> - CUstream stream)
>> -{
>> - CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
>> - CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
>> -}
>> -
>> -void
>> -rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
>> - CUstream stream)
>> -{
>> - CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
>> - CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
>> -}
>> -
>> void
>> GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>> {
>> @@ -2087,9 +2090,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>> }
>> nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
>>
>> - size_t stack_size = nvptx_stacks_size ();
>> bool reverse_offload = ptx_dev->rev_data != NULL;
>> - CUstream copy_stream = NULL;
>> + struct goacc_asyncqueue *reverse_offload_aq = NULL;
>> + if (reverse_offload)
>> + {
>> + reverse_offload_aq
>> + = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
>> + if (!reverse_offload_aq)
>> + exit (EXIT_FAILURE);
>> + }
>> +
>> + size_t stack_size = nvptx_stacks_size ();
>>
>> pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
>> void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
>> @@ -2103,8 +2114,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>> GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
>> " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
>> __FUNCTION__, fn_name, teams, threads);
>> - if (reverse_offload)
>> - CUDA_CALL_ASSERT (cuStreamCreate, ©_stream, CU_STREAM_NON_BLOCKING);
>> r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
>> 32, threads, 1, 0, NULL, NULL, config);
>> if (r != CUDA_SUCCESS)
>> @@ -2127,17 +2136,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>> GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
>> rev_data->addrs, rev_data->sizes,
>> rev_data->kinds, rev_data->dev_num,
>> - rev_off_dev_to_host_cpy,
>> - rev_off_host_to_dev_cpy, copy_stream);
>> - CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
>> + reverse_offload_aq);
>> + if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
>> + exit (EXIT_FAILURE);
>> __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
>> }
>> usleep (1);
>> }
>> else
>> r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
>> - if (reverse_offload)
>> - CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
>> if (r == CUDA_ERROR_LAUNCH_FAILED)
>> GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
>> maybe_abort_msg);
>> @@ -2145,6 +2152,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
>> GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
>>
>> pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
>> +
>> + if (reverse_offload)
>> + {
>> + if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
>> + exit (EXIT_FAILURE);
>> + }
>> }
>>
>> /* TODO: Implement GOMP_OFFLOAD_async_run. */
>> diff --git a/libgomp/target.c b/libgomp/target.c
>> index 79ed64a5dc3..e02188cf7e1 100644
>> --- a/libgomp/target.c
>> +++ b/libgomp/target.c
>> @@ -3312,9 +3312,7 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
>> void
>> gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>> uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
>> - void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
>> - void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
>> - void *token)
>> + struct goacc_asyncqueue *aq)
>> {
>> /* Return early if there is no offload code. */
>> if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
>> @@ -3356,26 +3354,17 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>> devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
>> sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
>> kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
>> - if (dev_to_host_cpy)
>> - {
>> - dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
>> - mapnum * sizeof (uint64_t), token);
>> - dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
>> - mapnum * sizeof (uint64_t), token);
>> - dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
>> - mapnum * sizeof (unsigned short), token);
>> - }
>> - else
>> - {
>> - gomp_copy_dev2host (devicep, NULL, devaddrs,
>> - (const void *) (uintptr_t) devaddrs_ptr,
>> - mapnum * sizeof (uint64_t));
>> - gomp_copy_dev2host (devicep, NULL, sizes,
>> - (const void *) (uintptr_t) sizes_ptr,
>> - mapnum * sizeof (uint64_t));
>> - gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr,
>> - mapnum * sizeof (unsigned short));
>> - }
>> + gomp_copy_dev2host (devicep, aq, devaddrs,
>> + (const void *) (uintptr_t) devaddrs_ptr,
>> + mapnum * sizeof (uint64_t));
>> + gomp_copy_dev2host (devicep, aq, sizes,
>> + (const void *) (uintptr_t) sizes_ptr,
>> + mapnum * sizeof (uint64_t));
>> + gomp_copy_dev2host (devicep, aq, kinds,
>> + (const void *) (uintptr_t) kinds_ptr,
>> + mapnum * sizeof (unsigned short));
>> + if (aq && !devicep->openacc.async.synchronize_func (aq))
>> + exit (EXIT_FAILURE);
>> }
>>
>> size_t tgt_align = 0, tgt_size = 0;
>> @@ -3402,13 +3391,14 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>> if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
>> memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
>> (size_t) sizes[i]);
>> - else if (dev_to_host_cpy)
>> - dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
>> - (size_t) sizes[i], token);
>> else
>> - gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
>> - (void *) (uintptr_t) devaddrs[i],
>> - (size_t) sizes[i]);
>> + {
>> + gomp_copy_dev2host (devicep, aq, tgt + tgt_size,
>> + (void *) (uintptr_t) devaddrs[i],
>> + (size_t) sizes[i]);
>> + if (aq && !devicep->openacc.async.synchronize_func (aq))
>> + exit (EXIT_FAILURE);
>> + }
>> devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
>> tgt_size = tgt_size + sizes[i];
>> if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
>> @@ -3498,15 +3488,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>> || kind == GOMP_MAP_ALWAYS_TO
>> || kind == GOMP_MAP_ALWAYS_TOFROM)
>> {
>> - if (dev_to_host_cpy)
>> - dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
>> - (void *) (uintptr_t) cdata[i].devaddr,
>> - sizes[i], token);
>> - else
>> - gomp_copy_dev2host (devicep, NULL,
>> - (void *) (uintptr_t) devaddrs[i],
>> - (void *) (uintptr_t) cdata[i].devaddr,
>> - sizes[i]);
>> + gomp_copy_dev2host (devicep, aq,
>> + (void *) (uintptr_t) devaddrs[i],
>> + (void *) (uintptr_t) cdata[i].devaddr,
>> + sizes[i]);
>> + if (aq && !devicep->openacc.async.synchronize_func (aq))
>> + {
>> + gomp_mutex_unlock (&devicep->lock);
>> + exit (EXIT_FAILURE);
>> + }
>> }
>> if (struct_cpy)
>> struct_cpy--;
>> @@ -3573,15 +3563,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>> devaddrs[i]
>> = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
>> sizes[i]);
>> - if (dev_to_host_cpy)
>> - dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
>> - (void *) (uintptr_t) cdata[i].devaddr,
>> - sizes[i], token);
>> - else
>> - gomp_copy_dev2host (devicep, NULL,
>> - (void *) (uintptr_t) devaddrs[i],
>> - (void *) (uintptr_t) cdata[i].devaddr,
>> - sizes[i]);
>> + gomp_copy_dev2host (devicep, aq,
>> + (void *) (uintptr_t) devaddrs[i],
>> + (void *) (uintptr_t) cdata[i].devaddr,
>> + sizes[i]);
>> + if (aq && !devicep->openacc.async.synchronize_func (aq))
>> + {
>> + gomp_mutex_unlock (&devicep->lock);
>> + exit (EXIT_FAILURE);
>> + }
>> }
>> for (j = i + 1; j < mapnum; j++)
>> {
>> @@ -3685,15 +3675,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
>> /* FALLTHRU */
>> case GOMP_MAP_FROM:
>> case GOMP_MAP_TOFROM:
>> - if (copy && host_to_dev_cpy)
>> - host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
>> - (void *) (uintptr_t) devaddrs[i],
>> - sizes[i], token);
>> - else if (copy)
>> - gomp_copy_host2dev (devicep, NULL,
>> - (void *) (uintptr_t) cdata[i].devaddr,
>> - (void *) (uintptr_t) devaddrs[i],
>> - sizes[i], false, NULL);
>> + if (copy)
>> + {
>> + gomp_copy_host2dev (devicep, aq,
>> + (void *) (uintptr_t) cdata[i].devaddr,
>> + (void *) (uintptr_t) devaddrs[i],
>> + sizes[i], false, NULL);
>> + if (aq && !devicep->openacc.async.synchronize_func (aq))
>> + exit (EXIT_FAILURE);
>> + }
>> default:
>> break;
>> }
> -----------------
> Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
On 28.04.23 11:31, Thomas Schwinge wrote:
> On 2023-04-28T10:48:31+0200, Tobias Burnus <tobias@codesourcery.com> wrote:
>> I don't think that just calling "exit (EXIT_FAILURE);" is the the proper
>> way
> The point is, when we run into such an 'exit', we've already issued an
> error (in the plugin, via 'GOMP_PLUGIN_fatal'),
you meant: GOMP_PLUGIN_error.
> and then (to replicate
> what 'GOMP_PLUGIN_fatal'/'gomp_fatal' do) we just need to 'exit' -- after
> unlocking. The latter is the reason why we can't just do this:
>
>> – I think that should be GOMP_PLUGIN_fatal in the plugin and
>> gomp_fatal in target.c.
> ..., because we'd dead-lock due to 'atexit' shutdown of devices etc.,
> while still having devices etc. locked.
>
> (Resolving all this differently/"properly" is for another day.)
→ https://gcc.gnu.org/PR109664
>> Otherwise, it LGTM.
> Thanks. OK to push then, given the rationale above?
OK.
Tobias
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
From 65636e924f69a146e571e7a7009304803e24ca1a Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <thomas@codesourcery.com>
Date: Tue, 21 Mar 2023 16:14:16 +0100
Subject: [PATCH] libgomp: Simplify OpenMP reverse offload host <-> device
memory copy implementation
... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it.
Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609
"libgomp/nvptx: Prepare for reverse-offload callback handling",
and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8
"libgomp: Handle OpenMP's reverse offloads".
libgomp/
* target.c (gomp_target_rev): Instead of 'dev_to_host_cpy',
'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'.
* libgomp.h (gomp_target_rev): Adjust.
* libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust.
* libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust.
* plugin/plugin-gcn.c (process_reverse_offload): Adjust.
* plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy)
(rev_off_host_to_dev_cpy): Remove.
(GOMP_OFFLOAD_run): Adjust.
---
libgomp/libgomp-plugin.c | 7 +--
libgomp/libgomp-plugin.h | 6 +-
libgomp/libgomp.h | 5 +-
libgomp/plugin/plugin-gcn.c | 2 +-
libgomp/plugin/plugin-nvptx.c | 77 ++++++++++++++-----------
libgomp/target.c | 102 +++++++++++++++-------------------
6 files changed, 96 insertions(+), 103 deletions(-)
@@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
void
GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
- void (*dev_to_host_cpy) (void *, const void *, size_t,
- void *),
- void (*host_to_dev_cpy) (void *, const void *, size_t,
- void *), void *token)
+ struct goacc_asyncqueue *aq)
{
gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
- dev_to_host_cpy, host_to_dev_cpy, token);
+ aq);
}
@@ -121,11 +121,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
__attribute__ ((noreturn, format (printf, 1, 2)));
extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
- uint64_t, int,
- void (*) (void *, const void *, size_t,
- void *),
- void (*) (void *, const void *, size_t,
- void *), void *);
+ uint64_t, int, struct goacc_asyncqueue *);
/* Prototypes for functions implemented by libgomp plugins. */
extern const char *GOMP_OFFLOAD_get_name (void);
@@ -1130,10 +1130,7 @@ extern void gomp_init_targets_once (void);
extern int gomp_get_num_devices (void);
extern bool gomp_target_task_fn (void *);
extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
- int,
- void (*) (void *, const void *, size_t, void *),
- void (*) (void *, const void *, size_t, void *),
- void *);
+ int, struct goacc_asyncqueue *);
/* Splay tree definitions. */
typedef struct splay_tree_node_s *splay_tree_node;
@@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
{
int dev_num = dev_num64;
GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
- NULL, NULL, NULL);
+ NULL);
}
/* Output any data written to console output from the kernel. It is expected
@@ -56,6 +56,7 @@
#include <unistd.h>
#include <assert.h>
#include <errno.h>
+#include <stdlib.h>
/* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
block to cache between kernel invocations. For soft-stacks blocks bigger
@@ -1739,11 +1740,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
return 1;
}
-struct goacc_asyncqueue *
-GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+static struct goacc_asyncqueue *
+nvptx_goacc_asyncqueue_construct (unsigned int flags)
{
CUstream stream = NULL;
- CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+ CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
struct goacc_asyncqueue *aq
= GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
@@ -1751,14 +1752,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
return aq;
}
-bool
-GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+{
+ return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
+}
+
+static bool
+nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
{
CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
free (aq);
return true;
}
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+ return nvptx_goacc_asyncqueue_destruct (aq);
+}
+
int
GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
{
@@ -1772,13 +1785,19 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
return -1;
}
-bool
-GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+static bool
+nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
{
CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
return true;
}
+bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+ return nvptx_goacc_asyncqueue_synchronize (aq);
+}
+
bool
GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
struct goacc_asyncqueue *aq2)
@@ -2038,22 +2057,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
}
-void
-rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
- CUstream stream)
-{
- CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
- CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
-void
-rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
- CUstream stream)
-{
- CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
- CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
void
GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
{
@@ -2087,9 +2090,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
}
nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
- size_t stack_size = nvptx_stacks_size ();
bool reverse_offload = ptx_dev->rev_data != NULL;
- CUstream copy_stream = NULL;
+ struct goacc_asyncqueue *reverse_offload_aq = NULL;
+ if (reverse_offload)
+ {
+ reverse_offload_aq
+ = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
+ if (!reverse_offload_aq)
+ exit (EXIT_FAILURE);
+ }
+
+ size_t stack_size = nvptx_stacks_size ();
pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
@@ -2103,8 +2114,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
" [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
__FUNCTION__, fn_name, teams, threads);
- if (reverse_offload)
- CUDA_CALL_ASSERT (cuStreamCreate, ©_stream, CU_STREAM_NON_BLOCKING);
r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
32, threads, 1, 0, NULL, NULL, config);
if (r != CUDA_SUCCESS)
@@ -2127,17 +2136,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
rev_data->addrs, rev_data->sizes,
rev_data->kinds, rev_data->dev_num,
- rev_off_dev_to_host_cpy,
- rev_off_host_to_dev_cpy, copy_stream);
- CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
+ reverse_offload_aq);
+ if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
+ exit (EXIT_FAILURE);
__atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
}
usleep (1);
}
else
r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
- if (reverse_offload)
- CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);
@@ -2145,6 +2152,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+
+ if (reverse_offload)
+ {
+ if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
+ exit (EXIT_FAILURE);
+ }
}
/* TODO: Implement GOMP_OFFLOAD_async_run. */
@@ -3312,9 +3312,7 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
void
gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
- void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
- void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
- void *token)
+ struct goacc_asyncqueue *aq)
{
/* Return early if there is no offload code. */
if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
@@ -3356,26 +3354,17 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
- if (dev_to_host_cpy)
- {
- dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
- mapnum * sizeof (uint64_t), token);
- dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
- mapnum * sizeof (uint64_t), token);
- dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
- mapnum * sizeof (unsigned short), token);
- }
- else
- {
- gomp_copy_dev2host (devicep, NULL, devaddrs,
- (const void *) (uintptr_t) devaddrs_ptr,
- mapnum * sizeof (uint64_t));
- gomp_copy_dev2host (devicep, NULL, sizes,
- (const void *) (uintptr_t) sizes_ptr,
- mapnum * sizeof (uint64_t));
- gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) kinds_ptr,
- mapnum * sizeof (unsigned short));
- }
+ gomp_copy_dev2host (devicep, aq, devaddrs,
+ (const void *) (uintptr_t) devaddrs_ptr,
+ mapnum * sizeof (uint64_t));
+ gomp_copy_dev2host (devicep, aq, sizes,
+ (const void *) (uintptr_t) sizes_ptr,
+ mapnum * sizeof (uint64_t));
+ gomp_copy_dev2host (devicep, aq, kinds,
+ (const void *) (uintptr_t) kinds_ptr,
+ mapnum * sizeof (unsigned short));
+ if (aq && !devicep->openacc.async.synchronize_func (aq))
+ exit (EXIT_FAILURE);
}
size_t tgt_align = 0, tgt_size = 0;
@@ -3402,13 +3391,14 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
(size_t) sizes[i]);
- else if (dev_to_host_cpy)
- dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
- (size_t) sizes[i], token);
else
- gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
- (void *) (uintptr_t) devaddrs[i],
- (size_t) sizes[i]);
+ {
+ gomp_copy_dev2host (devicep, aq, tgt + tgt_size,
+ (void *) (uintptr_t) devaddrs[i],
+ (size_t) sizes[i]);
+ if (aq && !devicep->openacc.async.synchronize_func (aq))
+ exit (EXIT_FAILURE);
+ }
devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
tgt_size = tgt_size + sizes[i];
if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
@@ -3498,15 +3488,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
|| kind == GOMP_MAP_ALWAYS_TO
|| kind == GOMP_MAP_ALWAYS_TOFROM)
{
- if (dev_to_host_cpy)
- dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
- (void *) (uintptr_t) cdata[i].devaddr,
- sizes[i], token);
- else
- gomp_copy_dev2host (devicep, NULL,
- (void *) (uintptr_t) devaddrs[i],
- (void *) (uintptr_t) cdata[i].devaddr,
- sizes[i]);
+ gomp_copy_dev2host (devicep, aq,
+ (void *) (uintptr_t) devaddrs[i],
+ (void *) (uintptr_t) cdata[i].devaddr,
+ sizes[i]);
+ if (aq && !devicep->openacc.async.synchronize_func (aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
}
if (struct_cpy)
struct_cpy--;
@@ -3573,15 +3563,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
devaddrs[i]
= (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
sizes[i]);
- if (dev_to_host_cpy)
- dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
- (void *) (uintptr_t) cdata[i].devaddr,
- sizes[i], token);
- else
- gomp_copy_dev2host (devicep, NULL,
- (void *) (uintptr_t) devaddrs[i],
- (void *) (uintptr_t) cdata[i].devaddr,
- sizes[i]);
+ gomp_copy_dev2host (devicep, aq,
+ (void *) (uintptr_t) devaddrs[i],
+ (void *) (uintptr_t) cdata[i].devaddr,
+ sizes[i]);
+ if (aq && !devicep->openacc.async.synchronize_func (aq))
+ {
+ gomp_mutex_unlock (&devicep->lock);
+ exit (EXIT_FAILURE);
+ }
}
for (j = i + 1; j < mapnum; j++)
{
@@ -3685,15 +3675,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
/* FALLTHRU */
case GOMP_MAP_FROM:
case GOMP_MAP_TOFROM:
- if (copy && host_to_dev_cpy)
- host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
- (void *) (uintptr_t) devaddrs[i],
- sizes[i], token);
- else if (copy)
- gomp_copy_host2dev (devicep, NULL,
- (void *) (uintptr_t) cdata[i].devaddr,
- (void *) (uintptr_t) devaddrs[i],
- sizes[i], false, NULL);
+ if (copy)
+ {
+ gomp_copy_host2dev (devicep, aq,
+ (void *) (uintptr_t) cdata[i].devaddr,
+ (void *) (uintptr_t) devaddrs[i],
+ sizes[i], false, NULL);
+ if (aq && !devicep->openacc.async.synchronize_func (aq))
+ exit (EXIT_FAILURE);
+ }
default:
break;
}
--
2.25.1