@@ -56,6 +56,10 @@
#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
free (((void)(MEMSPACE), (void)(SIZE), (ADDR)))
#endif
+#ifndef MEMSPACE_VALIDATE
+#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \
+ (((void)(MEMSPACE), (void)(ACCESS), 1))
+#endif
/* Map the predefined allocators to the correct memory space.
The index to this table is the omp_allocator_handle_t enum value.
@@ -439,6 +443,10 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
if (data.pinned)
return omp_null_allocator;
+ /* Reject unsupported memory spaces. */
+ if (!MEMSPACE_VALIDATE (data.memspace, data.access))
+ return omp_null_allocator;
+
ret = gomp_malloc (sizeof (struct omp_allocator_data));
*ret = data;
#ifndef HAVE_SYNC_BUILTINS
@@ -522,6 +530,10 @@ retry:
new_size += new_alignment - sizeof (void *);
if (__builtin_add_overflow (size, new_size, &new_size))
goto fail;
+#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID
+ if (allocator == omp_low_lat_mem_alloc)
+ goto fail;
+#endif
if (__builtin_expect (allocator_data
&& allocator_data->pool_size < ~(uintptr_t) 0, 0))
@@ -820,6 +832,10 @@ retry:
goto fail;
if (__builtin_add_overflow (size_temp, new_size, &new_size))
goto fail;
+#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID
+ if (allocator == omp_low_lat_mem_alloc)
+ goto fail;
+#endif
if (__builtin_expect (allocator_data
&& allocator_data->pool_size < ~(uintptr_t) 0, 0))
@@ -1054,6 +1070,10 @@ retry:
if (__builtin_add_overflow (size, new_size, &new_size))
goto fail;
old_size = data->size;
+#ifdef OMP_LOW_LAT_MEM_ALLOC_INVALID
+ if (allocator == omp_low_lat_mem_alloc)
+ goto fail;
+#endif
if (__builtin_expect (allocator_data
&& allocator_data->pool_size < ~(uintptr_t) 0, 0))
@@ -108,6 +108,21 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
return realloc (addr, size);
}
+static inline int
+nvptx_memspace_validate (omp_memspace_handle_t memspace, unsigned access)
+{
+#if __PTX_ISA_VERSION_MAJOR__ > 4 \
+ || (__PTX_ISA_VERSION_MAJOR__ == 4 && __PTX_ISA_VERSION_MINOR >= 1)
+ /* Disallow use of low-latency memory when it must be accessible by
+ all threads. */
+ return (memspace != omp_low_lat_mem_space
+ || access != omp_atv_all);
+#else
+ /* Low-latency memory is not available before PTX 4.1. */
+ return (memspace != omp_low_lat_mem_space);
+#endif
+}
+
#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \
nvptx_memspace_alloc (MEMSPACE, SIZE)
#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \
@@ -116,5 +131,11 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
nvptx_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE)
#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE) \
nvptx_memspace_free (MEMSPACE, ADDR, SIZE)
+#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS) \
+ nvptx_memspace_validate (MEMSPACE, ACCESS)
+
+/* The default low-latency memspace implies omp_atv_all, which is incompatible
+ with the .shared memory space. */
+#define OMP_LOW_LAT_MEM_ALLOC_INVALID 1
#include "../../allocator.c"
@@ -5767,6 +5767,9 @@ Additional notes regarding the traits:
@item The @code{sync_hint} trait has no effect.
@end itemize
+See also:
+@ref{Offload-Target Specifics}
+
@c ---------------------------------------------------------------------
@c Offload-Target Specifics
@c ---------------------------------------------------------------------
@@ -5900,6 +5903,21 @@ The implementation remark:
directive for non-contiguous list items will use the 2D and 3D
memory-copy functions of the CUDA library. Higher dimensions will
call those functions in a loop and are therefore supported.
+@item Low-latency memory (@code{omp_low_lat_mem_space}) is supported when the
+ the @code{access} trait is set to @code{cgroup}, the ISA is at least
+ @code{sm_53}, and the PTX version is at least 4.1. The default pool size
+ is 8 kiB per team, but may be adjusted at runtime by setting environment
+ variable @code{GOMP_NVPTX_LOWLAT_POOL=@var{bytes}}. The maximum value is
+ limited by the available hardware, and care should be taken that the
+ selected pool size does not unduly limit the number of teams that can
+ run simultaneously.
+@item @code{omp_low_lat_mem_alloc} cannot be used with true low-latency memory
+ because the definition implies the @code{omp_atv_all} trait; main
+ graphics memory is used instead.
+@item @code{omp_cgroup_mem_alloc}, @code{omp_pteam_mem_alloc}, and
+ @code{omp_thread_mem_alloc}, all use low-latency memory as first
+ preference, and fall back to main graphics memory when the low-latency
+ pool is exhausted.
@end itemize
@@ -32,12 +32,21 @@ test (int n, omp_allocator_handle_t allocator)
int
main ()
{
+ /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */
+ omp_allocator_handle_t gpu_lowlat = 0;
+ #pragma omp target map(from:gpu_lowlat)
+ {
+ omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
+ gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
+ }
+
// Smaller than low-latency memory limit
test (10, omp_default_mem_alloc);
test (10, omp_large_cap_mem_alloc);
test (10, omp_const_mem_alloc);
test (10, omp_high_bw_mem_alloc);
test (10, omp_low_lat_mem_alloc);
+ test (10, gpu_lowlat);
test (10, omp_cgroup_mem_alloc);
test (10, omp_pteam_mem_alloc);
test (10, omp_thread_mem_alloc);
@@ -48,6 +57,7 @@ main ()
test (100000, omp_const_mem_alloc);
test (100000, omp_high_bw_mem_alloc);
test (100000, omp_low_lat_mem_alloc);
+ test (100000, gpu_lowlat);
test (100000, omp_cgroup_mem_alloc);
test (100000, omp_pteam_mem_alloc);
test (100000, omp_thread_mem_alloc);
@@ -40,12 +40,19 @@ test (int n, omp_allocator_handle_t allocator)
int
main ()
{
+ /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */
+ omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
+ omp_allocator_handle_t gpu_lowlat;
+ #pragma omp target map(from:gpu_lowlat)
+ gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
+
// Smaller than low-latency memory limit
test (10, omp_default_mem_alloc);
test (10, omp_large_cap_mem_alloc);
test (10, omp_const_mem_alloc);
test (10, omp_high_bw_mem_alloc);
test (10, omp_low_lat_mem_alloc);
+ test (10, gpu_lowlat);
test (10, omp_cgroup_mem_alloc);
test (10, omp_pteam_mem_alloc);
test (10, omp_thread_mem_alloc);
@@ -56,6 +63,7 @@ main ()
test (1000, omp_const_mem_alloc);
test (1000, omp_high_bw_mem_alloc);
test (1000, omp_low_lat_mem_alloc);
+ test (1000, gpu_lowlat);
test (1000, omp_cgroup_mem_alloc);
test (1000, omp_pteam_mem_alloc);
test (1000, omp_thread_mem_alloc);
@@ -28,12 +28,19 @@ test (omp_allocator_handle_t allocator)
int
main ()
{
+ /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */
+ omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
+ omp_allocator_handle_t gpu_lowlat;
+ #pragma omp target map(from:gpu_lowlat)
+ gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
+
// Smaller than low-latency memory limit
test (omp_default_mem_alloc);
test (omp_large_cap_mem_alloc);
test (omp_const_mem_alloc);
test (omp_high_bw_mem_alloc);
test (omp_low_lat_mem_alloc);
+ test (gpu_lowlat);
test (omp_cgroup_mem_alloc);
test (omp_pteam_mem_alloc);
test (omp_thread_mem_alloc);
@@ -26,10 +26,11 @@ main ()
#pragma omp target
{
/* Ensure that the memory we get *is* low-latency with a null-fallback. */
- omp_alloctrait_t traits[1]
- = { { omp_atk_fallback, omp_atv_null_fb } };
+ omp_alloctrait_t traits[2]
+ = { { omp_atk_fallback, omp_atv_null_fb },
+ { omp_atk_access, omp_atv_cgroup } };
omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
- 1, traits);
+ 2, traits);
int size = 4;
@@ -39,12 +39,19 @@ test (int n, omp_allocator_handle_t allocator)
int
main ()
{
+ /* omp_low_lat_mem_alloc doesn't actually get low-latency memory on GPU. */
+ omp_alloctrait_t traits[1] = { { omp_atk_access, omp_atv_cgroup } };
+ omp_allocator_handle_t gpu_lowlat;
+ #pragma omp target map(from:gpu_lowlat)
+ gpu_lowlat = omp_init_allocator (omp_low_lat_mem_space, 1, traits);
+
// Smaller than low-latency memory limit
test (10, omp_default_mem_alloc);
test (10, omp_large_cap_mem_alloc);
test (10, omp_const_mem_alloc);
test (10, omp_high_bw_mem_alloc);
test (10, omp_low_lat_mem_alloc);
+ test (10, gpu_lowlat);
test (10, omp_cgroup_mem_alloc);
test (10, omp_pteam_mem_alloc);
test (10, omp_thread_mem_alloc);
@@ -55,6 +62,7 @@ main ()
test (100000, omp_const_mem_alloc);
test (100000, omp_high_bw_mem_alloc);
test (100000, omp_low_lat_mem_alloc);
+ test (100000, gpu_lowlat);
test (100000, omp_cgroup_mem_alloc);
test (100000, omp_pteam_mem_alloc);
test (100000, omp_thread_mem_alloc);
@@ -26,10 +26,11 @@ main ()
#pragma omp target
{
/* Ensure that the memory we get *is* low-latency with a null-fallback. */
- omp_alloctrait_t traits[1]
- = { { omp_atk_fallback, omp_atv_null_fb } };
+ omp_alloctrait_t traits[2]
+ = { { omp_atk_fallback, omp_atv_null_fb },
+ { omp_atk_access, omp_atv_cgroup } };
omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
- 1, traits);
+ 2, traits);
int size = 16;
new file mode 100644
@@ -0,0 +1,66 @@
+/* { dg-do run } */
+
+/* { dg-require-effective-target offload_device } */
+/* { dg-xfail-if "not implemented" { ! offload_target_nvptx } } */
+
+/* Test that GPU low-latency allocation is limited to team access. */
+
+#include <stddef.h>
+#include <omp.h>
+
+#pragma omp requires dynamic_allocators
+
+int
+main ()
+{
+ #pragma omp target
+ {
+ /* Ensure that the memory we get *is* low-latency with a null-fallback. */
+ omp_alloctrait_t traits[2]
+ = { { omp_atk_fallback, omp_atv_null_fb },
+ { omp_atk_access, omp_atv_cgroup } };
+ omp_allocator_handle_t lowlat = omp_init_allocator (omp_low_lat_mem_space,
+ 2, traits); // good
+
+ omp_alloctrait_t traits_all[2]
+ = { { omp_atk_fallback, omp_atv_null_fb },
+ { omp_atk_access, omp_atv_all } };
+ omp_allocator_handle_t lowlat_all
+ = omp_init_allocator (omp_low_lat_mem_space, 2, traits_all); // bad
+
+ omp_alloctrait_t traits_default[1]
+ = { { omp_atk_fallback, omp_atv_null_fb } };
+ omp_allocator_handle_t lowlat_default
+ = omp_init_allocator (omp_low_lat_mem_space, 1, traits_default); // bad
+
+ if (lowlat_all != omp_null_allocator
+ || lowlat_default != omp_null_allocator)
+ __builtin_abort ();
+
+ void *a = omp_alloc (1, lowlat); // good
+
+ if (!a)
+ __builtin_abort ();
+
+ omp_free (a, lowlat);
+
+
+ a = omp_calloc (1, 1, lowlat); // good
+
+ if (!a)
+ __builtin_abort ();
+
+ omp_free (a, lowlat);
+
+
+ a = omp_realloc (NULL, 1, lowlat, lowlat); // good
+
+ if (!a)
+ __builtin_abort ();
+
+ omp_free (a, lowlat);
+ }
+
+ return 0;
+}
+