libgomp: Use libnuma for OpenMP's partition=nearest allocation trait
libgomp/ChangeLog:
* allocator.c: Add ifdef for LIBGOMP_USE_LIBNUMA.
(enum gomp_numa_memkind_kind): Renamed from gomp_memkind_kind;
add GOMP_MEMKIND_LIBNUMA.
(struct gomp_libnuma_data, gomp_init_libnuma, gomp_get_libnuma): New.
(omp_init_allocator): Handle partition=nearest with libnuma if avail.
(omp_aligned_alloc, omp_free, omp_aligned_calloc, omp_realloc): Add
numa_alloc_local (+ memset), numa_free, and numa_realloc calls as
needed.
* config/linux/allocator.c (LIBGOMP_USE_LIBNUMA): Define
* libgomp.texi (Memory allocation): Renamed from 'Memory allocation
with libmemkind'; updated for libnuma usage.
libgomp/allocator.c | 202 +++++++++++++++++++++++++++++++++------
libgomp/config/linux/allocator.c | 1 +
libgomp/libgomp.texi | 22 ++++-
3 files changed, 195 insertions(+), 30 deletions(-)
@@ -31,13 +31,13 @@
#include "libgomp.h"
#include <stdlib.h>
#include <string.h>
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
#include <dlfcn.h>
#endif
#define omp_max_predefined_alloc omp_thread_mem_alloc
-enum gomp_memkind_kind
+enum gomp_numa_memkind_kind
{
GOMP_MEMKIND_NONE = 0,
#define GOMP_MEMKIND_KINDS \
@@ -50,7 +50,8 @@ enum gomp_memkind_kind
#define GOMP_MEMKIND_KIND(kind) GOMP_MEMKIND_##kind
GOMP_MEMKIND_KINDS,
#undef GOMP_MEMKIND_KIND
- GOMP_MEMKIND_COUNT
+ GOMP_MEMKIND_COUNT,
+ GOMP_MEMKIND_LIBNUMA = GOMP_MEMKIND_COUNT
};
struct omp_allocator_data
@@ -65,7 +66,7 @@ struct omp_allocator_data
unsigned int fallback : 8;
unsigned int pinned : 1;
unsigned int partition : 7;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
unsigned int memkind : 8;
#endif
#ifndef HAVE_SYNC_BUILTINS
@@ -81,6 +82,14 @@ struct omp_mem_header
void *pad;
};
+struct gomp_libnuma_data
+{
+ void *numa_handle;
+ void *(*numa_alloc_local) (size_t);
+ void *(*numa_realloc) (void *, size_t, size_t);
+ void (*numa_free) (void *, size_t);
+};
+
struct gomp_memkind_data
{
void *memkind_handle;
@@ -92,6 +101,50 @@ struct gomp_memkind_data
void **kinds[GOMP_MEMKIND_COUNT];
};
+#ifdef LIBGOMP_USE_LIBNUMA
+static struct gomp_libnuma_data *libnuma_data;
+static pthread_once_t libnuma_data_once = PTHREAD_ONCE_INIT;
+
+static void
+gomp_init_libnuma (void)
+{
+ void *handle = dlopen ("libnuma.so.1", RTLD_LAZY);
+ struct gomp_libnuma_data *data;
+
+ data = calloc (1, sizeof (struct gomp_libnuma_data));
+ if (data == NULL)
+ {
+ if (handle)
+ dlclose (handle);
+ return;
+ }
+ if (!handle)
+ {
+ __atomic_store_n (&libnuma_data, data, MEMMODEL_RELEASE);
+ return;
+ }
+ data->numa_handle = handle;
+ data->numa_alloc_local
+ = (__typeof (data->numa_alloc_local)) dlsym (handle, "numa_alloc_local");
+ data->numa_realloc
+ = (__typeof (data->numa_realloc)) dlsym (handle, "numa_realloc");
+ data->numa_free
+ = (__typeof (data->numa_free)) dlsym (handle, "numa_free");
+ __atomic_store_n (&libnuma_data, data, MEMMODEL_RELEASE);
+}
+
+static struct gomp_libnuma_data *
+gomp_get_libnuma (void)
+{
+ struct gomp_libnuma_data *data
+ = __atomic_load_n (&libnuma_data, MEMMODEL_ACQUIRE);
+ if (data)
+ return data;
+ pthread_once (&libnuma_data_once, gomp_init_libnuma);
+ return __atomic_load_n (&libnuma_data, MEMMODEL_ACQUIRE);
+}
+#endif
+
#ifdef LIBGOMP_USE_MEMKIND
static struct gomp_memkind_data *memkind_data;
static pthread_once_t memkind_data_once = PTHREAD_ONCE_INIT;
@@ -166,7 +219,7 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
struct omp_allocator_data data
= { memspace, 1, ~(uintptr_t) 0, 0, 0, omp_atv_contended, omp_atv_all,
omp_atv_default_mem_fb, omp_atv_false, omp_atv_environment,
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
GOMP_MEMKIND_NONE
#endif
};
@@ -285,8 +338,8 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
switch (memspace)
{
- case omp_high_bw_mem_space:
#ifdef LIBGOMP_USE_MEMKIND
+ case omp_high_bw_mem_space:
struct gomp_memkind_data *memkind_data;
memkind_data = gomp_get_memkind ();
if (data.partition == omp_atv_interleaved
@@ -300,17 +353,15 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
data.memkind = GOMP_MEMKIND_HBW_PREFERRED;
break;
}
-#endif
break;
case omp_large_cap_mem_space:
-#ifdef LIBGOMP_USE_MEMKIND
memkind_data = gomp_get_memkind ();
if (memkind_data->kinds[GOMP_MEMKIND_DAX_KMEM_ALL])
data.memkind = GOMP_MEMKIND_DAX_KMEM_ALL;
else if (memkind_data->kinds[GOMP_MEMKIND_DAX_KMEM])
data.memkind = GOMP_MEMKIND_DAX_KMEM;
-#endif
break;
+#endif
default:
#ifdef LIBGOMP_USE_MEMKIND
if (data.partition == omp_atv_interleaved)
@@ -323,6 +374,14 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits,
break;
}
+#ifdef LIBGOMP_USE_LIBNUMA
+ if (data.memkind == GOMP_MEMKIND_NONE && data.partition == omp_atv_nearest)
+ {
+ data.memkind = GOMP_MEMKIND_LIBNUMA;
+ libnuma_data = gomp_get_libnuma ();
+ }
+#endif
+
/* No support for this so far. */
if (data.pinned)
return omp_null_allocator;
@@ -357,8 +416,8 @@ omp_aligned_alloc (size_t alignment, size_t size,
struct omp_allocator_data *allocator_data;
size_t new_size, new_alignment;
void *ptr, *ret;
-#ifdef LIBGOMP_USE_MEMKIND
- enum gomp_memkind_kind memkind;
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
+ enum gomp_numa_memkind_kind memkind;
#endif
if (__builtin_expect (size == 0, 0))
@@ -379,7 +438,7 @@ retry:
allocator_data = (struct omp_allocator_data *) allocator;
if (new_alignment < allocator_data->alignment)
new_alignment = allocator_data->alignment;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
memkind = allocator_data->memkind;
#endif
}
@@ -388,8 +447,10 @@ retry:
allocator_data = NULL;
if (new_alignment < sizeof (void *))
new_alignment = sizeof (void *);
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
if (allocator == omp_high_bw_mem_alloc)
memkind = GOMP_MEMKIND_HBW_PREFERRED;
else if (allocator == omp_large_cap_mem_alloc)
@@ -444,6 +505,13 @@ retry:
allocator_data->used_pool_size = used_pool_size;
gomp_mutex_unlock (&allocator_data->lock);
#endif
+#ifdef LIBGOMP_USE_LIBNUMA
+ if (memkind == GOMP_MEMKIND_LIBNUMA)
+ ptr = libnuma_data->numa_alloc_local (new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+ else
+# endif
+#endif
#ifdef LIBGOMP_USE_MEMKIND
if (memkind)
{
@@ -469,6 +537,13 @@ retry:
}
else
{
+#ifdef LIBGOMP_USE_LIBNUMA
+ if (memkind == GOMP_MEMKIND_LIBNUMA)
+ ptr = libnuma_data->numa_alloc_local (new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+ else
+# endif
+#endif
#ifdef LIBGOMP_USE_MEMKIND
if (memkind)
{
@@ -502,7 +577,7 @@ fail:
{
case omp_atv_default_mem_fb:
if ((new_alignment > sizeof (void *) && new_alignment > alignment)
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
|| memkind
#endif
|| (allocator_data
@@ -577,6 +652,16 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
gomp_mutex_unlock (&allocator_data->lock);
#endif
}
+#ifdef LIBGOMP_USE_LIBNUMA
+ if (allocator_data->memkind == GOMP_MEMKIND_LIBNUMA)
+ {
+ libnuma_data->numa_free (data->ptr, data->size);
+ return;
+ }
+# ifdef LIBGOMP_USE_MEMKIND
+ else
+# endif
+#endif
#ifdef LIBGOMP_USE_MEMKIND
if (allocator_data->memkind)
{
@@ -590,7 +675,7 @@ omp_free (void *ptr, omp_allocator_handle_t allocator)
#ifdef LIBGOMP_USE_MEMKIND
else
{
- enum gomp_memkind_kind memkind = GOMP_MEMKIND_NONE;
+ enum gomp_numa_memkind_kind memkind = GOMP_MEMKIND_NONE;
if (data->allocator == omp_high_bw_mem_alloc)
memkind = GOMP_MEMKIND_HBW_PREFERRED;
else if (data->allocator == omp_large_cap_mem_alloc)
@@ -625,8 +710,8 @@ omp_aligned_calloc (size_t alignment, size_t nmemb, size_t size,
struct omp_allocator_data *allocator_data;
size_t new_size, size_temp, new_alignment;
void *ptr, *ret;
-#ifdef LIBGOMP_USE_MEMKIND
- enum gomp_memkind_kind memkind;
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
+ enum gomp_numa_memkind_kind memkind;
#endif
if (__builtin_expect (size == 0 || nmemb == 0, 0))
@@ -647,7 +732,7 @@ retry:
allocator_data = (struct omp_allocator_data *) allocator;
if (new_alignment < allocator_data->alignment)
new_alignment = allocator_data->alignment;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
memkind = allocator_data->memkind;
#endif
}
@@ -656,8 +741,10 @@ retry:
allocator_data = NULL;
if (new_alignment < sizeof (void *))
new_alignment = sizeof (void *);
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
if (allocator == omp_high_bw_mem_alloc)
memkind = GOMP_MEMKIND_HBW_PREFERRED;
else if (allocator == omp_large_cap_mem_alloc)
@@ -714,6 +801,16 @@ retry:
allocator_data->used_pool_size = used_pool_size;
gomp_mutex_unlock (&allocator_data->lock);
#endif
+#ifdef LIBGOMP_USE_LIBNUMA
+ if (memkind == GOMP_MEMKIND_LIBNUMA)
+ {
+ ptr = libnuma_data->numa_alloc_local (new_size);
+ memset (ptr, '\0', new_size);
+ }
+# ifdef LIBGOMP_USE_MEMKIND
+ else
+# endif
+#endif
#ifdef LIBGOMP_USE_MEMKIND
if (memkind)
{
@@ -739,6 +836,16 @@ retry:
}
else
{
+#ifdef LIBGOMP_USE_LIBNUMA
+ if (memkind == GOMP_MEMKIND_LIBNUMA)
+ {
+ ptr = libnuma_data->numa_alloc_local (new_size);
+ memset (ptr, '\0', new_size);
+ }
+# ifdef LIBGOMP_USE_MEMKIND
+ else
+# endif
+#endif
#ifdef LIBGOMP_USE_MEMKIND
if (memkind)
{
@@ -772,7 +879,7 @@ fail:
{
case omp_atv_default_mem_fb:
if ((new_alignment > sizeof (void *) && new_alignment > alignment)
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
|| memkind
#endif
|| (allocator_data
@@ -815,8 +922,8 @@ omp_realloc (void *ptr, size_t size, omp_allocator_handle_t allocator,
size_t new_size, old_size, new_alignment, old_alignment;
void *new_ptr, *ret;
struct omp_mem_header *data;
-#ifdef LIBGOMP_USE_MEMKIND
- enum gomp_memkind_kind memkind, free_memkind;
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
+ enum gomp_numa_memkind_kind memkind, free_memkind;
#endif
if (__builtin_expect (ptr == NULL, 0))
@@ -841,15 +948,17 @@ retry:
allocator_data = (struct omp_allocator_data *) allocator;
if (new_alignment < allocator_data->alignment)
new_alignment = allocator_data->alignment;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
memkind = allocator_data->memkind;
#endif
}
else
{
allocator_data = NULL;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
if (allocator == omp_high_bw_mem_alloc)
memkind = GOMP_MEMKIND_HBW_PREFERRED;
else if (allocator == omp_large_cap_mem_alloc)
@@ -865,15 +974,17 @@ retry:
if (free_allocator > omp_max_predefined_alloc)
{
free_allocator_data = (struct omp_allocator_data *) free_allocator;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
free_memkind = free_allocator_data->memkind;
#endif
}
else
{
free_allocator_data = NULL;
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
free_memkind = GOMP_MEMKIND_NONE;
+#endif
+#ifdef LIBGOMP_USE_MEMKIND
if (free_allocator == omp_high_bw_mem_alloc)
free_memkind = GOMP_MEMKIND_HBW_PREFERRED;
else if (free_allocator == omp_large_cap_mem_alloc)
@@ -953,6 +1064,19 @@ retry:
allocator_data->used_pool_size = used_pool_size;
gomp_mutex_unlock (&allocator_data->lock);
#endif
+#ifdef LIBGOMP_USE_LIBNUMA
+ if (memkind == GOMP_MEMKIND_LIBNUMA)
+ {
+ if (prev_size)
+ new_ptr = libnuma_data->numa_realloc (data->ptr, data->size,
+ new_size);
+ else
+ new_ptr = libnuma_data->numa_alloc_local (new_size);
+ }
+# ifdef LIBGOMP_USE_MEMKIND
+ else
+# endif
+#endif
#ifdef LIBGOMP_USE_MEMKIND
if (memkind)
{
@@ -1000,6 +1124,13 @@ retry:
&& (free_allocator_data == NULL
|| free_allocator_data->pool_size == ~(uintptr_t) 0))
{
+#ifdef LIBGOMP_USE_LIBNUMA
+ if (memkind == GOMP_MEMKIND_LIBNUMA)
+ new_ptr = libnuma_data->numa_realloc (data->ptr, data->size, new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+ else
+# endif
+#endif
#ifdef LIBGOMP_USE_MEMKIND
if (memkind)
{
@@ -1021,6 +1152,13 @@ retry:
}
else
{
+#ifdef LIBGOMP_USE_LIBNUMA
+ if (memkind == GOMP_MEMKIND_LIBNUMA)
+ new_ptr = libnuma_data->numa_alloc_local (new_size);
+# ifdef LIBGOMP_USE_MEMKIND
+ else
+# endif
+#endif
#ifdef LIBGOMP_USE_MEMKIND
if (memkind)
{
@@ -1060,6 +1198,16 @@ retry:
gomp_mutex_unlock (&free_allocator_data->lock);
#endif
}
+#ifdef LIBGOMP_USE_LIBNUMA
+ if (memkind == GOMP_MEMKIND_LIBNUMA)
+ {
+ libnuma_data->numa_free (data->ptr, data->size);
+ return ret;
+ }
+# ifdef LIBGOMP_USE_MEMKIND
+ else
+# endif
+#endif
#ifdef LIBGOMP_USE_MEMKIND
if (free_memkind)
{
@@ -1079,7 +1227,7 @@ fail:
{
case omp_atv_default_mem_fb:
if (new_alignment > sizeof (void *)
-#ifdef LIBGOMP_USE_MEMKIND
+#if defined(LIBGOMP_USE_MEMKIND) || defined(LIBGOMP_USE_LIBNUMA)
|| memkind
#endif
|| (allocator_data
@@ -31,6 +31,7 @@
#include "libgomp.h"
#if defined(PLUGIN_SUPPORT) && defined(LIBGOMP_USE_PTHREADS)
#define LIBGOMP_USE_MEMKIND
+#define LIBGOMP_USE_LIBNUMA
#endif
#include "../../allocator.c"
@@ -4584,7 +4584,7 @@ offloading devices (it's not clear if they should be):
@menu
* Implementation-defined ICV Initialization::
* OpenMP Context Selectors::
-* Memory allocation with libmemkind::
+* Memory allocation::
@end menu
@node Implementation-defined ICV Initialization
@@ -4631,8 +4631,8 @@ smaller number. On non-host devices, the value of the
@tab See @code{-march=} in ``Nvidia PTX Options''
@end multitable
-@node Memory allocation with libmemkind
-@section Memory allocation with libmemkind
+@node Memory allocation
+@section Memory allocation
For the memory spaces, the following applies:
@itemize
@@ -4656,6 +4656,22 @@ creating memory allocators requesting
@code{omp_large_cap_mem_space} the allocation will not be interleaved
@end itemize
+On Linux systems, where the @uref{https://github.com/numactl/numactl, numa
+library} (@code{libnuma.so.1}) is available at runtime, it used when creating
+memory allocators requesting
+
+@itemize
+@item the partition trait @code{omp_atv_nearest}, except when the libmemkind
+library is available and the memory space is either
+@code{omp_large_cap_mem_space} or @code{omp_high_bw_mem_space}
+@end itemize
+
+Note that the numa library will round up the allocation size to a multiple of
+the system page size; therefore, consider using it only with large data or
+by sharing allocations by using the @code{pool_size} trait. Additionally,
+the numa library does not guarantee that for reallocations the same node will
+be used.
+
Additional notes:
@itemize
@item The @code{pinned} trait is unsupported.