I changed it to use 128-byte alignment to match the GPU cache-lines.
Committed to OG12.
Andrew
On 11/01/2023 18:05, Andrew Stubbs wrote:
> This patch fixes a runtime issue I encountered with the AMD GCN Unified
> Shared Memory implementation.
>
> We were using regular malloc'd memory configured into USM mode, but
> there were random intermittent crashes. I can't be completely sure, but
> my best guess is that the HSA driver is using malloc internally from the
> same heap, and therefore using memory on the same page as the offload
> kernel. What I do know is that I could make the crashes go away by
> simply padding the USM allocations before and after.
>
> With this patch USM allocations are now completely separated from the
> system heap. The custom allocator is probably less optimal is some
> use-cases, but does have the advantage that all the metadata is stored
> in a side-table that won't ever cause any pages to migrate back to
> main-memory unnecessarily. It's still possible for the user program to
> use USM memory in a way that causes it to thrash, and this might have
> been the ultimate cause of the crashes, but there's not much we can do
> about that here.
>
> I've broken the allocator out into a new file because I anticipate it
> being needed in more than one place, but I didn't put full
> data-isolation on it yet.
>
> I'll rebase, merge, and repost all of the OpenMP memory patches sometime
> soonish.
>
> Andrew
libgomp, amdgcn: Switch USM to 128-byte alignment
This should optimize cache-lines on the AMD GPUs somewhat.
libgomp/ChangeLog:
* usm-allocator.c (ALIGN): Use 128-byte alignment.
diff --git a/libgomp/usm-allocator.c b/libgomp/usm-allocator.c
index c45109169ca..68c1ebafec2 100644
--- a/libgomp/usm-allocator.c
+++ b/libgomp/usm-allocator.c
@@ -57,7 +57,8 @@ static int usm_lock = 0;
static struct usm_splay_tree_s usm_allocations = { NULL };
static struct usm_splay_tree_s usm_free_space = { NULL };
-#define ALIGN(VAR) (((VAR) + 7) & ~7) /* 8-byte granularity. */
+/* 128-byte granularity means GPU cache-line aligned. */
+#define ALIGN(VAR) (((VAR) + 127) & ~127)
/* Coalesce contiguous free space into one entry. This considers the entries
either side of the root node only, so it should be called each time a new
@@ -48,6 +48,8 @@
#include "oacc-plugin.h"
#include "oacc-int.h"
#include <assert.h>
+#include <sys/mman.h>
+#include <unistd.h>
/* These probably won't be in elf.h for a while. */
#ifndef R_AMDGPU_NONE
@@ -3071,6 +3073,102 @@ wait_queue (struct goacc_asyncqueue *aq)
}
/* }}} */
+/* {{{ Unified Shared Memory
+
+ Normal heap memory is already enabled for USM, but by default it is "fine-
+ grained" memory, meaning that the GPU must access it via the system bus,
+ slowly. Changing the page to "coarse-grained" mode means that the page
+ is migrated on-demand and can therefore be accessed quickly by both CPU and
+ GPU (although care should be taken to prevent thrashing the page back and
+ forth).
+
+ GOMP_OFFLOAD_alloc also allocates coarse-grained memory, but in that case
+ the initial location is GPU memory; GOMP_OFFLOAD_usm_alloc returns system
+ memory configure coarse-grained.
+
+ The USM memory space is allocated as a largish block and then subdivided
+ via a custom allocator. (It would be possible to reconfigure regular
+ "malloc'd" memory, but if it ends up on the same page as memory used by
+ the HSA driver then bad things happen.) */
+
+#include "../usm-allocator.c"
+
+/* Record a list of the memory blocks configured for USM. */
+static struct usm_heap_pages {
+ void *start;
+ void *end;
+ struct usm_heap_pages *next;
+} *usm_heap_pages = NULL;
+
+/* Initialize or extend the USM memory space. This is called whenever
+ allocation fails. SIZE is the minimum size required for the failed
+ allocation to succeed; the function may choose a larger size.
+ Note that Linux lazy allocation means that the memory returned isn't
+ guarenteed to acually exist. */
+
+static bool
+usm_heap_create (size_t size)
+{
+ static int lock = 0;
+ while (__atomic_exchange_n (&lock, 1, MEMMODEL_ACQUIRE) != 0)
+ ;
+
+ size_t default_size = 1L * 1024 * 1024 * 1024; /* 1GB */
+ if (size < default_size)
+ size = default_size;
+
+ /* Round up to a whole page. */
+ int pagesize = getpagesize ();
+ int misalignment = size % pagesize;
+ if (misalignment > 0)
+ size += pagesize - misalignment;
+
+ /* Try to get contiguous memory, but it might not be possible.
+ The most recent previous allocation is at the head of the list. */
+ void *addrhint = (usm_heap_pages ? usm_heap_pages->end : NULL);
+ void *new_pages = mmap (addrhint, size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (!new_pages)
+ {
+ GCN_DEBUG ("Could not allocate Unified Shared Memory heap.");
+ __atomic_store_n (&lock, 0, MEMMODEL_RELEASE);
+ return false;
+ }
+
+ /* Register the heap allocation as coarse grained, which implies USM. */
+ struct hsa_amd_svm_attribute_pair_s attr = {
+ HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG,
+ HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED
+ };
+ hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (new_pages, size,
+ &attr, 1);
+ if (status != HSA_STATUS_SUCCESS)
+ GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;"
+ " please update your drivers and/or kernel");
+
+ if (new_pages == addrhint)
+ {
+ /* We got contiguous pages, so we don't need extra list entries. */
+ usm_heap_pages->end += size;
+ }
+ else
+ {
+ /* We need a new list entry to record a discontiguous range. */
+ struct usm_heap_pages *page = malloc (sizeof (*page));
+ page->start = new_pages;
+ page->end = new_pages + size;
+ page->next = usm_heap_pages;
+ usm_heap_pages = page;
+ }
+
+ /* Add the new memory into the allocator's free table. */
+ usm_register_memory (new_pages, size);
+
+ __atomic_store_n (&lock, 0, MEMMODEL_RELEASE);
+ return true;
+}
+
+/* }}} */
/* {{{ OpenACC support */
/* Execute an OpenACC kernel, synchronously or asynchronously. */
@@ -3905,74 +4003,24 @@ GOMP_OFFLOAD_evaluate_device (int device_num, const char *kind,
return !isa || isa_code (isa) == agent->device_isa;
}
-/* Use a splay tree to track USM allocations. */
-typedef struct usm_splay_tree_node_s *usm_splay_tree_node;
-typedef struct usm_splay_tree_s *usm_splay_tree;
-typedef struct usm_splay_tree_key_s *usm_splay_tree_key;
-
-struct usm_splay_tree_key_s {
- void *addr;
- size_t size;
-};
-
-static inline int
-usm_splay_compare (usm_splay_tree_key x, usm_splay_tree_key y)
-{
- if ((x->addr <= y->addr && x->addr + x->size > y->addr)
- || (y->addr <= x->addr && y->addr + y->size > x->addr))
- return 0;
-
- return (x->addr > y->addr ? 1 : -1);
-}
-
-#define splay_tree_prefix usm
-#include "../splay-tree.h"
-
-static struct usm_splay_tree_s usm_map = { NULL };
-
-/* Allocate memory suitable for Unified Shared Memory.
-
- Normal heap memory is already enabled for USM, but by default it is "fine-
- grained" memory, meaning that the GPU must access it via the system bus,
- slowly. Changing the page to "coarse-grained" mode means that the page
- is migrated on-demand and can therefore be accessed quickly by both CPU and
- GPU (although care should be taken to prevent thrashing the page back and
- forth).
-
- GOMP_OFFLOAD_alloc also allocates coarse-grained memory, but in that case
- the initial location is GPU memory; this function returns system memory.
-
- We record and track allocations so that GOMP_OFFLOAD_is_usm_ptr can look
- them up. */
+/* Allocate memory suitable for Unified Shared Memory. */
void *
GOMP_OFFLOAD_usm_alloc (int device, size_t size)
{
- void *ptr = malloc (size);
- if (!ptr || !hsa_fns.hsa_amd_svm_attributes_set_fn)
- return ptr;
-
- /* Register the heap allocation as coarse grained, which implies USM. */
- struct hsa_amd_svm_attribute_pair_s attr = {
- HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG,
- HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED
- };
- hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (ptr, size,
- &attr, 1);
- if (status != HSA_STATUS_SUCCESS)
- GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;"
- " please update your drivers and/or kernel");
-
- /* Record the allocation for GOMP_OFFLOAD_is_usm_ptr. */
- usm_splay_tree_node node = malloc (sizeof (struct usm_splay_tree_node_s));
- node->key.addr = ptr;
- node->key.size = size;
- node->left = NULL;
- node->right = NULL;
- usm_splay_tree_insert (&usm_map, node);
-
- return ptr;
+ while (1)
+ {
+ void *result = usm_alloc (size);
+ if (result)
+ return result;
+
+ /* Allocation failed. Try again if we can create a new heap block.
+ Note: it's possible another thread could get to the new memory
+ first, so the while loop is necessary. */
+ if (!usm_heap_create (size))
+ return NULL;
+ }
}
/* Free memory allocated via GOMP_OFFLOAD_usm_alloc. */
@@ -3980,15 +4028,7 @@ GOMP_OFFLOAD_usm_alloc (int device, size_t size)
bool
GOMP_OFFLOAD_usm_free (int device, void *ptr)
{
- struct usm_splay_tree_key_s key = { ptr, 1 };
- usm_splay_tree_key node = usm_splay_tree_lookup (&usm_map, &key);
- if (node)
- {
- usm_splay_tree_remove (&usm_map, &key);
- free (node);
- }
-
- free (ptr);
+ usm_free (ptr);
return true;
}
@@ -3997,8 +4037,10 @@ GOMP_OFFLOAD_usm_free (int device, void *ptr)
bool
GOMP_OFFLOAD_is_usm_ptr (void *ptr)
{
- struct usm_splay_tree_key_s key = { ptr, 1 };
- return usm_splay_tree_lookup (&usm_map, &key);
+ for (struct usm_heap_pages *heap = usm_heap_pages; heap; heap = heap->next)
+ if (ptr >= (void*)heap && ptr < heap->end)
+ return true;
+ return false;
}
/* }}} */
@@ -4292,18 +4334,3 @@ GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
}
/* }}} */
-/* {{{ USM splay tree */
-
-/* Include this now so that splay-tree.c doesn't include it later. This
- avoids a conflict with splay_tree_prefix. */
-#include "libgomp.h"
-
-/* This allows splay-tree.c to call gomp_fatal in this context. The splay
- tree code doesn't use the variadic arguments right now. */
-#define gomp_fatal(MSG, ...) GOMP_PLUGIN_fatal (MSG)
-
-/* Include the splay tree code inline, with the prefixes added. */
-#define splay_tree_prefix usm
-#define splay_tree_c
-#include "../splay-tree.h"
-/* }}} */
new file mode 100644
@@ -0,0 +1,231 @@
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This is a simple "malloc" implementation intended for use with Unified
+ Shared Memory. It allocates memory from a pool allocated and configured
+ by the device plugin, and is intended to be #included from there. It
+ keeps the allocated/free chain in a side-table (splay tree) to ensure that
+ the allocation routine does not migrate all the USM pages back into host
+ memory. */
+
+
+#include "libgomp.h"
+
+/* Use a splay tree to track USM allocations. */
+
+typedef struct usm_splay_tree_node_s *usm_splay_tree_node;
+typedef struct usm_splay_tree_s *usm_splay_tree;
+typedef struct usm_splay_tree_key_s *usm_splay_tree_key;
+
+struct usm_splay_tree_key_s {
+ void *base;
+ size_t size;
+};
+
+static inline int
+usm_splay_compare (usm_splay_tree_key x, usm_splay_tree_key y)
+{
+ return (x->base == y->base ? 0
+ : x->base > y->base ? 1
+ : -1);
+}
+#define splay_tree_prefix usm
+#include "splay-tree.h"
+
+static int usm_lock = 0;
+static struct usm_splay_tree_s usm_allocations = { NULL };
+static struct usm_splay_tree_s usm_free_space = { NULL };
+
+#define ALIGN(VAR) (((VAR) + 7) & ~7) /* 8-byte granularity. */
+
+/* Coalesce contiguous free space into one entry. This considers the entries
+ either side of the root node only, so it should be called each time a new
+ entry in inserted into the root. */
+
+static void
+usm_coalesce_free_space ()
+{
+ usm_splay_tree_node prev, next, node = usm_free_space.root;
+
+ for (prev = node->left; prev && prev->right; prev = prev->right)
+ ;
+ for (next = node->right; next && next->left; next = next->left)
+ ;
+
+ /* Coalesce adjacent free chunks. */
+ if (next
+ && node->key.base + node->key.size == next->key.base)
+ {
+ /* Free chunk follows. */
+ node->key.size += next->key.size;
+ usm_splay_tree_remove (&usm_free_space, &next->key);
+ free (next);
+ }
+ if (prev
+ && prev->key.base + prev->key.size == node->key.base)
+ {
+ /* Free chunk precedes. */
+ prev->key.size += node->key.size;
+ usm_splay_tree_remove (&usm_free_space, &node->key);
+ free (node);
+ }
+}
+
+/* Add a new memory region into the free chain. This is how the USM heap is
+ initialized and extended. If the new region is contiguous with an existing
+ region then any free space will be coalesced. */
+
+static void
+usm_register_memory (char *base, size_t size)
+{
+ if (base == NULL)
+ return;
+
+ while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1)
+ ;
+
+ usm_splay_tree_node node = malloc (sizeof (struct usm_splay_tree_node_s));
+ node->key.base = base;
+ node->key.size = size;
+ node->left = NULL;
+ node->right = NULL;
+ usm_splay_tree_insert (&usm_free_space, node);
+ usm_coalesce_free_space (node);
+
+ __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+}
+
+/* This splay_tree_foreach callback selects the first free space large enough
+ to hold the allocation needed. Since the splay_tree walk may start in the
+ middle the "first" isn't necessarily the "leftmost" entry. */
+
+struct usm_callback_data {
+ size_t size;
+ usm_splay_tree_node found;
+};
+
+static int
+usm_alloc_callback (usm_splay_tree_key key, void *data)
+{
+ struct usm_callback_data *cbd = (struct usm_callback_data *)data;
+
+ if (key->size >= cbd->size)
+ {
+ cbd->found = (usm_splay_tree_node)key;
+ return 1;
+ }
+
+ return 0;
+}
+
+/* USM "malloc". Selects and moves and address range from usm_free_space to
+ usm_allocations, while leaving any excess in usm_free_space. */
+
+static void *
+usm_alloc (size_t size)
+{
+ /* Memory is allocated in N-byte granularity. */
+ size = ALIGN (size);
+
+ /* Acquire the lock. */
+ while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1)
+ ;
+
+ if (!usm_free_space.root)
+ {
+ /* No memory registered, or no free space. */
+ __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+ return NULL;
+ }
+
+ /* Find a suitable free block. */
+ struct usm_callback_data cbd = {size, NULL};
+ usm_splay_tree_foreach_lazy (&usm_free_space, usm_alloc_callback, &cbd);
+ usm_splay_tree_node freenode = cbd.found;
+
+ void *result = NULL;
+ if (freenode)
+ {
+ /* Allocation successful. */
+ result = freenode->key.base;
+ usm_splay_tree_node allocnode = malloc (sizeof (*allocnode));
+ allocnode->key.base = result;
+ allocnode->key.size = size;
+ allocnode->left = NULL;
+ allocnode->right = NULL;
+ usm_splay_tree_insert (&usm_allocations, allocnode);
+
+ /* Update the free chain. */
+ size_t stillfree_size = freenode->key.size - size;
+ if (stillfree_size > 0)
+ {
+ freenode->key.base = freenode->key.base + size;
+ freenode->key.size = stillfree_size;
+ }
+ else
+ {
+ usm_splay_tree_remove (&usm_free_space, &freenode->key);
+ free (freenode);
+ }
+ }
+
+ /* Release the lock. */
+ __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+
+ return result;
+}
+
+/* USM "free". Moves an address range from usm_allocations to usm_free_space
+ and merges that record with any contiguous free memory. */
+
+static void
+usm_free (void *addr)
+{
+ /* Acquire the lock. */
+ while (__atomic_exchange_n (&usm_lock, 1, MEMMODEL_ACQUIRE) == 1)
+ ;
+
+ /* Convert the memory map to free. */
+ struct usm_splay_tree_key_s key = {addr};
+ usm_splay_tree_key found = usm_splay_tree_lookup (&usm_allocations, &key);
+ if (!found)
+ GOMP_PLUGIN_fatal ("invalid free");
+ usm_splay_tree_remove (&usm_allocations, &key);
+ usm_splay_tree_insert (&usm_free_space, (usm_splay_tree_node)found);
+ usm_coalesce_free_space ();
+
+ /* Release the lock. */
+ __atomic_store_n (&usm_lock, 0, MEMMODEL_RELEASE);
+}
+
+#undef ALIGN
+
+/* This allows splay-tree.c to call gomp_fatal in this context. The splay
+ tree code doesn't use the variadic arguments right now. */
+#define gomp_fatal(MSG, ...) GOMP_PLUGIN_fatal (MSG)
+
+/* Include the splay tree code inline, with the prefixes added. */
+#define splay_tree_prefix usm
+#define splay_tree_c
+#include "splay-tree.h"