libgomp: Add no-target-region rev offload test + fix plugin-nvptx

Message ID 64661eda-7f5f-da60-894f-00f90f1def04@codesourcery.com
State Accepted
Headers
Series libgomp: Add no-target-region rev offload test + fix plugin-nvptx |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Tobias Burnus Nov. 24, 2022, 5:48 p.m. UTC
  The nvptx reverse-offload code mishandled the case that there was a reverse
offload function that isn't called inside a target region. In that case,
the linker did not include GOMP_target_ext and the global variable it uses.
But the plugin-nvptx.c code expected that the latter is present.

Found via sollve_vv's tests/5.0/requires/test_requires_reverse_offload.c which is
similar to the new testcase. (Albeit the 'if' and comments imply that the sollve_vv
author did not intend this.)

Solution: Handle it gracefully that the global variable does not exist - and
do this check first - and only when successful allocate dev->rev_data. If not,
deallocate rev_fn_table to disable reverse offload handling.

OK for mainline?

Tobias

PS: Admittedly, the nvptx code is not yet exercised as I still have to submit the
libgomp/target.c code handling the reverse offload (+ enabling requires reverse_offload
in plugin-nvptx.c). As it is obvious from this patch, the target.c patch is nearly but
not yet completely ready. - That patch passes the three sollve_vv testcases and also
the existing libgomp testcases, but some corner cases and more testcases are missing.
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  

Comments

Jakub Jelinek Nov. 25, 2022, 11:25 a.m. UTC | #1
On Thu, Nov 24, 2022 at 06:48:01PM +0100, Tobias Burnus wrote:
> libgomp: Add no-target-region rev offload test + fix plugin-nvptx
> 
> OpenMP permits that a 'target device(ancestor:1)' is called without being
> enclosed in a target region - using the current device (i.e. the host) in
> that case.  This commit adds a testcase for this.
> 
> In case of nvptx, the missing on-device 'GOMP_target_ext' call causes that
> it and also the associated on-device GOMP_REV_OFFLOAD_VAR variable are not
> linked in from nvptx's libgomp.a. Thus, handle the failing cuModuleGetGlobal
> gracefully by disabling reverse offload and assuming that the failure is fine.
> 
> libgomp/ChangeLog:
> 
> 	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_load_image): Use unsigned int
> 	for 'i' to match 'fn_entries'; regard absent GOMP_REV_OFFLOAD_VAR
> 	as valid and the code having no reverse-offload code.
> 	* testsuite/libgomp.c-c++-common/reverse-offload-2.c: New test.

Ok, thanks.

	Jakub
  

Patch

libgomp: Add no-target-region rev offload test + fix plugin-nvptx

OpenMP permits that a 'target device(ancestor:1)' is called without being
enclosed in a target region - using the current device (i.e. the host) in
that case.  This commit adds a testcase for this.

In case of nvptx, the missing on-device 'GOMP_target_ext' call causes that
it and also the associated on-device GOMP_REV_OFFLOAD_VAR variable are not
linked in from nvptx's libgomp.a. Thus, handle the failing cuModuleGetGlobal
gracefully by disabling reverse offload and assuming that the failure is fine.

libgomp/ChangeLog:

	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_load_image): Use unsigned int
	for 'i' to match 'fn_entries'; regard absent GOMP_REV_OFFLOAD_VAR
	as valid and the code having no reverse-offload code.
	* testsuite/libgomp.c-c++-common/reverse-offload-2.c: New test.

 libgomp/plugin/plugin-nvptx.c                      | 36 ++++++++++------
 .../libgomp.c-c++-common/reverse-offload-2.c       | 49 ++++++++++++++++++++++
 2 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 0768fca350b..e803f083591 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1390,7 +1390,8 @@  GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
   else if (rev_fn_table)
     {
       CUdeviceptr var;
-      size_t bytes, i;
+      size_t bytes;
+      unsigned int i;
       r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
 			     "$offload_func_table");
       if (r != CUDA_SUCCESS)
@@ -1413,12 +1414,11 @@  GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
 
   if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
     {
-      /* cuMemHostAlloc memory is accessible on the device, if unified-shared
-	 address is supported; this is assumed - see comment in
-	 nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.   */
-      CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
-			sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
-      CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
+      /* Get the on-device GOMP_REV_OFFLOAD_VAR variable.  It should be
+	 available but it might be not.  One reason could be: if the user code
+	 has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
+	 is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
+	 are not linked in.  */
       CUdeviceptr device_rev_offload_var;
       size_t device_rev_offload_size;
       CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
@@ -1426,11 +1426,23 @@  GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
 				      &device_rev_offload_size, module,
 				      XSTRING (GOMP_REV_OFFLOAD_VAR));
       if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuModuleGetGlobal error - GOMP_REV_OFFLOAD_VAR: %s", cuda_error (r));
-      r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
-			     sizeof (dp));
-      if (r != CUDA_SUCCESS)
-	GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
+	{
+	  free (*rev_fn_table);
+	  *rev_fn_table = NULL;
+	}
+      else
+	{
+	  /* cuMemHostAlloc memory is accessible on the device, if
+	     unified-shared address is supported; this is assumed - see comment
+	     in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
+	  CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
+			    sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
+	  CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
+	  r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
+				 sizeof (dp));
+	  if (r != CUDA_SUCCESS)
+	    GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
+	}
     }
 
   nvptx_set_clocktick (module, dev);
diff --git a/libgomp/testsuite/libgomp.c-c++-common/reverse-offload-2.c b/libgomp/testsuite/libgomp.c-c++-common/reverse-offload-2.c
new file mode 100644
index 00000000000..33bd38481bb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/reverse-offload-2.c
@@ -0,0 +1,49 @@ 
+/* { dg-do run }  */
+/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
+
+#pragma omp requires reverse_offload
+
+int
+main ()
+{
+  int A[10];
+  int y;
+
+  for (int i = 0; i < 10; i++)
+    A[i] = 2*i;
+
+  y = 42;
+
+  /* Pointlessly copy to the default device.  */
+  #pragma omp target data map(to: A)
+  {
+    /* Not enclosed in a target region (= i.e. running on the host); the
+       following is valid - it runs on the current device (= host).  */
+    #pragma omp target device ( ancestor:1 ) firstprivate(y) map(to: A)
+    {
+      if (y != 42)
+	__builtin_abort ();
+      for (int i = 0; i < 10; i++)
+	if (A[i] != 2*i)
+	  __builtin_abort ();
+      for (int i = 0; i < 10; i++)
+	if (A[i] != 2*i)
+	  A[i] = 4*i;
+      y = 31;
+    }
+
+    if (y != 42)
+      __builtin_abort ();
+    for (int i = 0; i < 10; i++)
+      if (A[i] != 2*i)
+	__builtin_abort ();
+  }
+
+  if (y != 42)
+    __builtin_abort ();
+  for (int i = 0; i < 10; i++)
+    if (A[i] != 2*i)
+      __builtin_abort ();
+
+  return 0;
+}