Enable tpause Exponential backoff and thread delay

Message ID 20230802084314.965951-1-jun.zhang@intel.com
State Accepted
Headers
Series Enable tpause Exponential backoff and thread delay |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Li, Pan2 via Gcc-patches Aug. 2, 2023, 8:43 a.m. UTC
  There are two kinds of pause bottleneck, one is in user space, the other
is in kernel. Tpause plus backoff could reduce loop count in user space.
To kernel, Because tasks start at same time, they usually arrive critial
area at same time, this decrease performance. tasks started one by one
could avoid it.

include/ChangeLog:

	* localfn.h: define RUNLOCALFN.

libgomp/ChangeLog:

	* config/linux/wait.h: split do_spin
	* env.c (initialize_env): set gomp_thread_delay_count default
	value
	* libgomp.h: add gomp_thread_delay_count
	* team.c (gomp_thread_start): add RUNLOCALFN
	* config/linux/spin.h: head file.
	* config/linux/x86/localfn.h: implement thread delay.
	* config/linux/x86/mutex.c: implement tpause backoff.
	* config/linux/x86/spin.h: spin head file.
---
 include/localfn.h                  |  6 +++
 libgomp/config/linux/spin.h        | 12 ++++++
 libgomp/config/linux/wait.h        | 11 ++---
 libgomp/config/linux/x86/localfn.h | 19 +++++++++
 libgomp/config/linux/x86/mutex.c   | 66 ++++++++++++++++++++++++++++++
 libgomp/config/linux/x86/spin.h    |  5 +++
 libgomp/env.c                      |  4 ++
 libgomp/libgomp.h                  |  1 +
 libgomp/team.c                     |  8 ++--
 9 files changed, 121 insertions(+), 11 deletions(-)
 create mode 100644 include/localfn.h
 create mode 100644 libgomp/config/linux/spin.h
 create mode 100644 libgomp/config/linux/x86/localfn.h
 create mode 100644 libgomp/config/linux/x86/mutex.c
 create mode 100644 libgomp/config/linux/x86/spin.h
  

Patch

diff --git a/include/localfn.h b/include/localfn.h
new file mode 100644
index 00000000000..998e6554aec
--- /dev/null
+++ b/include/localfn.h
@@ -0,0 +1,6 @@ 
+#define RUNLOCALFN(a, b, c)  \
+  do \
+    { \
+      a (b); \
+    } \
+  while (0)
diff --git a/libgomp/config/linux/spin.h b/libgomp/config/linux/spin.h
new file mode 100644
index 00000000000..ad8eba275ed
--- /dev/null
+++ b/libgomp/config/linux/spin.h
@@ -0,0 +1,12 @@ 
+static inline int
+do_spin_for_count (int *addr, int val, unsigned long long count)
+{
+  unsigned long long i;
+  for (i = 0; i < count; i++)
+    if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val, 0))
+      return 0;
+    else
+      cpu_relax ();
+  return 1;
+}
+
diff --git a/libgomp/config/linux/wait.h b/libgomp/config/linux/wait.h
index 29d745f7141..17b7ef11c96 100644
--- a/libgomp/config/linux/wait.h
+++ b/libgomp/config/linux/wait.h
@@ -44,21 +44,16 @@ 
 extern int gomp_futex_wait, gomp_futex_wake;
 
 #include <futex.h>
-
+#include <spin.h>
 static inline int do_spin (int *addr, int val)
 {
-  unsigned long long i, count = gomp_spin_count_var;
+  unsigned long long count = gomp_spin_count_var;
 
   if (__builtin_expect (__atomic_load_n (&gomp_managed_threads,
                                          MEMMODEL_RELAXED)
                         > gomp_available_cpus, 0))
     count = gomp_throttled_spin_count_var;
-  for (i = 0; i < count; i++)
-    if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val, 0))
-      return 0;
-    else
-      cpu_relax ();
-  return 1;
+  return do_spin_for_count (addr, val, count);
 }
 
 static inline void do_wait (int *addr, int val)
diff --git a/libgomp/config/linux/x86/localfn.h b/libgomp/config/linux/x86/localfn.h
new file mode 100644
index 00000000000..379aced99ee
--- /dev/null
+++ b/libgomp/config/linux/x86/localfn.h
@@ -0,0 +1,19 @@ 
+#ifdef __x86_64__
+static inline void
+gomp_thread_delay(unsigned int count)
+{
+  unsigned long long i;
+  for (i = 0; i < count * gomp_thread_delay_count; i++)
+    __builtin_ia32_pause ();
+}
+
+#define RUNLOCALFN(a, b, c)  \
+  do \
+    { \
+      gomp_thread_delay(c); \
+      a (b); \
+    } \
+  while (0)
+#else
+# include "../../../../include/localfn.h"
+#endif
diff --git a/libgomp/config/linux/x86/mutex.c b/libgomp/config/linux/x86/mutex.c
new file mode 100644
index 00000000000..5a14efb522e
--- /dev/null
+++ b/libgomp/config/linux/x86/mutex.c
@@ -0,0 +1,66 @@ 
+#include "../mutex.c"
+
+#ifdef __x86_64__
+static inline int
+do_spin_for_count_generic (int *addr, int val, unsigned long long count)
+{
+  unsigned long long i;
+  for (i = 0; i < count; i++)
+    if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val,
+			  0))
+      return 0;
+    else
+      cpu_relax ();
+  return 1;
+}
+
+#ifndef __WAITPKG__
+#pragma GCC push_options
+#pragma GCC target("waitpkg")
+#define __DISABLE_WAITPKG__
+#endif /* __WAITPKG__ */
+
+static inline unsigned long long __rdtsc(void)
+{
+  unsigned long long var;
+  unsigned int hi, lo;
+
+  __asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
+
+  var = ((unsigned long long)hi << 32) | lo;
+  return var;
+}
+
+#define PAUSE_TP 200
+static inline int
+do_spin_for_backoff_tpause (int *addr, int val, unsigned long long count)
+{
+  unsigned int ctrl = 1;
+  unsigned long long wait_time = 1;
+  unsigned long long mask = 1ULL << __builtin_ia32_bsrdi(count * PAUSE_TP);
+  do
+    {
+      __builtin_ia32_tpause (ctrl, wait_time + __rdtsc());
+      wait_time = (wait_time << 1) | 1;
+      if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_RELAXED) != val,
+			    0))
+	return 0;
+    }
+  while ((wait_time & mask) == 0);
+  return 1;
+}
+
+#ifdef __DISABLE_WAITPKG__
+#undef __DISABLE_WAITPKG__
+#pragma GCC pop_options
+#endif /* __DISABLE_WAITPKG__ */
+
+int do_spin_for_count (int *addr, int val, unsigned long long count)
+{
+  if(__builtin_cpu_supports ("waitpkg"))
+    return do_spin_for_backoff_tpause(addr, val, count);
+  else
+    return do_spin_for_count_generic(addr, val, count);
+}
+
+#endif
diff --git a/libgomp/config/linux/x86/spin.h b/libgomp/config/linux/x86/spin.h
new file mode 100644
index 00000000000..fb8529af026
--- /dev/null
+++ b/libgomp/config/linux/x86/spin.h
@@ -0,0 +1,5 @@ 
+#ifdef __x86_64__
+extern int do_spin_for_count (int *, int, unsigned long long) ;
+#else
+# include "../spin.h"
+#endif
diff --git a/libgomp/env.c b/libgomp/env.c
index f24484d7f70..6a96a4b0df1 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -106,6 +106,7 @@  gomp_mutex_t gomp_managed_threads_lock;
 #endif
 unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
 unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
+unsigned long long gomp_thread_delay_count;
 unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
 char *gomp_bind_var_list;
 unsigned long gomp_bind_var_list_len;
@@ -2419,6 +2420,9 @@  initialize_env (void)
   else if (all != NULL && gomp_get_icv_flag (all->flags, GOMP_ICV_WAIT_POLICY))
     wait_policy = all->icvs.wait_policy;
 
+  if (!parse_spincount ("GOMP_DELAYCOUNT", &gomp_thread_delay_count))
+    gomp_thread_delay_count = 300;
+
   if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
     {
       /* Using a rough estimation of 100000 spins per msec,
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 4d2bfab4b71..c3ccf247f6c 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -596,6 +596,7 @@  extern bool gomp_cancel_var;
 extern enum gomp_target_offload_t gomp_target_offload_var;
 extern int gomp_max_task_priority_var;
 extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
+extern unsigned long long gomp_thread_delay_count;
 extern unsigned long gomp_available_cpus, gomp_managed_threads;
 extern unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
 extern char *gomp_bind_var_list;
diff --git a/libgomp/team.c b/libgomp/team.c
index 54dfca8080a..2a5aff72654 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -30,6 +30,7 @@ 
 #include "pool.h"
 #include <stdlib.h>
 #include <string.h>
+#include "localfn.h"
 
 #ifdef LIBGOMP_USE_PTHREADS
 pthread_attr_t gomp_thread_attr;
@@ -62,7 +63,6 @@  struct gomp_thread_start_data
   pthread_t handle;
 };
 
-
 /* This function is a pthread_create entry point.  This contains the idle
    loop in which a thread waits to be called up to become part of a team.  */
 
@@ -111,7 +111,8 @@  gomp_thread_start (void *xdata)
 
       gomp_barrier_wait (&team->barrier);
 
-      local_fn (local_data);
+      RUNLOCALFN(local_fn, local_data, thr->ts.team_id);
+
       gomp_team_barrier_wait_final (&team->barrier);
       gomp_finish_task (task);
       gomp_barrier_wait_last (&team->barrier);
@@ -126,7 +127,8 @@  gomp_thread_start (void *xdata)
 	  struct gomp_team *team = thr->ts.team;
 	  struct gomp_task *task = thr->task;
 
-	  local_fn (local_data);
+	  RUNLOCALFN(local_fn, local_data, thr->ts.team_id);
+
 	  gomp_team_barrier_wait_final (&team->barrier);
 	  gomp_finish_task (task);