Ping: [PATCH] enable ATOMIC_COMPARE_EXCHANGE opt for floating type or types contains padding

Message ID CAJ=gGT2zjNN6Pf88rkKw5c0P0k4McCeYpkKO9-VCE1bvB4tmAg@mail.gmail.com
State Unresolved
Headers
Series Ping: [PATCH] enable ATOMIC_COMPARE_EXCHANGE opt for floating type or types contains padding |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

xndcn Jan. 3, 2024, 3:42 p.m. UTC
  Hi, I am new to this, and I really need your advice, thanks.

I noticed PR71716 and I want to enable ATOMIC_COMPARE_EXCHANGE
internal-fn optimization

for floating type or types contains padding (e.g., long double).
Please correct me if I happen to
make any mistakes, Thanks!

Firstly, about the concerns of sNaNs float/doduble value, it seems
work well and shall have been
covered by testsuite/gcc.dg/atomic/c11-atomic-exec-5.c

Secondly, since ATOMIC_COMPARE_EXCHANGE is only enabled when expected
var is only addressable
because of the call, the padding bits can not be modified by any other
stmts. So we can save all
bits after ATOMIC_COMPARE_EXCHANGE call and extract the padding bits.
After first iteration, the
extracted padding bits can be mixed with the expected var.

Bootstrapped/regtested on x86_64-linux.

I did some benchmarks, and there is some significant time optimization
for float/double types,

while there is no regression for long double type.

Thanks,

xndcn


gcc/ChangeLog:

	* gimple-fold.cc (optimize_atomic_compare_exchange_p): enable
	for SCALAR_FLOAT_TYPE_P type of expected var, or if TYPE_PRECISION
	is different from mode's bitsize
	(fold_builtin_atomic_compare_exchange): if TYPE_PRECISION is
	different from mode's bitsize, try to keep track all the bits and
	mix it with VIEW_CONVERT_EXPR<itype>(expected).

Signed-off-by: xndcn <xndchn@gmail.com>
---
 gcc/gimple-fold.cc | 77 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 71 insertions(+), 6 deletions(-)

   gsi_insert_before (gsi, g, GSI_SAME_STMT);
@@ -5362,6 +5359,67 @@ fold_builtin_atomic_compare_exchange
(gimple_stmt_iterator *gsi)
 			       build1 (VIEW_CONVERT_EXPR, itype,
 				       gimple_assign_lhs (g)));
       gsi_insert_before (gsi, g, GSI_SAME_STMT);
+
+      // VIEW_CONVERT_EXPRs might not preserve all the bits.  See PR71716.
+      // so we have to keep track all bits here.
+      if (maybe_ne (TYPE_PRECISION (etype),
+		    GET_MODE_BITSIZE (TYPE_MODE (etype))))
+	{
+	  gimple_stmt_iterator cgsi
+	    = gsi_after_labels (single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
+	  allbits = create_tmp_var (itype);
+	  // allbits is initialized to 0, which can be ignored first time
+	  gimple *init_stmt
+	    = gimple_build_assign (allbits, build_int_cst (itype, 0));
+	  gsi_insert_before (&cgsi, init_stmt, GSI_SAME_STMT);
+	  tree maskbits = create_tmp_var (itype);
+	  // maskbits is initialized to full 1 (0xFFF...)
+	  init_stmt = gimple_build_assign (maskbits, build1 (BIT_NOT_EXPR,
+							     itype, allbits));
+	  gsi_insert_before (&cgsi, init_stmt, GSI_SAME_STMT);
+
+	  // g = g & maskbits
+	  g = gimple_build_assign (make_ssa_name (itype),
+				   build2 (BIT_AND_EXPR, itype,
+					   gimple_assign_lhs (g), maskbits));
+	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
+
+	  gimple *def_mask = gimple_build_assign (
+	    make_ssa_name (itype),
+	    build2 (LSHIFT_EXPR, itype, build_int_cst (itype, 1),
+		    build_int_cst (itype, TYPE_PRECISION (etype))));
+	  gsi_insert_before (gsi, def_mask, GSI_SAME_STMT);
+	  def_mask = gimple_build_assign (make_ssa_name (itype),
+					  build2 (MINUS_EXPR, itype,
+						  gimple_assign_lhs (def_mask),
+						  build_int_cst (itype, 1)));
+	  gsi_insert_before (gsi, def_mask, GSI_SAME_STMT);
+	  // maskbits = (1 << TYPE_PRECISION (etype)) - 1
+	  def_mask = gimple_build_assign (maskbits, SSA_NAME,
+					  gimple_assign_lhs (def_mask));
+	  gsi_insert_before (gsi, def_mask, GSI_SAME_STMT);
+
+	  // paddingbits = (~maskbits) & allbits
+	  def_mask
+	    = gimple_build_assign (make_ssa_name (itype),
+				   build1 (BIT_NOT_EXPR, itype,
+					   gimple_assign_lhs (def_mask)));
+	  gsi_insert_before (gsi, def_mask, GSI_SAME_STMT);
+	  def_mask
+	    = gimple_build_assign (make_ssa_name (itype),
+				   build2 (BIT_AND_EXPR, itype, allbits,
+					   gimple_assign_lhs (def_mask)));
+	  gsi_insert_before (gsi, def_mask, GSI_SAME_STMT);
+
+	  // g = g | paddingbits, i.e.,
+	  // g = (VIEW_CONVERT_EXPR<itype>(expected) & maskbits)
+	  //       | (allbits &(~maskbits))
+	  g = gimple_build_assign (make_ssa_name (itype),
+				   build2 (BIT_IOR_EXPR, itype,
+					   gimple_assign_lhs (g),
+					   gimple_assign_lhs (def_mask)));
+	  gsi_insert_before (gsi, g, GSI_SAME_STMT);
+	}
     }
   int flag = (integer_onep (gimple_call_arg (stmt, 3)) ? 256 : 0)
 	     + int_size_in_bytes (itype);
@@ -5410,6 +5468,13 @@ fold_builtin_atomic_compare_exchange
(gimple_stmt_iterator *gsi)
     gsi_insert_after (gsi, g, GSI_NEW_STMT);
   if (!useless_type_conversion_p (TREE_TYPE (expected), itype))
     {
+      // save all bits here
+      if (maybe_ne (TYPE_PRECISION (etype),
+		    GET_MODE_BITSIZE (TYPE_MODE (etype))))
+	{
+	  g = gimple_build_assign (allbits, SSA_NAME, gimple_assign_lhs (g));
+	  gsi_insert_after (gsi, g, GSI_NEW_STMT);
+	}
       g = gimple_build_assign (make_ssa_name (TREE_TYPE (expected)),
 			       VIEW_CONVERT_EXPR,
 			       build1 (VIEW_CONVERT_EXPR, TREE_TYPE (expected),
  

Comments

Jakub Jelinek Jan. 3, 2024, 3:52 p.m. UTC | #1
On Wed, Jan 03, 2024 at 11:42:58PM +0800, xndcn wrote:
> Hi, I am new to this, and I really need your advice, thanks.
> 
> I noticed PR71716 and I want to enable ATOMIC_COMPARE_EXCHANGE
> internal-fn optimization
> 
> for floating type or types contains padding (e.g., long double).
> Please correct me if I happen to
> make any mistakes, Thanks!
> 
> Firstly, about the concerns of sNaNs float/doduble value, it seems
> work well and shall have been
> covered by testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
> 
> Secondly, since ATOMIC_COMPARE_EXCHANGE is only enabled when expected
> var is only addressable
> because of the call, the padding bits can not be modified by any other
> stmts. So we can save all
> bits after ATOMIC_COMPARE_EXCHANGE call and extract the padding bits.
> After first iteration, the
> extracted padding bits can be mixed with the expected var.
> 
> Bootstrapped/regtested on x86_64-linux.
> 
> I did some benchmarks, and there is some significant time optimization
> for float/double types,
> 
> while there is no regression for long double type.

If anything, this should be using clear_padding_type_may_have_padding_p
and call to __builtin_clear_padding.
Code in that file uses /* ... */ style comments, so please use them instead
of // for consistency, and furthermore comments should be terminated with
a dot and two spaces before */

Also, I don't think this is late stage3 material, so needs to wait for GCC
15.

	Jakub
  
xndcn Jan. 4, 2024, 4:19 p.m. UTC | #2
Thanks! I am trying to re-write by calling __builtin_clear_padding. But I
found gimple_fold_builtin_clear_padding seems only work before SSA pass.
Should I remove the assertion?

On the other hand, since ATOMIC_COMPARE_EXCHANGE will only work for simple
reg type. excluding vector or complex types, is it enough to generate the
mask by simple bit operations?

Thanks!,
xndcn

---
static bool
gimple_fold_builtin_clear_padding (gimple_stmt_iterator *gsi)
{ ...
  /* This should be folded during the lower pass.  */
  gcc_assert (!gimple_in_ssa_p (cfun) && cfun->cfg == NULL);


Jakub Jelinek <jakub@redhat.com> 于2024年1月3日周三 23:52写道:

> On Wed, Jan 03, 2024 at 11:42:58PM +0800, xndcn wrote:
> > Hi, I am new to this, and I really need your advice, thanks.
> >
> > I noticed PR71716 and I want to enable ATOMIC_COMPARE_EXCHANGE
> > internal-fn optimization
> >
> > for floating type or types contains padding (e.g., long double).
> > Please correct me if I happen to
> > make any mistakes, Thanks!
> >
> > Firstly, about the concerns of sNaNs float/doduble value, it seems
> > work well and shall have been
> > covered by testsuite/gcc.dg/atomic/c11-atomic-exec-5.c
> >
> > Secondly, since ATOMIC_COMPARE_EXCHANGE is only enabled when expected
> > var is only addressable
> > because of the call, the padding bits can not be modified by any other
> > stmts. So we can save all
> > bits after ATOMIC_COMPARE_EXCHANGE call and extract the padding bits.
> > After first iteration, the
> > extracted padding bits can be mixed with the expected var.
> >
> > Bootstrapped/regtested on x86_64-linux.
> >
> > I did some benchmarks, and there is some significant time optimization
> > for float/double types,
> >
> > while there is no regression for long double type.
>
> If anything, this should be using clear_padding_type_may_have_padding_p
> and call to __builtin_clear_padding.
> Code in that file uses /* ... */ style comments, so please use them instead
> of // for consistency, and furthermore comments should be terminated with
> a dot and two spaces before */
>
> Also, I don't think this is late stage3 material, so needs to wait for GCC
> 15.
>
>         Jakub
>
>
  

Patch

diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
index cb4b57250..321ff4f41 100644
--- a/gcc/gimple-fold.cc
+++ b/gcc/gimple-fold.cc
@@ -5306,12 +5306,7 @@  optimize_atomic_compare_exchange_p (gimple *stmt)
       || !auto_var_in_fn_p (TREE_OPERAND (expected, 0), current_function_decl)
       || TREE_THIS_VOLATILE (etype)
       || VECTOR_TYPE_P (etype)
-      || TREE_CODE (etype) == COMPLEX_TYPE
-      /* Don't optimize floating point expected vars, VIEW_CONVERT_EXPRs
-	 might not preserve all the bits.  See PR71716.  */
-      || SCALAR_FLOAT_TYPE_P (etype)
-      || maybe_ne (TYPE_PRECISION (etype),
-		   GET_MODE_BITSIZE (TYPE_MODE (etype))))
+      || TREE_CODE (etype) == COMPLEX_TYPE)
     return false;

   tree weak = gimple_call_arg (stmt, 3);
@@ -5350,8 +5345,10 @@  fold_builtin_atomic_compare_exchange
(gimple_stmt_iterator *gsi)
   tree itype = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (parmt)));
   tree ctype = build_complex_type (itype);
   tree expected = TREE_OPERAND (gimple_call_arg (stmt, 1), 0);
+  tree etype = TREE_TYPE (expected);
   bool throws = false;
   edge e = NULL;
+  tree allbits = NULL_TREE;
   gimple *g = gimple_build_assign (make_ssa_name (TREE_TYPE (expected)),
 				   expected);