tree-optimization/111228 - combine two VEC_PERM_EXPRs

Message ID 20230830115447.3E5611353E@imap2.suse-dmz.suse.de
State Accepted
Headers
Series tree-optimization/111228 - combine two VEC_PERM_EXPRs |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

Richard Biener Aug. 30, 2023, 11:54 a.m. UTC
  The following adds simplification of two VEC_PERM_EXPRs where
the later one replaces all elements from either the first or the
second input of the earlier permute.  This allows a three input
permute to be simplified to a two input one.

I'm following the existing two input simplification case and only
allow non-VLA permutes.  The now existing three cases and the
single case in tree-ssa-forwprop.cc somehow ask for merging,
I'm not doing this as part of this change though.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

	PR tree-optimization/111228
	* match.pd ((vec_perm (vec_perm ..) @5 ..) -> (vec_perm @x @5 ..)):
	New simplifications.

	* gcc.dg/tree-ssa/forwprop-42.c: New testcase.
---
 gcc/match.pd                                | 141 +++++++++++++++++++-
 gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c |  17 +++
 2 files changed, 155 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
  

Comments

Jakub Jelinek Aug. 30, 2023, 12:09 p.m. UTC | #1
On Wed, Aug 30, 2023 at 01:54:46PM +0200, Richard Biener via Gcc-patches wrote:
> 	* gcc.dg/tree-ssa/forwprop-42.c: New testcase.

> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -fdump-tree-cddce1" } */
> +
> +typedef unsigned long v2di __attribute__((vector_size(16)));

Shouldn't this be unsigned long long ?  Otherwise it is actually V4SImode
rather than V2DImode.

> +
> +v2di g;
> +void test (v2di *v)
> +{
> +  v2di lo = v[0];
> +  v2di hi = v[1];
> +  v2di res;
> +  res[1] = hi[1];
> +  res[0] = lo[0];
> +  g = res;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR <\[^>\]*, { 0, 3 }>" 1 "cddce1" } } */
> -- 
> 2.35.3

	Jakub
  
Richard Biener Aug. 30, 2023, 12:25 p.m. UTC | #2
On Wed, 30 Aug 2023, Jakub Jelinek wrote:

> On Wed, Aug 30, 2023 at 01:54:46PM +0200, Richard Biener via Gcc-patches wrote:
> > 	* gcc.dg/tree-ssa/forwprop-42.c: New testcase.
> 
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O -fdump-tree-cddce1" } */
> > +
> > +typedef unsigned long v2di __attribute__((vector_size(16)));
> 
> Shouldn't this be unsigned long long ?  Otherwise it is actually V4SImode
> rather than V2DImode.

Fixed like this.

Richard.

From 695caedeb1b89ec05c727b2e2aacc2a27aa16c42 Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Wed, 30 Aug 2023 14:24:57 +0200
Subject: [PATCH] tree-optimization/111228 - fix testcase
To: gcc-patches@gcc.gnu.org

	* gcc.dg/tree-ssa/forwprop-42.c: Use __UINT64_TYPE__ instead
	of unsigned long.
---
 gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
index f3dbc3e9394..257a05d3ec8 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O -fdump-tree-cddce1" } */
 
-typedef unsigned long v2di __attribute__((vector_size(16)));
+typedef __UINT64_TYPE__ v2di __attribute__((vector_size(16)));
 
 v2di g;
 void test (v2di *v)
  

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index 47d2733211a..6a7edde5736 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8993,10 +8993,10 @@  and,
 
 
 /* Merge
-   c = VEC_PERM_EXPR <a, b, VCST0>;
-   d = VEC_PERM_EXPR <c, c, VCST1>;
+     c = VEC_PERM_EXPR <a, b, VCST0>;
+     d = VEC_PERM_EXPR <c, c, VCST1>;
    to
-   d = VEC_PERM_EXPR <a, b, NEW_VCST>;  */
+     d = VEC_PERM_EXPR <a, b, NEW_VCST>;  */
 
 (simplify
  (vec_perm (vec_perm@0 @1 @2 VECTOR_CST@3) @0 VECTOR_CST@4)
@@ -9038,6 +9038,141 @@  and,
      (if (op0)
       (vec_perm @1 @2 { op0; })))))))
 
+/* Merge
+     c = VEC_PERM_EXPR <a, b, VCST0>;
+     d = VEC_PERM_EXPR <x, c, VCST1>;
+   to
+     d = VEC_PERM_EXPR <x, {a,b}, NEW_VCST>;
+   when all elements from a or b are replaced by the later
+   permutation.  */
+
+(simplify
+ (vec_perm @5 (vec_perm@0 @1 @2 VECTOR_CST@3) VECTOR_CST@4)
+ (if (TYPE_VECTOR_SUBPARTS (type).is_constant ())
+  (with
+   {
+     machine_mode result_mode = TYPE_MODE (type);
+     machine_mode op_mode = TYPE_MODE (TREE_TYPE (@1));
+     int nelts = TYPE_VECTOR_SUBPARTS (type).to_constant ();
+     vec_perm_builder builder0;
+     vec_perm_builder builder1;
+     vec_perm_builder builder2 (nelts, nelts, 2);
+   }
+   (if (tree_to_vec_perm_builder (&builder0, @3)
+	&& tree_to_vec_perm_builder (&builder1, @4))
+    (with
+     {
+       vec_perm_indices sel0 (builder0, 2, nelts);
+       vec_perm_indices sel1 (builder1, 2, nelts);
+       bool use_1 = false, use_2 = false;
+
+       for (int i = 0; i < nelts; i++)
+         {
+	   if (known_lt ((poly_uint64)sel1[i], sel1.nelts_per_input ()))
+	     builder2.quick_push (sel1[i]);
+	   else
+	     {
+	       poly_uint64 j = sel0[(sel1[i] - sel1.nelts_per_input ())
+				    .to_constant ()];
+	       if (known_lt (j, sel0.nelts_per_input ()))
+	         use_1 = true;
+	       else
+	         {
+		   use_2 = true;
+		   j -= sel0.nelts_per_input ();
+		 }
+	       builder2.quick_push (j + sel1.nelts_per_input ());
+	     }
+	 }
+     }
+     (if (use_1 ^ use_2)
+      (with
+       {
+	 vec_perm_indices sel2 (builder2, 2, nelts);
+	 tree op0 = NULL_TREE;
+	 /* If the new VEC_PERM_EXPR can't be handled but both
+	    original VEC_PERM_EXPRs can, punt.
+	    If one or both of the original VEC_PERM_EXPRs can't be
+	    handled and the new one can't be either, don't increase
+	    number of VEC_PERM_EXPRs that can't be handled.  */
+	 if (can_vec_perm_const_p (result_mode, op_mode, sel2, false)
+	     || (single_use (@0)
+		 ? (!can_vec_perm_const_p (result_mode, op_mode, sel0, false)
+		    || !can_vec_perm_const_p (result_mode, op_mode, sel1, false))
+		 : !can_vec_perm_const_p (result_mode, op_mode, sel1, false)))
+	   op0 = vec_perm_indices_to_tree (TREE_TYPE (@4), sel2);
+       }
+       (if (op0)
+	(switch
+	 (if (use_1)
+	  (vec_perm @5 @1 { op0; }))
+	 (if (use_2)
+	  (vec_perm @5 @2 { op0; })))))))))))
+
+/* And the case with swapped outer permute sources.  */
+
+(simplify
+ (vec_perm (vec_perm@0 @1 @2 VECTOR_CST@3) @5 VECTOR_CST@4)
+ (if (TYPE_VECTOR_SUBPARTS (type).is_constant ())
+  (with
+   {
+     machine_mode result_mode = TYPE_MODE (type);
+     machine_mode op_mode = TYPE_MODE (TREE_TYPE (@1));
+     int nelts = TYPE_VECTOR_SUBPARTS (type).to_constant ();
+     vec_perm_builder builder0;
+     vec_perm_builder builder1;
+     vec_perm_builder builder2 (nelts, nelts, 2);
+   }
+   (if (tree_to_vec_perm_builder (&builder0, @3)
+	&& tree_to_vec_perm_builder (&builder1, @4))
+    (with
+     {
+       vec_perm_indices sel0 (builder0, 2, nelts);
+       vec_perm_indices sel1 (builder1, 2, nelts);
+       bool use_1 = false, use_2 = false;
+
+       for (int i = 0; i < nelts; i++)
+         {
+	   if (known_ge ((poly_uint64)sel1[i], sel1.nelts_per_input ()))
+	     builder2.quick_push (sel1[i]);
+	   else
+	     {
+	       poly_uint64 j = sel0[sel1[i].to_constant ()];
+	       if (known_lt (j, sel0.nelts_per_input ()))
+	         use_1 = true;
+	       else
+	         {
+		   use_2 = true;
+		   j -= sel0.nelts_per_input ();
+		 }
+	       builder2.quick_push (j);
+	     }
+	 }
+     }
+     (if (use_1 ^ use_2)
+      (with
+       {
+	 vec_perm_indices sel2 (builder2, 2, nelts);
+	 tree op0 = NULL_TREE;
+	 /* If the new VEC_PERM_EXPR can't be handled but both
+	    original VEC_PERM_EXPRs can, punt.
+	    If one or both of the original VEC_PERM_EXPRs can't be
+	    handled and the new one can't be either, don't increase
+	    number of VEC_PERM_EXPRs that can't be handled.  */
+	 if (can_vec_perm_const_p (result_mode, op_mode, sel2, false)
+	     || (single_use (@0)
+		 ? (!can_vec_perm_const_p (result_mode, op_mode, sel0, false)
+		    || !can_vec_perm_const_p (result_mode, op_mode, sel1, false))
+		 : !can_vec_perm_const_p (result_mode, op_mode, sel1, false)))
+	   op0 = vec_perm_indices_to_tree (TREE_TYPE (@4), sel2);
+       }
+       (if (op0)
+	(switch
+	 (if (use_1)
+	  (vec_perm @1 @5 { op0; }))
+	 (if (use_2)
+	  (vec_perm @2 @5 { op0; })))))))))))
+
 
 /* Match count trailing zeroes for simplify_count_trailing_zeroes in fwprop.
    The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
new file mode 100644
index 00000000000..f3dbc3e9394
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-cddce1" } */
+
+typedef unsigned long v2di __attribute__((vector_size(16)));
+
+v2di g;
+void test (v2di *v)
+{
+  v2di lo = v[0];
+  v2di hi = v[1];
+  v2di res;
+  res[1] = hi[1];
+  res[0] = lo[0];
+  g = res;
+}
+
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR <\[^>\]*, { 0, 3 }>" 1 "cddce1" } } */