[4/4,RFC] VLA Constructor

Message ID 3f90f079-8c12-2547-c925-a28779fdb267@arm.com
State New, archived
Headers
Series aarch64: Improve codegen for dups and constructors |

Commit Message

Andre Vieira (lists) Aug. 5, 2022, 12:58 p.m. UTC
  This isn't really a 'PATCH' yet, it's something I was working on but had 
to put on hold. Feel free to re-use any bits or trash all of it if you'd 
like.
  

Comments

Richard Biener Aug. 8, 2022, 12:12 p.m. UTC | #1
On Fri, Aug 5, 2022 at 2:59 PM Andre Vieira (lists) via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This isn't really a 'PATCH' yet, it's something I was working on but had
> to put on hold. Feel free to re-use any bits or trash all of it if you'd
> like.

@@ -10264,6 +10264,44 @@ expand_expr_real_2 (sepops ops, rtx target,
machine_mode tmode,

     case VEC_PERM_EXPR:
       {
+       if (TREE_CODE (treeop2) == VECTOR_CST
+           && targetm.vectorize.vla_constructor)
+         {
+           tree ctor0, ctor1;
+           if (TREE_CODE (treeop0) == SSA_NAME
+               && is_gimple_assign (SSA_NAME_DEF_STMT (treeop0)))
+             ctor0 = gimple_assign_rhs1 (SSA_NAME_DEF_STMT (treeop0));
+           else
+             ctor0 = treeop0;
+           if (TREE_CODE (treeop1) == SSA_NAME
+               && is_gimple_assign (SSA_NAME_DEF_STMT (treeop1)))
+             ctor1 = gimple_assign_rhs1 (SSA_NAME_DEF_STMT (treeop1));

just to say - you can't lookup things like this, you have to go through the TER
machinery, otherwise the expansions for the CTOR elements might be
clobbered already.  That means to be fully effective doing this during RTL
expansion is likely limited.
  

Patch

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 82f9eba5c397af04924bdebdc684a1d77682d3fd..08625aad7b1a8dc9c9f8c491cb13d8af0b46a946 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -842,13 +842,45 @@  public:
     for (unsigned int i = 0; i < nargs; ++i)
       {
 	tree elt = gimple_call_arg (f.call, i);
-	if (!CONSTANT_CLASS_P (elt))
-	  return NULL;
 	builder.quick_push (elt);
 	for (unsigned int j = 1; j < factor; ++j)
 	  builder.quick_push (build_zero_cst (TREE_TYPE (vec_type)));
       }
-    return gimple_build_assign (f.lhs, builder.build ());
+    builder.finalize ();
+    unsigned int n_elts
+      = builder.nelts_per_pattern () == 1 ? builder.npatterns ()
+					  : builder.full_nelts ().coeffs[0];
+
+    if (n_elts == 1)
+      return gimple_build_assign (f.lhs, build1 (VEC_DUPLICATE_EXPR, vec_type,
+						 builder.elt (0)));
+    tree list = NULL_TREE;
+    tree *pp = &list;
+    for (unsigned int i = 0; i < n_elts; ++i)
+      {
+	*pp = build_tree_list (NULL, builder.elt (i) PASS_MEM_STAT);
+	pp = &TREE_CHAIN (*pp);
+      }
+
+    poly_uint64 vec_len = TYPE_VECTOR_SUBPARTS (vec_type);
+    vec_perm_builder sel (vec_len, n_elts, 1);
+    for (unsigned int i = 0; i < n_elts; i++)
+      sel.quick_push (i);
+    vec_perm_indices indices (sel, 1, n_elts);
+
+    tree elt_type = TREE_TYPE (vec_type);
+
+    tree ctor_type = build_vector_type (elt_type, n_elts);
+    tree ctor = make_ssa_name_fn (cfun, ctor_type, 0);
+    gimple *ctor_stmt
+      = gimple_build_assign (ctor,
+			     build_constructor_from_list (ctor_type, list));
+    gsi_insert_before (f.gsi, ctor_stmt, GSI_SAME_STMT);
+
+    tree mask_type = build_vector_type (ssizetype, vec_len);
+    tree mask = vec_perm_indices_to_tree (mask_type, indices);
+    return gimple_build_assign (f.lhs, fold_build3 (VEC_PERM_EXPR, vec_type,
+						    ctor, ctor, mask));
   }
 
   rtx
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index bd60e65b0c3f05f1c931f03807170f3b9d699de5..dec935211e5a064239c858880a696e6ca3fe1ae2 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2544,6 +2544,17 @@ 
   }
 )
 
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (LE version).
+(define_insn "*aarch64_vec_duplicate_reg<mode>_le"
+  [(set (match_operand:SVE_FULL 0 "register_operand" "=w,w")
+	(vec_duplicate:SVE_FULL
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
+  "TARGET_SVE && !BYTES_BIG_ENDIAN"
+  "@
+   mov\t%0.<Vetype>, %<vwcore>1
+   mov\t%0.<Vetype>, %<Vetype>1"
+)
+
 ;; Duplicate an Advanced SIMD vector to fill an SVE vector (BE version).
 ;; The SVE register layout puts memory lane N into (architectural)
 ;; register lane N, whereas the Advanced SIMD layout puts the memory
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index a08043e18d609e258ebfe033875201163d129aba..9b118e4101d0a5995a833769433be49321ab2151 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6033,7 +6033,6 @@  rtx
 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
 {
   machine_mode src_mode = GET_MODE (src);
-  gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
   insn_code icode = (BYTES_BIG_ENDIAN
 		     ? code_for_aarch64_vec_duplicate_vq_be (mode)
 		     : code_for_aarch64_vec_duplicate_vq_le (mode));
@@ -21806,20 +21805,29 @@  aarch64_simd_make_constant (rtx vals)
 }
 
 static void
-aarch64_vec_duplicate (rtx target, machine_mode mode, machine_mode element_mode,
+aarch64_vec_duplicate (rtx target, rtx op, machine_mode mode, machine_mode element_mode,
 		       int narrow_n_elts)
 {
   poly_uint64 size = narrow_n_elts * GET_MODE_BITSIZE (element_mode);
-  scalar_mode i_mode = int_mode_for_size (size, 0).require ();
   machine_mode o_mode;
-  if (aarch64_sve_mode_p (mode))
-    o_mode = aarch64_full_sve_mode (i_mode).require ();
+  rtx input, output;
+  bool sve = aarch64_sve_mode_p (mode);
+  if (sve && known_eq (size, 128U))
+    {
+      o_mode = mode;
+      output = target;
+      input = op;
+    }
   else
-    o_mode
-      = aarch64_simd_container_mode (i_mode,
-				     GET_MODE_BITSIZE (mode));
-  rtx input = simplify_gen_subreg (i_mode, target, mode, 0);
-  rtx output = simplify_gen_subreg (o_mode, target, mode, 0);
+    {
+      scalar_mode i_mode = int_mode_for_size (size, 0).require ();
+      o_mode
+	= sve ? aarch64_full_sve_mode (i_mode).require ()
+	      : aarch64_simd_container_mode (i_mode,
+					     GET_MODE_BITSIZE (mode));
+      input = simplify_gen_subreg (i_mode, op, GET_MODE (op), 0);
+      output = simplify_gen_subreg (o_mode, target, mode, 0);
+    }
   aarch64_emit_move (output, gen_vec_duplicate (o_mode, input));
 }
 
@@ -21910,6 +21918,16 @@  aarch64_expand_vector_init (rtx target, rtx_vector_builder &v)
       return;
     }
 
+  /* We are constructing a VLS vector that we may later duplicate into a VLA
+     one.  Actually maybe split this into one for ASIMD and one for SVE? */
+  machine_mode real_mode = mode;
+  rtx real_target = target;
+  if (aarch64_sve_mode_p (real_mode))
+    {
+      mode = aarch64_vq_mode (GET_MODE_INNER (real_mode)).require ();
+      target = simplify_gen_subreg (mode, target, real_mode, 0);
+    }
+
   enum insn_code icode = optab_handler (vec_set_optab, mode);
   gcc_assert (icode != CODE_FOR_nothing);
 
@@ -22000,8 +22018,8 @@  aarch64_expand_vector_init (rtx target, rtx_vector_builder &v)
 	  x = copy_to_mode_reg (inner_mode, x);
 	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
 	}
-	if (!known_eq (v.full_nelts (), n_elts))
-	  aarch64_vec_duplicate (target, mode, GET_MODE (v0), n_elts);
+      if (!known_eq (v.full_nelts (), n_elts))
+	aarch64_vec_duplicate (real_target, target, real_mode, GET_MODE (v0), n_elts);
       return;
     }
 
@@ -22048,7 +22066,7 @@  aarch64_expand_vector_init (rtx target, rtx_vector_builder &v)
       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
     }
   if (!known_eq (v.full_nelts (), n_elts))
-    aarch64_vec_duplicate (target, mode, inner_mode, n_elts);
+    aarch64_vec_duplicate (real_target, target, real_mode, inner_mode, n_elts);
 }
 
 /* Emit RTL corresponding to:
@@ -23947,11 +23965,7 @@  aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
   if (BYTES_BIG_ENDIAN
       || !d->one_vector_p
       || d->vec_flags != VEC_SVE_DATA
-      || d->op_vec_flags != VEC_ADVSIMD
-      || d->perm.encoding ().nelts_per_pattern () != 1
-      || !known_eq (d->perm.encoding ().npatterns (),
-		    GET_MODE_NUNITS (d->op_mode))
-      || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
+      || d->perm.encoding ().nelts_per_pattern () != 1)
     return false;
 
   int npatterns = d->perm.encoding ().npatterns ();
@@ -23962,7 +23976,10 @@  aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
   if (d->testing_p)
     return true;
 
-  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  machine_mode mode = GET_MODE (d->target);
+  machine_mode element_mode = GET_MODE_INNER (mode);
+  aarch64_vec_duplicate (d->target, d->op0, mode, element_mode,
+			 d->perm.encoding ().npatterns ());
   return true;
 }
 
@@ -24194,6 +24211,15 @@  aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   return ret;
 }
 
+/* Implement TARGET_VECTORIZE_VLA_CONSTRUCTOR.  */
+
+static bool
+aarch64_vectorize_vla_constructor (rtx target, rtx_vector_builder &builder)
+{
+  aarch64_expand_vector_init (target, builder);
+  return true;
+}
+
 /* Generate a byte permute mask for a register of mode MODE,
    which has NUNITS units.  */
 
@@ -27667,6 +27693,10 @@  aarch64_libgcc_floating_mode_supported_p
 #define TARGET_VECTORIZE_VEC_PERM_CONST \
   aarch64_vectorize_vec_perm_const
 
+#undef TARGET_VECTORIZE_VLA_CONSTRUCTOR
+#define TARGET_VECTORIZE_VLA_CONSTRUCTOR \
+  aarch64_vectorize_vla_constructor
+
 #undef TARGET_VECTORIZE_RELATED_MODE
 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
 #undef TARGET_VECTORIZE_GET_MASK_MODE
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index b0ea39884aa3ced5c0ccc1e792088aa66997ec3b..eda3f014984f62d96d7fe0b3c0c439905375f25a 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,11 @@  instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_VLA_CONSTRUCTOR (rtx @var{target}, rtx_vector_builder @var{&builder})
+This hook is used to expand a vla constructor into @var{target}
+using the rtx_vector_builder @var{builder}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index f869ddd5e5b8b7acbd8e9765fb103af24a1085b6..07f4f77877b18a23f6fd205a8dd8daf1a03c2923 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@  address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_VLA_CONSTRUCTOR
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/expr.cc b/gcc/expr.cc
index f9753d48245d56039206647be8576246a3b25ed3..b9eb550cac4c68464c95cffa8da19b3984b80782 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -10264,6 +10264,44 @@  expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode,
 
     case VEC_PERM_EXPR:
       {
+	if (TREE_CODE (treeop2) == VECTOR_CST
+	    && targetm.vectorize.vla_constructor)
+	  {
+	    tree ctor0, ctor1;
+	    if (TREE_CODE (treeop0) == SSA_NAME
+		&& is_gimple_assign (SSA_NAME_DEF_STMT (treeop0)))
+	      ctor0 = gimple_assign_rhs1 (SSA_NAME_DEF_STMT (treeop0));
+	    else
+	      ctor0 = treeop0;
+	    if (TREE_CODE (treeop1) == SSA_NAME
+		&& is_gimple_assign (SSA_NAME_DEF_STMT (treeop1)))
+	      ctor1 = gimple_assign_rhs1 (SSA_NAME_DEF_STMT (treeop1));
+	    else
+	      ctor1 = treeop1;
+
+	    if (TREE_CODE (ctor0) == CONSTRUCTOR
+		&& TREE_CODE (ctor1) == CONSTRUCTOR)
+	      {
+
+		unsigned int nelts = vector_cst_encoded_nelts (treeop2);
+		unsigned int ctor_nelts = CONSTRUCTOR_NELTS (ctor0);
+		machine_mode mode = GET_MODE (target);
+		rtx_vector_builder builder (mode, nelts, 1);
+		for (unsigned int i = 0; i < nelts; ++i)
+		  {
+		    unsigned HOST_WIDE_INT index
+		      = tree_to_uhwi (VECTOR_CST_ENCODED_ELT (treeop2, i));
+		    tree op
+		      = index >= ctor_nelts
+			? CONSTRUCTOR_ELT (ctor1, index - ctor_nelts)->value
+			: CONSTRUCTOR_ELT (ctor0, index)->value;
+		    builder.quick_push (expand_normal (op));
+		  }
+		builder.finalize ();
+		if (targetm.vectorize.vla_constructor (target, builder))
+		  return target;
+	      }
+	  }
 	expand_operands (treeop0, treeop1, target, &op0, &op1, EXPAND_NORMAL);
 	vec_perm_builder sel;
 	if (TREE_CODE (treeop2) == VECTOR_CST
diff --git a/gcc/target.def b/gcc/target.def
index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..3c219b6a90d9cc1a6393a3ebc24e54fcf14c6377 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,13 @@  implementation approaches itself.",
 	const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(vla_constructor,
+ "This hook is used to expand a vla constructor into @var{target}\n\
+using the rtx_vector_builder @var{builder}.",
+ bool, (rtx target, rtx_vector_builder &builder),
+ NULL)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index d6fa6931499d15edff3e5af3e429540d001c7058..b46b8f0d7a9c52f6efe6acf10f589703cec3bd08 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -262,6 +262,8 @@  enum poly_value_estimate_kind
 extern bool verify_type_context (location_t, type_context_kind, const_tree,
 				 bool = false);
 
+class rtx_vector_builder;
+
 /* The target structure.  This holds all the backend hooks.  */
 #define DEFHOOKPOD(NAME, DOC, TYPE, INIT) TYPE NAME;
 #define DEFHOOK(NAME, DOC, TYPE, PARAMS, INIT) TYPE (* NAME) PARAMS;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_opt_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_opt_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..01f652931555534f43e0487766c568c72a5df686
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_opt_1.c
@@ -0,0 +1,134 @@ 
+/* { dg-options { "-O2" } } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+#include <arm_sve.h>
+
+/*
+** test0:
+**	ins	v0.s\[1\], v1.s\[0\]
+**	mov	z0.d, d0
+**	ret
+*/
+svfloat32_t test0(float x, float y) {
+    return svdupq_n_f32(x, y, x, y);
+}
+/*
+** test1:
+**	mov	z0.s, s0
+**	ret
+*/
+
+svfloat32_t test1(float x) {
+    return svdupq_n_f32(x, x, x, x);
+}
+
+/*
+** test2:
+**	mov	z0.s, w0
+**	ret
+*/
+
+svint32_t test2(int x) {
+    return svdupq_n_s32(x, x, x, x);
+}
+
+/*
+** test3:
+**	sxth	w0, w0
+**	fmov	d0, x0
+**	ins	v0.h\[1\], w1
+**	ins	v0.h\[2\], w2
+**	ins	v0.h\[3\], w3
+**	mov	z0.d, d0
+**	ret
+*/
+
+svint16_t test3(short a, short b, short c, short d)
+{
+    return svdupq_n_s16(a, b, c, d, a, b, c, d);
+}
+
+/*
+** test4:
+**	dup	v0.4h, w0
+**	ins	v0.h\[1\], w1
+**	ins	v0.h\[3\], w1
+**	mov	z0.d, d0
+**	ret
+*/
+
+svint16_t test4(short a, short b)
+{
+    return svdupq_n_s16(a, b, a, b, a, b, a, b);
+}
+
+/*
+** test5:
+**	mov	z0.h, w0
+**	ret
+*/
+
+svint16_t test5(short a)
+{
+    return svdupq_n_s16(a, a, a, a, a, a, a, a);
+}
+/*
+** test6:
+**	sxtb	w0, w0
+**	fmov	d0, x0
+**	ins	v0.b\[1\], w1
+**	ins	v0.b\[2\], w2
+**	ins	v0.b\[3\], w3
+**	ins	v0.b\[4\], w4
+**	ins	v0.b\[5\], w5
+**	ins	v0.b\[6\], w6
+**	ins	v0.b\[7\], w7
+**	mov	z0.d, d0
+**	ret
+*/
+
+svint8_t test6(char a, char b, char c, char d, char e, char f, char g, char h)
+{
+    return svdupq_n_s8(a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h);
+}
+
+/*
+** test7:
+**	dup	v0.8b, w0
+**	ins	v0.b\[1\], w1
+**	ins	v0.b\[2\], w2
+**	ins	v0.b\[3\], w3
+**	mov	z0.s, s0
+**	ret
+*/
+
+svint8_t test7(char a, char b, char c, char d)
+{
+    return svdupq_n_s8(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d);
+}
+
+
+// We can do better than this
+/*
+**	sxtb	w0, w0
+**	fmov	d0, x0
+**	ins	v0.d\[1\], x1
+**	ins	v0.b\[1\], w1
+**	mov	z0.h, h0
+**	ret
+*/
+
+svint8_t test8(char a, char b)
+{
+    return svdupq_n_s8(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
+}
+
+/*
+** test9:
+**	mov	z0.b, w0
+**	ret
+*/
+
+svint8_t test9(char a)
+{
+    return svdupq_n_s8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a);
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 350129555a0c71c0896c4f1003163f3b3557c11b..eaae1eefe02af3f51073310e7d17c33286b2bead 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1513,6 +1513,11 @@  lower_vec_perm (gimple_stmt_iterator *gsi)
   if (!TYPE_VECTOR_SUBPARTS (vect_type).is_constant (&elements))
     return;
 
+  /* It is possible to have a VEC_PERM_EXPR with a VLA mask and a VLS
+     CONSTRUCTOR, this should return a VLA type, so we can't lower it.  */
+  if (!TYPE_VECTOR_SUBPARTS (mask_type).is_constant ())
+    return;
+
   if (TREE_CODE (mask) == SSA_NAME)
     {
       gimple *def_stmt = SSA_NAME_DEF_STMT (mask);