VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer

Message ID 20230630104104.4193661-2-juzhe.zhong@rivai.ai
State Accepted
Headers
Series VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer |

Checks

Context Check Description
snail/gcc-patch-check success Github commit url

Commit Message

juzhe.zhong@rivai.ai June 30, 2023, 10:41 a.m. UTC
  From: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Hi, Richard and Richi.
It seems that the implementation of LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE is simple
and code change is not big.

Here is an example:

#include <stdint.h>
void
f (uint8_t *restrict a,
   uint8_t *restrict b, int n,
   int base, int step,
   int *restrict cond)
{
  for (int i = 0; i < n; ++i)
    {
      if (cond[i])
        a[i * step + base] = b[i * step + base];
    }
}

With this patch:

  <bb 6> [local count: 84095460]:
  _58 = (unsigned int) base_19(D);
  _61 = (unsigned long) b_20(D);
  _63 = (unsigned long) a_21(D);
  vect_cst__105 = [vec_duplicate_expr] _58;
  _110 = (unsigned long) n_16(D);

  <bb 7> [local count: 504572759]:
  # vect_vec_iv_.8_95 = PHI <_96(7), { 0, 1, 2, ... }(6)>
  # vectp_cond.9_99 = PHI <vectp_cond.9_100(7), cond_17(D)(6)>
  # ivtmp_111 = PHI <ivtmp_112(7), _110(6)>
  _113 = .SELECT_VL (ivtmp_111, POLY_INT_CST [4, 4]);
  _96 = vect_vec_iv_.8_95 + { POLY_INT_CST [4, 4], ... };
  ivtmp_98 = _113 * 4;
  vect__24.11_101 = .LEN_MASK_LOAD (vectp_cond.9_99, 32B, _113, { -1, ... }, 0);
  mask__14.12_103 = vect__24.11_101 != { 0, ... };
  vect__59.13_104 = VIEW_CONVERT_EXPR<vector([4,4]) unsigned int>(vect_vec_iv_.8_95);
  vect__60.14_106 = vect__59.13_104 + vect_cst__105;
  vect__12.15_107 = VIEW_CONVERT_EXPR<vector([4,4]) int>(vect__60.14_106);
  vect_patt_5.16_108 = .LEN_MASK_GATHER_LOAD (_61, vect__12.15_107, 4, { 0, ... }, _113, mask__14.12_103, 0);
  .LEN_MASK_SCATTER_STORE (_63, vect__12.15_107, 4, vect_patt_5.16_108, _113, mask__14.12_103, 0);
  vectp_cond.9_100 = vectp_cond.9_99 + ivtmp_98;
  ivtmp_112 = ivtmp_111 - _113;
  if (ivtmp_112 != 0)
    goto <bb 7>; [83.33%]
  else
    goto <bb 8>; [16.67%]

gcc/ChangeLog:

        * optabs-query.cc (supports_vec_gather_load_p): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE.
        (supports_vec_scatter_store_p): Ditto.
        * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
        * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.

---
 gcc/optabs-query.cc        |  2 +
 gcc/tree-vect-data-refs.cc | 18 ++++++++-
 gcc/tree-vect-stmts.cc     | 81 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 98 insertions(+), 3 deletions(-)
  

Patch

diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 2fdd0d34354..bf1f484e874 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -676,6 +676,7 @@  supports_vec_gather_load_p (machine_mode mode)
     this_fn_optabs->supports_vec_gather_load[mode]
       = (supports_vec_convert_optab_p (gather_load_optab, mode)
 	 || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+	 || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
 	 ? 1 : -1);
 
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
@@ -692,6 +693,7 @@  supports_vec_scatter_store_p (machine_mode mode)
     this_fn_optabs->supports_vec_scatter_store[mode]
       = (supports_vec_convert_optab_p (scatter_store_optab, mode)
 	 || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+	 || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
 	 ? 1 : -1);
 
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index ebe93832b1e..01016284c48 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3873,16 +3873,24 @@  vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
     return false;
 
   /* Work out which function we need.  */
-  internal_fn ifn, alt_ifn;
+  internal_fn ifn, alt_ifn, len_mask_ifn;
   if (read_p)
     {
       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
       alt_ifn = IFN_MASK_GATHER_LOAD;
+      /* When target supports LEN_MASK_GATHER_LOAD, we always
+	 use LEN_MASK_GATHER_LOAD regardless whether len and
+	 mask are valid or not.  */
+      len_mask_ifn = IFN_LEN_MASK_GATHER_LOAD;
     }
   else
     {
       ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
       alt_ifn = IFN_MASK_SCATTER_STORE;
+      /* When target supports LEN_MASK_SCATTER_STORE, we always
+	 use LEN_MASK_SCATTER_STORE regardless whether len and
+	 mask are valid or not.  */
+      len_mask_ifn = IFN_LEN_MASK_SCATTER_STORE;
     }
 
   for (;;)
@@ -3909,6 +3917,14 @@  vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
 	  *offset_vectype_out = offset_vectype;
 	  return true;
 	}
+      else if (internal_gather_scatter_fn_supported_p (len_mask_ifn, vectype,
+						       memory_type,
+						       offset_vectype, scale))
+	{
+	  *ifn_out = ifn;
+	  *offset_vectype_out = offset_vectype;
+	  return true;
+	}
 
       if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
 	  && TYPE_PRECISION (offset_type) >= element_bits)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 68faa8ead39..fa0387353cf 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1771,6 +1771,17 @@  check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
 						   gs_info->offset_vectype,
 						   gs_info->scale))
 	{
+	  internal_fn len_mask_ifn
+	    = (is_load ? IFN_LEN_MASK_GATHER_LOAD : IFN_LEN_MASK_SCATTER_STORE);
+	  if (internal_gather_scatter_fn_supported_p (len_mask_ifn, vectype,
+						      gs_info->memory_type,
+						      gs_info->offset_vectype,
+						      gs_info->scale))
+	    {
+	      vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
+	      vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+	      return;
+	    }
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			     "can't operate on partial vectors because"
@@ -8930,7 +8941,40 @@  vectorizable_store (vec_info *vinfo,
 		    vec_offset = vec_offsets[vec_num * j + i];
 		  tree scale = size_int (gs_info.scale);
 		  gcall *call;
-		  if (final_mask)
+		  if (internal_gather_scatter_fn_supported_p (
+			IFN_LEN_MASK_SCATTER_STORE, vectype,
+			gs_info.memory_type, TREE_TYPE (vec_offset),
+			gs_info.scale))
+		    {
+		      tree final_len = NULL_TREE;
+		      tree bias = NULL_TREE;
+		      if (loop_lens)
+			{
+			  final_len
+			    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+						 vec_num * ncopies, vectype,
+						 vec_num * j + i, 1);
+			}
+		      else
+			{
+			  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+			  final_len
+			    = build_int_cst (iv_type,
+					     TYPE_VECTOR_SUBPARTS (vectype));
+			}
+		      signed char biasval
+			= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+		      bias = build_int_cst (intQI_type_node, biasval);
+		      if (!final_mask)
+			{
+			  mask_vectype = truth_type_for (vectype);
+			  final_mask = build_minus_one_cst (mask_vectype);
+			}
+		      call = gimple_build_call_internal (
+			IFN_LEN_MASK_SCATTER_STORE, 7, dataref_ptr, vec_offset,
+			scale, vec_oprnd, final_len, final_mask, bias);
+		    }
+		  else if (final_mask)
 		    call = gimple_build_call_internal
 		      (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
 		       scale, vec_oprnd, final_mask);
@@ -10368,7 +10412,40 @@  vectorizable_load (vec_info *vinfo,
 			tree zero = build_zero_cst (vectype);
 			tree scale = size_int (gs_info.scale);
 			gcall *call;
-			if (final_mask)
+			if (internal_gather_scatter_fn_supported_p (
+			      IFN_LEN_MASK_GATHER_LOAD, vectype,
+			      gs_info.memory_type, TREE_TYPE (vec_offset),
+			      gs_info.scale))
+			  {
+			    tree final_len = NULL_TREE;
+			    tree bias = NULL_TREE;
+			    if (loop_lens)
+			      {
+				final_len = vect_get_loop_len (
+				  loop_vinfo, gsi, loop_lens, vec_num * ncopies,
+				  vectype, vec_num * j + i, 1);
+			      }
+			    else
+			      {
+				tree iv_type
+				  = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+				final_len = build_int_cst (
+				  iv_type, TYPE_VECTOR_SUBPARTS (vectype));
+			      }
+			    signed char biasval
+			      = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+			    bias = build_int_cst (intQI_type_node, biasval);
+			    if (!final_mask)
+			      {
+				mask_vectype = truth_type_for (vectype);
+				final_mask = build_minus_one_cst (mask_vectype);
+			      }
+			    call = gimple_build_call_internal (
+			      IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
+			      vec_offset, scale, zero, final_len, final_mask,
+			      bias);
+			  }
+			else if (final_mask)
 			  call = gimple_build_call_internal
 			    (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
 			     vec_offset, scale, zero, final_mask);