VECT: Support mask_len_strided_load/mask_len_strided_store in loop vectorize

Message ID 20231031100745.2356816-1-juzhe.zhong@rivai.ai
State Unresolved
Headers
Series VECT: Support mask_len_strided_load/mask_len_strided_store in loop vectorize |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

juzhe.zhong@rivai.ai Oct. 31, 2023, 10:07 a.m. UTC
  This patch support loop vectorizer generate direct strided load/store IFN
if targets enable it.

Note that this patch provide the ability that target enabling strided load/store but without gather/scatter
can vectorize stride memory access.

gcc/ChangeLog:

	* optabs-query.cc (supports_vec_gather_load_p): Support strided load/store.
	(supports_vec_scatter_store_p): Ditto.
	* optabs-query.h (supports_vec_gather_load_p): Ditto.
	(supports_vec_scatter_store_p): Ditto.
	* tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
	(vect_check_gather_scatter): Ditto.
	* tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
	(vect_truncate_gather_scatter_offset): Ditto.
	(vect_use_strided_gather_scatters_p): Ditto.
	(vect_get_strided_load_store_ops): Ditto.
	(vectorizable_store): Ditto.
	(vectorizable_load): Ditto.
	* tree-vectorizer.h (vect_gather_scatter_fn_p): Ditto.

---
 gcc/optabs-query.cc        | 27 ++++++++++-----
 gcc/optabs-query.h         |  4 +--
 gcc/tree-vect-data-refs.cc | 71 ++++++++++++++++++++++++++++----------
 gcc/tree-vect-stmts.cc     | 46 +++++++++++++++++-------
 gcc/tree-vectorizer.h      |  3 +-
 5 files changed, 109 insertions(+), 42 deletions(-)
  

Comments

Li, Pan2 Oct. 31, 2023, 2:09 p.m. UTC | #1
Passed the x86 bootstrap and regression tests.

Pan

-----Original Message-----
From: Juzhe-Zhong <juzhe.zhong@rivai.ai> 
Sent: Tuesday, October 31, 2023 6:08 PM
To: gcc-patches@gcc.gnu.org
Cc: richard.sandiford@arm.com; rguenther@suse.de; jeffreyalaw@gmail.com; Juzhe-Zhong <juzhe.zhong@rivai.ai>
Subject: [PATCH] VECT: Support mask_len_strided_load/mask_len_strided_store in loop vectorize

This patch support loop vectorizer generate direct strided load/store IFN
if targets enable it.

Note that this patch provide the ability that target enabling strided load/store but without gather/scatter
can vectorize stride memory access.

gcc/ChangeLog:

	* optabs-query.cc (supports_vec_gather_load_p): Support strided load/store.
	(supports_vec_scatter_store_p): Ditto.
	* optabs-query.h (supports_vec_gather_load_p): Ditto.
	(supports_vec_scatter_store_p): Ditto.
	* tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
	(vect_check_gather_scatter): Ditto.
	* tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
	(vect_truncate_gather_scatter_offset): Ditto.
	(vect_use_strided_gather_scatters_p): Ditto.
	(vect_get_strided_load_store_ops): Ditto.
	(vectorizable_store): Ditto.
	(vectorizable_load): Ditto.
	* tree-vectorizer.h (vect_gather_scatter_fn_p): Ditto.

---
 gcc/optabs-query.cc        | 27 ++++++++++-----
 gcc/optabs-query.h         |  4 +--
 gcc/tree-vect-data-refs.cc | 71 ++++++++++++++++++++++++++++----------
 gcc/tree-vect-stmts.cc     | 46 +++++++++++++++++-------
 gcc/tree-vectorizer.h      |  3 +-
 5 files changed, 109 insertions(+), 42 deletions(-)

diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 947ccef218c..ea594baf15d 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -670,14 +670,19 @@ supports_vec_convert_optab_p (optab op, machine_mode mode)
    for at least one vector mode.  */
 
 bool
-supports_vec_gather_load_p (machine_mode mode)
+supports_vec_gather_load_p (machine_mode mode, bool strided_p)
 {
   if (!this_fn_optabs->supports_vec_gather_load[mode])
     this_fn_optabs->supports_vec_gather_load[mode]
       = (supports_vec_convert_optab_p (gather_load_optab, mode)
-	 || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
-	 || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode)
-	 ? 1 : -1);
+	     || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+	     || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode)
+	     || (strided_p
+		 && convert_optab_handler (mask_len_strided_load_optab, mode,
+					   Pmode)
+		      != CODE_FOR_nothing)
+	   ? 1
+	   : -1);
 
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
 }
@@ -687,14 +692,20 @@ supports_vec_gather_load_p (machine_mode mode)
    for at least one vector mode.  */
 
 bool
-supports_vec_scatter_store_p (machine_mode mode)
+supports_vec_scatter_store_p (machine_mode mode, bool strided_p)
 {
   if (!this_fn_optabs->supports_vec_scatter_store[mode])
     this_fn_optabs->supports_vec_scatter_store[mode]
       = (supports_vec_convert_optab_p (scatter_store_optab, mode)
-	 || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
-	 || supports_vec_convert_optab_p (mask_len_scatter_store_optab, mode)
-	 ? 1 : -1);
+	     || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+	     || supports_vec_convert_optab_p (mask_len_scatter_store_optab,
+					      mode)
+	     || (strided_p
+		 && convert_optab_handler (mask_len_strided_store_optab, mode,
+					   Pmode)
+		      != CODE_FOR_nothing)
+	   ? 1
+	   : -1);
 
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
 }
diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 920eb6a1b67..7c22edc5a78 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -191,8 +191,8 @@ bool can_compare_and_swap_p (machine_mode, bool);
 bool can_atomic_exchange_p (machine_mode, bool);
 bool can_atomic_load_p (machine_mode);
 bool lshift_cheap_p (bool);
-bool supports_vec_gather_load_p (machine_mode = E_VOIDmode);
-bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode);
+bool supports_vec_gather_load_p (machine_mode = E_VOIDmode, bool = false);
+bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode, bool = false);
 bool can_vec_extract (machine_mode, machine_mode);
 
 /* Version of find_widening_optab_handler_and_mode that operates on
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index d5c9c4a11c2..d374849b0a7 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3913,9 +3913,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
 
 bool
-vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
-			  tree vectype, tree memory_type, tree offset_type,
-			  int scale, internal_fn *ifn_out,
+vect_gather_scatter_fn_p (vec_info *vinfo, bool strided_p, bool read_p,
+			  bool masked_p, tree vectype, tree memory_type,
+			  tree offset_type, int scale, internal_fn *ifn_out,
 			  tree *offset_vectype_out)
 {
   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
@@ -3926,7 +3926,7 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
     return false;
 
   /* Work out which function we need.  */
-  internal_fn ifn, alt_ifn, alt_ifn2;
+  internal_fn ifn, alt_ifn, alt_ifn2, alt_ifn3;
   if (read_p)
     {
       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
@@ -3935,6 +3935,12 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
 	 use MASK_LEN_GATHER_LOAD regardless whether len and
 	 mask are valid or not.  */
       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
+      /* When target supports MASK_LEN_STRIDED_LOAD, we can relax the
+	 restrictions around the relationship of the vector offset type
+	 to the loaded by using a gather load with strided access.
+	 E.g. a "gather" of N bytes with a 64-bit stride would in principle
+	 be possible without needing an Nx64-bit vector offset type.  */
+      alt_ifn3 = IFN_MASK_LEN_STRIDED_LOAD;
     }
   else
     {
@@ -3944,6 +3950,12 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
 	 use MASK_LEN_SCATTER_STORE regardless whether len and
 	 mask are valid or not.  */
       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
+      /* When target supports MASK_LEN_STRIDED_STORE, we can relax the
+	 restrictions around the relationship of the vector offset type
+	 to the stored by using a scatter store with strided access.
+	 E.g. a "scatter" of N bytes with a 64-bit stride would in principle
+	 be possible without needing an Nx64-bit vector offset type.  */
+      alt_ifn3 = IFN_MASK_LEN_STRIDED_STORE;
     }
 
   for (;;)
@@ -3953,8 +3965,20 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
 	return false;
 
       /* Test whether the target supports this combination.  */
-      if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
-						  offset_vectype, scale))
+      /* We don't need to check whether target supports gather/scatter IFN
+	 with expected vector offset for gather/scatter with a strided access
+	 when target itself support strided load/store IFN.  */
+      if (strided_p
+	  && internal_strided_fn_supported_p (alt_ifn3, vectype, offset_type,
+					      scale))
+	{
+	  *ifn_out = alt_ifn3;
+	  *offset_vectype_out = offset_vectype;
+	  return true;
+	}
+      else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
+						       memory_type,
+						       offset_vectype, scale))
 	{
 	  *ifn_out = ifn;
 	  *offset_vectype_out = offset_vectype;
@@ -4047,9 +4071,12 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 
   /* True if we should aim to use internal functions rather than
      built-in functions.  */
-  bool use_ifn_p = (DR_IS_READ (dr)
-		    ? supports_vec_gather_load_p (TYPE_MODE (vectype))
-		    : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
+  bool use_ifn_p
+    = (DR_IS_READ (dr)
+	 ? supports_vec_gather_load_p (TYPE_MODE (vectype),
+				       STMT_VINFO_STRIDED_P (stmt_info))
+	 : supports_vec_scatter_store_p (TYPE_MODE (vectype),
+					 STMT_VINFO_STRIDED_P (stmt_info)));
 
   base = DR_REF (dr);
   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
@@ -4196,13 +4223,17 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	      /* Only treat this as a scaling operation if the target
 		 supports it for at least some offset type.  */
 	      if (use_ifn_p
-		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-						masked_p, vectype, memory_type,
+		  && !vect_gather_scatter_fn_p (loop_vinfo,
+						STMT_VINFO_STRIDED_P (stmt_info),
+						DR_IS_READ (dr), masked_p,
+						vectype, memory_type,
 						signed_char_type_node,
 						new_scale, &ifn,
 						&offset_vectype)
-		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-						masked_p, vectype, memory_type,
+		  && !vect_gather_scatter_fn_p (loop_vinfo,
+						STMT_VINFO_STRIDED_P (stmt_info),
+						DR_IS_READ (dr), masked_p,
+						vectype, memory_type,
 						unsigned_char_type_node,
 						new_scale, &ifn,
 						&offset_vectype))
@@ -4225,8 +4256,10 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	  if (use_ifn_p
 	      && TREE_CODE (off) == SSA_NAME
 	      && !POINTER_TYPE_P (TREE_TYPE (off))
-	      && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-					   masked_p, vectype, memory_type,
+	      && vect_gather_scatter_fn_p (loop_vinfo,
+					   STMT_VINFO_STRIDED_P (stmt_info),
+					   DR_IS_READ (dr), masked_p,
+					   vectype, memory_type,
 					   TREE_TYPE (off), scale, &ifn,
 					   &offset_vectype))
 	    break;
@@ -4280,9 +4313,11 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 
   if (use_ifn_p)
     {
-      if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
-				     vectype, memory_type, offtype, scale,
-				     &ifn, &offset_vectype))
+      if (!vect_gather_scatter_fn_p (loop_vinfo,
+				     STMT_VINFO_STRIDED_P (stmt_info),
+				     DR_IS_READ (dr), masked_p, vectype,
+				     memory_type, offtype, scale, &ifn,
+				     &offset_vectype))
 	ifn = IFN_LAST;
       decl = NULL_TREE;
     }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a9200767f67..8ff06bd3acb 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1506,10 +1506,15 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
       internal_fn len_ifn = (is_load
 			     ? IFN_MASK_LEN_GATHER_LOAD
 			     : IFN_MASK_LEN_SCATTER_STORE);
-      if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
-						  gs_info->memory_type,
-						  gs_info->offset_vectype,
-						  gs_info->scale))
+      if (internal_strided_fn_p (gs_info->ifn)
+	  && internal_strided_fn_supported_p (gs_info->ifn, vectype,
+					      TREE_TYPE (gs_info->offset),
+					      gs_info->scale))
+	vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+      else if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
+						       gs_info->memory_type,
+						       gs_info->offset_vectype,
+						       gs_info->scale))
 	vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
       else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
 						       gs_info->memory_type,
@@ -1693,8 +1698,10 @@ vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info,
       /* See whether the target supports the operation with an offset
 	 no narrower than OFFSET_TYPE.  */
       tree memory_type = TREE_TYPE (DR_REF (dr));
-      if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
-				     vectype, memory_type, offset_type, scale,
+      if (!vect_gather_scatter_fn_p (loop_vinfo,
+				     STMT_VINFO_STRIDED_P (stmt_info),
+				     DR_IS_READ (dr), masked_p, vectype,
+				     memory_type, offset_type, scale,
 				     &gs_info->ifn, &gs_info->offset_vectype)
 	  || gs_info->ifn == IFN_LAST)
 	continue;
@@ -1734,6 +1741,15 @@ vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info,
       || gs_info->ifn == IFN_LAST)
     return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo,
 						masked_p, gs_info);
+  else if (internal_strided_fn_p (gs_info->ifn))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "using strided IFN for strided/grouped access,"
+			 " scale = %d\n",
+			 gs_info->scale);
+      return true;
+    }
 
   tree old_offset_type = TREE_TYPE (gs_info->offset);
   tree new_offset_type = TREE_TYPE (gs_info->offset_vectype);
@@ -3012,9 +3028,13 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
 			  ssize_int (gs_info->scale));
   step = fold_convert (offset_type, step);
 
-  /* Create {0, X, X*2, X*3, ...}.  */
-  tree offset = fold_build2 (VEC_SERIES_EXPR, gs_info->offset_vectype,
-			     build_zero_cst (offset_type), step);
+  tree offset;
+  if (internal_strided_fn_p (gs_info->ifn))
+    offset = step;
+  else
+    /* Create {0, X, X*2, X*3, ...}.  */
+    offset = fold_build2 (VEC_SERIES_EXPR, gs_info->offset_vectype,
+			  build_zero_cst (offset_type), step);
   *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset);
 }
 
@@ -9125,7 +9145,7 @@ vectorizable_store (vec_info *vinfo,
 		vec_offset = vec_offsets[j];
 	      tree scale = size_int (gs_info.scale);
 
-	      if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
+	      if (internal_fn_len_index (gs_info.ifn) >= 0)
 		{
 		  if (loop_lens)
 		    final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
@@ -9145,7 +9165,7 @@ vectorizable_store (vec_info *vinfo,
 
 	      gcall *call;
 	      if (final_len && final_mask)
-		call = gimple_build_call_internal (IFN_MASK_LEN_SCATTER_STORE,
+		call = gimple_build_call_internal (gs_info.ifn,
 						   7, dataref_ptr, vec_offset,
 						   scale, vec_oprnd, final_mask,
 						   final_len, bias);
@@ -10949,7 +10969,7 @@ vectorizable_load (vec_info *vinfo,
 		  tree zero = build_zero_cst (vectype);
 		  tree scale = size_int (gs_info.scale);
 
-		  if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
+		  if (internal_fn_len_index (gs_info.ifn) >= 0)
 		    {
 		      if (loop_lens)
 			final_len
@@ -10973,7 +10993,7 @@ vectorizable_load (vec_info *vinfo,
 		  gcall *call;
 		  if (final_len && final_mask)
 		    call
-		      = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
+		      = gimple_build_call_internal (gs_info.ifn, 7,
 						    dataref_ptr, vec_offset,
 						    scale, zero, final_mask,
 						    final_len, bias);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index a4043e4a656..76bf3aa14b4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2309,7 +2309,8 @@ extern opt_result vect_analyze_data_refs_alignment (loop_vec_info);
 extern bool vect_slp_analyze_instance_alignment (vec_info *, slp_instance);
 extern opt_result vect_analyze_data_ref_accesses (vec_info *, vec<int> *);
 extern opt_result vect_prune_runtime_alias_test_list (loop_vec_info);
-extern bool vect_gather_scatter_fn_p (vec_info *, bool, bool, tree, tree,
+extern bool vect_gather_scatter_fn_p (vec_info *,
+				      bool, bool, bool, tree, tree,
 				      tree, int, internal_fn *, tree *);
 extern bool vect_check_gather_scatter (stmt_vec_info, loop_vec_info,
 				       gather_scatter_info *);
  

Patch

diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 947ccef218c..ea594baf15d 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -670,14 +670,19 @@  supports_vec_convert_optab_p (optab op, machine_mode mode)
    for at least one vector mode.  */
 
 bool
-supports_vec_gather_load_p (machine_mode mode)
+supports_vec_gather_load_p (machine_mode mode, bool strided_p)
 {
   if (!this_fn_optabs->supports_vec_gather_load[mode])
     this_fn_optabs->supports_vec_gather_load[mode]
       = (supports_vec_convert_optab_p (gather_load_optab, mode)
-	 || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
-	 || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode)
-	 ? 1 : -1);
+	     || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+	     || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode)
+	     || (strided_p
+		 && convert_optab_handler (mask_len_strided_load_optab, mode,
+					   Pmode)
+		      != CODE_FOR_nothing)
+	   ? 1
+	   : -1);
 
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
 }
@@ -687,14 +692,20 @@  supports_vec_gather_load_p (machine_mode mode)
    for at least one vector mode.  */
 
 bool
-supports_vec_scatter_store_p (machine_mode mode)
+supports_vec_scatter_store_p (machine_mode mode, bool strided_p)
 {
   if (!this_fn_optabs->supports_vec_scatter_store[mode])
     this_fn_optabs->supports_vec_scatter_store[mode]
       = (supports_vec_convert_optab_p (scatter_store_optab, mode)
-	 || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
-	 || supports_vec_convert_optab_p (mask_len_scatter_store_optab, mode)
-	 ? 1 : -1);
+	     || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+	     || supports_vec_convert_optab_p (mask_len_scatter_store_optab,
+					      mode)
+	     || (strided_p
+		 && convert_optab_handler (mask_len_strided_store_optab, mode,
+					   Pmode)
+		      != CODE_FOR_nothing)
+	   ? 1
+	   : -1);
 
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
 }
diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 920eb6a1b67..7c22edc5a78 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -191,8 +191,8 @@  bool can_compare_and_swap_p (machine_mode, bool);
 bool can_atomic_exchange_p (machine_mode, bool);
 bool can_atomic_load_p (machine_mode);
 bool lshift_cheap_p (bool);
-bool supports_vec_gather_load_p (machine_mode = E_VOIDmode);
-bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode);
+bool supports_vec_gather_load_p (machine_mode = E_VOIDmode, bool = false);
+bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode, bool = false);
 bool can_vec_extract (machine_mode, machine_mode);
 
 /* Version of find_widening_optab_handler_and_mode that operates on
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index d5c9c4a11c2..d374849b0a7 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3913,9 +3913,9 @@  vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
 
 bool
-vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
-			  tree vectype, tree memory_type, tree offset_type,
-			  int scale, internal_fn *ifn_out,
+vect_gather_scatter_fn_p (vec_info *vinfo, bool strided_p, bool read_p,
+			  bool masked_p, tree vectype, tree memory_type,
+			  tree offset_type, int scale, internal_fn *ifn_out,
 			  tree *offset_vectype_out)
 {
   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
@@ -3926,7 +3926,7 @@  vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
     return false;
 
   /* Work out which function we need.  */
-  internal_fn ifn, alt_ifn, alt_ifn2;
+  internal_fn ifn, alt_ifn, alt_ifn2, alt_ifn3;
   if (read_p)
     {
       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
@@ -3935,6 +3935,12 @@  vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
 	 use MASK_LEN_GATHER_LOAD regardless whether len and
 	 mask are valid or not.  */
       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
+      /* When target supports MASK_LEN_STRIDED_LOAD, we can relax the
+	 restrictions around the relationship of the vector offset type
+	 to the loaded by using a gather load with strided access.
+	 E.g. a "gather" of N bytes with a 64-bit stride would in principle
+	 be possible without needing an Nx64-bit vector offset type.  */
+      alt_ifn3 = IFN_MASK_LEN_STRIDED_LOAD;
     }
   else
     {
@@ -3944,6 +3950,12 @@  vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
 	 use MASK_LEN_SCATTER_STORE regardless whether len and
 	 mask are valid or not.  */
       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
+      /* When target supports MASK_LEN_STRIDED_STORE, we can relax the
+	 restrictions around the relationship of the vector offset type
+	 to the stored by using a scatter store with strided access.
+	 E.g. a "scatter" of N bytes with a 64-bit stride would in principle
+	 be possible without needing an Nx64-bit vector offset type.  */
+      alt_ifn3 = IFN_MASK_LEN_STRIDED_STORE;
     }
 
   for (;;)
@@ -3953,8 +3965,20 @@  vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
 	return false;
 
       /* Test whether the target supports this combination.  */
-      if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
-						  offset_vectype, scale))
+      /* We don't need to check whether target supports gather/scatter IFN
+	 with expected vector offset for gather/scatter with a strided access
+	 when target itself support strided load/store IFN.  */
+      if (strided_p
+	  && internal_strided_fn_supported_p (alt_ifn3, vectype, offset_type,
+					      scale))
+	{
+	  *ifn_out = alt_ifn3;
+	  *offset_vectype_out = offset_vectype;
+	  return true;
+	}
+      else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
+						       memory_type,
+						       offset_vectype, scale))
 	{
 	  *ifn_out = ifn;
 	  *offset_vectype_out = offset_vectype;
@@ -4047,9 +4071,12 @@  vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 
   /* True if we should aim to use internal functions rather than
      built-in functions.  */
-  bool use_ifn_p = (DR_IS_READ (dr)
-		    ? supports_vec_gather_load_p (TYPE_MODE (vectype))
-		    : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
+  bool use_ifn_p
+    = (DR_IS_READ (dr)
+	 ? supports_vec_gather_load_p (TYPE_MODE (vectype),
+				       STMT_VINFO_STRIDED_P (stmt_info))
+	 : supports_vec_scatter_store_p (TYPE_MODE (vectype),
+					 STMT_VINFO_STRIDED_P (stmt_info)));
 
   base = DR_REF (dr);
   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
@@ -4196,13 +4223,17 @@  vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	      /* Only treat this as a scaling operation if the target
 		 supports it for at least some offset type.  */
 	      if (use_ifn_p
-		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-						masked_p, vectype, memory_type,
+		  && !vect_gather_scatter_fn_p (loop_vinfo,
+						STMT_VINFO_STRIDED_P (stmt_info),
+						DR_IS_READ (dr), masked_p,
+						vectype, memory_type,
 						signed_char_type_node,
 						new_scale, &ifn,
 						&offset_vectype)
-		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-						masked_p, vectype, memory_type,
+		  && !vect_gather_scatter_fn_p (loop_vinfo,
+						STMT_VINFO_STRIDED_P (stmt_info),
+						DR_IS_READ (dr), masked_p,
+						vectype, memory_type,
 						unsigned_char_type_node,
 						new_scale, &ifn,
 						&offset_vectype))
@@ -4225,8 +4256,10 @@  vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 	  if (use_ifn_p
 	      && TREE_CODE (off) == SSA_NAME
 	      && !POINTER_TYPE_P (TREE_TYPE (off))
-	      && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-					   masked_p, vectype, memory_type,
+	      && vect_gather_scatter_fn_p (loop_vinfo,
+					   STMT_VINFO_STRIDED_P (stmt_info),
+					   DR_IS_READ (dr), masked_p,
+					   vectype, memory_type,
 					   TREE_TYPE (off), scale, &ifn,
 					   &offset_vectype))
 	    break;
@@ -4280,9 +4313,11 @@  vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
 
   if (use_ifn_p)
     {
-      if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
-				     vectype, memory_type, offtype, scale,
-				     &ifn, &offset_vectype))
+      if (!vect_gather_scatter_fn_p (loop_vinfo,
+				     STMT_VINFO_STRIDED_P (stmt_info),
+				     DR_IS_READ (dr), masked_p, vectype,
+				     memory_type, offtype, scale, &ifn,
+				     &offset_vectype))
 	ifn = IFN_LAST;
       decl = NULL_TREE;
     }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a9200767f67..8ff06bd3acb 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1506,10 +1506,15 @@  check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
       internal_fn len_ifn = (is_load
 			     ? IFN_MASK_LEN_GATHER_LOAD
 			     : IFN_MASK_LEN_SCATTER_STORE);
-      if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
-						  gs_info->memory_type,
-						  gs_info->offset_vectype,
-						  gs_info->scale))
+      if (internal_strided_fn_p (gs_info->ifn)
+	  && internal_strided_fn_supported_p (gs_info->ifn, vectype,
+					      TREE_TYPE (gs_info->offset),
+					      gs_info->scale))
+	vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+      else if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
+						       gs_info->memory_type,
+						       gs_info->offset_vectype,
+						       gs_info->scale))
 	vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
       else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
 						       gs_info->memory_type,
@@ -1693,8 +1698,10 @@  vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info,
       /* See whether the target supports the operation with an offset
 	 no narrower than OFFSET_TYPE.  */
       tree memory_type = TREE_TYPE (DR_REF (dr));
-      if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
-				     vectype, memory_type, offset_type, scale,
+      if (!vect_gather_scatter_fn_p (loop_vinfo,
+				     STMT_VINFO_STRIDED_P (stmt_info),
+				     DR_IS_READ (dr), masked_p, vectype,
+				     memory_type, offset_type, scale,
 				     &gs_info->ifn, &gs_info->offset_vectype)
 	  || gs_info->ifn == IFN_LAST)
 	continue;
@@ -1734,6 +1741,15 @@  vect_use_strided_gather_scatters_p (stmt_vec_info stmt_info,
       || gs_info->ifn == IFN_LAST)
     return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo,
 						masked_p, gs_info);
+  else if (internal_strided_fn_p (gs_info->ifn))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "using strided IFN for strided/grouped access,"
+			 " scale = %d\n",
+			 gs_info->scale);
+      return true;
+    }
 
   tree old_offset_type = TREE_TYPE (gs_info->offset);
   tree new_offset_type = TREE_TYPE (gs_info->offset_vectype);
@@ -3012,9 +3028,13 @@  vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
 			  ssize_int (gs_info->scale));
   step = fold_convert (offset_type, step);
 
-  /* Create {0, X, X*2, X*3, ...}.  */
-  tree offset = fold_build2 (VEC_SERIES_EXPR, gs_info->offset_vectype,
-			     build_zero_cst (offset_type), step);
+  tree offset;
+  if (internal_strided_fn_p (gs_info->ifn))
+    offset = step;
+  else
+    /* Create {0, X, X*2, X*3, ...}.  */
+    offset = fold_build2 (VEC_SERIES_EXPR, gs_info->offset_vectype,
+			  build_zero_cst (offset_type), step);
   *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset);
 }
 
@@ -9125,7 +9145,7 @@  vectorizable_store (vec_info *vinfo,
 		vec_offset = vec_offsets[j];
 	      tree scale = size_int (gs_info.scale);
 
-	      if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
+	      if (internal_fn_len_index (gs_info.ifn) >= 0)
 		{
 		  if (loop_lens)
 		    final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
@@ -9145,7 +9165,7 @@  vectorizable_store (vec_info *vinfo,
 
 	      gcall *call;
 	      if (final_len && final_mask)
-		call = gimple_build_call_internal (IFN_MASK_LEN_SCATTER_STORE,
+		call = gimple_build_call_internal (gs_info.ifn,
 						   7, dataref_ptr, vec_offset,
 						   scale, vec_oprnd, final_mask,
 						   final_len, bias);
@@ -10949,7 +10969,7 @@  vectorizable_load (vec_info *vinfo,
 		  tree zero = build_zero_cst (vectype);
 		  tree scale = size_int (gs_info.scale);
 
-		  if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
+		  if (internal_fn_len_index (gs_info.ifn) >= 0)
 		    {
 		      if (loop_lens)
 			final_len
@@ -10973,7 +10993,7 @@  vectorizable_load (vec_info *vinfo,
 		  gcall *call;
 		  if (final_len && final_mask)
 		    call
-		      = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
+		      = gimple_build_call_internal (gs_info.ifn, 7,
 						    dataref_ptr, vec_offset,
 						    scale, zero, final_mask,
 						    final_len, bias);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index a4043e4a656..76bf3aa14b4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2309,7 +2309,8 @@  extern opt_result vect_analyze_data_refs_alignment (loop_vec_info);
 extern bool vect_slp_analyze_instance_alignment (vec_info *, slp_instance);
 extern opt_result vect_analyze_data_ref_accesses (vec_info *, vec<int> *);
 extern opt_result vect_prune_runtime_alias_test_list (loop_vec_info);
-extern bool vect_gather_scatter_fn_p (vec_info *, bool, bool, tree, tree,
+extern bool vect_gather_scatter_fn_p (vec_info *,
+				      bool, bool, bool, tree, tree,
 				      tree, int, internal_fn *, tree *);
 extern bool vect_check_gather_scatter (stmt_vec_info, loop_vec_info,
 				       gather_scatter_info *);