vect: Move VMAT_GATHER_SCATTER handlings from final loop nest

Message ID 7314a4eb-26d0-e33e-94c2-31daca9f490e@linux.ibm.com
State Unresolved
Headers
Series vect: Move VMAT_GATHER_SCATTER handlings from final loop nest |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Kewen.Lin Aug. 14, 2023, 8:59 a.m. UTC
  Hi,

Following Richi's suggestion [1], this patch is to move the
handlings on VMAT_GATHER_SCATTER in the final loop nest
of function vectorizable_load to its own loop.  Basically
it duplicates the final loop nest, clean up some useless
set up code for the case of VMAT_GATHER_SCATTER, remove some
unreachable code.  Also remove the corresponding handlings
in the final loop nest.

Bootstrapped and regtested on x86_64-redhat-linux,
aarch64-linux-gnu and powerpc64{,le}-linux-gnu.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html

Is it ok for trunk?

BR,
Kewen
-----

gcc/ChangeLog:

	* tree-vect-stmts.cc (vectorizable_load): Move the handlings on
	VMAT_GATHER_SCATTER in the final loop nest to its own loop,
	and update the final nest accordingly.
---
 gcc/tree-vect-stmts.cc | 361 +++++++++++++++++++++++++----------------
 1 file changed, 219 insertions(+), 142 deletions(-)

--
2.39.1
  

Comments

Richard Sandiford Aug. 14, 2023, 12:20 p.m. UTC | #1
Thanks for the clean-ups.  But...

"Kewen.Lin" <linkw@linux.ibm.com> writes:
> Hi,
>
> Following Richi's suggestion [1], this patch is to move the
> handlings on VMAT_GATHER_SCATTER in the final loop nest
> of function vectorizable_load to its own loop.  Basically
> it duplicates the final loop nest, clean up some useless
> set up code for the case of VMAT_GATHER_SCATTER, remove some
> unreachable code.  Also remove the corresponding handlings
> in the final loop nest.
>
> Bootstrapped and regtested on x86_64-redhat-linux,
> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>
> Is it ok for trunk?
>
> BR,
> Kewen
> -----
>
> gcc/ChangeLog:
>
> 	* tree-vect-stmts.cc (vectorizable_load): Move the handlings on
> 	VMAT_GATHER_SCATTER in the final loop nest to its own loop,
> 	and update the final nest accordingly.
> ---
>  gcc/tree-vect-stmts.cc | 361 +++++++++++++++++++++++++----------------
>  1 file changed, 219 insertions(+), 142 deletions(-)

...that seems like quite a lot of +s.  Is there nothing we can do to
avoid the cut-&-paste?

Richard

>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index c361e16cb7b..5e514eca19b 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -10455,6 +10455,218 @@ vectorizable_load (vec_info *vinfo,
>        return true;
>      }
>
> +  if (memory_access_type == VMAT_GATHER_SCATTER)
> +    {
> +      gcc_assert (alignment_support_scheme == dr_aligned
> +		  || alignment_support_scheme == dr_unaligned_supported);
> +      gcc_assert (!grouped_load && !slp_perm);
> +
> +      unsigned int inside_cost = 0, prologue_cost = 0;
> +      for (j = 0; j < ncopies; j++)
> +	{
> +	  /* 1. Create the vector or array pointer update chain.  */
> +	  if (j == 0 && !costing_p)
> +	    {
> +	      if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +		vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
> +					     slp_node, &gs_info, &dataref_ptr,
> +					     &vec_offsets);
> +	      else
> +		dataref_ptr
> +		  = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
> +					      at_loop, offset, &dummy, gsi,
> +					      &ptr_incr, false, bump);
> +	    }
> +	  else if (!costing_p)
> +	    {
> +	      gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
> +	      if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> +					       gsi, stmt_info, bump);
> +	    }
> +
> +	  if (mask && !costing_p)
> +	    vec_mask = vec_masks[j];
> +
> +	  gimple *new_stmt = NULL;
> +	  for (i = 0; i < vec_num; i++)
> +	    {
> +	      tree final_mask = NULL_TREE;
> +	      tree final_len = NULL_TREE;
> +	      tree bias = NULL_TREE;
> +	      if (!costing_p)
> +		{
> +		  if (loop_masks)
> +		    final_mask
> +		      = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> +					    vec_num * ncopies, vectype,
> +					    vec_num * j + i);
> +		  if (vec_mask)
> +		    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> +						   final_mask, vec_mask, gsi);
> +
> +		  if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +		    dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> +						   gsi, stmt_info, bump);
> +		}
> +
> +	      /* 2. Create the vector-load in the loop.  */
> +	      unsigned HOST_WIDE_INT align;
> +	      if (gs_info.ifn != IFN_LAST)
> +		{
> +		  if (costing_p)
> +		    {
> +		      unsigned int cnunits = vect_nunits_for_cost (vectype);
> +		      inside_cost
> +			= record_stmt_cost (cost_vec, cnunits, scalar_load,
> +					    stmt_info, 0, vect_body);
> +		      continue;
> +		    }
> +		  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +		    vec_offset = vec_offsets[vec_num * j + i];
> +		  tree zero = build_zero_cst (vectype);
> +		  tree scale = size_int (gs_info.scale);
> +
> +		  if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
> +		    {
> +		      if (loop_lens)
> +			final_len
> +			  = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +					       vec_num * ncopies, vectype,
> +					       vec_num * j + i, 1);
> +		      else
> +			final_len
> +			  = build_int_cst (sizetype,
> +					   TYPE_VECTOR_SUBPARTS (vectype));
> +		      signed char biasval
> +			= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +		      bias = build_int_cst (intQI_type_node, biasval);
> +		      if (!final_mask)
> +			{
> +			  mask_vectype = truth_type_for (vectype);
> +			  final_mask = build_minus_one_cst (mask_vectype);
> +			}
> +		    }
> +
> +		  gcall *call;
> +		  if (final_len && final_mask)
> +		    call
> +		      = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
> +						    dataref_ptr, vec_offset,
> +						    scale, zero, final_mask,
> +						    final_len, bias);
> +		  else if (final_mask)
> +		    call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
> +						       dataref_ptr, vec_offset,
> +						       scale, zero, final_mask);
> +		  else
> +		    call = gimple_build_call_internal (IFN_GATHER_LOAD, 4,
> +						       dataref_ptr, vec_offset,
> +						       scale, zero);
> +		  gimple_call_set_nothrow (call, true);
> +		  new_stmt = call;
> +		  data_ref = NULL_TREE;
> +		}
> +	      else
> +		{
> +		  /* Emulated gather-scatter.  */
> +		  gcc_assert (!final_mask);
> +		  unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
> +		  if (costing_p)
> +		    {
> +		      /* For emulated gathers N offset vector element
> +			 offset add is consumed by the load).  */
> +		      inside_cost = record_stmt_cost (cost_vec, const_nunits,
> +						      vec_to_scalar, stmt_info,
> +						      0, vect_body);
> +		      /* N scalar loads plus gathering them into a
> +			 vector.  */
> +		      inside_cost
> +			= record_stmt_cost (cost_vec, const_nunits, scalar_load,
> +					    stmt_info, 0, vect_body);
> +		      inside_cost
> +			= record_stmt_cost (cost_vec, 1, vec_construct,
> +					    stmt_info, 0, vect_body);
> +		      continue;
> +		    }
> +		  unsigned HOST_WIDE_INT const_offset_nunits
> +		    = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
> +			.to_constant ();
> +		  vec<constructor_elt, va_gc> *ctor_elts;
> +		  vec_alloc (ctor_elts, const_nunits);
> +		  gimple_seq stmts = NULL;
> +		  /* We support offset vectors with more elements
> +		     than the data vector for now.  */
> +		  unsigned HOST_WIDE_INT factor
> +		    = const_offset_nunits / const_nunits;
> +		  vec_offset = vec_offsets[j / factor];
> +		  unsigned elt_offset = (j % factor) * const_nunits;
> +		  tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> +		  tree scale = size_int (gs_info.scale);
> +		  align = get_object_alignment (DR_REF (first_dr_info->dr));
> +		  tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
> +		  for (unsigned k = 0; k < const_nunits; ++k)
> +		    {
> +		      tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
> +					      bitsize_int (k + elt_offset));
> +		      tree idx
> +			= gimple_build (&stmts, BIT_FIELD_REF, idx_type,
> +					vec_offset, TYPE_SIZE (idx_type), boff);
> +		      idx = gimple_convert (&stmts, sizetype, idx);
> +		      idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
> +					  scale);
> +		      tree ptr = gimple_build (&stmts, PLUS_EXPR,
> +					       TREE_TYPE (dataref_ptr),
> +					       dataref_ptr, idx);
> +		      ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> +		      tree elt = make_ssa_name (TREE_TYPE (vectype));
> +		      tree ref = build2 (MEM_REF, ltype, ptr,
> +					 build_int_cst (ref_type, 0));
> +		      new_stmt = gimple_build_assign (elt, ref);
> +		      gimple_set_vuse (new_stmt, gimple_vuse (gsi_stmt (*gsi)));
> +		      gimple_seq_add_stmt (&stmts, new_stmt);
> +		      CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
> +		    }
> +		  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> +		  new_stmt = gimple_build_assign (
> +		    NULL_TREE, build_constructor (vectype, ctor_elts));
> +		  data_ref = NULL_TREE;
> +		}
> +
> +	      vec_dest = vect_create_destination_var (scalar_dest, vectype);
> +	      /* DATA_REF is null if we've already built the statement.  */
> +	      if (data_ref)
> +		{
> +		  vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> +		  new_stmt = gimple_build_assign (vec_dest, data_ref);
> +		}
> +	      new_temp = make_ssa_name (vec_dest, new_stmt);
> +	      gimple_set_lhs (new_stmt, new_temp);
> +	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +
> +	      /* Store vector loads in the corresponding SLP_NODE.  */
> +	      if (slp)
> +		slp_node->push_vec_def (new_stmt);
> +	    }
> +
> +	  if (!slp && !costing_p)
> +	    STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> +	}
> +
> +      if (!slp && !costing_p)
> +	*vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> +
> +      if (costing_p)
> +	{
> +	  if (dump_enabled_p ())
> +	    dump_printf_loc (MSG_NOTE, vect_location,
> +			     "vect_model_load_cost: inside_cost = %u, "
> +			     "prologue_cost = %u .\n",
> +			     inside_cost, prologue_cost);
> +	}
> +      return true;
> +    }
> +
>    poly_uint64 group_elt = 0;
>    unsigned int inside_cost = 0, prologue_cost = 0;
>    for (j = 0; j < ncopies; j++)
> @@ -10504,12 +10716,6 @@ vectorizable_load (vec_info *vinfo,
>  		  gcc_assert (!compute_in_loop);
>  		}
>  	    }
> -	  else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> -	    {
> -	      vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
> -					   slp_node, &gs_info, &dataref_ptr,
> -					   &vec_offsets);
> -	    }
>  	  else
>  	    dataref_ptr
>  	      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
> @@ -10525,7 +10731,7 @@ vectorizable_load (vec_info *vinfo,
>  	  if (dataref_offset)
>  	    dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
>  					      bump);
> -	  else if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +	  else
>  	    dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
>  					   stmt_info, bump);
>  	  if (mask)
> @@ -10551,7 +10757,7 @@ vectorizable_load (vec_info *vinfo,
>  		final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>  					       final_mask, vec_mask, gsi);
>
> -	      if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +	      if (i > 0)
>  		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>  					       gsi, stmt_info, bump);
>  	    }
> @@ -10562,139 +10768,11 @@ vectorizable_load (vec_info *vinfo,
>  	    case dr_aligned:
>  	    case dr_unaligned_supported:
>  	      {
> -		unsigned int misalign;
> -		unsigned HOST_WIDE_INT align;
> -
> -		if (memory_access_type == VMAT_GATHER_SCATTER
> -		    && gs_info.ifn != IFN_LAST)
> -		  {
> -		    if (costing_p)
> -		      {
> -			unsigned int cnunits = vect_nunits_for_cost (vectype);
> -			inside_cost
> -			  = record_stmt_cost (cost_vec, cnunits, scalar_load,
> -					      stmt_info, 0, vect_body);
> -			break;
> -		      }
> -		    if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> -		      vec_offset = vec_offsets[vec_num * j + i];
> -		    tree zero = build_zero_cst (vectype);
> -		    tree scale = size_int (gs_info.scale);
> -
> -		    if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
> -		      {
> -			if (loop_lens)
> -			  final_len
> -			    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> -						 vec_num * ncopies, vectype,
> -						 vec_num * j + i, 1);
> -			else
> -			  final_len
> -			    = build_int_cst (sizetype,
> -					     TYPE_VECTOR_SUBPARTS (vectype));
> -			signed char biasval
> -			  = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> -			bias = build_int_cst (intQI_type_node, biasval);
> -			if (!final_mask)
> -			  {
> -			    mask_vectype = truth_type_for (vectype);
> -			    final_mask = build_minus_one_cst (mask_vectype);
> -			  }
> -		      }
> -
> -		    gcall *call;
> -		    if (final_len && final_mask)
> -		      call = gimple_build_call_internal (
> -			IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
> -			scale, zero, final_mask, final_len, bias);
> -		    else if (final_mask)
> -		      call
> -			= gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
> -						      dataref_ptr, vec_offset,
> -						      scale, zero, final_mask);
> -		    else
> -		      call
> -			= gimple_build_call_internal (IFN_GATHER_LOAD, 4,
> -						      dataref_ptr, vec_offset,
> -						      scale, zero);
> -		    gimple_call_set_nothrow (call, true);
> -		    new_stmt = call;
> -		    data_ref = NULL_TREE;
> -		    break;
> -		  }
> -		else if (memory_access_type == VMAT_GATHER_SCATTER)
> -		  {
> -		    /* Emulated gather-scatter.  */
> -		    gcc_assert (!final_mask);
> -		    unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
> -		    if (costing_p)
> -		      {
> -			/* For emulated gathers N offset vector element
> -			   offset add is consumed by the load).  */
> -			inside_cost
> -			  = record_stmt_cost (cost_vec, const_nunits,
> -					      vec_to_scalar, stmt_info, 0,
> -					      vect_body);
> -			/* N scalar loads plus gathering them into a
> -			   vector.  */
> -			inside_cost = record_stmt_cost (cost_vec, const_nunits,
> -							scalar_load, stmt_info,
> -							0, vect_body);
> -			inside_cost
> -			  = record_stmt_cost (cost_vec, 1, vec_construct,
> -					      stmt_info, 0, vect_body);
> -			break;
> -		      }
> -		    unsigned HOST_WIDE_INT const_offset_nunits
> -		      = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
> -			  .to_constant ();
> -		    vec<constructor_elt, va_gc> *ctor_elts;
> -		    vec_alloc (ctor_elts, const_nunits);
> -		    gimple_seq stmts = NULL;
> -		    /* We support offset vectors with more elements
> -		       than the data vector for now.  */
> -		    unsigned HOST_WIDE_INT factor
> -		      = const_offset_nunits / const_nunits;
> -		    vec_offset = vec_offsets[j / factor];
> -		    unsigned elt_offset = (j % factor) * const_nunits;
> -		    tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> -		    tree scale = size_int (gs_info.scale);
> -		    align = get_object_alignment (DR_REF (first_dr_info->dr));
> -		    tree ltype
> -		      = build_aligned_type (TREE_TYPE (vectype), align);
> -		    for (unsigned k = 0; k < const_nunits; ++k)
> -		      {
> -			tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
> -						bitsize_int (k + elt_offset));
> -			tree idx = gimple_build (&stmts, BIT_FIELD_REF,
> -						 idx_type, vec_offset,
> -						 TYPE_SIZE (idx_type), boff);
> -			idx = gimple_convert (&stmts, sizetype, idx);
> -			idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
> -					    scale);
> -			tree ptr = gimple_build (&stmts, PLUS_EXPR,
> -						 TREE_TYPE (dataref_ptr),
> -						 dataref_ptr, idx);
> -			ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> -			tree elt = make_ssa_name (TREE_TYPE (vectype));
> -			tree ref = build2 (MEM_REF, ltype, ptr,
> -					   build_int_cst (ref_type, 0));
> -			new_stmt = gimple_build_assign (elt, ref);
> -			gimple_set_vuse (new_stmt,
> -					 gimple_vuse (gsi_stmt (*gsi)));
> -			gimple_seq_add_stmt (&stmts, new_stmt);
> -			CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
> -		      }
> -		    gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> -		    new_stmt = gimple_build_assign (
> -		      NULL_TREE, build_constructor (vectype, ctor_elts));
> -		    data_ref = NULL_TREE;
> -		    break;
> -		  }
> -
>  		if (costing_p)
>  		  break;
>
> +		unsigned int misalign;
> +		unsigned HOST_WIDE_INT align;
>  		align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
>  		if (alignment_support_scheme == dr_aligned)
>  		  misalign = 0;
> @@ -11156,10 +11234,9 @@ vectorizable_load (vec_info *vinfo,
>
>    if (costing_p)
>      {
> -      gcc_assert (memory_access_type != VMAT_INVARIANT
> -		  && memory_access_type != VMAT_ELEMENTWISE
> -		  && memory_access_type != VMAT_STRIDED_SLP
> -		  && memory_access_type != VMAT_LOAD_STORE_LANES);
> +      gcc_assert (memory_access_type == VMAT_CONTIGUOUS
> +		  || memory_access_type == VMAT_CONTIGUOUS_REVERSE
> +		  || memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
>        if (dump_enabled_p ())
>  	dump_printf_loc (MSG_NOTE, vect_location,
>  			 "vect_model_load_cost: inside_cost = %u, "
> --
> 2.39.1
  
Kewen.Lin Aug. 14, 2023, 1:01 p.m. UTC | #2
Hi Richard,

on 2023/8/14 20:20, Richard Sandiford wrote:
> Thanks for the clean-ups.  But...
> 
> "Kewen.Lin" <linkw@linux.ibm.com> writes:
>> Hi,
>>
>> Following Richi's suggestion [1], this patch is to move the
>> handlings on VMAT_GATHER_SCATTER in the final loop nest
>> of function vectorizable_load to its own loop.  Basically
>> it duplicates the final loop nest, clean up some useless
>> set up code for the case of VMAT_GATHER_SCATTER, remove some
>> unreachable code.  Also remove the corresponding handlings
>> in the final loop nest.
>>
>> Bootstrapped and regtested on x86_64-redhat-linux,
>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
>>
>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>>
>> Is it ok for trunk?
>>
>> BR,
>> Kewen
>> -----
>>
>> gcc/ChangeLog:
>>
>> 	* tree-vect-stmts.cc (vectorizable_load): Move the handlings on
>> 	VMAT_GATHER_SCATTER in the final loop nest to its own loop,
>> 	and update the final nest accordingly.
>> ---
>>  gcc/tree-vect-stmts.cc | 361 +++++++++++++++++++++++++----------------
>>  1 file changed, 219 insertions(+), 142 deletions(-)
> 
> ...that seems like quite a lot of +s.  Is there nothing we can do to
> avoid the cut-&-paste?

Thanks for the comments!  I'm not sure if I get your question, if we
want to move out the handlings of VMAT_GATHER_SCATTER, the new +s seem
inevitable?  Your concern is mainly about git blame history?

BR,
Kewen

> 
> Richard
> 
>>
>> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
>> index c361e16cb7b..5e514eca19b 100644
>> --- a/gcc/tree-vect-stmts.cc
>> +++ b/gcc/tree-vect-stmts.cc
>> @@ -10455,6 +10455,218 @@ vectorizable_load (vec_info *vinfo,
>>        return true;
>>      }
>>
>> +  if (memory_access_type == VMAT_GATHER_SCATTER)
>> +    {
>> +      gcc_assert (alignment_support_scheme == dr_aligned
>> +		  || alignment_support_scheme == dr_unaligned_supported);
>> +      gcc_assert (!grouped_load && !slp_perm);
>> +
>> +      unsigned int inside_cost = 0, prologue_cost = 0;
>> +      for (j = 0; j < ncopies; j++)
>> +	{
>> +	  /* 1. Create the vector or array pointer update chain.  */
>> +	  if (j == 0 && !costing_p)
>> +	    {
>> +	      if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +		vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
>> +					     slp_node, &gs_info, &dataref_ptr,
>> +					     &vec_offsets);
>> +	      else
>> +		dataref_ptr
>> +		  = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
>> +					      at_loop, offset, &dummy, gsi,
>> +					      &ptr_incr, false, bump);
>> +	    }
>> +	  else if (!costing_p)
>> +	    {
>> +	      gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
>> +	      if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>> +					       gsi, stmt_info, bump);
>> +	    }
>> +
>> +	  if (mask && !costing_p)
>> +	    vec_mask = vec_masks[j];
>> +
>> +	  gimple *new_stmt = NULL;
>> +	  for (i = 0; i < vec_num; i++)
>> +	    {
>> +	      tree final_mask = NULL_TREE;
>> +	      tree final_len = NULL_TREE;
>> +	      tree bias = NULL_TREE;
>> +	      if (!costing_p)
>> +		{
>> +		  if (loop_masks)
>> +		    final_mask
>> +		      = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>> +					    vec_num * ncopies, vectype,
>> +					    vec_num * j + i);
>> +		  if (vec_mask)
>> +		    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>> +						   final_mask, vec_mask, gsi);
>> +
>> +		  if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +		    dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>> +						   gsi, stmt_info, bump);
>> +		}
>> +
>> +	      /* 2. Create the vector-load in the loop.  */
>> +	      unsigned HOST_WIDE_INT align;
>> +	      if (gs_info.ifn != IFN_LAST)
>> +		{
>> +		  if (costing_p)
>> +		    {
>> +		      unsigned int cnunits = vect_nunits_for_cost (vectype);
>> +		      inside_cost
>> +			= record_stmt_cost (cost_vec, cnunits, scalar_load,
>> +					    stmt_info, 0, vect_body);
>> +		      continue;
>> +		    }
>> +		  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +		    vec_offset = vec_offsets[vec_num * j + i];
>> +		  tree zero = build_zero_cst (vectype);
>> +		  tree scale = size_int (gs_info.scale);
>> +
>> +		  if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
>> +		    {
>> +		      if (loop_lens)
>> +			final_len
>> +			  = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
>> +					       vec_num * ncopies, vectype,
>> +					       vec_num * j + i, 1);
>> +		      else
>> +			final_len
>> +			  = build_int_cst (sizetype,
>> +					   TYPE_VECTOR_SUBPARTS (vectype));
>> +		      signed char biasval
>> +			= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>> +		      bias = build_int_cst (intQI_type_node, biasval);
>> +		      if (!final_mask)
>> +			{
>> +			  mask_vectype = truth_type_for (vectype);
>> +			  final_mask = build_minus_one_cst (mask_vectype);
>> +			}
>> +		    }
>> +
>> +		  gcall *call;
>> +		  if (final_len && final_mask)
>> +		    call
>> +		      = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
>> +						    dataref_ptr, vec_offset,
>> +						    scale, zero, final_mask,
>> +						    final_len, bias);
>> +		  else if (final_mask)
>> +		    call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
>> +						       dataref_ptr, vec_offset,
>> +						       scale, zero, final_mask);
>> +		  else
>> +		    call = gimple_build_call_internal (IFN_GATHER_LOAD, 4,
>> +						       dataref_ptr, vec_offset,
>> +						       scale, zero);
>> +		  gimple_call_set_nothrow (call, true);
>> +		  new_stmt = call;
>> +		  data_ref = NULL_TREE;
>> +		}
>> +	      else
>> +		{
>> +		  /* Emulated gather-scatter.  */
>> +		  gcc_assert (!final_mask);
>> +		  unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
>> +		  if (costing_p)
>> +		    {
>> +		      /* For emulated gathers N offset vector element
>> +			 offset add is consumed by the load).  */
>> +		      inside_cost = record_stmt_cost (cost_vec, const_nunits,
>> +						      vec_to_scalar, stmt_info,
>> +						      0, vect_body);
>> +		      /* N scalar loads plus gathering them into a
>> +			 vector.  */
>> +		      inside_cost
>> +			= record_stmt_cost (cost_vec, const_nunits, scalar_load,
>> +					    stmt_info, 0, vect_body);
>> +		      inside_cost
>> +			= record_stmt_cost (cost_vec, 1, vec_construct,
>> +					    stmt_info, 0, vect_body);
>> +		      continue;
>> +		    }
>> +		  unsigned HOST_WIDE_INT const_offset_nunits
>> +		    = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
>> +			.to_constant ();
>> +		  vec<constructor_elt, va_gc> *ctor_elts;
>> +		  vec_alloc (ctor_elts, const_nunits);
>> +		  gimple_seq stmts = NULL;
>> +		  /* We support offset vectors with more elements
>> +		     than the data vector for now.  */
>> +		  unsigned HOST_WIDE_INT factor
>> +		    = const_offset_nunits / const_nunits;
>> +		  vec_offset = vec_offsets[j / factor];
>> +		  unsigned elt_offset = (j % factor) * const_nunits;
>> +		  tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
>> +		  tree scale = size_int (gs_info.scale);
>> +		  align = get_object_alignment (DR_REF (first_dr_info->dr));
>> +		  tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
>> +		  for (unsigned k = 0; k < const_nunits; ++k)
>> +		    {
>> +		      tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
>> +					      bitsize_int (k + elt_offset));
>> +		      tree idx
>> +			= gimple_build (&stmts, BIT_FIELD_REF, idx_type,
>> +					vec_offset, TYPE_SIZE (idx_type), boff);
>> +		      idx = gimple_convert (&stmts, sizetype, idx);
>> +		      idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
>> +					  scale);
>> +		      tree ptr = gimple_build (&stmts, PLUS_EXPR,
>> +					       TREE_TYPE (dataref_ptr),
>> +					       dataref_ptr, idx);
>> +		      ptr = gimple_convert (&stmts, ptr_type_node, ptr);
>> +		      tree elt = make_ssa_name (TREE_TYPE (vectype));
>> +		      tree ref = build2 (MEM_REF, ltype, ptr,
>> +					 build_int_cst (ref_type, 0));
>> +		      new_stmt = gimple_build_assign (elt, ref);
>> +		      gimple_set_vuse (new_stmt, gimple_vuse (gsi_stmt (*gsi)));
>> +		      gimple_seq_add_stmt (&stmts, new_stmt);
>> +		      CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
>> +		    }
>> +		  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>> +		  new_stmt = gimple_build_assign (
>> +		    NULL_TREE, build_constructor (vectype, ctor_elts));
>> +		  data_ref = NULL_TREE;
>> +		}
>> +
>> +	      vec_dest = vect_create_destination_var (scalar_dest, vectype);
>> +	      /* DATA_REF is null if we've already built the statement.  */
>> +	      if (data_ref)
>> +		{
>> +		  vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
>> +		  new_stmt = gimple_build_assign (vec_dest, data_ref);
>> +		}
>> +	      new_temp = make_ssa_name (vec_dest, new_stmt);
>> +	      gimple_set_lhs (new_stmt, new_temp);
>> +	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>> +
>> +	      /* Store vector loads in the corresponding SLP_NODE.  */
>> +	      if (slp)
>> +		slp_node->push_vec_def (new_stmt);
>> +	    }
>> +
>> +	  if (!slp && !costing_p)
>> +	    STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>> +	}
>> +
>> +      if (!slp && !costing_p)
>> +	*vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>> +
>> +      if (costing_p)
>> +	{
>> +	  if (dump_enabled_p ())
>> +	    dump_printf_loc (MSG_NOTE, vect_location,
>> +			     "vect_model_load_cost: inside_cost = %u, "
>> +			     "prologue_cost = %u .\n",
>> +			     inside_cost, prologue_cost);
>> +	}
>> +      return true;
>> +    }
>> +
>>    poly_uint64 group_elt = 0;
>>    unsigned int inside_cost = 0, prologue_cost = 0;
>>    for (j = 0; j < ncopies; j++)
>> @@ -10504,12 +10716,6 @@ vectorizable_load (vec_info *vinfo,
>>  		  gcc_assert (!compute_in_loop);
>>  		}
>>  	    }
>> -	  else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> -	    {
>> -	      vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
>> -					   slp_node, &gs_info, &dataref_ptr,
>> -					   &vec_offsets);
>> -	    }
>>  	  else
>>  	    dataref_ptr
>>  	      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
>> @@ -10525,7 +10731,7 @@ vectorizable_load (vec_info *vinfo,
>>  	  if (dataref_offset)
>>  	    dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
>>  					      bump);
>> -	  else if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +	  else
>>  	    dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
>>  					   stmt_info, bump);
>>  	  if (mask)
>> @@ -10551,7 +10757,7 @@ vectorizable_load (vec_info *vinfo,
>>  		final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>>  					       final_mask, vec_mask, gsi);
>>
>> -	      if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +	      if (i > 0)
>>  		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>>  					       gsi, stmt_info, bump);
>>  	    }
>> @@ -10562,139 +10768,11 @@ vectorizable_load (vec_info *vinfo,
>>  	    case dr_aligned:
>>  	    case dr_unaligned_supported:
>>  	      {
>> -		unsigned int misalign;
>> -		unsigned HOST_WIDE_INT align;
>> -
>> -		if (memory_access_type == VMAT_GATHER_SCATTER
>> -		    && gs_info.ifn != IFN_LAST)
>> -		  {
>> -		    if (costing_p)
>> -		      {
>> -			unsigned int cnunits = vect_nunits_for_cost (vectype);
>> -			inside_cost
>> -			  = record_stmt_cost (cost_vec, cnunits, scalar_load,
>> -					      stmt_info, 0, vect_body);
>> -			break;
>> -		      }
>> -		    if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> -		      vec_offset = vec_offsets[vec_num * j + i];
>> -		    tree zero = build_zero_cst (vectype);
>> -		    tree scale = size_int (gs_info.scale);
>> -
>> -		    if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
>> -		      {
>> -			if (loop_lens)
>> -			  final_len
>> -			    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
>> -						 vec_num * ncopies, vectype,
>> -						 vec_num * j + i, 1);
>> -			else
>> -			  final_len
>> -			    = build_int_cst (sizetype,
>> -					     TYPE_VECTOR_SUBPARTS (vectype));
>> -			signed char biasval
>> -			  = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>> -			bias = build_int_cst (intQI_type_node, biasval);
>> -			if (!final_mask)
>> -			  {
>> -			    mask_vectype = truth_type_for (vectype);
>> -			    final_mask = build_minus_one_cst (mask_vectype);
>> -			  }
>> -		      }
>> -
>> -		    gcall *call;
>> -		    if (final_len && final_mask)
>> -		      call = gimple_build_call_internal (
>> -			IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
>> -			scale, zero, final_mask, final_len, bias);
>> -		    else if (final_mask)
>> -		      call
>> -			= gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
>> -						      dataref_ptr, vec_offset,
>> -						      scale, zero, final_mask);
>> -		    else
>> -		      call
>> -			= gimple_build_call_internal (IFN_GATHER_LOAD, 4,
>> -						      dataref_ptr, vec_offset,
>> -						      scale, zero);
>> -		    gimple_call_set_nothrow (call, true);
>> -		    new_stmt = call;
>> -		    data_ref = NULL_TREE;
>> -		    break;
>> -		  }
>> -		else if (memory_access_type == VMAT_GATHER_SCATTER)
>> -		  {
>> -		    /* Emulated gather-scatter.  */
>> -		    gcc_assert (!final_mask);
>> -		    unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
>> -		    if (costing_p)
>> -		      {
>> -			/* For emulated gathers N offset vector element
>> -			   offset add is consumed by the load).  */
>> -			inside_cost
>> -			  = record_stmt_cost (cost_vec, const_nunits,
>> -					      vec_to_scalar, stmt_info, 0,
>> -					      vect_body);
>> -			/* N scalar loads plus gathering them into a
>> -			   vector.  */
>> -			inside_cost = record_stmt_cost (cost_vec, const_nunits,
>> -							scalar_load, stmt_info,
>> -							0, vect_body);
>> -			inside_cost
>> -			  = record_stmt_cost (cost_vec, 1, vec_construct,
>> -					      stmt_info, 0, vect_body);
>> -			break;
>> -		      }
>> -		    unsigned HOST_WIDE_INT const_offset_nunits
>> -		      = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
>> -			  .to_constant ();
>> -		    vec<constructor_elt, va_gc> *ctor_elts;
>> -		    vec_alloc (ctor_elts, const_nunits);
>> -		    gimple_seq stmts = NULL;
>> -		    /* We support offset vectors with more elements
>> -		       than the data vector for now.  */
>> -		    unsigned HOST_WIDE_INT factor
>> -		      = const_offset_nunits / const_nunits;
>> -		    vec_offset = vec_offsets[j / factor];
>> -		    unsigned elt_offset = (j % factor) * const_nunits;
>> -		    tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
>> -		    tree scale = size_int (gs_info.scale);
>> -		    align = get_object_alignment (DR_REF (first_dr_info->dr));
>> -		    tree ltype
>> -		      = build_aligned_type (TREE_TYPE (vectype), align);
>> -		    for (unsigned k = 0; k < const_nunits; ++k)
>> -		      {
>> -			tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
>> -						bitsize_int (k + elt_offset));
>> -			tree idx = gimple_build (&stmts, BIT_FIELD_REF,
>> -						 idx_type, vec_offset,
>> -						 TYPE_SIZE (idx_type), boff);
>> -			idx = gimple_convert (&stmts, sizetype, idx);
>> -			idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
>> -					    scale);
>> -			tree ptr = gimple_build (&stmts, PLUS_EXPR,
>> -						 TREE_TYPE (dataref_ptr),
>> -						 dataref_ptr, idx);
>> -			ptr = gimple_convert (&stmts, ptr_type_node, ptr);
>> -			tree elt = make_ssa_name (TREE_TYPE (vectype));
>> -			tree ref = build2 (MEM_REF, ltype, ptr,
>> -					   build_int_cst (ref_type, 0));
>> -			new_stmt = gimple_build_assign (elt, ref);
>> -			gimple_set_vuse (new_stmt,
>> -					 gimple_vuse (gsi_stmt (*gsi)));
>> -			gimple_seq_add_stmt (&stmts, new_stmt);
>> -			CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
>> -		      }
>> -		    gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>> -		    new_stmt = gimple_build_assign (
>> -		      NULL_TREE, build_constructor (vectype, ctor_elts));
>> -		    data_ref = NULL_TREE;
>> -		    break;
>> -		  }
>> -
>>  		if (costing_p)
>>  		  break;
>>
>> +		unsigned int misalign;
>> +		unsigned HOST_WIDE_INT align;
>>  		align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
>>  		if (alignment_support_scheme == dr_aligned)
>>  		  misalign = 0;
>> @@ -11156,10 +11234,9 @@ vectorizable_load (vec_info *vinfo,
>>
>>    if (costing_p)
>>      {
>> -      gcc_assert (memory_access_type != VMAT_INVARIANT
>> -		  && memory_access_type != VMAT_ELEMENTWISE
>> -		  && memory_access_type != VMAT_STRIDED_SLP
>> -		  && memory_access_type != VMAT_LOAD_STORE_LANES);
>> +      gcc_assert (memory_access_type == VMAT_CONTIGUOUS
>> +		  || memory_access_type == VMAT_CONTIGUOUS_REVERSE
>> +		  || memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
>>        if (dump_enabled_p ())
>>  	dump_printf_loc (MSG_NOTE, vect_location,
>>  			 "vect_model_load_cost: inside_cost = %u, "
>> --
>> 2.39.1
  
Richard Sandiford Aug. 14, 2023, 2:16 p.m. UTC | #3
"Kewen.Lin" <linkw@linux.ibm.com> writes:
> Hi Richard,
>
> on 2023/8/14 20:20, Richard Sandiford wrote:
>> Thanks for the clean-ups.  But...
>> 
>> "Kewen.Lin" <linkw@linux.ibm.com> writes:
>>> Hi,
>>>
>>> Following Richi's suggestion [1], this patch is to move the
>>> handlings on VMAT_GATHER_SCATTER in the final loop nest
>>> of function vectorizable_load to its own loop.  Basically
>>> it duplicates the final loop nest, clean up some useless
>>> set up code for the case of VMAT_GATHER_SCATTER, remove some
>>> unreachable code.  Also remove the corresponding handlings
>>> in the final loop nest.
>>>
>>> Bootstrapped and regtested on x86_64-redhat-linux,
>>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
>>>
>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>>>
>>> Is it ok for trunk?
>>>
>>> BR,
>>> Kewen
>>> -----
>>>
>>> gcc/ChangeLog:
>>>
>>> 	* tree-vect-stmts.cc (vectorizable_load): Move the handlings on
>>> 	VMAT_GATHER_SCATTER in the final loop nest to its own loop,
>>> 	and update the final nest accordingly.
>>> ---
>>>  gcc/tree-vect-stmts.cc | 361 +++++++++++++++++++++++++----------------
>>>  1 file changed, 219 insertions(+), 142 deletions(-)
>> 
>> ...that seems like quite a lot of +s.  Is there nothing we can do to
>> avoid the cut-&-paste?
>
> Thanks for the comments!  I'm not sure if I get your question, if we
> want to move out the handlings of VMAT_GATHER_SCATTER, the new +s seem
> inevitable?  Your concern is mainly about git blame history?

No, it was more that 219-142=77, so it seems like a lot of lines
are being duplicated rather than simply being moved.  (Unlike for
VMAT_LOAD_STORE_LANES, which was even a slight LOC saving, and so
was a clear improvement.)

So I was just wondering if there was any obvious factoring-out that
could be done to reduce the duplication.

Thanks,
Richard
  
Kewen.Lin Aug. 15, 2023, 2:33 a.m. UTC | #4
on 2023/8/14 22:16, Richard Sandiford wrote:
> "Kewen.Lin" <linkw@linux.ibm.com> writes:
>> Hi Richard,
>>
>> on 2023/8/14 20:20, Richard Sandiford wrote:
>>> Thanks for the clean-ups.  But...
>>>
>>> "Kewen.Lin" <linkw@linux.ibm.com> writes:
>>>> Hi,
>>>>
>>>> Following Richi's suggestion [1], this patch is to move the
>>>> handlings on VMAT_GATHER_SCATTER in the final loop nest
>>>> of function vectorizable_load to its own loop.  Basically
>>>> it duplicates the final loop nest, clean up some useless
>>>> set up code for the case of VMAT_GATHER_SCATTER, remove some
>>>> unreachable code.  Also remove the corresponding handlings
>>>> in the final loop nest.
>>>>
>>>> Bootstrapped and regtested on x86_64-redhat-linux,
>>>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
>>>>
>>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>>>>
>>>> Is it ok for trunk?
>>>>
>>>> BR,
>>>> Kewen
>>>> -----
>>>>
>>>> gcc/ChangeLog:
>>>>
>>>> 	* tree-vect-stmts.cc (vectorizable_load): Move the handlings on
>>>> 	VMAT_GATHER_SCATTER in the final loop nest to its own loop,
>>>> 	and update the final nest accordingly.
>>>> ---
>>>>  gcc/tree-vect-stmts.cc | 361 +++++++++++++++++++++++++----------------
>>>>  1 file changed, 219 insertions(+), 142 deletions(-)
>>>
>>> ...that seems like quite a lot of +s.  Is there nothing we can do to
>>> avoid the cut-&-paste?
>>
>> Thanks for the comments!  I'm not sure if I get your question, if we
>> want to move out the handlings of VMAT_GATHER_SCATTER, the new +s seem
>> inevitable?  Your concern is mainly about git blame history?
> 
> No, it was more that 219-142=77, so it seems like a lot of lines
> are being duplicated rather than simply being moved.  (Unlike for
> VMAT_LOAD_STORE_LANES, which was even a slight LOC saving, and so
> was a clear improvement.)
> 
> So I was just wondering if there was any obvious factoring-out that
> could be done to reduce the duplication.

ah, thanks for the clarification!

I think the main duplication are on the loop body beginning and end,
let's take a look at them in details:

+  if (memory_access_type == VMAT_GATHER_SCATTER)
+    {
+      gcc_assert (alignment_support_scheme == dr_aligned
+		  || alignment_support_scheme == dr_unaligned_supported);
+      gcc_assert (!grouped_load && !slp_perm);
+
+      unsigned int inside_cost = 0, prologue_cost = 0;

// These above are newly added.

+      for (j = 0; j < ncopies; j++)
+	{
+	  /* 1. Create the vector or array pointer update chain.  */
+	  if (j == 0 && !costing_p)
+	    {
+	      if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
+					     slp_node, &gs_info, &dataref_ptr,
+					     &vec_offsets);
+	      else
+		dataref_ptr
+		  = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
+					      at_loop, offset, &dummy, gsi,
+					      &ptr_incr, false, bump);
+	    }
+	  else if (!costing_p)
+	    {
+	      gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
+	      if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+					       gsi, stmt_info, bump);
+	    }

// These are for dataref_ptr, in the final looop nest we deal with more cases
on simd_lane_access_p and diff_first_stmt_info, but don't handle
STMT_VINFO_GATHER_SCATTER_P any more, very few (one case) can be shared between,
IMHO factoring out it seems like a overkill.

+
+	  if (mask && !costing_p)
+	    vec_mask = vec_masks[j];

// It's merged out from j == 0 and j != 0

+
+	  gimple *new_stmt = NULL;
+	  for (i = 0; i < vec_num; i++)
+	    {
+	      tree final_mask = NULL_TREE;
+	      tree final_len = NULL_TREE;
+	      tree bias = NULL_TREE;
+	      if (!costing_p)
+		{
+		  if (loop_masks)
+		    final_mask
+		      = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+					    vec_num * ncopies, vectype,
+					    vec_num * j + i);
+		  if (vec_mask)
+		    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+						   final_mask, vec_mask, gsi);
+
+		  if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		    dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+						   gsi, stmt_info, bump);
+		}

// This part is directly copied from the original, the original gets updated by
removing && !STMT_VINFO_GATHER_SCATTER_P.  Due to its size, I didn't consider
this before, do you prefer me to factor this part out?

+	      if (gs_info.ifn != IFN_LAST)
+		{
...
+		}
+	      else
+		{
+		  /* Emulated gather-scatter.  */
...

// This part is just moved from the original.

+	      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+	      /* DATA_REF is null if we've already built the statement.  */
+	      if (data_ref)
+		{
+		  vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+		  new_stmt = gimple_build_assign (vec_dest, data_ref);
+		}
+	      new_temp = make_ssa_name (vec_dest, new_stmt);
+	      gimple_set_lhs (new_stmt, new_temp);
+	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
+	      /* Store vector loads in the corresponding SLP_NODE.  */
+	      if (slp)
+		slp_node->push_vec_def (new_stmt);
+
+	  if (!slp && !costing_p)
+	    STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+	}
+
+      if (!slp && !costing_p)
+	*vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];

// This part is some subsequent handlings, it's duplicated from the original
but removing some more useless code.  I guess this part is not worthy
being factored out?

+      if (costing_p)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "vect_model_load_cost: inside_cost = %u, "
+			     "prologue_cost = %u .\n",
+			     inside_cost, prologue_cost);
+	}
+      return true;
+    }

// Duplicating the dumping, I guess it's unnecessary to be factored out.

oh, I just noticed that this should be shorten as
"if (costing_p && dump_enabled_p ())" instead, just the same as what's
adopted for VMAT_LOAD_STORE_LANES dumping.

BR,
Kewen
  
Richard Biener Aug. 15, 2023, 7:53 a.m. UTC | #5
On Tue, Aug 15, 2023 at 4:44 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> on 2023/8/14 22:16, Richard Sandiford wrote:
> > "Kewen.Lin" <linkw@linux.ibm.com> writes:
> >> Hi Richard,
> >>
> >> on 2023/8/14 20:20, Richard Sandiford wrote:
> >>> Thanks for the clean-ups.  But...
> >>>
> >>> "Kewen.Lin" <linkw@linux.ibm.com> writes:
> >>>> Hi,
> >>>>
> >>>> Following Richi's suggestion [1], this patch is to move the
> >>>> handlings on VMAT_GATHER_SCATTER in the final loop nest
> >>>> of function vectorizable_load to its own loop.  Basically
> >>>> it duplicates the final loop nest, clean up some useless
> >>>> set up code for the case of VMAT_GATHER_SCATTER, remove some
> >>>> unreachable code.  Also remove the corresponding handlings
> >>>> in the final loop nest.
> >>>>
> >>>> Bootstrapped and regtested on x86_64-redhat-linux,
> >>>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
> >>>>
> >>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
> >>>>
> >>>> Is it ok for trunk?
> >>>>
> >>>> BR,
> >>>> Kewen
> >>>> -----
> >>>>
> >>>> gcc/ChangeLog:
> >>>>
> >>>>    * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
> >>>>    VMAT_GATHER_SCATTER in the final loop nest to its own loop,
> >>>>    and update the final nest accordingly.
> >>>> ---
> >>>>  gcc/tree-vect-stmts.cc | 361 +++++++++++++++++++++++++----------------
> >>>>  1 file changed, 219 insertions(+), 142 deletions(-)
> >>>
> >>> ...that seems like quite a lot of +s.  Is there nothing we can do to
> >>> avoid the cut-&-paste?
> >>
> >> Thanks for the comments!  I'm not sure if I get your question, if we
> >> want to move out the handlings of VMAT_GATHER_SCATTER, the new +s seem
> >> inevitable?  Your concern is mainly about git blame history?
> >
> > No, it was more that 219-142=77, so it seems like a lot of lines
> > are being duplicated rather than simply being moved.  (Unlike for
> > VMAT_LOAD_STORE_LANES, which was even a slight LOC saving, and so
> > was a clear improvement.)
> >
> > So I was just wondering if there was any obvious factoring-out that
> > could be done to reduce the duplication.
>
> ah, thanks for the clarification!
>
> I think the main duplication are on the loop body beginning and end,
> let's take a look at them in details:
>
> +  if (memory_access_type == VMAT_GATHER_SCATTER)
> +    {
> +      gcc_assert (alignment_support_scheme == dr_aligned
> +                 || alignment_support_scheme == dr_unaligned_supported);
> +      gcc_assert (!grouped_load && !slp_perm);
> +
> +      unsigned int inside_cost = 0, prologue_cost = 0;
>
> // These above are newly added.
>
> +      for (j = 0; j < ncopies; j++)
> +       {
> +         /* 1. Create the vector or array pointer update chain.  */
> +         if (j == 0 && !costing_p)
> +           {
> +             if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +               vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
> +                                            slp_node, &gs_info, &dataref_ptr,
> +                                            &vec_offsets);
> +             else
> +               dataref_ptr
> +                 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
> +                                             at_loop, offset, &dummy, gsi,
> +                                             &ptr_incr, false, bump);
> +           }
> +         else if (!costing_p)
> +           {
> +             gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
> +             if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> +                                              gsi, stmt_info, bump);
> +           }
>
> // These are for dataref_ptr, in the final looop nest we deal with more cases
> on simd_lane_access_p and diff_first_stmt_info, but don't handle
> STMT_VINFO_GATHER_SCATTER_P any more, very few (one case) can be shared between,
> IMHO factoring out it seems like a overkill.
>
> +
> +         if (mask && !costing_p)
> +           vec_mask = vec_masks[j];
>
> // It's merged out from j == 0 and j != 0
>
> +
> +         gimple *new_stmt = NULL;
> +         for (i = 0; i < vec_num; i++)
> +           {
> +             tree final_mask = NULL_TREE;
> +             tree final_len = NULL_TREE;
> +             tree bias = NULL_TREE;
> +             if (!costing_p)
> +               {
> +                 if (loop_masks)
> +                   final_mask
> +                     = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> +                                           vec_num * ncopies, vectype,
> +                                           vec_num * j + i);
> +                 if (vec_mask)
> +                   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> +                                                  final_mask, vec_mask, gsi);
> +
> +                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> +                                                  gsi, stmt_info, bump);
> +               }
>
> // This part is directly copied from the original, the original gets updated by
> removing && !STMT_VINFO_GATHER_SCATTER_P.  Due to its size, I didn't consider
> this before, do you prefer me to factor this part out?
>
> +             if (gs_info.ifn != IFN_LAST)
> +               {
> ...
> +               }
> +             else
> +               {
> +                 /* Emulated gather-scatter.  */
> ...
>
> // This part is just moved from the original.
>
> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
> +             /* DATA_REF is null if we've already built the statement.  */
> +             if (data_ref)
> +               {
> +                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> +                 new_stmt = gimple_build_assign (vec_dest, data_ref);
> +               }
> +             new_temp = make_ssa_name (vec_dest, new_stmt);
> +             gimple_set_lhs (new_stmt, new_temp);
> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +
> +             /* Store vector loads in the corresponding SLP_NODE.  */
> +             if (slp)
> +               slp_node->push_vec_def (new_stmt);
> +
> +         if (!slp && !costing_p)
> +           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> +       }
> +
> +      if (!slp && !costing_p)
> +       *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>
> // This part is some subsequent handlings, it's duplicated from the original
> but removing some more useless code.  I guess this part is not worthy
> being factored out?
>
> +      if (costing_p)
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_NOTE, vect_location,
> +                            "vect_model_load_cost: inside_cost = %u, "
> +                            "prologue_cost = %u .\n",
> +                            inside_cost, prologue_cost);
> +       }
> +      return true;
> +    }
>
> // Duplicating the dumping, I guess it's unnecessary to be factored out.
>
> oh, I just noticed that this should be shorten as
> "if (costing_p && dump_enabled_p ())" instead, just the same as what's
> adopted for VMAT_LOAD_STORE_LANES dumping.

Just to mention, the original motivational idea was even though we
duplicate some
code we make it overall more readable and thus maintainable.  In the end we
might have vectorizable_load () for analysis but have not only
load_vec_info_type but one for each VMAT_* which means multiple separate
vect_transform_load () functions.  Currently vectorizable_load is structured
very inconsistently, having the transforms all hang off a single
switch (vmat-kind) {} would be an improvement IMHO.

But sure some of our internal APIs are verbose and maybe badly factored,
any improvement there is welcome.  Inventing new random APIs just to
save a few lines of code without actually making the code more readable
is IMHO bad.

But, if we can for example enhance prepare_vec_mask to handle both loop
and conditional mask and handle querying the mask that would be fine
(of course you need to check all uses to see if that makes sense).

Richard.

> BR,
> Kewen
  
Richard Sandiford Aug. 15, 2023, 8:44 a.m. UTC | #6
Richard Biener <richard.guenther@gmail.com> writes:
> On Tue, Aug 15, 2023 at 4:44 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>
>> on 2023/8/14 22:16, Richard Sandiford wrote:
>> > No, it was more that 219-142=77, so it seems like a lot of lines
>> > are being duplicated rather than simply being moved.  (Unlike for
>> > VMAT_LOAD_STORE_LANES, which was even a slight LOC saving, and so
>> > was a clear improvement.)
>> >
>> > So I was just wondering if there was any obvious factoring-out that
>> > could be done to reduce the duplication.
>>
>> ah, thanks for the clarification!
>>
>> I think the main duplication are on the loop body beginning and end,
>> let's take a look at them in details:
>>
>> +  if (memory_access_type == VMAT_GATHER_SCATTER)
>> +    {
>> +      gcc_assert (alignment_support_scheme == dr_aligned
>> +                 || alignment_support_scheme == dr_unaligned_supported);
>> +      gcc_assert (!grouped_load && !slp_perm);
>> +
>> +      unsigned int inside_cost = 0, prologue_cost = 0;
>>
>> // These above are newly added.
>>
>> +      for (j = 0; j < ncopies; j++)
>> +       {
>> +         /* 1. Create the vector or array pointer update chain.  */
>> +         if (j == 0 && !costing_p)
>> +           {
>> +             if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +               vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
>> +                                            slp_node, &gs_info, &dataref_ptr,
>> +                                            &vec_offsets);
>> +             else
>> +               dataref_ptr
>> +                 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
>> +                                             at_loop, offset, &dummy, gsi,
>> +                                             &ptr_incr, false, bump);
>> +           }
>> +         else if (!costing_p)
>> +           {
>> +             gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
>> +             if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>> +                                              gsi, stmt_info, bump);
>> +           }
>>
>> // These are for dataref_ptr, in the final looop nest we deal with more cases
>> on simd_lane_access_p and diff_first_stmt_info, but don't handle
>> STMT_VINFO_GATHER_SCATTER_P any more, very few (one case) can be shared between,
>> IMHO factoring out it seems like a overkill.
>>
>> +
>> +         if (mask && !costing_p)
>> +           vec_mask = vec_masks[j];
>>
>> // It's merged out from j == 0 and j != 0
>>
>> +
>> +         gimple *new_stmt = NULL;
>> +         for (i = 0; i < vec_num; i++)
>> +           {
>> +             tree final_mask = NULL_TREE;
>> +             tree final_len = NULL_TREE;
>> +             tree bias = NULL_TREE;
>> +             if (!costing_p)
>> +               {
>> +                 if (loop_masks)
>> +                   final_mask
>> +                     = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>> +                                           vec_num * ncopies, vectype,
>> +                                           vec_num * j + i);
>> +                 if (vec_mask)
>> +                   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>> +                                                  final_mask, vec_mask, gsi);
>> +
>> +                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>> +                                                  gsi, stmt_info, bump);
>> +               }
>>
>> // This part is directly copied from the original, the original gets updated by
>> removing && !STMT_VINFO_GATHER_SCATTER_P.  Due to its size, I didn't consider
>> this before, do you prefer me to factor this part out?
>>
>> +             if (gs_info.ifn != IFN_LAST)
>> +               {
>> ...
>> +               }
>> +             else
>> +               {
>> +                 /* Emulated gather-scatter.  */
>> ...
>>
>> // This part is just moved from the original.
>>
>> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
>> +             /* DATA_REF is null if we've already built the statement.  */
>> +             if (data_ref)
>> +               {
>> +                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
>> +                 new_stmt = gimple_build_assign (vec_dest, data_ref);
>> +               }
>> +             new_temp = make_ssa_name (vec_dest, new_stmt);
>> +             gimple_set_lhs (new_stmt, new_temp);
>> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>> +
>> +             /* Store vector loads in the corresponding SLP_NODE.  */
>> +             if (slp)
>> +               slp_node->push_vec_def (new_stmt);
>> +
>> +         if (!slp && !costing_p)
>> +           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>> +       }
>> +
>> +      if (!slp && !costing_p)
>> +       *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>>
>> // This part is some subsequent handlings, it's duplicated from the original
>> but removing some more useless code.  I guess this part is not worthy
>> being factored out?
>>
>> +      if (costing_p)
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_NOTE, vect_location,
>> +                            "vect_model_load_cost: inside_cost = %u, "
>> +                            "prologue_cost = %u .\n",
>> +                            inside_cost, prologue_cost);
>> +       }
>> +      return true;
>> +    }
>>
>> // Duplicating the dumping, I guess it's unnecessary to be factored out.
>>
>> oh, I just noticed that this should be shorten as
>> "if (costing_p && dump_enabled_p ())" instead, just the same as what's
>> adopted for VMAT_LOAD_STORE_LANES dumping.
>
> Just to mention, the original motivational idea was even though we
> duplicate some
> code we make it overall more readable and thus maintainable.

Not sure I necessarily agree with the "thus".  Maybe it tends to be
true with a good, well-factored API.  But the internal vector APIs
make it extremely easy to get things wrong.  If we have multiple copies
of the same operation, the tendency is to fix bugs in the copy that the
bugs were seen in.  It's easy to forget that other copies exist
elsewhere that probably need updating in the same way.

> In the end we
> might have vectorizable_load () for analysis but have not only
> load_vec_info_type but one for each VMAT_* which means multiple separate
> vect_transform_load () functions.  Currently vectorizable_load is structured
> very inconsistently, having the transforms all hang off a single
> switch (vmat-kind) {} would be an improvement IMHO.

Yeah, agree vectorizable_load ought to be refactored.

> But sure some of our internal APIs are verbose and maybe badly factored,
> any improvement there is welcome.  Inventing new random APIs just to
> save a few lines of code without actually making the code more readable
> is IMHO bad.

OK, fair enough.  So the idea is: see where we end up and then try to
improve/factor the APIs in a less peephole way?

Thanks,
Richard

> But, if we can for example enhance prepare_vec_mask to handle both loop
> and conditional mask and handle querying the mask that would be fine
> (of course you need to check all uses to see if that makes sense).
>
> Richard.
>
>> BR,
>> Kewen
  
Richard Biener Aug. 15, 2023, 9:04 a.m. UTC | #7
On Tue, Aug 15, 2023 at 10:44 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Tue, Aug 15, 2023 at 4:44 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
> >>
> >> on 2023/8/14 22:16, Richard Sandiford wrote:
> >> > No, it was more that 219-142=77, so it seems like a lot of lines
> >> > are being duplicated rather than simply being moved.  (Unlike for
> >> > VMAT_LOAD_STORE_LANES, which was even a slight LOC saving, and so
> >> > was a clear improvement.)
> >> >
> >> > So I was just wondering if there was any obvious factoring-out that
> >> > could be done to reduce the duplication.
> >>
> >> ah, thanks for the clarification!
> >>
> >> I think the main duplication are on the loop body beginning and end,
> >> let's take a look at them in details:
> >>
> >> +  if (memory_access_type == VMAT_GATHER_SCATTER)
> >> +    {
> >> +      gcc_assert (alignment_support_scheme == dr_aligned
> >> +                 || alignment_support_scheme == dr_unaligned_supported);
> >> +      gcc_assert (!grouped_load && !slp_perm);
> >> +
> >> +      unsigned int inside_cost = 0, prologue_cost = 0;
> >>
> >> // These above are newly added.
> >>
> >> +      for (j = 0; j < ncopies; j++)
> >> +       {
> >> +         /* 1. Create the vector or array pointer update chain.  */
> >> +         if (j == 0 && !costing_p)
> >> +           {
> >> +             if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> +               vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
> >> +                                            slp_node, &gs_info, &dataref_ptr,
> >> +                                            &vec_offsets);
> >> +             else
> >> +               dataref_ptr
> >> +                 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
> >> +                                             at_loop, offset, &dummy, gsi,
> >> +                                             &ptr_incr, false, bump);
> >> +           }
> >> +         else if (!costing_p)
> >> +           {
> >> +             gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
> >> +             if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> +               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> >> +                                              gsi, stmt_info, bump);
> >> +           }
> >>
> >> // These are for dataref_ptr, in the final looop nest we deal with more cases
> >> on simd_lane_access_p and diff_first_stmt_info, but don't handle
> >> STMT_VINFO_GATHER_SCATTER_P any more, very few (one case) can be shared between,
> >> IMHO factoring out it seems like a overkill.
> >>
> >> +
> >> +         if (mask && !costing_p)
> >> +           vec_mask = vec_masks[j];
> >>
> >> // It's merged out from j == 0 and j != 0
> >>
> >> +
> >> +         gimple *new_stmt = NULL;
> >> +         for (i = 0; i < vec_num; i++)
> >> +           {
> >> +             tree final_mask = NULL_TREE;
> >> +             tree final_len = NULL_TREE;
> >> +             tree bias = NULL_TREE;
> >> +             if (!costing_p)
> >> +               {
> >> +                 if (loop_masks)
> >> +                   final_mask
> >> +                     = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >> +                                           vec_num * ncopies, vectype,
> >> +                                           vec_num * j + i);
> >> +                 if (vec_mask)
> >> +                   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> >> +                                                  final_mask, vec_mask, gsi);
> >> +
> >> +                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> +                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> >> +                                                  gsi, stmt_info, bump);
> >> +               }
> >>
> >> // This part is directly copied from the original, the original gets updated by
> >> removing && !STMT_VINFO_GATHER_SCATTER_P.  Due to its size, I didn't consider
> >> this before, do you prefer me to factor this part out?
> >>
> >> +             if (gs_info.ifn != IFN_LAST)
> >> +               {
> >> ...
> >> +               }
> >> +             else
> >> +               {
> >> +                 /* Emulated gather-scatter.  */
> >> ...
> >>
> >> // This part is just moved from the original.
> >>
> >> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
> >> +             /* DATA_REF is null if we've already built the statement.  */
> >> +             if (data_ref)
> >> +               {
> >> +                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> >> +                 new_stmt = gimple_build_assign (vec_dest, data_ref);
> >> +               }
> >> +             new_temp = make_ssa_name (vec_dest, new_stmt);
> >> +             gimple_set_lhs (new_stmt, new_temp);
> >> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >> +
> >> +             /* Store vector loads in the corresponding SLP_NODE.  */
> >> +             if (slp)
> >> +               slp_node->push_vec_def (new_stmt);
> >> +
> >> +         if (!slp && !costing_p)
> >> +           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >> +       }
> >> +
> >> +      if (!slp && !costing_p)
> >> +       *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> >>
> >> // This part is some subsequent handlings, it's duplicated from the original
> >> but removing some more useless code.  I guess this part is not worthy
> >> being factored out?
> >>
> >> +      if (costing_p)
> >> +       {
> >> +         if (dump_enabled_p ())
> >> +           dump_printf_loc (MSG_NOTE, vect_location,
> >> +                            "vect_model_load_cost: inside_cost = %u, "
> >> +                            "prologue_cost = %u .\n",
> >> +                            inside_cost, prologue_cost);
> >> +       }
> >> +      return true;
> >> +    }
> >>
> >> // Duplicating the dumping, I guess it's unnecessary to be factored out.
> >>
> >> oh, I just noticed that this should be shorten as
> >> "if (costing_p && dump_enabled_p ())" instead, just the same as what's
> >> adopted for VMAT_LOAD_STORE_LANES dumping.
> >
> > Just to mention, the original motivational idea was even though we
> > duplicate some
> > code we make it overall more readable and thus maintainable.
>
> Not sure I necessarily agree with the "thus".  Maybe it tends to be
> true with a good, well-factored API.  But the internal vector APIs
> make it extremely easy to get things wrong.  If we have multiple copies
> of the same operation, the tendency is to fix bugs in the copy that the
> bugs were seen in.  It's easy to forget that other copies exist
> elsewhere that probably need updating in the same way.
>
> > In the end we
> > might have vectorizable_load () for analysis but have not only
> > load_vec_info_type but one for each VMAT_* which means multiple separate
> > vect_transform_load () functions.  Currently vectorizable_load is structured
> > very inconsistently, having the transforms all hang off a single
> > switch (vmat-kind) {} would be an improvement IMHO.
>
> Yeah, agree vectorizable_load ought to be refactored.
>
> > But sure some of our internal APIs are verbose and maybe badly factored,
> > any improvement there is welcome.  Inventing new random APIs just to
> > save a few lines of code without actually making the code more readable
> > is IMHO bad.
>
> OK, fair enough.  So the idea is: see where we end up and then try to
> improve/factor the APIs in a less peephole way?

Yeah, I think that's the only good way forward.

> Thanks,
> Richard
>
> > But, if we can for example enhance prepare_vec_mask to handle both loop
> > and conditional mask and handle querying the mask that would be fine
> > (of course you need to check all uses to see if that makes sense).
> >
> > Richard.
> >
> >> BR,
> >> Kewen
  
Richard Sandiford Aug. 15, 2023, 9:13 a.m. UTC | #8
Richard Biener <richard.guenther@gmail.com> writes:
>> OK, fair enough.  So the idea is: see where we end up and then try to
>> improve/factor the APIs in a less peephole way?
>
> Yeah, I think that's the only good way forward.

OK, no objection from me.  Sorry for holding the patch up.

Richard
  
Kewen.Lin Aug. 15, 2023, 11:47 a.m. UTC | #9
on 2023/8/15 15:53, Richard Biener wrote:
> On Tue, Aug 15, 2023 at 4:44 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>
>> on 2023/8/14 22:16, Richard Sandiford wrote:
>>> "Kewen.Lin" <linkw@linux.ibm.com> writes:
>>>> Hi Richard,
>>>>
>>>> on 2023/8/14 20:20, Richard Sandiford wrote:
>>>>> Thanks for the clean-ups.  But...
>>>>>
>>>>> "Kewen.Lin" <linkw@linux.ibm.com> writes:
>>>>>> Hi,
>>>>>>
>>>>>> Following Richi's suggestion [1], this patch is to move the
>>>>>> handlings on VMAT_GATHER_SCATTER in the final loop nest
>>>>>> of function vectorizable_load to its own loop.  Basically
>>>>>> it duplicates the final loop nest, clean up some useless
>>>>>> set up code for the case of VMAT_GATHER_SCATTER, remove some
>>>>>> unreachable code.  Also remove the corresponding handlings
>>>>>> in the final loop nest.
>>>>>>
>>>>>> Bootstrapped and regtested on x86_64-redhat-linux,
>>>>>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
>>>>>>
>>>>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>>>>>>
>>>>>> Is it ok for trunk?
>>>>>>
>>>>>> BR,
>>>>>> Kewen
>>>>>> -----
>>>>>>
>>>>>> gcc/ChangeLog:
>>>>>>
>>>>>>    * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
>>>>>>    VMAT_GATHER_SCATTER in the final loop nest to its own loop,
>>>>>>    and update the final nest accordingly.
>>>>>> ---
>>>>>>  gcc/tree-vect-stmts.cc | 361 +++++++++++++++++++++++++----------------
>>>>>>  1 file changed, 219 insertions(+), 142 deletions(-)
>>>>>
>>>>> ...that seems like quite a lot of +s.  Is there nothing we can do to
>>>>> avoid the cut-&-paste?
>>>>
>>>> Thanks for the comments!  I'm not sure if I get your question, if we
>>>> want to move out the handlings of VMAT_GATHER_SCATTER, the new +s seem
>>>> inevitable?  Your concern is mainly about git blame history?
>>>
>>> No, it was more that 219-142=77, so it seems like a lot of lines
>>> are being duplicated rather than simply being moved.  (Unlike for
>>> VMAT_LOAD_STORE_LANES, which was even a slight LOC saving, and so
>>> was a clear improvement.)
>>>
>>> So I was just wondering if there was any obvious factoring-out that
>>> could be done to reduce the duplication.
>>
>> ah, thanks for the clarification!
>>
>> I think the main duplication are on the loop body beginning and end,
>> let's take a look at them in details:
>>
>> +  if (memory_access_type == VMAT_GATHER_SCATTER)
>> +    {
>> +      gcc_assert (alignment_support_scheme == dr_aligned
>> +                 || alignment_support_scheme == dr_unaligned_supported);
>> +      gcc_assert (!grouped_load && !slp_perm);
>> +
>> +      unsigned int inside_cost = 0, prologue_cost = 0;
>>
>> // These above are newly added.
>>
>> +      for (j = 0; j < ncopies; j++)
>> +       {
>> +         /* 1. Create the vector or array pointer update chain.  */
>> +         if (j == 0 && !costing_p)
>> +           {
>> +             if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +               vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
>> +                                            slp_node, &gs_info, &dataref_ptr,
>> +                                            &vec_offsets);
>> +             else
>> +               dataref_ptr
>> +                 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
>> +                                             at_loop, offset, &dummy, gsi,
>> +                                             &ptr_incr, false, bump);
>> +           }
>> +         else if (!costing_p)
>> +           {
>> +             gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
>> +             if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>> +                                              gsi, stmt_info, bump);
>> +           }
>>
>> // These are for dataref_ptr, in the final looop nest we deal with more cases
>> on simd_lane_access_p and diff_first_stmt_info, but don't handle
>> STMT_VINFO_GATHER_SCATTER_P any more, very few (one case) can be shared between,
>> IMHO factoring out it seems like a overkill.
>>
>> +
>> +         if (mask && !costing_p)
>> +           vec_mask = vec_masks[j];
>>
>> // It's merged out from j == 0 and j != 0
>>
>> +
>> +         gimple *new_stmt = NULL;
>> +         for (i = 0; i < vec_num; i++)
>> +           {
>> +             tree final_mask = NULL_TREE;
>> +             tree final_len = NULL_TREE;
>> +             tree bias = NULL_TREE;
>> +             if (!costing_p)
>> +               {
>> +                 if (loop_masks)
>> +                   final_mask
>> +                     = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>> +                                           vec_num * ncopies, vectype,
>> +                                           vec_num * j + i);
>> +                 if (vec_mask)
>> +                   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>> +                                                  final_mask, vec_mask, gsi);
>> +
>> +                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>> +                                                  gsi, stmt_info, bump);
>> +               }
>>
>> // This part is directly copied from the original, the original gets updated by
>> removing && !STMT_VINFO_GATHER_SCATTER_P.  Due to its size, I didn't consider
>> this before, do you prefer me to factor this part out?
>>
>> +             if (gs_info.ifn != IFN_LAST)
>> +               {
>> ...
>> +               }
>> +             else
>> +               {
>> +                 /* Emulated gather-scatter.  */
>> ...
>>
>> // This part is just moved from the original.
>>
>> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
>> +             /* DATA_REF is null if we've already built the statement.  */
>> +             if (data_ref)
>> +               {
>> +                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
>> +                 new_stmt = gimple_build_assign (vec_dest, data_ref);
>> +               }
>> +             new_temp = make_ssa_name (vec_dest, new_stmt);
>> +             gimple_set_lhs (new_stmt, new_temp);
>> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>> +
>> +             /* Store vector loads in the corresponding SLP_NODE.  */
>> +             if (slp)
>> +               slp_node->push_vec_def (new_stmt);
>> +
>> +         if (!slp && !costing_p)
>> +           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>> +       }
>> +
>> +      if (!slp && !costing_p)
>> +       *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>>
>> // This part is some subsequent handlings, it's duplicated from the original
>> but removing some more useless code.  I guess this part is not worthy
>> being factored out?
>>
>> +      if (costing_p)
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_NOTE, vect_location,
>> +                            "vect_model_load_cost: inside_cost = %u, "
>> +                            "prologue_cost = %u .\n",
>> +                            inside_cost, prologue_cost);
>> +       }
>> +      return true;
>> +    }
>>
>> // Duplicating the dumping, I guess it's unnecessary to be factored out.
>>
>> oh, I just noticed that this should be shorten as
>> "if (costing_p && dump_enabled_p ())" instead, just the same as what's
>> adopted for VMAT_LOAD_STORE_LANES dumping.
> 
> Just to mention, the original motivational idea was even though we
> duplicate some
> code we make it overall more readable and thus maintainable.  In the end we
> might have vectorizable_load () for analysis but have not only
> load_vec_info_type but one for each VMAT_* which means multiple separate
> vect_transform_load () functions.  Currently vectorizable_load is structured
> very inconsistently, having the transforms all hang off a single
> switch (vmat-kind) {} would be an improvement IMHO.

Thanks for the comments!  With these two patches, now the final loop nest are
only handling VMAT_CONTIGUOUS, VMAT_CONTIGUOUS_REVERSE and VMAT_CONTIGUOUS_PERMUTE.
IMHO, their handlings are highly bundled, re-structuring them can have more
duplicated code and potential incomplete bug fix risks as Richard pointed out.
But if I read the above comments right, our final goal seems to separate all of
them?  I wonder if you both prefer to further separate them?

> 
> But sure some of our internal APIs are verbose and maybe badly factored,
> any improvement there is welcome.  Inventing new random APIs just to
> save a few lines of code without actually making the code more readable
> is IMHO bad.
> 
> But, if we can for example enhance prepare_vec_mask to handle both loop
> and conditional mask and handle querying the mask that would be fine
> (of course you need to check all uses to see if that makes sense).

OK, will keep in mind, also add the example to my TODO list. :)

BR,
Kewen
  
Richard Biener Aug. 15, 2023, 12:07 p.m. UTC | #10
On Tue, Aug 15, 2023 at 1:47 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> on 2023/8/15 15:53, Richard Biener wrote:
> > On Tue, Aug 15, 2023 at 4:44 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
> >>
> >> on 2023/8/14 22:16, Richard Sandiford wrote:
> >>> "Kewen.Lin" <linkw@linux.ibm.com> writes:
> >>>> Hi Richard,
> >>>>
> >>>> on 2023/8/14 20:20, Richard Sandiford wrote:
> >>>>> Thanks for the clean-ups.  But...
> >>>>>
> >>>>> "Kewen.Lin" <linkw@linux.ibm.com> writes:
> >>>>>> Hi,
> >>>>>>
> >>>>>> Following Richi's suggestion [1], this patch is to move the
> >>>>>> handlings on VMAT_GATHER_SCATTER in the final loop nest
> >>>>>> of function vectorizable_load to its own loop.  Basically
> >>>>>> it duplicates the final loop nest, clean up some useless
> >>>>>> set up code for the case of VMAT_GATHER_SCATTER, remove some
> >>>>>> unreachable code.  Also remove the corresponding handlings
> >>>>>> in the final loop nest.
> >>>>>>
> >>>>>> Bootstrapped and regtested on x86_64-redhat-linux,
> >>>>>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
> >>>>>>
> >>>>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
> >>>>>>
> >>>>>> Is it ok for trunk?
> >>>>>>
> >>>>>> BR,
> >>>>>> Kewen
> >>>>>> -----
> >>>>>>
> >>>>>> gcc/ChangeLog:
> >>>>>>
> >>>>>>    * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
> >>>>>>    VMAT_GATHER_SCATTER in the final loop nest to its own loop,
> >>>>>>    and update the final nest accordingly.
> >>>>>> ---
> >>>>>>  gcc/tree-vect-stmts.cc | 361 +++++++++++++++++++++++++----------------
> >>>>>>  1 file changed, 219 insertions(+), 142 deletions(-)
> >>>>>
> >>>>> ...that seems like quite a lot of +s.  Is there nothing we can do to
> >>>>> avoid the cut-&-paste?
> >>>>
> >>>> Thanks for the comments!  I'm not sure if I get your question, if we
> >>>> want to move out the handlings of VMAT_GATHER_SCATTER, the new +s seem
> >>>> inevitable?  Your concern is mainly about git blame history?
> >>>
> >>> No, it was more that 219-142=77, so it seems like a lot of lines
> >>> are being duplicated rather than simply being moved.  (Unlike for
> >>> VMAT_LOAD_STORE_LANES, which was even a slight LOC saving, and so
> >>> was a clear improvement.)
> >>>
> >>> So I was just wondering if there was any obvious factoring-out that
> >>> could be done to reduce the duplication.
> >>
> >> ah, thanks for the clarification!
> >>
> >> I think the main duplication are on the loop body beginning and end,
> >> let's take a look at them in details:
> >>
> >> +  if (memory_access_type == VMAT_GATHER_SCATTER)
> >> +    {
> >> +      gcc_assert (alignment_support_scheme == dr_aligned
> >> +                 || alignment_support_scheme == dr_unaligned_supported);
> >> +      gcc_assert (!grouped_load && !slp_perm);
> >> +
> >> +      unsigned int inside_cost = 0, prologue_cost = 0;
> >>
> >> // These above are newly added.
> >>
> >> +      for (j = 0; j < ncopies; j++)
> >> +       {
> >> +         /* 1. Create the vector or array pointer update chain.  */
> >> +         if (j == 0 && !costing_p)
> >> +           {
> >> +             if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> +               vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
> >> +                                            slp_node, &gs_info, &dataref_ptr,
> >> +                                            &vec_offsets);
> >> +             else
> >> +               dataref_ptr
> >> +                 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
> >> +                                             at_loop, offset, &dummy, gsi,
> >> +                                             &ptr_incr, false, bump);
> >> +           }
> >> +         else if (!costing_p)
> >> +           {
> >> +             gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
> >> +             if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> +               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> >> +                                              gsi, stmt_info, bump);
> >> +           }
> >>
> >> // These are for dataref_ptr, in the final looop nest we deal with more cases
> >> on simd_lane_access_p and diff_first_stmt_info, but don't handle
> >> STMT_VINFO_GATHER_SCATTER_P any more, very few (one case) can be shared between,
> >> IMHO factoring out it seems like a overkill.
> >>
> >> +
> >> +         if (mask && !costing_p)
> >> +           vec_mask = vec_masks[j];
> >>
> >> // It's merged out from j == 0 and j != 0
> >>
> >> +
> >> +         gimple *new_stmt = NULL;
> >> +         for (i = 0; i < vec_num; i++)
> >> +           {
> >> +             tree final_mask = NULL_TREE;
> >> +             tree final_len = NULL_TREE;
> >> +             tree bias = NULL_TREE;
> >> +             if (!costing_p)
> >> +               {
> >> +                 if (loop_masks)
> >> +                   final_mask
> >> +                     = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >> +                                           vec_num * ncopies, vectype,
> >> +                                           vec_num * j + i);
> >> +                 if (vec_mask)
> >> +                   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> >> +                                                  final_mask, vec_mask, gsi);
> >> +
> >> +                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> +                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> >> +                                                  gsi, stmt_info, bump);
> >> +               }
> >>
> >> // This part is directly copied from the original, the original gets updated by
> >> removing && !STMT_VINFO_GATHER_SCATTER_P.  Due to its size, I didn't consider
> >> this before, do you prefer me to factor this part out?
> >>
> >> +             if (gs_info.ifn != IFN_LAST)
> >> +               {
> >> ...
> >> +               }
> >> +             else
> >> +               {
> >> +                 /* Emulated gather-scatter.  */
> >> ...
> >>
> >> // This part is just moved from the original.
> >>
> >> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
> >> +             /* DATA_REF is null if we've already built the statement.  */
> >> +             if (data_ref)
> >> +               {
> >> +                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> >> +                 new_stmt = gimple_build_assign (vec_dest, data_ref);
> >> +               }
> >> +             new_temp = make_ssa_name (vec_dest, new_stmt);
> >> +             gimple_set_lhs (new_stmt, new_temp);
> >> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >> +
> >> +             /* Store vector loads in the corresponding SLP_NODE.  */
> >> +             if (slp)
> >> +               slp_node->push_vec_def (new_stmt);
> >> +
> >> +         if (!slp && !costing_p)
> >> +           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >> +       }
> >> +
> >> +      if (!slp && !costing_p)
> >> +       *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> >>
> >> // This part is some subsequent handlings, it's duplicated from the original
> >> but removing some more useless code.  I guess this part is not worthy
> >> being factored out?
> >>
> >> +      if (costing_p)
> >> +       {
> >> +         if (dump_enabled_p ())
> >> +           dump_printf_loc (MSG_NOTE, vect_location,
> >> +                            "vect_model_load_cost: inside_cost = %u, "
> >> +                            "prologue_cost = %u .\n",
> >> +                            inside_cost, prologue_cost);
> >> +       }
> >> +      return true;
> >> +    }
> >>
> >> // Duplicating the dumping, I guess it's unnecessary to be factored out.
> >>
> >> oh, I just noticed that this should be shorten as
> >> "if (costing_p && dump_enabled_p ())" instead, just the same as what's
> >> adopted for VMAT_LOAD_STORE_LANES dumping.
> >
> > Just to mention, the original motivational idea was even though we
> > duplicate some
> > code we make it overall more readable and thus maintainable.  In the end we
> > might have vectorizable_load () for analysis but have not only
> > load_vec_info_type but one for each VMAT_* which means multiple separate
> > vect_transform_load () functions.  Currently vectorizable_load is structured
> > very inconsistently, having the transforms all hang off a single
> > switch (vmat-kind) {} would be an improvement IMHO.
>
> Thanks for the comments!  With these two patches, now the final loop nest are
> only handling VMAT_CONTIGUOUS, VMAT_CONTIGUOUS_REVERSE and VMAT_CONTIGUOUS_PERMUTE.
> IMHO, their handlings are highly bundled, re-structuring them can have more
> duplicated code and potential incomplete bug fix risks as Richard pointed out.
> But if I read the above comments right, our final goal seems to separate all of
> them?  I wonder if you both prefer to further separate them?

I'd leave those together, they share too much code.

> >
> > But sure some of our internal APIs are verbose and maybe badly factored,
> > any improvement there is welcome.  Inventing new random APIs just to
> > save a few lines of code without actually making the code more readable
> > is IMHO bad.
> >
> > But, if we can for example enhance prepare_vec_mask to handle both loop
> > and conditional mask and handle querying the mask that would be fine
> > (of course you need to check all uses to see if that makes sense).
>
> OK, will keep in mind, also add the example to my TODO list. :)
>
> BR,
> Kewen
  
Kewen.Lin Aug. 15, 2023, 12:13 p.m. UTC | #11
on 2023/8/15 20:07, Richard Biener wrote:
> On Tue, Aug 15, 2023 at 1:47 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>
>> on 2023/8/15 15:53, Richard Biener wrote:
>>> On Tue, Aug 15, 2023 at 4:44 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>>>
>>>> on 2023/8/14 22:16, Richard Sandiford wrote:
>>>>> "Kewen.Lin" <linkw@linux.ibm.com> writes:
>>>>>> Hi Richard,
>>>>>>
>>>>>> on 2023/8/14 20:20, Richard Sandiford wrote:
>>>>>>> Thanks for the clean-ups.  But...
>>>>>>>
>>>>>>> "Kewen.Lin" <linkw@linux.ibm.com> writes:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> Following Richi's suggestion [1], this patch is to move the
>>>>>>>> handlings on VMAT_GATHER_SCATTER in the final loop nest
>>>>>>>> of function vectorizable_load to its own loop.  Basically
>>>>>>>> it duplicates the final loop nest, clean up some useless
>>>>>>>> set up code for the case of VMAT_GATHER_SCATTER, remove some
>>>>>>>> unreachable code.  Also remove the corresponding handlings
>>>>>>>> in the final loop nest.
>>>>>>>>
>>>>>>>> Bootstrapped and regtested on x86_64-redhat-linux,
>>>>>>>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
>>>>>>>>
>>>>>>>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>>>>>>>>
>>>>>>>> Is it ok for trunk?
>>>>>>>>
>>>>>>>> BR,
>>>>>>>> Kewen
>>>>>>>> -----
>>>>>>>>
>>>>>>>> gcc/ChangeLog:
>>>>>>>>
>>>>>>>>    * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
>>>>>>>>    VMAT_GATHER_SCATTER in the final loop nest to its own loop,
>>>>>>>>    and update the final nest accordingly.
>>>>>>>> ---
>>>>>>>>  gcc/tree-vect-stmts.cc | 361 +++++++++++++++++++++++++----------------
>>>>>>>>  1 file changed, 219 insertions(+), 142 deletions(-)
>>>>>>>
>>>>>>> ...that seems like quite a lot of +s.  Is there nothing we can do to
>>>>>>> avoid the cut-&-paste?
>>>>>>
>>>>>> Thanks for the comments!  I'm not sure if I get your question, if we
>>>>>> want to move out the handlings of VMAT_GATHER_SCATTER, the new +s seem
>>>>>> inevitable?  Your concern is mainly about git blame history?
>>>>>
>>>>> No, it was more that 219-142=77, so it seems like a lot of lines
>>>>> are being duplicated rather than simply being moved.  (Unlike for
>>>>> VMAT_LOAD_STORE_LANES, which was even a slight LOC saving, and so
>>>>> was a clear improvement.)
>>>>>
>>>>> So I was just wondering if there was any obvious factoring-out that
>>>>> could be done to reduce the duplication.
>>>>
>>>> ah, thanks for the clarification!
>>>>
>>>> I think the main duplication are on the loop body beginning and end,
>>>> let's take a look at them in details:
>>>>
>>>> +  if (memory_access_type == VMAT_GATHER_SCATTER)
>>>> +    {
>>>> +      gcc_assert (alignment_support_scheme == dr_aligned
>>>> +                 || alignment_support_scheme == dr_unaligned_supported);
>>>> +      gcc_assert (!grouped_load && !slp_perm);
>>>> +
>>>> +      unsigned int inside_cost = 0, prologue_cost = 0;
>>>>
>>>> // These above are newly added.
>>>>
>>>> +      for (j = 0; j < ncopies; j++)
>>>> +       {
>>>> +         /* 1. Create the vector or array pointer update chain.  */
>>>> +         if (j == 0 && !costing_p)
>>>> +           {
>>>> +             if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>>>> +               vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
>>>> +                                            slp_node, &gs_info, &dataref_ptr,
>>>> +                                            &vec_offsets);
>>>> +             else
>>>> +               dataref_ptr
>>>> +                 = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
>>>> +                                             at_loop, offset, &dummy, gsi,
>>>> +                                             &ptr_incr, false, bump);
>>>> +           }
>>>> +         else if (!costing_p)
>>>> +           {
>>>> +             gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
>>>> +             if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>>>> +               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>>>> +                                              gsi, stmt_info, bump);
>>>> +           }
>>>>
>>>> // These are for dataref_ptr, in the final looop nest we deal with more cases
>>>> on simd_lane_access_p and diff_first_stmt_info, but don't handle
>>>> STMT_VINFO_GATHER_SCATTER_P any more, very few (one case) can be shared between,
>>>> IMHO factoring out it seems like a overkill.
>>>>
>>>> +
>>>> +         if (mask && !costing_p)
>>>> +           vec_mask = vec_masks[j];
>>>>
>>>> // It's merged out from j == 0 and j != 0
>>>>
>>>> +
>>>> +         gimple *new_stmt = NULL;
>>>> +         for (i = 0; i < vec_num; i++)
>>>> +           {
>>>> +             tree final_mask = NULL_TREE;
>>>> +             tree final_len = NULL_TREE;
>>>> +             tree bias = NULL_TREE;
>>>> +             if (!costing_p)
>>>> +               {
>>>> +                 if (loop_masks)
>>>> +                   final_mask
>>>> +                     = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>>>> +                                           vec_num * ncopies, vectype,
>>>> +                                           vec_num * j + i);
>>>> +                 if (vec_mask)
>>>> +                   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>>>> +                                                  final_mask, vec_mask, gsi);
>>>> +
>>>> +                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>>>> +                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>>>> +                                                  gsi, stmt_info, bump);
>>>> +               }
>>>>
>>>> // This part is directly copied from the original, the original gets updated by
>>>> removing && !STMT_VINFO_GATHER_SCATTER_P.  Due to its size, I didn't consider
>>>> this before, do you prefer me to factor this part out?
>>>>
>>>> +             if (gs_info.ifn != IFN_LAST)
>>>> +               {
>>>> ...
>>>> +               }
>>>> +             else
>>>> +               {
>>>> +                 /* Emulated gather-scatter.  */
>>>> ...
>>>>
>>>> // This part is just moved from the original.
>>>>
>>>> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
>>>> +             /* DATA_REF is null if we've already built the statement.  */
>>>> +             if (data_ref)
>>>> +               {
>>>> +                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
>>>> +                 new_stmt = gimple_build_assign (vec_dest, data_ref);
>>>> +               }
>>>> +             new_temp = make_ssa_name (vec_dest, new_stmt);
>>>> +             gimple_set_lhs (new_stmt, new_temp);
>>>> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>>>> +
>>>> +             /* Store vector loads in the corresponding SLP_NODE.  */
>>>> +             if (slp)
>>>> +               slp_node->push_vec_def (new_stmt);
>>>> +
>>>> +         if (!slp && !costing_p)
>>>> +           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>>>> +       }
>>>> +
>>>> +      if (!slp && !costing_p)
>>>> +       *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>>>>
>>>> // This part is some subsequent handlings, it's duplicated from the original
>>>> but removing some more useless code.  I guess this part is not worthy
>>>> being factored out?
>>>>
>>>> +      if (costing_p)
>>>> +       {
>>>> +         if (dump_enabled_p ())
>>>> +           dump_printf_loc (MSG_NOTE, vect_location,
>>>> +                            "vect_model_load_cost: inside_cost = %u, "
>>>> +                            "prologue_cost = %u .\n",
>>>> +                            inside_cost, prologue_cost);
>>>> +       }
>>>> +      return true;
>>>> +    }
>>>>
>>>> // Duplicating the dumping, I guess it's unnecessary to be factored out.
>>>>
>>>> oh, I just noticed that this should be shorten as
>>>> "if (costing_p && dump_enabled_p ())" instead, just the same as what's
>>>> adopted for VMAT_LOAD_STORE_LANES dumping.
>>>
>>> Just to mention, the original motivational idea was even though we
>>> duplicate some
>>> code we make it overall more readable and thus maintainable.  In the end we
>>> might have vectorizable_load () for analysis but have not only
>>> load_vec_info_type but one for each VMAT_* which means multiple separate
>>> vect_transform_load () functions.  Currently vectorizable_load is structured
>>> very inconsistently, having the transforms all hang off a single
>>> switch (vmat-kind) {} would be an improvement IMHO.
>>
>> Thanks for the comments!  With these two patches, now the final loop nest are
>> only handling VMAT_CONTIGUOUS, VMAT_CONTIGUOUS_REVERSE and VMAT_CONTIGUOUS_PERMUTE.
>> IMHO, their handlings are highly bundled, re-structuring them can have more
>> duplicated code and potential incomplete bug fix risks as Richard pointed out.
>> But if I read the above comments right, our final goal seems to separate all of
>> them?  I wonder if you both prefer to further separate them?
> 
> I'd leave those together, they share too much code.

Got it, thanks for clarifying, it matches my previous thought. :)

BR,
Kewen
  
Kewen.Lin Aug. 16, 2023, 2:38 a.m. UTC | #12
on 2023/8/15 17:13, Richard Sandiford wrote:
> Richard Biener <richard.guenther@gmail.com> writes:
>>> OK, fair enough.  So the idea is: see where we end up and then try to
>>> improve/factor the APIs in a less peephole way?
>>
>> Yeah, I think that's the only good way forward.
> 
> OK, no objection from me.  Sorry for holding the patch up.

This hasn't been approved yet (although the patch on VMAT_LOAD_STORE_LANES
was), so it wasn't held up and thanks for sharing your thoughts and making
it get attention. :)

From the discussions, it seems this looks good to both of you.  But I could
be wrong, so may I ask if it's ok for trunk?

BR,
Kewen
  
Richard Biener Aug. 17, 2023, 7:47 a.m. UTC | #13
On Wed, Aug 16, 2023 at 4:38 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> on 2023/8/15 17:13, Richard Sandiford wrote:
> > Richard Biener <richard.guenther@gmail.com> writes:
> >>> OK, fair enough.  So the idea is: see where we end up and then try to
> >>> improve/factor the APIs in a less peephole way?
> >>
> >> Yeah, I think that's the only good way forward.
> >
> > OK, no objection from me.  Sorry for holding the patch up.
>
> This hasn't been approved yet (although the patch on VMAT_LOAD_STORE_LANES
> was), so it wasn't held up and thanks for sharing your thoughts and making
> it get attention. :)
>
> From the discussions, it seems this looks good to both of you.  But I could
> be wrong, so may I ask if it's ok for trunk?

OK.

Richard.

> BR,
> Kewen
  

Patch

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c361e16cb7b..5e514eca19b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -10455,6 +10455,218 @@  vectorizable_load (vec_info *vinfo,
       return true;
     }

+  if (memory_access_type == VMAT_GATHER_SCATTER)
+    {
+      gcc_assert (alignment_support_scheme == dr_aligned
+		  || alignment_support_scheme == dr_unaligned_supported);
+      gcc_assert (!grouped_load && !slp_perm);
+
+      unsigned int inside_cost = 0, prologue_cost = 0;
+      for (j = 0; j < ncopies; j++)
+	{
+	  /* 1. Create the vector or array pointer update chain.  */
+	  if (j == 0 && !costing_p)
+	    {
+	      if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
+					     slp_node, &gs_info, &dataref_ptr,
+					     &vec_offsets);
+	      else
+		dataref_ptr
+		  = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
+					      at_loop, offset, &dummy, gsi,
+					      &ptr_incr, false, bump);
+	    }
+	  else if (!costing_p)
+	    {
+	      gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
+	      if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+					       gsi, stmt_info, bump);
+	    }
+
+	  if (mask && !costing_p)
+	    vec_mask = vec_masks[j];
+
+	  gimple *new_stmt = NULL;
+	  for (i = 0; i < vec_num; i++)
+	    {
+	      tree final_mask = NULL_TREE;
+	      tree final_len = NULL_TREE;
+	      tree bias = NULL_TREE;
+	      if (!costing_p)
+		{
+		  if (loop_masks)
+		    final_mask
+		      = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+					    vec_num * ncopies, vectype,
+					    vec_num * j + i);
+		  if (vec_mask)
+		    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+						   final_mask, vec_mask, gsi);
+
+		  if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		    dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+						   gsi, stmt_info, bump);
+		}
+
+	      /* 2. Create the vector-load in the loop.  */
+	      unsigned HOST_WIDE_INT align;
+	      if (gs_info.ifn != IFN_LAST)
+		{
+		  if (costing_p)
+		    {
+		      unsigned int cnunits = vect_nunits_for_cost (vectype);
+		      inside_cost
+			= record_stmt_cost (cost_vec, cnunits, scalar_load,
+					    stmt_info, 0, vect_body);
+		      continue;
+		    }
+		  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		    vec_offset = vec_offsets[vec_num * j + i];
+		  tree zero = build_zero_cst (vectype);
+		  tree scale = size_int (gs_info.scale);
+
+		  if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
+		    {
+		      if (loop_lens)
+			final_len
+			  = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+					       vec_num * ncopies, vectype,
+					       vec_num * j + i, 1);
+		      else
+			final_len
+			  = build_int_cst (sizetype,
+					   TYPE_VECTOR_SUBPARTS (vectype));
+		      signed char biasval
+			= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+		      bias = build_int_cst (intQI_type_node, biasval);
+		      if (!final_mask)
+			{
+			  mask_vectype = truth_type_for (vectype);
+			  final_mask = build_minus_one_cst (mask_vectype);
+			}
+		    }
+
+		  gcall *call;
+		  if (final_len && final_mask)
+		    call
+		      = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
+						    dataref_ptr, vec_offset,
+						    scale, zero, final_mask,
+						    final_len, bias);
+		  else if (final_mask)
+		    call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
+						       dataref_ptr, vec_offset,
+						       scale, zero, final_mask);
+		  else
+		    call = gimple_build_call_internal (IFN_GATHER_LOAD, 4,
+						       dataref_ptr, vec_offset,
+						       scale, zero);
+		  gimple_call_set_nothrow (call, true);
+		  new_stmt = call;
+		  data_ref = NULL_TREE;
+		}
+	      else
+		{
+		  /* Emulated gather-scatter.  */
+		  gcc_assert (!final_mask);
+		  unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
+		  if (costing_p)
+		    {
+		      /* For emulated gathers N offset vector element
+			 offset add is consumed by the load).  */
+		      inside_cost = record_stmt_cost (cost_vec, const_nunits,
+						      vec_to_scalar, stmt_info,
+						      0, vect_body);
+		      /* N scalar loads plus gathering them into a
+			 vector.  */
+		      inside_cost
+			= record_stmt_cost (cost_vec, const_nunits, scalar_load,
+					    stmt_info, 0, vect_body);
+		      inside_cost
+			= record_stmt_cost (cost_vec, 1, vec_construct,
+					    stmt_info, 0, vect_body);
+		      continue;
+		    }
+		  unsigned HOST_WIDE_INT const_offset_nunits
+		    = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
+			.to_constant ();
+		  vec<constructor_elt, va_gc> *ctor_elts;
+		  vec_alloc (ctor_elts, const_nunits);
+		  gimple_seq stmts = NULL;
+		  /* We support offset vectors with more elements
+		     than the data vector for now.  */
+		  unsigned HOST_WIDE_INT factor
+		    = const_offset_nunits / const_nunits;
+		  vec_offset = vec_offsets[j / factor];
+		  unsigned elt_offset = (j % factor) * const_nunits;
+		  tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
+		  tree scale = size_int (gs_info.scale);
+		  align = get_object_alignment (DR_REF (first_dr_info->dr));
+		  tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
+		  for (unsigned k = 0; k < const_nunits; ++k)
+		    {
+		      tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
+					      bitsize_int (k + elt_offset));
+		      tree idx
+			= gimple_build (&stmts, BIT_FIELD_REF, idx_type,
+					vec_offset, TYPE_SIZE (idx_type), boff);
+		      idx = gimple_convert (&stmts, sizetype, idx);
+		      idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
+					  scale);
+		      tree ptr = gimple_build (&stmts, PLUS_EXPR,
+					       TREE_TYPE (dataref_ptr),
+					       dataref_ptr, idx);
+		      ptr = gimple_convert (&stmts, ptr_type_node, ptr);
+		      tree elt = make_ssa_name (TREE_TYPE (vectype));
+		      tree ref = build2 (MEM_REF, ltype, ptr,
+					 build_int_cst (ref_type, 0));
+		      new_stmt = gimple_build_assign (elt, ref);
+		      gimple_set_vuse (new_stmt, gimple_vuse (gsi_stmt (*gsi)));
+		      gimple_seq_add_stmt (&stmts, new_stmt);
+		      CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
+		    }
+		  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+		  new_stmt = gimple_build_assign (
+		    NULL_TREE, build_constructor (vectype, ctor_elts));
+		  data_ref = NULL_TREE;
+		}
+
+	      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+	      /* DATA_REF is null if we've already built the statement.  */
+	      if (data_ref)
+		{
+		  vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+		  new_stmt = gimple_build_assign (vec_dest, data_ref);
+		}
+	      new_temp = make_ssa_name (vec_dest, new_stmt);
+	      gimple_set_lhs (new_stmt, new_temp);
+	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
+	      /* Store vector loads in the corresponding SLP_NODE.  */
+	      if (slp)
+		slp_node->push_vec_def (new_stmt);
+	    }
+
+	  if (!slp && !costing_p)
+	    STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+	}
+
+      if (!slp && !costing_p)
+	*vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+
+      if (costing_p)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "vect_model_load_cost: inside_cost = %u, "
+			     "prologue_cost = %u .\n",
+			     inside_cost, prologue_cost);
+	}
+      return true;
+    }
+
   poly_uint64 group_elt = 0;
   unsigned int inside_cost = 0, prologue_cost = 0;
   for (j = 0; j < ncopies; j++)
@@ -10504,12 +10716,6 @@  vectorizable_load (vec_info *vinfo,
 		  gcc_assert (!compute_in_loop);
 		}
 	    }
-	  else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-	    {
-	      vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
-					   slp_node, &gs_info, &dataref_ptr,
-					   &vec_offsets);
-	    }
 	  else
 	    dataref_ptr
 	      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
@@ -10525,7 +10731,7 @@  vectorizable_load (vec_info *vinfo,
 	  if (dataref_offset)
 	    dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
 					      bump);
-	  else if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+	  else
 	    dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
 					   stmt_info, bump);
 	  if (mask)
@@ -10551,7 +10757,7 @@  vectorizable_load (vec_info *vinfo,
 		final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
 					       final_mask, vec_mask, gsi);

-	      if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+	      if (i > 0)
 		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
 					       gsi, stmt_info, bump);
 	    }
@@ -10562,139 +10768,11 @@  vectorizable_load (vec_info *vinfo,
 	    case dr_aligned:
 	    case dr_unaligned_supported:
 	      {
-		unsigned int misalign;
-		unsigned HOST_WIDE_INT align;
-
-		if (memory_access_type == VMAT_GATHER_SCATTER
-		    && gs_info.ifn != IFN_LAST)
-		  {
-		    if (costing_p)
-		      {
-			unsigned int cnunits = vect_nunits_for_cost (vectype);
-			inside_cost
-			  = record_stmt_cost (cost_vec, cnunits, scalar_load,
-					      stmt_info, 0, vect_body);
-			break;
-		      }
-		    if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-		      vec_offset = vec_offsets[vec_num * j + i];
-		    tree zero = build_zero_cst (vectype);
-		    tree scale = size_int (gs_info.scale);
-
-		    if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
-		      {
-			if (loop_lens)
-			  final_len
-			    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
-						 vec_num * ncopies, vectype,
-						 vec_num * j + i, 1);
-			else
-			  final_len
-			    = build_int_cst (sizetype,
-					     TYPE_VECTOR_SUBPARTS (vectype));
-			signed char biasval
-			  = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
-			bias = build_int_cst (intQI_type_node, biasval);
-			if (!final_mask)
-			  {
-			    mask_vectype = truth_type_for (vectype);
-			    final_mask = build_minus_one_cst (mask_vectype);
-			  }
-		      }
-
-		    gcall *call;
-		    if (final_len && final_mask)
-		      call = gimple_build_call_internal (
-			IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
-			scale, zero, final_mask, final_len, bias);
-		    else if (final_mask)
-		      call
-			= gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
-						      dataref_ptr, vec_offset,
-						      scale, zero, final_mask);
-		    else
-		      call
-			= gimple_build_call_internal (IFN_GATHER_LOAD, 4,
-						      dataref_ptr, vec_offset,
-						      scale, zero);
-		    gimple_call_set_nothrow (call, true);
-		    new_stmt = call;
-		    data_ref = NULL_TREE;
-		    break;
-		  }
-		else if (memory_access_type == VMAT_GATHER_SCATTER)
-		  {
-		    /* Emulated gather-scatter.  */
-		    gcc_assert (!final_mask);
-		    unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
-		    if (costing_p)
-		      {
-			/* For emulated gathers N offset vector element
-			   offset add is consumed by the load).  */
-			inside_cost
-			  = record_stmt_cost (cost_vec, const_nunits,
-					      vec_to_scalar, stmt_info, 0,
-					      vect_body);
-			/* N scalar loads plus gathering them into a
-			   vector.  */
-			inside_cost = record_stmt_cost (cost_vec, const_nunits,
-							scalar_load, stmt_info,
-							0, vect_body);
-			inside_cost
-			  = record_stmt_cost (cost_vec, 1, vec_construct,
-					      stmt_info, 0, vect_body);
-			break;
-		      }
-		    unsigned HOST_WIDE_INT const_offset_nunits
-		      = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
-			  .to_constant ();
-		    vec<constructor_elt, va_gc> *ctor_elts;
-		    vec_alloc (ctor_elts, const_nunits);
-		    gimple_seq stmts = NULL;
-		    /* We support offset vectors with more elements
-		       than the data vector for now.  */
-		    unsigned HOST_WIDE_INT factor
-		      = const_offset_nunits / const_nunits;
-		    vec_offset = vec_offsets[j / factor];
-		    unsigned elt_offset = (j % factor) * const_nunits;
-		    tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
-		    tree scale = size_int (gs_info.scale);
-		    align = get_object_alignment (DR_REF (first_dr_info->dr));
-		    tree ltype
-		      = build_aligned_type (TREE_TYPE (vectype), align);
-		    for (unsigned k = 0; k < const_nunits; ++k)
-		      {
-			tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
-						bitsize_int (k + elt_offset));
-			tree idx = gimple_build (&stmts, BIT_FIELD_REF,
-						 idx_type, vec_offset,
-						 TYPE_SIZE (idx_type), boff);
-			idx = gimple_convert (&stmts, sizetype, idx);
-			idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
-					    scale);
-			tree ptr = gimple_build (&stmts, PLUS_EXPR,
-						 TREE_TYPE (dataref_ptr),
-						 dataref_ptr, idx);
-			ptr = gimple_convert (&stmts, ptr_type_node, ptr);
-			tree elt = make_ssa_name (TREE_TYPE (vectype));
-			tree ref = build2 (MEM_REF, ltype, ptr,
-					   build_int_cst (ref_type, 0));
-			new_stmt = gimple_build_assign (elt, ref);
-			gimple_set_vuse (new_stmt,
-					 gimple_vuse (gsi_stmt (*gsi)));
-			gimple_seq_add_stmt (&stmts, new_stmt);
-			CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
-		      }
-		    gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-		    new_stmt = gimple_build_assign (
-		      NULL_TREE, build_constructor (vectype, ctor_elts));
-		    data_ref = NULL_TREE;
-		    break;
-		  }
-
 		if (costing_p)
 		  break;

+		unsigned int misalign;
+		unsigned HOST_WIDE_INT align;
 		align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
 		if (alignment_support_scheme == dr_aligned)
 		  misalign = 0;
@@ -11156,10 +11234,9 @@  vectorizable_load (vec_info *vinfo,

   if (costing_p)
     {
-      gcc_assert (memory_access_type != VMAT_INVARIANT
-		  && memory_access_type != VMAT_ELEMENTWISE
-		  && memory_access_type != VMAT_STRIDED_SLP
-		  && memory_access_type != VMAT_LOAD_STORE_LANES);
+      gcc_assert (memory_access_type == VMAT_CONTIGUOUS
+		  || memory_access_type == VMAT_CONTIGUOUS_REVERSE
+		  || memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "vect_model_load_cost: inside_cost = %u, "