vect: Move VMAT_LOAD_STORE_LANES handlings from final loop nest

Message ID b39e934e-869e-840d-eb7a-5b2de24146a8@linux.ibm.com
State Unresolved
Headers
Series vect: Move VMAT_LOAD_STORE_LANES handlings from final loop nest |

Checks

Context Check Description
snail/gcc-patch-check warning Git am fail log

Commit Message

Kewen.Lin Aug. 14, 2023, 8:54 a.m. UTC
  Hi,

Following Richi's suggestion [1], this patch is to move the
handlings on VMAT_LOAD_STORE_LANES in the final loop nest
of function vectorizable_load to its own loop.  Basically
it duplicates the final loop nest, clean up some useless
set up code for the case of VMAT_LOAD_STORE_LANES, remove
some unreachable code.  Also remove the corresponding
handlings in the final loop nest.

Bootstrapped and regtested on x86_64-redhat-linux,
aarch64-linux-gnu and powerpc64{,le}-linux-gnu.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html

gcc/ChangeLog:

	* tree-vect-stmts.cc (vectorizable_load): Move the handlings on
	VMAT_LOAD_STORE_LANES in the final loop nest to its own loop,
	and update the final nest accordingly.
---
 gcc/tree-vect-stmts.cc | 1275 ++++++++++++++++++++--------------------
 1 file changed, 634 insertions(+), 641 deletions(-)

--
2.31.1
  

Comments

Richard Biener Aug. 14, 2023, 12:04 p.m. UTC | #1
On Mon, Aug 14, 2023 at 10:54 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> Hi,
>
> Following Richi's suggestion [1], this patch is to move the
> handlings on VMAT_LOAD_STORE_LANES in the final loop nest
> of function vectorizable_load to its own loop.  Basically
> it duplicates the final loop nest, clean up some useless
> set up code for the case of VMAT_LOAD_STORE_LANES, remove
> some unreachable code.  Also remove the corresponding
> handlings in the final loop nest.
>
> Bootstrapped and regtested on x86_64-redhat-linux,
> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.

OK (I guess the big diff is mostly because of re-indenting).

Thanks,
Richard.

> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>
> gcc/ChangeLog:
>
>         * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
>         VMAT_LOAD_STORE_LANES in the final loop nest to its own loop,
>         and update the final nest accordingly.
> ---
>  gcc/tree-vect-stmts.cc | 1275 ++++++++++++++++++++--------------------
>  1 file changed, 634 insertions(+), 641 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 4f2d088484c..c361e16cb7b 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo,
>         vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
>                                        &vec_masks, mask_vectype);
>      }
> +
>    tree vec_mask = NULL_TREE;
> +  if (memory_access_type == VMAT_LOAD_STORE_LANES)
> +    {
> +      gcc_assert (alignment_support_scheme == dr_aligned
> +                 || alignment_support_scheme == dr_unaligned_supported);
> +      gcc_assert (grouped_load && !slp);
> +
> +      unsigned int inside_cost = 0, prologue_cost = 0;
> +      for (j = 0; j < ncopies; j++)
> +       {
> +         if (costing_p)
> +           {
> +             /* An IFN_LOAD_LANES will load all its vector results,
> +                regardless of which ones we actually need.  Account
> +                for the cost of unused results.  */
> +             if (first_stmt_info == stmt_info)
> +               {
> +                 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
> +                 stmt_vec_info next_stmt_info = first_stmt_info;
> +                 do
> +                   {
> +                     gaps -= 1;
> +                     next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
> +                   }
> +                 while (next_stmt_info);
> +                 if (gaps)
> +                   {
> +                     if (dump_enabled_p ())
> +                       dump_printf_loc (MSG_NOTE, vect_location,
> +                                        "vect_model_load_cost: %d "
> +                                        "unused vectors.\n",
> +                                        gaps);
> +                     vect_get_load_cost (vinfo, stmt_info, gaps,
> +                                         alignment_support_scheme,
> +                                         misalignment, false, &inside_cost,
> +                                         &prologue_cost, cost_vec, cost_vec,
> +                                         true);
> +                   }
> +               }
> +             vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
> +                                 misalignment, false, &inside_cost,
> +                                 &prologue_cost, cost_vec, cost_vec, true);
> +             continue;
> +           }
> +
> +         /* 1. Create the vector or array pointer update chain.  */
> +         if (j == 0)
> +           dataref_ptr
> +             = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
> +                                         at_loop, offset, &dummy, gsi,
> +                                         &ptr_incr, false, bump);
> +         else
> +           {
> +             gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
> +             dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
> +                                            stmt_info, bump);
> +           }
> +         if (mask)
> +           vec_mask = vec_masks[j];
> +
> +         tree vec_array = create_vector_array (vectype, vec_num);
> +
> +         tree final_mask = NULL_TREE;
> +         if (loop_masks)
> +           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> +                                            ncopies, vectype, j);
> +         if (vec_mask)
> +           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
> +                                          vec_mask, gsi);
> +
> +         gcall *call;
> +         if (final_mask)
> +           {
> +             /* Emit:
> +                  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> +                                               VEC_MASK).  */
> +             unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> +             tree alias_ptr = build_int_cst (ref_type, align);
> +             call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
> +                                                dataref_ptr, alias_ptr,
> +                                                final_mask);
> +           }
> +         else
> +           {
> +             /* Emit:
> +                  VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
> +             data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
> +             call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
> +           }
> +         gimple_call_set_lhs (call, vec_array);
> +         gimple_call_set_nothrow (call, true);
> +         vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> +
> +         dr_chain.create (vec_num);
> +         /* Extract each vector into an SSA_NAME.  */
> +         for (i = 0; i < vec_num; i++)
> +           {
> +             new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
> +                                           vec_array, i);
> +             dr_chain.quick_push (new_temp);
> +           }
> +
> +         /* Record the mapping between SSA_NAMEs and statements.  */
> +         vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
> +
> +         /* Record that VEC_ARRAY is now dead.  */
> +         vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
> +
> +         dr_chain.release ();
> +
> +         *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> +       }
> +
> +      if (costing_p && dump_enabled_p ())
> +       dump_printf_loc (MSG_NOTE, vect_location,
> +                        "vect_model_load_cost: inside_cost = %u, "
> +                        "prologue_cost = %u .\n",
> +                        inside_cost, prologue_cost);
> +
> +      return true;
> +    }
> +
>    poly_uint64 group_elt = 0;
>    unsigned int inside_cost = 0, prologue_cost = 0;
>    for (j = 0; j < ncopies; j++)
> @@ -10414,685 +10538,558 @@ vectorizable_load (vec_info *vinfo,
>         dr_chain.create (vec_num);
>
>        gimple *new_stmt = NULL;
> -      if (memory_access_type == VMAT_LOAD_STORE_LANES)
> +      for (i = 0; i < vec_num; i++)
>         {
> -         if (costing_p)
> -           {
> -             /* An IFN_LOAD_LANES will load all its vector results,
> -                regardless of which ones we actually need.  Account
> -                for the cost of unused results.  */
> -             if (grouped_load && first_stmt_info == stmt_info)
> -               {
> -                 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
> -                 stmt_vec_info next_stmt_info = first_stmt_info;
> -                 do
> -                   {
> -                     gaps -= 1;
> -                     next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
> -                   }
> -                 while (next_stmt_info);
> -                 if (gaps)
> -                   {
> -                     if (dump_enabled_p ())
> -                       dump_printf_loc (MSG_NOTE, vect_location,
> -                                        "vect_model_load_cost: %d "
> -                                        "unused vectors.\n",
> -                                        gaps);
> -                     vect_get_load_cost (vinfo, stmt_info, gaps,
> -                                         alignment_support_scheme,
> -                                         misalignment, false, &inside_cost,
> -                                         &prologue_cost, cost_vec, cost_vec,
> -                                         true);
> -                   }
> -               }
> -             vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
> -                                 misalignment, false, &inside_cost,
> -                                 &prologue_cost, cost_vec, cost_vec, true);
> -             continue;
> -           }
> -         tree vec_array;
> -
> -         vec_array = create_vector_array (vectype, vec_num);
> -
>           tree final_mask = NULL_TREE;
> -         if (loop_masks)
> -           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> -                                            ncopies, vectype, j);
> -         if (vec_mask)
> -           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> -                                          final_mask, vec_mask, gsi);
> -
> -         gcall *call;
> -         if (final_mask)
> -           {
> -             /* Emit:
> -                  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> -                                               VEC_MASK).  */
> -             unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> -             tree alias_ptr = build_int_cst (ref_type, align);
> -             call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
> -                                                dataref_ptr, alias_ptr,
> -                                                final_mask);
> -           }
> -         else
> +         tree final_len = NULL_TREE;
> +         tree bias = NULL_TREE;
> +         if (!costing_p)
>             {
> -             /* Emit:
> -                  VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
> -             data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
> -             call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
> -           }
> -         gimple_call_set_lhs (call, vec_array);
> -         gimple_call_set_nothrow (call, true);
> -         vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> -         new_stmt = call;
> +             if (loop_masks)
> +               final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> +                                                vec_num * ncopies, vectype,
> +                                                vec_num * j + i);
> +             if (vec_mask)
> +               final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> +                                              final_mask, vec_mask, gsi);
>
> -         /* Extract each vector into an SSA_NAME.  */
> -         for (i = 0; i < vec_num; i++)
> -           {
> -             new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
> -                                           vec_array, i);
> -             dr_chain.quick_push (new_temp);
> +             if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> +                                              gsi, stmt_info, bump);
>             }
>
> -         /* Record the mapping between SSA_NAMEs and statements.  */
> -         vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
> -
> -         /* Record that VEC_ARRAY is now dead.  */
> -         vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
> -       }
> -      else
> -       {
> -         for (i = 0; i < vec_num; i++)
> +         /* 2. Create the vector-load in the loop.  */
> +         switch (alignment_support_scheme)
>             {
> -             tree final_mask = NULL_TREE;
> -             tree final_len = NULL_TREE;
> -             tree bias = NULL_TREE;
> -             if (!costing_p)
> -               {
> -                 if (loop_masks)
> -                   final_mask
> -                     = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> -                                           vec_num * ncopies, vectype,
> -                                           vec_num * j + i);
> -                 if (vec_mask)
> -                   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> -                                                  final_mask, vec_mask, gsi);
> -
> -                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> -                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> -                                                  gsi, stmt_info, bump);
> -               }
> +           case dr_aligned:
> +           case dr_unaligned_supported:
> +             {
> +               unsigned int misalign;
> +               unsigned HOST_WIDE_INT align;
>
> -             /* 2. Create the vector-load in the loop.  */
> -             switch (alignment_support_scheme)
> -               {
> -               case dr_aligned:
> -               case dr_unaligned_supported:
> +               if (memory_access_type == VMAT_GATHER_SCATTER
> +                   && gs_info.ifn != IFN_LAST)
>                   {
> -                   unsigned int misalign;
> -                   unsigned HOST_WIDE_INT align;
> -
> -                   if (memory_access_type == VMAT_GATHER_SCATTER
> -                       && gs_info.ifn != IFN_LAST)
> +                   if (costing_p)
>                       {
> -                       if (costing_p)
> -                         {
> -                           unsigned int cnunits
> -                             = vect_nunits_for_cost (vectype);
> -                           inside_cost
> -                             = record_stmt_cost (cost_vec, cnunits,
> -                                                 scalar_load, stmt_info, 0,
> -                                                 vect_body);
> -                           break;
> -                         }
> -                       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> -                         vec_offset = vec_offsets[vec_num * j + i];
> -                       tree zero = build_zero_cst (vectype);
> -                       tree scale = size_int (gs_info.scale);
> -
> -                       if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
> -                         {
> -                           if (loop_lens)
> -                             final_len
> -                               = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> -                                                    vec_num * ncopies, vectype,
> -                                                    vec_num * j + i, 1);
> -                           else
> -                             final_len = build_int_cst (sizetype,
> -                                                        TYPE_VECTOR_SUBPARTS (
> -                                                          vectype));
> -                           signed char biasval
> -                             = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> -                           bias = build_int_cst (intQI_type_node, biasval);
> -                           if (!final_mask)
> -                             {
> -                               mask_vectype = truth_type_for (vectype);
> -                               final_mask = build_minus_one_cst (mask_vectype);
> -                             }
> -                         }
> -
> -                       gcall *call;
> -                       if (final_len && final_mask)
> -                         call = gimple_build_call_internal (
> -                           IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr,
> -                           vec_offset, scale, zero, final_mask, final_len,
> -                           bias);
> -                       else if (final_mask)
> -                         call = gimple_build_call_internal
> -                           (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
> -                            vec_offset, scale, zero, final_mask);
> -                       else
> -                         call = gimple_build_call_internal
> -                           (IFN_GATHER_LOAD, 4, dataref_ptr,
> -                            vec_offset, scale, zero);
> -                       gimple_call_set_nothrow (call, true);
> -                       new_stmt = call;
> -                       data_ref = NULL_TREE;
> +                       unsigned int cnunits = vect_nunits_for_cost (vectype);
> +                       inside_cost
> +                         = record_stmt_cost (cost_vec, cnunits, scalar_load,
> +                                             stmt_info, 0, vect_body);
>                         break;
>                       }
> -                   else if (memory_access_type == VMAT_GATHER_SCATTER)
> +                   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +                     vec_offset = vec_offsets[vec_num * j + i];
> +                   tree zero = build_zero_cst (vectype);
> +                   tree scale = size_int (gs_info.scale);
> +
> +                   if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
>                       {
> -                       /* Emulated gather-scatter.  */
> -                       gcc_assert (!final_mask);
> -                       unsigned HOST_WIDE_INT const_nunits
> -                         = nunits.to_constant ();
> -                       if (costing_p)
> -                         {
> -                           /* For emulated gathers N offset vector element
> -                              offset add is consumed by the load).  */
> -                           inside_cost
> -                             = record_stmt_cost (cost_vec, const_nunits,
> -                                                 vec_to_scalar, stmt_info, 0,
> -                                                 vect_body);
> -                           /* N scalar loads plus gathering them into a
> -                              vector.  */
> -                           inside_cost
> -                             = record_stmt_cost (cost_vec, const_nunits,
> -                                                 scalar_load, stmt_info, 0,
> -                                                 vect_body);
> -                           inside_cost
> -                             = record_stmt_cost (cost_vec, 1, vec_construct,
> -                                                 stmt_info, 0, vect_body);
> -                           break;
> -                         }
> -                       unsigned HOST_WIDE_INT const_offset_nunits
> -                         = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
> -                             .to_constant ();
> -                       vec<constructor_elt, va_gc> *ctor_elts;
> -                       vec_alloc (ctor_elts, const_nunits);
> -                       gimple_seq stmts = NULL;
> -                       /* We support offset vectors with more elements
> -                          than the data vector for now.  */
> -                       unsigned HOST_WIDE_INT factor
> -                         = const_offset_nunits / const_nunits;
> -                       vec_offset = vec_offsets[j / factor];
> -                       unsigned elt_offset = (j % factor) * const_nunits;
> -                       tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> -                       tree scale = size_int (gs_info.scale);
> -                       align
> -                         = get_object_alignment (DR_REF (first_dr_info->dr));
> -                       tree ltype = build_aligned_type (TREE_TYPE (vectype),
> -                                                        align);
> -                       for (unsigned k = 0; k < const_nunits; ++k)
> +                       if (loop_lens)
> +                         final_len
> +                           = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +                                                vec_num * ncopies, vectype,
> +                                                vec_num * j + i, 1);
> +                       else
> +                         final_len
> +                           = build_int_cst (sizetype,
> +                                            TYPE_VECTOR_SUBPARTS (vectype));
> +                       signed char biasval
> +                         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +                       bias = build_int_cst (intQI_type_node, biasval);
> +                       if (!final_mask)
>                           {
> -                           tree boff = size_binop (MULT_EXPR,
> -                                                   TYPE_SIZE (idx_type),
> -                                                   bitsize_int
> -                                                     (k + elt_offset));
> -                           tree idx = gimple_build (&stmts, BIT_FIELD_REF,
> -                                                    idx_type, vec_offset,
> -                                                    TYPE_SIZE (idx_type),
> -                                                    boff);
> -                           idx = gimple_convert (&stmts, sizetype, idx);
> -                           idx = gimple_build (&stmts, MULT_EXPR,
> -                                               sizetype, idx, scale);
> -                           tree ptr = gimple_build (&stmts, PLUS_EXPR,
> -                                                    TREE_TYPE (dataref_ptr),
> -                                                    dataref_ptr, idx);
> -                           ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> -                           tree elt = make_ssa_name (TREE_TYPE (vectype));
> -                           tree ref = build2 (MEM_REF, ltype, ptr,
> -                                              build_int_cst (ref_type, 0));
> -                           new_stmt = gimple_build_assign (elt, ref);
> -                           gimple_set_vuse (new_stmt,
> -                                            gimple_vuse (gsi_stmt (*gsi)));
> -                           gimple_seq_add_stmt (&stmts, new_stmt);
> -                           CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
> +                           mask_vectype = truth_type_for (vectype);
> +                           final_mask = build_minus_one_cst (mask_vectype);
>                           }
> -                       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> -                       new_stmt = gimple_build_assign (NULL_TREE,
> -                                                       build_constructor
> -                                                         (vectype, ctor_elts));
> -                       data_ref = NULL_TREE;
> -                       break;
>                       }
>
> -                   if (costing_p)
> -                     break;
> -
> -                   align =
> -                     known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
> -                   if (alignment_support_scheme == dr_aligned)
> -                     misalign = 0;
> -                   else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
> -                     {
> -                       align = dr_alignment
> -                         (vect_dr_behavior (vinfo, first_dr_info));
> -                       misalign = 0;
> -                     }
> +                   gcall *call;
> +                   if (final_len && final_mask)
> +                     call = gimple_build_call_internal (
> +                       IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
> +                       scale, zero, final_mask, final_len, bias);
> +                   else if (final_mask)
> +                     call
> +                       = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
> +                                                     dataref_ptr, vec_offset,
> +                                                     scale, zero, final_mask);
>                     else
> -                     misalign = misalignment;
> -                   if (dataref_offset == NULL_TREE
> -                       && TREE_CODE (dataref_ptr) == SSA_NAME)
> -                     set_ptr_info_alignment (get_ptr_info (dataref_ptr),
> -                                             align, misalign);
> -                   align = least_bit_hwi (misalign | align);
> -
> -                   /* Compute IFN when LOOP_LENS or final_mask valid.  */
> -                   machine_mode vmode = TYPE_MODE (vectype);
> -                   machine_mode new_vmode = vmode;
> -                   internal_fn partial_ifn = IFN_LAST;
> -                   if (loop_lens)
> +                     call
> +                       = gimple_build_call_internal (IFN_GATHER_LOAD, 4,
> +                                                     dataref_ptr, vec_offset,
> +                                                     scale, zero);
> +                   gimple_call_set_nothrow (call, true);
> +                   new_stmt = call;
> +                   data_ref = NULL_TREE;
> +                   break;
> +                 }
> +               else if (memory_access_type == VMAT_GATHER_SCATTER)
> +                 {
> +                   /* Emulated gather-scatter.  */
> +                   gcc_assert (!final_mask);
> +                   unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
> +                   if (costing_p)
>                       {
> -                       opt_machine_mode new_ovmode
> -                         = get_len_load_store_mode (vmode, true,
> -                                                    &partial_ifn);
> -                       new_vmode = new_ovmode.require ();
> -                       unsigned factor = (new_ovmode == vmode)
> -                                           ? 1
> -                                           : GET_MODE_UNIT_SIZE (vmode);
> -                       final_len
> -                         = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> -                                              vec_num * ncopies, vectype,
> -                                              vec_num * j + i, factor);
> +                       /* For emulated gathers N offset vector element
> +                          offset add is consumed by the load).  */
> +                       inside_cost
> +                         = record_stmt_cost (cost_vec, const_nunits,
> +                                             vec_to_scalar, stmt_info, 0,
> +                                             vect_body);
> +                       /* N scalar loads plus gathering them into a
> +                          vector.  */
> +                       inside_cost = record_stmt_cost (cost_vec, const_nunits,
> +                                                       scalar_load, stmt_info,
> +                                                       0, vect_body);
> +                       inside_cost
> +                         = record_stmt_cost (cost_vec, 1, vec_construct,
> +                                             stmt_info, 0, vect_body);
> +                       break;
>                       }
> -                   else if (final_mask)
> +                   unsigned HOST_WIDE_INT const_offset_nunits
> +                     = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
> +                         .to_constant ();
> +                   vec<constructor_elt, va_gc> *ctor_elts;
> +                   vec_alloc (ctor_elts, const_nunits);
> +                   gimple_seq stmts = NULL;
> +                   /* We support offset vectors with more elements
> +                      than the data vector for now.  */
> +                   unsigned HOST_WIDE_INT factor
> +                     = const_offset_nunits / const_nunits;
> +                   vec_offset = vec_offsets[j / factor];
> +                   unsigned elt_offset = (j % factor) * const_nunits;
> +                   tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> +                   tree scale = size_int (gs_info.scale);
> +                   align = get_object_alignment (DR_REF (first_dr_info->dr));
> +                   tree ltype
> +                     = build_aligned_type (TREE_TYPE (vectype), align);
> +                   for (unsigned k = 0; k < const_nunits; ++k)
>                       {
> -                       if (!can_vec_mask_load_store_p (
> -                             vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
> -                             &partial_ifn))
> -                         gcc_unreachable ();
> +                       tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
> +                                               bitsize_int (k + elt_offset));
> +                       tree idx = gimple_build (&stmts, BIT_FIELD_REF,
> +                                                idx_type, vec_offset,
> +                                                TYPE_SIZE (idx_type), boff);
> +                       idx = gimple_convert (&stmts, sizetype, idx);
> +                       idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
> +                                           scale);
> +                       tree ptr = gimple_build (&stmts, PLUS_EXPR,
> +                                                TREE_TYPE (dataref_ptr),
> +                                                dataref_ptr, idx);
> +                       ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> +                       tree elt = make_ssa_name (TREE_TYPE (vectype));
> +                       tree ref = build2 (MEM_REF, ltype, ptr,
> +                                          build_int_cst (ref_type, 0));
> +                       new_stmt = gimple_build_assign (elt, ref);
> +                       gimple_set_vuse (new_stmt,
> +                                        gimple_vuse (gsi_stmt (*gsi)));
> +                       gimple_seq_add_stmt (&stmts, new_stmt);
> +                       CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
>                       }
> +                   gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> +                   new_stmt = gimple_build_assign (
> +                     NULL_TREE, build_constructor (vectype, ctor_elts));
> +                   data_ref = NULL_TREE;
> +                   break;
> +                 }
>
> -                   if (partial_ifn == IFN_MASK_LEN_LOAD)
> +               if (costing_p)
> +                 break;
> +
> +               align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
> +               if (alignment_support_scheme == dr_aligned)
> +                 misalign = 0;
> +               else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
> +                 {
> +                   align
> +                     = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
> +                   misalign = 0;
> +                 }
> +               else
> +                 misalign = misalignment;
> +               if (dataref_offset == NULL_TREE
> +                   && TREE_CODE (dataref_ptr) == SSA_NAME)
> +                 set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
> +                                         misalign);
> +               align = least_bit_hwi (misalign | align);
> +
> +               /* Compute IFN when LOOP_LENS or final_mask valid.  */
> +               machine_mode vmode = TYPE_MODE (vectype);
> +               machine_mode new_vmode = vmode;
> +               internal_fn partial_ifn = IFN_LAST;
> +               if (loop_lens)
> +                 {
> +                   opt_machine_mode new_ovmode
> +                     = get_len_load_store_mode (vmode, true, &partial_ifn);
> +                   new_vmode = new_ovmode.require ();
> +                   unsigned factor
> +                     = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
> +                   final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +                                                  vec_num * ncopies, vectype,
> +                                                  vec_num * j + i, factor);
> +                 }
> +               else if (final_mask)
> +                 {
> +                   if (!can_vec_mask_load_store_p (
> +                         vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
> +                         &partial_ifn))
> +                     gcc_unreachable ();
> +                 }
> +
> +               if (partial_ifn == IFN_MASK_LEN_LOAD)
> +                 {
> +                   if (!final_len)
>                       {
> -                       if (!final_len)
> -                         {
> -                           /* Pass VF value to 'len' argument of
> -                              MASK_LEN_LOAD if LOOP_LENS is invalid.  */
> -                           final_len
> -                             = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> -                         }
> -                       if (!final_mask)
> -                         {
> -                           /* Pass all ones value to 'mask' argument of
> -                              MASK_LEN_LOAD if final_mask is invalid.  */
> -                           mask_vectype = truth_type_for (vectype);
> -                           final_mask = build_minus_one_cst (mask_vectype);
> -                         }
> +                       /* Pass VF value to 'len' argument of
> +                          MASK_LEN_LOAD if LOOP_LENS is invalid.  */
> +                       final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
>                       }
> -                   if (final_len)
> +                   if (!final_mask)
>                       {
> -                       signed char biasval
> -                         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> -
> -                       bias = build_int_cst (intQI_type_node, biasval);
> +                       /* Pass all ones value to 'mask' argument of
> +                          MASK_LEN_LOAD if final_mask is invalid.  */
> +                       mask_vectype = truth_type_for (vectype);
> +                       final_mask = build_minus_one_cst (mask_vectype);
>                       }
> +                 }
> +               if (final_len)
> +                 {
> +                   signed char biasval
> +                     = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>
> -                   if (final_len)
> +                   bias = build_int_cst (intQI_type_node, biasval);
> +                 }
> +
> +               if (final_len)
> +                 {
> +                   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> +                   gcall *call;
> +                   if (partial_ifn == IFN_MASK_LEN_LOAD)
> +                     call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, 5,
> +                                                        dataref_ptr, ptr,
> +                                                        final_mask, final_len,
> +                                                        bias);
> +                   else
> +                     call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
> +                                                        dataref_ptr, ptr,
> +                                                        final_len, bias);
> +                   gimple_call_set_nothrow (call, true);
> +                   new_stmt = call;
> +                   data_ref = NULL_TREE;
> +
> +                   /* Need conversion if it's wrapped with VnQI.  */
> +                   if (vmode != new_vmode)
>                       {
> -                       tree ptr
> -                         = build_int_cst (ref_type, align * BITS_PER_UNIT);
> -                       gcall *call;
> -                       if (partial_ifn == IFN_MASK_LEN_LOAD)
> -                         call = gimple_build_call_internal (IFN_MASK_LEN_LOAD,
> -                                                            5, dataref_ptr,
> -                                                            ptr, final_mask,
> -                                                            final_len, bias);
> -                       else
> -                         call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
> -                                                            dataref_ptr, ptr,
> -                                                            final_len, bias);
> -                       gimple_call_set_nothrow (call, true);
> -                       new_stmt = call;
> -                       data_ref = NULL_TREE;
> -
> -                       /* Need conversion if it's wrapped with VnQI.  */
> -                       if (vmode != new_vmode)
> -                         {
> -                           tree new_vtype = build_vector_type_for_mode (
> -                             unsigned_intQI_type_node, new_vmode);
> -                           tree var = vect_get_new_ssa_name (new_vtype,
> -                                                             vect_simple_var);
> -                           gimple_set_lhs (call, var);
> -                           vect_finish_stmt_generation (vinfo, stmt_info, call,
> -                                                        gsi);
> -                           tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
> -                           new_stmt
> -                             = gimple_build_assign (vec_dest,
> -                                                    VIEW_CONVERT_EXPR, op);
> -                         }
> +                       tree new_vtype = build_vector_type_for_mode (
> +                         unsigned_intQI_type_node, new_vmode);
> +                       tree var
> +                         = vect_get_new_ssa_name (new_vtype, vect_simple_var);
> +                       gimple_set_lhs (call, var);
> +                       vect_finish_stmt_generation (vinfo, stmt_info, call,
> +                                                    gsi);
> +                       tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
> +                       new_stmt = gimple_build_assign (vec_dest,
> +                                                       VIEW_CONVERT_EXPR, op);
>                       }
> -                   else if (final_mask)
> +                 }
> +               else if (final_mask)
> +                 {
> +                   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> +                   gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
> +                                                             dataref_ptr, ptr,
> +                                                             final_mask);
> +                   gimple_call_set_nothrow (call, true);
> +                   new_stmt = call;
> +                   data_ref = NULL_TREE;
> +                 }
> +               else
> +                 {
> +                   tree ltype = vectype;
> +                   tree new_vtype = NULL_TREE;
> +                   unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
> +                   unsigned int vect_align
> +                     = vect_known_alignment_in_bytes (first_dr_info, vectype);
> +                   unsigned int scalar_dr_size
> +                     = vect_get_scalar_dr_size (first_dr_info);
> +                   /* If there's no peeling for gaps but we have a gap
> +                      with slp loads then load the lower half of the
> +                      vector only.  See get_group_load_store_type for
> +                      when we apply this optimization.  */
> +                   if (slp
> +                       && loop_vinfo
> +                       && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0
> +                       && known_eq (nunits, (group_size - gap) * 2)
> +                       && known_eq (nunits, group_size)
> +                       && gap >= (vect_align / scalar_dr_size))
>                       {
> -                       tree ptr = build_int_cst (ref_type,
> -                                                 align * BITS_PER_UNIT);
> -                       gcall *call
> -                         = gimple_build_call_internal (IFN_MASK_LOAD, 3,
> -                                                       dataref_ptr, ptr,
> -                                                       final_mask);
> -                       gimple_call_set_nothrow (call, true);
> -                       new_stmt = call;
> -                       data_ref = NULL_TREE;
> +                       tree half_vtype;
> +                       new_vtype
> +                         = vector_vector_composition_type (vectype, 2,
> +                                                           &half_vtype);
> +                       if (new_vtype != NULL_TREE)
> +                         ltype = half_vtype;
>                       }
> +                   tree offset
> +                     = (dataref_offset ? dataref_offset
> +                                       : build_int_cst (ref_type, 0));
> +                   if (ltype != vectype
> +                       && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> +                     {
> +                       unsigned HOST_WIDE_INT gap_offset
> +                         = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
> +                       tree gapcst = build_int_cst (ref_type, gap_offset);
> +                       offset = size_binop (PLUS_EXPR, offset, gapcst);
> +                     }
> +                   data_ref
> +                     = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
> +                   if (alignment_support_scheme == dr_aligned)
> +                     ;
>                     else
> +                     TREE_TYPE (data_ref)
> +                       = build_aligned_type (TREE_TYPE (data_ref),
> +                                             align * BITS_PER_UNIT);
> +                   if (ltype != vectype)
>                       {
> -                       tree ltype = vectype;
> -                       tree new_vtype = NULL_TREE;
> -                       unsigned HOST_WIDE_INT gap
> -                         = DR_GROUP_GAP (first_stmt_info);
> -                       unsigned int vect_align
> -                         = vect_known_alignment_in_bytes (first_dr_info,
> -                                                          vectype);
> -                       unsigned int scalar_dr_size
> -                         = vect_get_scalar_dr_size (first_dr_info);
> -                       /* If there's no peeling for gaps but we have a gap
> -                          with slp loads then load the lower half of the
> -                          vector only.  See get_group_load_store_type for
> -                          when we apply this optimization.  */
> -                       if (slp
> -                           && loop_vinfo
> -                           && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
> -                           && gap != 0
> -                           && known_eq (nunits, (group_size - gap) * 2)
> -                           && known_eq (nunits, group_size)
> -                           && gap >= (vect_align / scalar_dr_size))
> +                       vect_copy_ref_info (data_ref,
> +                                           DR_REF (first_dr_info->dr));
> +                       tree tem = make_ssa_name (ltype);
> +                       new_stmt = gimple_build_assign (tem, data_ref);
> +                       vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
> +                                                    gsi);
> +                       data_ref = NULL;
> +                       vec<constructor_elt, va_gc> *v;
> +                       vec_alloc (v, 2);
> +                       if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
>                           {
> -                           tree half_vtype;
> -                           new_vtype
> -                             = vector_vector_composition_type (vectype, 2,
> -                                                               &half_vtype);
> -                           if (new_vtype != NULL_TREE)
> -                             ltype = half_vtype;
> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> +                                                   build_zero_cst (ltype));
> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
>                           }
> -                       tree offset
> -                         = (dataref_offset ? dataref_offset
> -                                           : build_int_cst (ref_type, 0));
> -                       if (ltype != vectype
> -                           && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> +                       else
>                           {
> -                           unsigned HOST_WIDE_INT gap_offset
> -                             = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
> -                           tree gapcst = build_int_cst (ref_type, gap_offset);
> -                           offset = size_binop (PLUS_EXPR, offset, gapcst);
> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> +                                                   build_zero_cst (ltype));
>                           }
> -                       data_ref
> -                         = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
> -                       if (alignment_support_scheme == dr_aligned)
> -                         ;
> +                       gcc_assert (new_vtype != NULL_TREE);
> +                       if (new_vtype == vectype)
> +                         new_stmt = gimple_build_assign (
> +                           vec_dest, build_constructor (vectype, v));
>                         else
> -                         TREE_TYPE (data_ref)
> -                           = build_aligned_type (TREE_TYPE (data_ref),
> -                                                 align * BITS_PER_UNIT);
> -                       if (ltype != vectype)
>                           {
> -                           vect_copy_ref_info (data_ref,
> -                                               DR_REF (first_dr_info->dr));
> -                           tree tem = make_ssa_name (ltype);
> -                           new_stmt = gimple_build_assign (tem, data_ref);
> +                           tree new_vname = make_ssa_name (new_vtype);
> +                           new_stmt = gimple_build_assign (
> +                             new_vname, build_constructor (new_vtype, v));
>                             vect_finish_stmt_generation (vinfo, stmt_info,
>                                                          new_stmt, gsi);
> -                           data_ref = NULL;
> -                           vec<constructor_elt, va_gc> *v;
> -                           vec_alloc (v, 2);
> -                           if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> -                             {
> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> -                                                       build_zero_cst (ltype));
> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
> -                             }
> -                           else
> -                             {
> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> -                                                       build_zero_cst (ltype));
> -                             }
> -                           gcc_assert (new_vtype != NULL_TREE);
> -                           if (new_vtype == vectype)
> -                             new_stmt = gimple_build_assign (
> -                               vec_dest, build_constructor (vectype, v));
> -                           else
> -                             {
> -                               tree new_vname = make_ssa_name (new_vtype);
> -                               new_stmt = gimple_build_assign (
> -                                 new_vname, build_constructor (new_vtype, v));
> -                               vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                            new_stmt, gsi);
> -                               new_stmt = gimple_build_assign (
> -                                 vec_dest, build1 (VIEW_CONVERT_EXPR, vectype,
> -                                                   new_vname));
> -                             }
> +                           new_stmt = gimple_build_assign (
> +                             vec_dest,
> +                             build1 (VIEW_CONVERT_EXPR, vectype, new_vname));
>                           }
>                       }
> -                   break;
>                   }
> -               case dr_explicit_realign:
> -                 {
> -                   if (costing_p)
> -                     break;
> -                   tree ptr, bump;
> -
> -                   tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +               break;
> +             }
> +           case dr_explicit_realign:
> +             {
> +               if (costing_p)
> +                 break;
> +               tree ptr, bump;
>
> -                   if (compute_in_loop)
> -                     msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
> -                                                   &realignment_token,
> -                                                   dr_explicit_realign,
> -                                                   dataref_ptr, NULL);
> +               tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
>
> -                   if (TREE_CODE (dataref_ptr) == SSA_NAME)
> -                     ptr = copy_ssa_name (dataref_ptr);
> -                   else
> -                     ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
> -                   // For explicit realign the target alignment should be
> -                   // known at compile time.
> -                   unsigned HOST_WIDE_INT align =
> -                     DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> -                   new_stmt = gimple_build_assign
> -                                (ptr, BIT_AND_EXPR, dataref_ptr,
> -                                 build_int_cst
> -                                 (TREE_TYPE (dataref_ptr),
> -                                  -(HOST_WIDE_INT) align));
> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                new_stmt, gsi);
> -                   data_ref
> -                     = build2 (MEM_REF, vectype, ptr,
> -                               build_int_cst (ref_type, 0));
> -                   vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> -                   vec_dest = vect_create_destination_var (scalar_dest,
> -                                                           vectype);
> -                   new_stmt = gimple_build_assign (vec_dest, data_ref);
> -                   new_temp = make_ssa_name (vec_dest, new_stmt);
> -                   gimple_assign_set_lhs (new_stmt, new_temp);
> -                   gimple_move_vops (new_stmt, stmt_info->stmt);
> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                new_stmt, gsi);
> -                   msq = new_temp;
> -
> -                   bump = size_binop (MULT_EXPR, vs,
> -                                      TYPE_SIZE_UNIT (elem_type));
> -                   bump = size_binop (MINUS_EXPR, bump, size_one_node);
> -                   ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi,
> -                                          stmt_info, bump);
> -                   new_stmt = gimple_build_assign
> -                                (NULL_TREE, BIT_AND_EXPR, ptr,
> -                                 build_int_cst
> -                                 (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
> -                   if (TREE_CODE (ptr) == SSA_NAME)
> -                     ptr = copy_ssa_name (ptr, new_stmt);
> -                   else
> -                     ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
> -                   gimple_assign_set_lhs (new_stmt, ptr);
> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                new_stmt, gsi);
> -                   data_ref
> -                     = build2 (MEM_REF, vectype, ptr,
> -                               build_int_cst (ref_type, 0));
> -                   break;
> -                 }
> -               case dr_explicit_realign_optimized:
> -                 {
> -                   if (costing_p)
> -                     break;
> -                   if (TREE_CODE (dataref_ptr) == SSA_NAME)
> -                     new_temp = copy_ssa_name (dataref_ptr);
> -                   else
> -                     new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
> -                   // We should only be doing this if we know the target
> -                   // alignment at compile time.
> -                   unsigned HOST_WIDE_INT align =
> -                     DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> -                   new_stmt = gimple_build_assign
> -                     (new_temp, BIT_AND_EXPR, dataref_ptr,
> -                      build_int_cst (TREE_TYPE (dataref_ptr),
> -                                    -(HOST_WIDE_INT) align));
> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> -                                                new_stmt, gsi);
> -                   data_ref
> -                     = build2 (MEM_REF, vectype, new_temp,
> -                               build_int_cst (ref_type, 0));
> -                   break;
> -                 }
> -               default:
> -                 gcc_unreachable ();
> -               }
> +               if (compute_in_loop)
> +                 msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
> +                                               &realignment_token,
> +                                               dr_explicit_realign,
> +                                               dataref_ptr, NULL);
> +
> +               if (TREE_CODE (dataref_ptr) == SSA_NAME)
> +                 ptr = copy_ssa_name (dataref_ptr);
> +               else
> +                 ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
> +               // For explicit realign the target alignment should be
> +               // known at compile time.
> +               unsigned HOST_WIDE_INT align
> +                 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> +               new_stmt = gimple_build_assign (
> +                 ptr, BIT_AND_EXPR, dataref_ptr,
> +                 build_int_cst (TREE_TYPE (dataref_ptr),
> +                                -(HOST_WIDE_INT) align));
> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +               data_ref
> +                 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
> +               vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> +               vec_dest = vect_create_destination_var (scalar_dest, vectype);
> +               new_stmt = gimple_build_assign (vec_dest, data_ref);
> +               new_temp = make_ssa_name (vec_dest, new_stmt);
> +               gimple_assign_set_lhs (new_stmt, new_temp);
> +               gimple_move_vops (new_stmt, stmt_info->stmt);
> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +               msq = new_temp;
> +
> +               bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type));
> +               bump = size_binop (MINUS_EXPR, bump, size_one_node);
> +               ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info,
> +                                      bump);
> +               new_stmt = gimple_build_assign (
> +                 NULL_TREE, BIT_AND_EXPR, ptr,
> +                 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
> +               if (TREE_CODE (ptr) == SSA_NAME)
> +                 ptr = copy_ssa_name (ptr, new_stmt);
> +               else
> +                 ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
> +               gimple_assign_set_lhs (new_stmt, ptr);
> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +               data_ref
> +                 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
> +               break;
> +             }
> +           case dr_explicit_realign_optimized:
> +             {
> +               if (costing_p)
> +                 break;
> +               if (TREE_CODE (dataref_ptr) == SSA_NAME)
> +                 new_temp = copy_ssa_name (dataref_ptr);
> +               else
> +                 new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
> +               // We should only be doing this if we know the target
> +               // alignment at compile time.
> +               unsigned HOST_WIDE_INT align
> +                 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> +               new_stmt = gimple_build_assign (
> +                 new_temp, BIT_AND_EXPR, dataref_ptr,
> +                 build_int_cst (TREE_TYPE (dataref_ptr),
> +                                -(HOST_WIDE_INT) align));
> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +               data_ref = build2 (MEM_REF, vectype, new_temp,
> +                                  build_int_cst (ref_type, 0));
> +               break;
> +             }
> +           default:
> +             gcc_unreachable ();
> +           }
>
> -             /* One common place to cost the above vect load for different
> -                alignment support schemes.  */
> -             if (costing_p)
> -               {
> -                 /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
> -                    only need to take care of the first stmt, whose
> -                    stmt_info is first_stmt_info, vec_num iterating on it
> -                    will cover the cost for the remaining, it's consistent
> -                    with transforming.  For the prologue cost for realign,
> -                    we only need to count it once for the whole group.  */
> -                 bool first_stmt_info_p = first_stmt_info == stmt_info;
> -                 bool add_realign_cost = first_stmt_info_p && i == 0;
> -                 if (memory_access_type == VMAT_CONTIGUOUS
> -                     || memory_access_type == VMAT_CONTIGUOUS_REVERSE
> -                     || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
> -                         && (!grouped_load || first_stmt_info_p)))
> -                   vect_get_load_cost (vinfo, stmt_info, 1,
> -                                       alignment_support_scheme, misalignment,
> -                                       add_realign_cost, &inside_cost,
> -                                       &prologue_cost, cost_vec, cost_vec,
> -                                       true);
> -               }
> -             else
> +         /* One common place to cost the above vect load for different
> +            alignment support schemes.  */
> +         if (costing_p)
> +           {
> +             /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
> +                only need to take care of the first stmt, whose
> +                stmt_info is first_stmt_info, vec_num iterating on it
> +                will cover the cost for the remaining, it's consistent
> +                with transforming.  For the prologue cost for realign,
> +                we only need to count it once for the whole group.  */
> +             bool first_stmt_info_p = first_stmt_info == stmt_info;
> +             bool add_realign_cost = first_stmt_info_p && i == 0;
> +             if (memory_access_type == VMAT_CONTIGUOUS
> +                 || memory_access_type == VMAT_CONTIGUOUS_REVERSE
> +                 || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
> +                     && (!grouped_load || first_stmt_info_p)))
> +               vect_get_load_cost (vinfo, stmt_info, 1,
> +                                   alignment_support_scheme, misalignment,
> +                                   add_realign_cost, &inside_cost,
> +                                   &prologue_cost, cost_vec, cost_vec, true);
> +           }
> +         else
> +           {
> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
> +             /* DATA_REF is null if we've already built the statement.  */
> +             if (data_ref)
>                 {
> -                 vec_dest = vect_create_destination_var (scalar_dest, vectype);
> -                 /* DATA_REF is null if we've already built the statement.  */
> -                 if (data_ref)
> -                   {
> -                     vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> -                     new_stmt = gimple_build_assign (vec_dest, data_ref);
> -                   }
> -                 new_temp = make_ssa_name (vec_dest, new_stmt);
> -                 gimple_set_lhs (new_stmt, new_temp);
> -                 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> +                 new_stmt = gimple_build_assign (vec_dest, data_ref);
>                 }
> +             new_temp = make_ssa_name (vec_dest, new_stmt);
> +             gimple_set_lhs (new_stmt, new_temp);
> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +           }
>
> -             /* 3. Handle explicit realignment if necessary/supported.
> -                Create in loop:
> -                  vec_dest = realign_load (msq, lsq, realignment_token)  */
> -             if (!costing_p
> -                 && (alignment_support_scheme == dr_explicit_realign_optimized
> -                     || alignment_support_scheme == dr_explicit_realign))
> -               {
> -                 lsq = gimple_assign_lhs (new_stmt);
> -                 if (!realignment_token)
> -                   realignment_token = dataref_ptr;
> -                 vec_dest = vect_create_destination_var (scalar_dest, vectype);
> -                 new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR,
> -                                                 msq, lsq, realignment_token);
> -                 new_temp = make_ssa_name (vec_dest, new_stmt);
> -                 gimple_assign_set_lhs (new_stmt, new_temp);
> -                 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +         /* 3. Handle explicit realignment if necessary/supported.
> +            Create in loop:
> +              vec_dest = realign_load (msq, lsq, realignment_token)  */
> +         if (!costing_p
> +             && (alignment_support_scheme == dr_explicit_realign_optimized
> +                 || alignment_support_scheme == dr_explicit_realign))
> +           {
> +             lsq = gimple_assign_lhs (new_stmt);
> +             if (!realignment_token)
> +               realignment_token = dataref_ptr;
> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
> +             new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq,
> +                                             lsq, realignment_token);
> +             new_temp = make_ssa_name (vec_dest, new_stmt);
> +             gimple_assign_set_lhs (new_stmt, new_temp);
> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>
> -                 if (alignment_support_scheme == dr_explicit_realign_optimized)
> -                   {
> -                     gcc_assert (phi);
> -                     if (i == vec_num - 1 && j == ncopies - 1)
> -                       add_phi_arg (phi, lsq,
> -                                    loop_latch_edge (containing_loop),
> -                                    UNKNOWN_LOCATION);
> -                     msq = lsq;
> -                   }
> +             if (alignment_support_scheme == dr_explicit_realign_optimized)
> +               {
> +                 gcc_assert (phi);
> +                 if (i == vec_num - 1 && j == ncopies - 1)
> +                   add_phi_arg (phi, lsq, loop_latch_edge (containing_loop),
> +                                UNKNOWN_LOCATION);
> +                 msq = lsq;
>                 }
> +           }
>
> -             if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> +         if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> +           {
> +             if (costing_p)
> +               inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
> +                                               stmt_info, 0, vect_body);
> +             else
>                 {
> -                 if (costing_p)
> -                   inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
> -                                                   stmt_info, 0, vect_body);
> -                 else
> -                   {
> -                     tree perm_mask = perm_mask_for_reverse (vectype);
> -                     new_temp
> -                       = permute_vec_elements (vinfo, new_temp, new_temp,
> -                                               perm_mask, stmt_info, gsi);
> -                     new_stmt = SSA_NAME_DEF_STMT (new_temp);
> -                   }
> +                 tree perm_mask = perm_mask_for_reverse (vectype);
> +                 new_temp = permute_vec_elements (vinfo, new_temp, new_temp,
> +                                                  perm_mask, stmt_info, gsi);
> +                 new_stmt = SSA_NAME_DEF_STMT (new_temp);
>                 }
> +           }
>
> -             /* Collect vector loads and later create their permutation in
> -                vect_transform_grouped_load ().  */
> -             if (!costing_p && (grouped_load || slp_perm))
> -               dr_chain.quick_push (new_temp);
> +         /* Collect vector loads and later create their permutation in
> +            vect_transform_grouped_load ().  */
> +         if (!costing_p && (grouped_load || slp_perm))
> +           dr_chain.quick_push (new_temp);
>
> -             /* Store vector loads in the corresponding SLP_NODE.  */
> -             if (!costing_p && slp && !slp_perm)
> -               slp_node->push_vec_def (new_stmt);
> +         /* Store vector loads in the corresponding SLP_NODE.  */
> +         if (!costing_p && slp && !slp_perm)
> +           slp_node->push_vec_def (new_stmt);
>
> -             /* With SLP permutation we load the gaps as well, without
> -                we need to skip the gaps after we manage to fully load
> -                all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
> -             group_elt += nunits;
> -             if (!costing_p
> -                 && maybe_ne (group_gap_adj, 0U)
> -                 && !slp_perm
> -                 && known_eq (group_elt, group_size - group_gap_adj))
> -               {
> -                 poly_wide_int bump_val
> -                   = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
> -                      * group_gap_adj);
> -                 if (tree_int_cst_sgn
> -                       (vect_dr_behavior (vinfo, dr_info)->step) == -1)
> -                   bump_val = -bump_val;
> -                 tree bump = wide_int_to_tree (sizetype, bump_val);
> -                 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> -                                                gsi, stmt_info, bump);
> -                 group_elt = 0;
> -               }
> -           }
> -         /* Bump the vector pointer to account for a gap or for excess
> -            elements loaded for a permuted SLP load.  */
> +         /* With SLP permutation we load the gaps as well, without
> +            we need to skip the gaps after we manage to fully load
> +            all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
> +         group_elt += nunits;
>           if (!costing_p
>               && maybe_ne (group_gap_adj, 0U)
> -             && slp_perm)
> +             && !slp_perm
> +             && known_eq (group_elt, group_size - group_gap_adj))
>             {
>               poly_wide_int bump_val
> -               = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
> -                  * group_gap_adj);
> -             if (tree_int_cst_sgn
> -                   (vect_dr_behavior (vinfo, dr_info)->step) == -1)
> +               = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
> +             if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step)
> +                 == -1)
>                 bump_val = -bump_val;
>               tree bump = wide_int_to_tree (sizetype, bump_val);
>               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
>                                              stmt_info, bump);
> +             group_elt = 0;
>             }
>         }
> +      /* Bump the vector pointer to account for a gap or for excess
> +        elements loaded for a permuted SLP load.  */
> +      if (!costing_p
> +         && maybe_ne (group_gap_adj, 0U)
> +         && slp_perm)
> +       {
> +         poly_wide_int bump_val
> +           = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
> +         if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1)
> +           bump_val = -bump_val;
> +         tree bump = wide_int_to_tree (sizetype, bump_val);
> +         dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
> +                                        stmt_info, bump);
> +       }
>
>        if (slp && !slp_perm)
>         continue;
> @@ -11120,39 +11117,36 @@ vectorizable_load (vec_info *vinfo,
>             }
>         }
>        else
> -        {
> -          if (grouped_load)
> -           {
> -             if (memory_access_type != VMAT_LOAD_STORE_LANES)
> +       {
> +         if (grouped_load)
> +           {
> +             gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
> +             /* We assume that the cost of a single load-lanes instruction
> +                is equivalent to the cost of DR_GROUP_SIZE separate loads.
> +                If a grouped access is instead being provided by a
> +                load-and-permute operation, include the cost of the
> +                permutes.  */
> +             if (costing_p && first_stmt_info == stmt_info)
>                 {
> -                 gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
> -                 /* We assume that the cost of a single load-lanes instruction
> -                    is equivalent to the cost of DR_GROUP_SIZE separate loads.
> -                    If a grouped access is instead being provided by a
> -                    load-and-permute operation, include the cost of the
> -                    permutes.  */
> -                 if (costing_p && first_stmt_info == stmt_info)
> -                   {
> -                     /* Uses an even and odd extract operations or shuffle
> -                        operations for each needed permute.  */
> -                     int group_size = DR_GROUP_SIZE (first_stmt_info);
> -                     int nstmts = ceil_log2 (group_size) * group_size;
> -                     inside_cost
> -                       += record_stmt_cost (cost_vec, nstmts, vec_perm,
> -                                            stmt_info, 0, vect_body);
> +                 /* Uses an even and odd extract operations or shuffle
> +                    operations for each needed permute.  */
> +                 int group_size = DR_GROUP_SIZE (first_stmt_info);
> +                 int nstmts = ceil_log2 (group_size) * group_size;
> +                 inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
> +                                                  stmt_info, 0, vect_body);
>
> -                     if (dump_enabled_p ())
> -                       dump_printf_loc (
> -                         MSG_NOTE, vect_location,
> -                         "vect_model_load_cost: strided group_size = %d .\n",
> -                         group_size);
> -                   }
> -                 else if (!costing_p)
> -                   vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
> -                                                group_size, gsi);
> +                 if (dump_enabled_p ())
> +                   dump_printf_loc (MSG_NOTE, vect_location,
> +                                    "vect_model_load_cost:"
> +                                    "strided group_size = %d .\n",
> +                                    group_size);
> +               }
> +             else if (!costing_p)
> +               {
> +                 vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
> +                                              group_size, gsi);
> +                 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>                 }
> -             if (!costing_p)
> -               *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>             }
>           else if (!costing_p)
>             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> @@ -11166,7 +11160,8 @@ vectorizable_load (vec_info *vinfo,
>      {
>        gcc_assert (memory_access_type != VMAT_INVARIANT
>                   && memory_access_type != VMAT_ELEMENTWISE
> -                 && memory_access_type != VMAT_STRIDED_SLP);
> +                 && memory_access_type != VMAT_STRIDED_SLP
> +                 && memory_access_type != VMAT_LOAD_STORE_LANES);
>        if (dump_enabled_p ())
>         dump_printf_loc (MSG_NOTE, vect_location,
>                          "vect_model_load_cost: inside_cost = %u, "
> --
> 2.31.1
  
Kewen.Lin Aug. 14, 2023, 12:49 p.m. UTC | #2
Hi Richi,

on 2023/8/14 20:04, Richard Biener wrote:
> On Mon, Aug 14, 2023 at 10:54 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
>>
>> Hi,
>>
>> Following Richi's suggestion [1], this patch is to move the
>> handlings on VMAT_LOAD_STORE_LANES in the final loop nest
>> of function vectorizable_load to its own loop.  Basically
>> it duplicates the final loop nest, clean up some useless
>> set up code for the case of VMAT_LOAD_STORE_LANES, remove
>> some unreachable code.  Also remove the corresponding
>> handlings in the final loop nest.
>>
>> Bootstrapped and regtested on x86_64-redhat-linux,
>> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
> 
> OK (I guess the big diff is mostly because of re-indenting).

Thanks!  Yes, there is some code in the original final loop nest like

if (memory_access_type == VMAT_LOAD_STORE_LANES)
  {
    ...
  }
else
  {
    ...
  }

Then the else arm is fully re-indented.

The other patch on VMAT_GATHER_SCATTER looks a bit better since
it doesn't need re-indenting.

BR,
Kewen

> 
> Thanks,
> Richard.
> 
>> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
>>
>> gcc/ChangeLog:
>>
>>         * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
>>         VMAT_LOAD_STORE_LANES in the final loop nest to its own loop,
>>         and update the final nest accordingly.
>> ---
>>  gcc/tree-vect-stmts.cc | 1275 ++++++++++++++++++++--------------------
>>  1 file changed, 634 insertions(+), 641 deletions(-)
>>
>> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
>> index 4f2d088484c..c361e16cb7b 100644
>> --- a/gcc/tree-vect-stmts.cc
>> +++ b/gcc/tree-vect-stmts.cc
>> @@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo,
>>         vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
>>                                        &vec_masks, mask_vectype);
>>      }
>> +
>>    tree vec_mask = NULL_TREE;
>> +  if (memory_access_type == VMAT_LOAD_STORE_LANES)
>> +    {
>> +      gcc_assert (alignment_support_scheme == dr_aligned
>> +                 || alignment_support_scheme == dr_unaligned_supported);
>> +      gcc_assert (grouped_load && !slp);
>> +
>> +      unsigned int inside_cost = 0, prologue_cost = 0;
>> +      for (j = 0; j < ncopies; j++)
>> +       {
>> +         if (costing_p)
>> +           {
>> +             /* An IFN_LOAD_LANES will load all its vector results,
>> +                regardless of which ones we actually need.  Account
>> +                for the cost of unused results.  */
>> +             if (first_stmt_info == stmt_info)
>> +               {
>> +                 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
>> +                 stmt_vec_info next_stmt_info = first_stmt_info;
>> +                 do
>> +                   {
>> +                     gaps -= 1;
>> +                     next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
>> +                   }
>> +                 while (next_stmt_info);
>> +                 if (gaps)
>> +                   {
>> +                     if (dump_enabled_p ())
>> +                       dump_printf_loc (MSG_NOTE, vect_location,
>> +                                        "vect_model_load_cost: %d "
>> +                                        "unused vectors.\n",
>> +                                        gaps);
>> +                     vect_get_load_cost (vinfo, stmt_info, gaps,
>> +                                         alignment_support_scheme,
>> +                                         misalignment, false, &inside_cost,
>> +                                         &prologue_cost, cost_vec, cost_vec,
>> +                                         true);
>> +                   }
>> +               }
>> +             vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
>> +                                 misalignment, false, &inside_cost,
>> +                                 &prologue_cost, cost_vec, cost_vec, true);
>> +             continue;
>> +           }
>> +
>> +         /* 1. Create the vector or array pointer update chain.  */
>> +         if (j == 0)
>> +           dataref_ptr
>> +             = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
>> +                                         at_loop, offset, &dummy, gsi,
>> +                                         &ptr_incr, false, bump);
>> +         else
>> +           {
>> +             gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
>> +             dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
>> +                                            stmt_info, bump);
>> +           }
>> +         if (mask)
>> +           vec_mask = vec_masks[j];
>> +
>> +         tree vec_array = create_vector_array (vectype, vec_num);
>> +
>> +         tree final_mask = NULL_TREE;
>> +         if (loop_masks)
>> +           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>> +                                            ncopies, vectype, j);
>> +         if (vec_mask)
>> +           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
>> +                                          vec_mask, gsi);
>> +
>> +         gcall *call;
>> +         if (final_mask)
>> +           {
>> +             /* Emit:
>> +                  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
>> +                                               VEC_MASK).  */
>> +             unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
>> +             tree alias_ptr = build_int_cst (ref_type, align);
>> +             call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
>> +                                                dataref_ptr, alias_ptr,
>> +                                                final_mask);
>> +           }
>> +         else
>> +           {
>> +             /* Emit:
>> +                  VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
>> +             data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
>> +             call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
>> +           }
>> +         gimple_call_set_lhs (call, vec_array);
>> +         gimple_call_set_nothrow (call, true);
>> +         vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
>> +
>> +         dr_chain.create (vec_num);
>> +         /* Extract each vector into an SSA_NAME.  */
>> +         for (i = 0; i < vec_num; i++)
>> +           {
>> +             new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
>> +                                           vec_array, i);
>> +             dr_chain.quick_push (new_temp);
>> +           }
>> +
>> +         /* Record the mapping between SSA_NAMEs and statements.  */
>> +         vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
>> +
>> +         /* Record that VEC_ARRAY is now dead.  */
>> +         vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
>> +
>> +         dr_chain.release ();
>> +
>> +         *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>> +       }
>> +
>> +      if (costing_p && dump_enabled_p ())
>> +       dump_printf_loc (MSG_NOTE, vect_location,
>> +                        "vect_model_load_cost: inside_cost = %u, "
>> +                        "prologue_cost = %u .\n",
>> +                        inside_cost, prologue_cost);
>> +
>> +      return true;
>> +    }
>> +
>>    poly_uint64 group_elt = 0;
>>    unsigned int inside_cost = 0, prologue_cost = 0;
>>    for (j = 0; j < ncopies; j++)
>> @@ -10414,685 +10538,558 @@ vectorizable_load (vec_info *vinfo,
>>         dr_chain.create (vec_num);
>>
>>        gimple *new_stmt = NULL;
>> -      if (memory_access_type == VMAT_LOAD_STORE_LANES)
>> +      for (i = 0; i < vec_num; i++)
>>         {
>> -         if (costing_p)
>> -           {
>> -             /* An IFN_LOAD_LANES will load all its vector results,
>> -                regardless of which ones we actually need.  Account
>> -                for the cost of unused results.  */
>> -             if (grouped_load && first_stmt_info == stmt_info)
>> -               {
>> -                 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
>> -                 stmt_vec_info next_stmt_info = first_stmt_info;
>> -                 do
>> -                   {
>> -                     gaps -= 1;
>> -                     next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
>> -                   }
>> -                 while (next_stmt_info);
>> -                 if (gaps)
>> -                   {
>> -                     if (dump_enabled_p ())
>> -                       dump_printf_loc (MSG_NOTE, vect_location,
>> -                                        "vect_model_load_cost: %d "
>> -                                        "unused vectors.\n",
>> -                                        gaps);
>> -                     vect_get_load_cost (vinfo, stmt_info, gaps,
>> -                                         alignment_support_scheme,
>> -                                         misalignment, false, &inside_cost,
>> -                                         &prologue_cost, cost_vec, cost_vec,
>> -                                         true);
>> -                   }
>> -               }
>> -             vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
>> -                                 misalignment, false, &inside_cost,
>> -                                 &prologue_cost, cost_vec, cost_vec, true);
>> -             continue;
>> -           }
>> -         tree vec_array;
>> -
>> -         vec_array = create_vector_array (vectype, vec_num);
>> -
>>           tree final_mask = NULL_TREE;
>> -         if (loop_masks)
>> -           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>> -                                            ncopies, vectype, j);
>> -         if (vec_mask)
>> -           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>> -                                          final_mask, vec_mask, gsi);
>> -
>> -         gcall *call;
>> -         if (final_mask)
>> -           {
>> -             /* Emit:
>> -                  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
>> -                                               VEC_MASK).  */
>> -             unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
>> -             tree alias_ptr = build_int_cst (ref_type, align);
>> -             call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
>> -                                                dataref_ptr, alias_ptr,
>> -                                                final_mask);
>> -           }
>> -         else
>> +         tree final_len = NULL_TREE;
>> +         tree bias = NULL_TREE;
>> +         if (!costing_p)
>>             {
>> -             /* Emit:
>> -                  VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
>> -             data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
>> -             call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
>> -           }
>> -         gimple_call_set_lhs (call, vec_array);
>> -         gimple_call_set_nothrow (call, true);
>> -         vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
>> -         new_stmt = call;
>> +             if (loop_masks)
>> +               final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>> +                                                vec_num * ncopies, vectype,
>> +                                                vec_num * j + i);
>> +             if (vec_mask)
>> +               final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>> +                                              final_mask, vec_mask, gsi);
>>
>> -         /* Extract each vector into an SSA_NAME.  */
>> -         for (i = 0; i < vec_num; i++)
>> -           {
>> -             new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
>> -                                           vec_array, i);
>> -             dr_chain.quick_push (new_temp);
>> +             if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>> +                                              gsi, stmt_info, bump);
>>             }
>>
>> -         /* Record the mapping between SSA_NAMEs and statements.  */
>> -         vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
>> -
>> -         /* Record that VEC_ARRAY is now dead.  */
>> -         vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
>> -       }
>> -      else
>> -       {
>> -         for (i = 0; i < vec_num; i++)
>> +         /* 2. Create the vector-load in the loop.  */
>> +         switch (alignment_support_scheme)
>>             {
>> -             tree final_mask = NULL_TREE;
>> -             tree final_len = NULL_TREE;
>> -             tree bias = NULL_TREE;
>> -             if (!costing_p)
>> -               {
>> -                 if (loop_masks)
>> -                   final_mask
>> -                     = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>> -                                           vec_num * ncopies, vectype,
>> -                                           vec_num * j + i);
>> -                 if (vec_mask)
>> -                   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>> -                                                  final_mask, vec_mask, gsi);
>> -
>> -                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> -                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>> -                                                  gsi, stmt_info, bump);
>> -               }
>> +           case dr_aligned:
>> +           case dr_unaligned_supported:
>> +             {
>> +               unsigned int misalign;
>> +               unsigned HOST_WIDE_INT align;
>>
>> -             /* 2. Create the vector-load in the loop.  */
>> -             switch (alignment_support_scheme)
>> -               {
>> -               case dr_aligned:
>> -               case dr_unaligned_supported:
>> +               if (memory_access_type == VMAT_GATHER_SCATTER
>> +                   && gs_info.ifn != IFN_LAST)
>>                   {
>> -                   unsigned int misalign;
>> -                   unsigned HOST_WIDE_INT align;
>> -
>> -                   if (memory_access_type == VMAT_GATHER_SCATTER
>> -                       && gs_info.ifn != IFN_LAST)
>> +                   if (costing_p)
>>                       {
>> -                       if (costing_p)
>> -                         {
>> -                           unsigned int cnunits
>> -                             = vect_nunits_for_cost (vectype);
>> -                           inside_cost
>> -                             = record_stmt_cost (cost_vec, cnunits,
>> -                                                 scalar_load, stmt_info, 0,
>> -                                                 vect_body);
>> -                           break;
>> -                         }
>> -                       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> -                         vec_offset = vec_offsets[vec_num * j + i];
>> -                       tree zero = build_zero_cst (vectype);
>> -                       tree scale = size_int (gs_info.scale);
>> -
>> -                       if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
>> -                         {
>> -                           if (loop_lens)
>> -                             final_len
>> -                               = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
>> -                                                    vec_num * ncopies, vectype,
>> -                                                    vec_num * j + i, 1);
>> -                           else
>> -                             final_len = build_int_cst (sizetype,
>> -                                                        TYPE_VECTOR_SUBPARTS (
>> -                                                          vectype));
>> -                           signed char biasval
>> -                             = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>> -                           bias = build_int_cst (intQI_type_node, biasval);
>> -                           if (!final_mask)
>> -                             {
>> -                               mask_vectype = truth_type_for (vectype);
>> -                               final_mask = build_minus_one_cst (mask_vectype);
>> -                             }
>> -                         }
>> -
>> -                       gcall *call;
>> -                       if (final_len && final_mask)
>> -                         call = gimple_build_call_internal (
>> -                           IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr,
>> -                           vec_offset, scale, zero, final_mask, final_len,
>> -                           bias);
>> -                       else if (final_mask)
>> -                         call = gimple_build_call_internal
>> -                           (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
>> -                            vec_offset, scale, zero, final_mask);
>> -                       else
>> -                         call = gimple_build_call_internal
>> -                           (IFN_GATHER_LOAD, 4, dataref_ptr,
>> -                            vec_offset, scale, zero);
>> -                       gimple_call_set_nothrow (call, true);
>> -                       new_stmt = call;
>> -                       data_ref = NULL_TREE;
>> +                       unsigned int cnunits = vect_nunits_for_cost (vectype);
>> +                       inside_cost
>> +                         = record_stmt_cost (cost_vec, cnunits, scalar_load,
>> +                                             stmt_info, 0, vect_body);
>>                         break;
>>                       }
>> -                   else if (memory_access_type == VMAT_GATHER_SCATTER)
>> +                   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>> +                     vec_offset = vec_offsets[vec_num * j + i];
>> +                   tree zero = build_zero_cst (vectype);
>> +                   tree scale = size_int (gs_info.scale);
>> +
>> +                   if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
>>                       {
>> -                       /* Emulated gather-scatter.  */
>> -                       gcc_assert (!final_mask);
>> -                       unsigned HOST_WIDE_INT const_nunits
>> -                         = nunits.to_constant ();
>> -                       if (costing_p)
>> -                         {
>> -                           /* For emulated gathers N offset vector element
>> -                              offset add is consumed by the load).  */
>> -                           inside_cost
>> -                             = record_stmt_cost (cost_vec, const_nunits,
>> -                                                 vec_to_scalar, stmt_info, 0,
>> -                                                 vect_body);
>> -                           /* N scalar loads plus gathering them into a
>> -                              vector.  */
>> -                           inside_cost
>> -                             = record_stmt_cost (cost_vec, const_nunits,
>> -                                                 scalar_load, stmt_info, 0,
>> -                                                 vect_body);
>> -                           inside_cost
>> -                             = record_stmt_cost (cost_vec, 1, vec_construct,
>> -                                                 stmt_info, 0, vect_body);
>> -                           break;
>> -                         }
>> -                       unsigned HOST_WIDE_INT const_offset_nunits
>> -                         = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
>> -                             .to_constant ();
>> -                       vec<constructor_elt, va_gc> *ctor_elts;
>> -                       vec_alloc (ctor_elts, const_nunits);
>> -                       gimple_seq stmts = NULL;
>> -                       /* We support offset vectors with more elements
>> -                          than the data vector for now.  */
>> -                       unsigned HOST_WIDE_INT factor
>> -                         = const_offset_nunits / const_nunits;
>> -                       vec_offset = vec_offsets[j / factor];
>> -                       unsigned elt_offset = (j % factor) * const_nunits;
>> -                       tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
>> -                       tree scale = size_int (gs_info.scale);
>> -                       align
>> -                         = get_object_alignment (DR_REF (first_dr_info->dr));
>> -                       tree ltype = build_aligned_type (TREE_TYPE (vectype),
>> -                                                        align);
>> -                       for (unsigned k = 0; k < const_nunits; ++k)
>> +                       if (loop_lens)
>> +                         final_len
>> +                           = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
>> +                                                vec_num * ncopies, vectype,
>> +                                                vec_num * j + i, 1);
>> +                       else
>> +                         final_len
>> +                           = build_int_cst (sizetype,
>> +                                            TYPE_VECTOR_SUBPARTS (vectype));
>> +                       signed char biasval
>> +                         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>> +                       bias = build_int_cst (intQI_type_node, biasval);
>> +                       if (!final_mask)
>>                           {
>> -                           tree boff = size_binop (MULT_EXPR,
>> -                                                   TYPE_SIZE (idx_type),
>> -                                                   bitsize_int
>> -                                                     (k + elt_offset));
>> -                           tree idx = gimple_build (&stmts, BIT_FIELD_REF,
>> -                                                    idx_type, vec_offset,
>> -                                                    TYPE_SIZE (idx_type),
>> -                                                    boff);
>> -                           idx = gimple_convert (&stmts, sizetype, idx);
>> -                           idx = gimple_build (&stmts, MULT_EXPR,
>> -                                               sizetype, idx, scale);
>> -                           tree ptr = gimple_build (&stmts, PLUS_EXPR,
>> -                                                    TREE_TYPE (dataref_ptr),
>> -                                                    dataref_ptr, idx);
>> -                           ptr = gimple_convert (&stmts, ptr_type_node, ptr);
>> -                           tree elt = make_ssa_name (TREE_TYPE (vectype));
>> -                           tree ref = build2 (MEM_REF, ltype, ptr,
>> -                                              build_int_cst (ref_type, 0));
>> -                           new_stmt = gimple_build_assign (elt, ref);
>> -                           gimple_set_vuse (new_stmt,
>> -                                            gimple_vuse (gsi_stmt (*gsi)));
>> -                           gimple_seq_add_stmt (&stmts, new_stmt);
>> -                           CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
>> +                           mask_vectype = truth_type_for (vectype);
>> +                           final_mask = build_minus_one_cst (mask_vectype);
>>                           }
>> -                       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>> -                       new_stmt = gimple_build_assign (NULL_TREE,
>> -                                                       build_constructor
>> -                                                         (vectype, ctor_elts));
>> -                       data_ref = NULL_TREE;
>> -                       break;
>>                       }
>>
>> -                   if (costing_p)
>> -                     break;
>> -
>> -                   align =
>> -                     known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
>> -                   if (alignment_support_scheme == dr_aligned)
>> -                     misalign = 0;
>> -                   else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
>> -                     {
>> -                       align = dr_alignment
>> -                         (vect_dr_behavior (vinfo, first_dr_info));
>> -                       misalign = 0;
>> -                     }
>> +                   gcall *call;
>> +                   if (final_len && final_mask)
>> +                     call = gimple_build_call_internal (
>> +                       IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
>> +                       scale, zero, final_mask, final_len, bias);
>> +                   else if (final_mask)
>> +                     call
>> +                       = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
>> +                                                     dataref_ptr, vec_offset,
>> +                                                     scale, zero, final_mask);
>>                     else
>> -                     misalign = misalignment;
>> -                   if (dataref_offset == NULL_TREE
>> -                       && TREE_CODE (dataref_ptr) == SSA_NAME)
>> -                     set_ptr_info_alignment (get_ptr_info (dataref_ptr),
>> -                                             align, misalign);
>> -                   align = least_bit_hwi (misalign | align);
>> -
>> -                   /* Compute IFN when LOOP_LENS or final_mask valid.  */
>> -                   machine_mode vmode = TYPE_MODE (vectype);
>> -                   machine_mode new_vmode = vmode;
>> -                   internal_fn partial_ifn = IFN_LAST;
>> -                   if (loop_lens)
>> +                     call
>> +                       = gimple_build_call_internal (IFN_GATHER_LOAD, 4,
>> +                                                     dataref_ptr, vec_offset,
>> +                                                     scale, zero);
>> +                   gimple_call_set_nothrow (call, true);
>> +                   new_stmt = call;
>> +                   data_ref = NULL_TREE;
>> +                   break;
>> +                 }
>> +               else if (memory_access_type == VMAT_GATHER_SCATTER)
>> +                 {
>> +                   /* Emulated gather-scatter.  */
>> +                   gcc_assert (!final_mask);
>> +                   unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
>> +                   if (costing_p)
>>                       {
>> -                       opt_machine_mode new_ovmode
>> -                         = get_len_load_store_mode (vmode, true,
>> -                                                    &partial_ifn);
>> -                       new_vmode = new_ovmode.require ();
>> -                       unsigned factor = (new_ovmode == vmode)
>> -                                           ? 1
>> -                                           : GET_MODE_UNIT_SIZE (vmode);
>> -                       final_len
>> -                         = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
>> -                                              vec_num * ncopies, vectype,
>> -                                              vec_num * j + i, factor);
>> +                       /* For emulated gathers N offset vector element
>> +                          offset add is consumed by the load).  */
>> +                       inside_cost
>> +                         = record_stmt_cost (cost_vec, const_nunits,
>> +                                             vec_to_scalar, stmt_info, 0,
>> +                                             vect_body);
>> +                       /* N scalar loads plus gathering them into a
>> +                          vector.  */
>> +                       inside_cost = record_stmt_cost (cost_vec, const_nunits,
>> +                                                       scalar_load, stmt_info,
>> +                                                       0, vect_body);
>> +                       inside_cost
>> +                         = record_stmt_cost (cost_vec, 1, vec_construct,
>> +                                             stmt_info, 0, vect_body);
>> +                       break;
>>                       }
>> -                   else if (final_mask)
>> +                   unsigned HOST_WIDE_INT const_offset_nunits
>> +                     = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
>> +                         .to_constant ();
>> +                   vec<constructor_elt, va_gc> *ctor_elts;
>> +                   vec_alloc (ctor_elts, const_nunits);
>> +                   gimple_seq stmts = NULL;
>> +                   /* We support offset vectors with more elements
>> +                      than the data vector for now.  */
>> +                   unsigned HOST_WIDE_INT factor
>> +                     = const_offset_nunits / const_nunits;
>> +                   vec_offset = vec_offsets[j / factor];
>> +                   unsigned elt_offset = (j % factor) * const_nunits;
>> +                   tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
>> +                   tree scale = size_int (gs_info.scale);
>> +                   align = get_object_alignment (DR_REF (first_dr_info->dr));
>> +                   tree ltype
>> +                     = build_aligned_type (TREE_TYPE (vectype), align);
>> +                   for (unsigned k = 0; k < const_nunits; ++k)
>>                       {
>> -                       if (!can_vec_mask_load_store_p (
>> -                             vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
>> -                             &partial_ifn))
>> -                         gcc_unreachable ();
>> +                       tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
>> +                                               bitsize_int (k + elt_offset));
>> +                       tree idx = gimple_build (&stmts, BIT_FIELD_REF,
>> +                                                idx_type, vec_offset,
>> +                                                TYPE_SIZE (idx_type), boff);
>> +                       idx = gimple_convert (&stmts, sizetype, idx);
>> +                       idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
>> +                                           scale);
>> +                       tree ptr = gimple_build (&stmts, PLUS_EXPR,
>> +                                                TREE_TYPE (dataref_ptr),
>> +                                                dataref_ptr, idx);
>> +                       ptr = gimple_convert (&stmts, ptr_type_node, ptr);
>> +                       tree elt = make_ssa_name (TREE_TYPE (vectype));
>> +                       tree ref = build2 (MEM_REF, ltype, ptr,
>> +                                          build_int_cst (ref_type, 0));
>> +                       new_stmt = gimple_build_assign (elt, ref);
>> +                       gimple_set_vuse (new_stmt,
>> +                                        gimple_vuse (gsi_stmt (*gsi)));
>> +                       gimple_seq_add_stmt (&stmts, new_stmt);
>> +                       CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
>>                       }
>> +                   gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
>> +                   new_stmt = gimple_build_assign (
>> +                     NULL_TREE, build_constructor (vectype, ctor_elts));
>> +                   data_ref = NULL_TREE;
>> +                   break;
>> +                 }
>>
>> -                   if (partial_ifn == IFN_MASK_LEN_LOAD)
>> +               if (costing_p)
>> +                 break;
>> +
>> +               align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
>> +               if (alignment_support_scheme == dr_aligned)
>> +                 misalign = 0;
>> +               else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
>> +                 {
>> +                   align
>> +                     = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
>> +                   misalign = 0;
>> +                 }
>> +               else
>> +                 misalign = misalignment;
>> +               if (dataref_offset == NULL_TREE
>> +                   && TREE_CODE (dataref_ptr) == SSA_NAME)
>> +                 set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
>> +                                         misalign);
>> +               align = least_bit_hwi (misalign | align);
>> +
>> +               /* Compute IFN when LOOP_LENS or final_mask valid.  */
>> +               machine_mode vmode = TYPE_MODE (vectype);
>> +               machine_mode new_vmode = vmode;
>> +               internal_fn partial_ifn = IFN_LAST;
>> +               if (loop_lens)
>> +                 {
>> +                   opt_machine_mode new_ovmode
>> +                     = get_len_load_store_mode (vmode, true, &partial_ifn);
>> +                   new_vmode = new_ovmode.require ();
>> +                   unsigned factor
>> +                     = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
>> +                   final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
>> +                                                  vec_num * ncopies, vectype,
>> +                                                  vec_num * j + i, factor);
>> +                 }
>> +               else if (final_mask)
>> +                 {
>> +                   if (!can_vec_mask_load_store_p (
>> +                         vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
>> +                         &partial_ifn))
>> +                     gcc_unreachable ();
>> +                 }
>> +
>> +               if (partial_ifn == IFN_MASK_LEN_LOAD)
>> +                 {
>> +                   if (!final_len)
>>                       {
>> -                       if (!final_len)
>> -                         {
>> -                           /* Pass VF value to 'len' argument of
>> -                              MASK_LEN_LOAD if LOOP_LENS is invalid.  */
>> -                           final_len
>> -                             = size_int (TYPE_VECTOR_SUBPARTS (vectype));
>> -                         }
>> -                       if (!final_mask)
>> -                         {
>> -                           /* Pass all ones value to 'mask' argument of
>> -                              MASK_LEN_LOAD if final_mask is invalid.  */
>> -                           mask_vectype = truth_type_for (vectype);
>> -                           final_mask = build_minus_one_cst (mask_vectype);
>> -                         }
>> +                       /* Pass VF value to 'len' argument of
>> +                          MASK_LEN_LOAD if LOOP_LENS is invalid.  */
>> +                       final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
>>                       }
>> -                   if (final_len)
>> +                   if (!final_mask)
>>                       {
>> -                       signed char biasval
>> -                         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>> -
>> -                       bias = build_int_cst (intQI_type_node, biasval);
>> +                       /* Pass all ones value to 'mask' argument of
>> +                          MASK_LEN_LOAD if final_mask is invalid.  */
>> +                       mask_vectype = truth_type_for (vectype);
>> +                       final_mask = build_minus_one_cst (mask_vectype);
>>                       }
>> +                 }
>> +               if (final_len)
>> +                 {
>> +                   signed char biasval
>> +                     = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
>>
>> -                   if (final_len)
>> +                   bias = build_int_cst (intQI_type_node, biasval);
>> +                 }
>> +
>> +               if (final_len)
>> +                 {
>> +                   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>> +                   gcall *call;
>> +                   if (partial_ifn == IFN_MASK_LEN_LOAD)
>> +                     call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, 5,
>> +                                                        dataref_ptr, ptr,
>> +                                                        final_mask, final_len,
>> +                                                        bias);
>> +                   else
>> +                     call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
>> +                                                        dataref_ptr, ptr,
>> +                                                        final_len, bias);
>> +                   gimple_call_set_nothrow (call, true);
>> +                   new_stmt = call;
>> +                   data_ref = NULL_TREE;
>> +
>> +                   /* Need conversion if it's wrapped with VnQI.  */
>> +                   if (vmode != new_vmode)
>>                       {
>> -                       tree ptr
>> -                         = build_int_cst (ref_type, align * BITS_PER_UNIT);
>> -                       gcall *call;
>> -                       if (partial_ifn == IFN_MASK_LEN_LOAD)
>> -                         call = gimple_build_call_internal (IFN_MASK_LEN_LOAD,
>> -                                                            5, dataref_ptr,
>> -                                                            ptr, final_mask,
>> -                                                            final_len, bias);
>> -                       else
>> -                         call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
>> -                                                            dataref_ptr, ptr,
>> -                                                            final_len, bias);
>> -                       gimple_call_set_nothrow (call, true);
>> -                       new_stmt = call;
>> -                       data_ref = NULL_TREE;
>> -
>> -                       /* Need conversion if it's wrapped with VnQI.  */
>> -                       if (vmode != new_vmode)
>> -                         {
>> -                           tree new_vtype = build_vector_type_for_mode (
>> -                             unsigned_intQI_type_node, new_vmode);
>> -                           tree var = vect_get_new_ssa_name (new_vtype,
>> -                                                             vect_simple_var);
>> -                           gimple_set_lhs (call, var);
>> -                           vect_finish_stmt_generation (vinfo, stmt_info, call,
>> -                                                        gsi);
>> -                           tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
>> -                           new_stmt
>> -                             = gimple_build_assign (vec_dest,
>> -                                                    VIEW_CONVERT_EXPR, op);
>> -                         }
>> +                       tree new_vtype = build_vector_type_for_mode (
>> +                         unsigned_intQI_type_node, new_vmode);
>> +                       tree var
>> +                         = vect_get_new_ssa_name (new_vtype, vect_simple_var);
>> +                       gimple_set_lhs (call, var);
>> +                       vect_finish_stmt_generation (vinfo, stmt_info, call,
>> +                                                    gsi);
>> +                       tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
>> +                       new_stmt = gimple_build_assign (vec_dest,
>> +                                                       VIEW_CONVERT_EXPR, op);
>>                       }
>> -                   else if (final_mask)
>> +                 }
>> +               else if (final_mask)
>> +                 {
>> +                   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
>> +                   gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
>> +                                                             dataref_ptr, ptr,
>> +                                                             final_mask);
>> +                   gimple_call_set_nothrow (call, true);
>> +                   new_stmt = call;
>> +                   data_ref = NULL_TREE;
>> +                 }
>> +               else
>> +                 {
>> +                   tree ltype = vectype;
>> +                   tree new_vtype = NULL_TREE;
>> +                   unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
>> +                   unsigned int vect_align
>> +                     = vect_known_alignment_in_bytes (first_dr_info, vectype);
>> +                   unsigned int scalar_dr_size
>> +                     = vect_get_scalar_dr_size (first_dr_info);
>> +                   /* If there's no peeling for gaps but we have a gap
>> +                      with slp loads then load the lower half of the
>> +                      vector only.  See get_group_load_store_type for
>> +                      when we apply this optimization.  */
>> +                   if (slp
>> +                       && loop_vinfo
>> +                       && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0
>> +                       && known_eq (nunits, (group_size - gap) * 2)
>> +                       && known_eq (nunits, group_size)
>> +                       && gap >= (vect_align / scalar_dr_size))
>>                       {
>> -                       tree ptr = build_int_cst (ref_type,
>> -                                                 align * BITS_PER_UNIT);
>> -                       gcall *call
>> -                         = gimple_build_call_internal (IFN_MASK_LOAD, 3,
>> -                                                       dataref_ptr, ptr,
>> -                                                       final_mask);
>> -                       gimple_call_set_nothrow (call, true);
>> -                       new_stmt = call;
>> -                       data_ref = NULL_TREE;
>> +                       tree half_vtype;
>> +                       new_vtype
>> +                         = vector_vector_composition_type (vectype, 2,
>> +                                                           &half_vtype);
>> +                       if (new_vtype != NULL_TREE)
>> +                         ltype = half_vtype;
>>                       }
>> +                   tree offset
>> +                     = (dataref_offset ? dataref_offset
>> +                                       : build_int_cst (ref_type, 0));
>> +                   if (ltype != vectype
>> +                       && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
>> +                     {
>> +                       unsigned HOST_WIDE_INT gap_offset
>> +                         = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
>> +                       tree gapcst = build_int_cst (ref_type, gap_offset);
>> +                       offset = size_binop (PLUS_EXPR, offset, gapcst);
>> +                     }
>> +                   data_ref
>> +                     = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
>> +                   if (alignment_support_scheme == dr_aligned)
>> +                     ;
>>                     else
>> +                     TREE_TYPE (data_ref)
>> +                       = build_aligned_type (TREE_TYPE (data_ref),
>> +                                             align * BITS_PER_UNIT);
>> +                   if (ltype != vectype)
>>                       {
>> -                       tree ltype = vectype;
>> -                       tree new_vtype = NULL_TREE;
>> -                       unsigned HOST_WIDE_INT gap
>> -                         = DR_GROUP_GAP (first_stmt_info);
>> -                       unsigned int vect_align
>> -                         = vect_known_alignment_in_bytes (first_dr_info,
>> -                                                          vectype);
>> -                       unsigned int scalar_dr_size
>> -                         = vect_get_scalar_dr_size (first_dr_info);
>> -                       /* If there's no peeling for gaps but we have a gap
>> -                          with slp loads then load the lower half of the
>> -                          vector only.  See get_group_load_store_type for
>> -                          when we apply this optimization.  */
>> -                       if (slp
>> -                           && loop_vinfo
>> -                           && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
>> -                           && gap != 0
>> -                           && known_eq (nunits, (group_size - gap) * 2)
>> -                           && known_eq (nunits, group_size)
>> -                           && gap >= (vect_align / scalar_dr_size))
>> +                       vect_copy_ref_info (data_ref,
>> +                                           DR_REF (first_dr_info->dr));
>> +                       tree tem = make_ssa_name (ltype);
>> +                       new_stmt = gimple_build_assign (tem, data_ref);
>> +                       vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
>> +                                                    gsi);
>> +                       data_ref = NULL;
>> +                       vec<constructor_elt, va_gc> *v;
>> +                       vec_alloc (v, 2);
>> +                       if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
>>                           {
>> -                           tree half_vtype;
>> -                           new_vtype
>> -                             = vector_vector_composition_type (vectype, 2,
>> -                                                               &half_vtype);
>> -                           if (new_vtype != NULL_TREE)
>> -                             ltype = half_vtype;
>> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
>> +                                                   build_zero_cst (ltype));
>> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
>>                           }
>> -                       tree offset
>> -                         = (dataref_offset ? dataref_offset
>> -                                           : build_int_cst (ref_type, 0));
>> -                       if (ltype != vectype
>> -                           && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
>> +                       else
>>                           {
>> -                           unsigned HOST_WIDE_INT gap_offset
>> -                             = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
>> -                           tree gapcst = build_int_cst (ref_type, gap_offset);
>> -                           offset = size_binop (PLUS_EXPR, offset, gapcst);
>> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
>> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
>> +                                                   build_zero_cst (ltype));
>>                           }
>> -                       data_ref
>> -                         = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
>> -                       if (alignment_support_scheme == dr_aligned)
>> -                         ;
>> +                       gcc_assert (new_vtype != NULL_TREE);
>> +                       if (new_vtype == vectype)
>> +                         new_stmt = gimple_build_assign (
>> +                           vec_dest, build_constructor (vectype, v));
>>                         else
>> -                         TREE_TYPE (data_ref)
>> -                           = build_aligned_type (TREE_TYPE (data_ref),
>> -                                                 align * BITS_PER_UNIT);
>> -                       if (ltype != vectype)
>>                           {
>> -                           vect_copy_ref_info (data_ref,
>> -                                               DR_REF (first_dr_info->dr));
>> -                           tree tem = make_ssa_name (ltype);
>> -                           new_stmt = gimple_build_assign (tem, data_ref);
>> +                           tree new_vname = make_ssa_name (new_vtype);
>> +                           new_stmt = gimple_build_assign (
>> +                             new_vname, build_constructor (new_vtype, v));
>>                             vect_finish_stmt_generation (vinfo, stmt_info,
>>                                                          new_stmt, gsi);
>> -                           data_ref = NULL;
>> -                           vec<constructor_elt, va_gc> *v;
>> -                           vec_alloc (v, 2);
>> -                           if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
>> -                             {
>> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
>> -                                                       build_zero_cst (ltype));
>> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
>> -                             }
>> -                           else
>> -                             {
>> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
>> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
>> -                                                       build_zero_cst (ltype));
>> -                             }
>> -                           gcc_assert (new_vtype != NULL_TREE);
>> -                           if (new_vtype == vectype)
>> -                             new_stmt = gimple_build_assign (
>> -                               vec_dest, build_constructor (vectype, v));
>> -                           else
>> -                             {
>> -                               tree new_vname = make_ssa_name (new_vtype);
>> -                               new_stmt = gimple_build_assign (
>> -                                 new_vname, build_constructor (new_vtype, v));
>> -                               vect_finish_stmt_generation (vinfo, stmt_info,
>> -                                                            new_stmt, gsi);
>> -                               new_stmt = gimple_build_assign (
>> -                                 vec_dest, build1 (VIEW_CONVERT_EXPR, vectype,
>> -                                                   new_vname));
>> -                             }
>> +                           new_stmt = gimple_build_assign (
>> +                             vec_dest,
>> +                             build1 (VIEW_CONVERT_EXPR, vectype, new_vname));
>>                           }
>>                       }
>> -                   break;
>>                   }
>> -               case dr_explicit_realign:
>> -                 {
>> -                   if (costing_p)
>> -                     break;
>> -                   tree ptr, bump;
>> -
>> -                   tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
>> +               break;
>> +             }
>> +           case dr_explicit_realign:
>> +             {
>> +               if (costing_p)
>> +                 break;
>> +               tree ptr, bump;
>>
>> -                   if (compute_in_loop)
>> -                     msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
>> -                                                   &realignment_token,
>> -                                                   dr_explicit_realign,
>> -                                                   dataref_ptr, NULL);
>> +               tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
>>
>> -                   if (TREE_CODE (dataref_ptr) == SSA_NAME)
>> -                     ptr = copy_ssa_name (dataref_ptr);
>> -                   else
>> -                     ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
>> -                   // For explicit realign the target alignment should be
>> -                   // known at compile time.
>> -                   unsigned HOST_WIDE_INT align =
>> -                     DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
>> -                   new_stmt = gimple_build_assign
>> -                                (ptr, BIT_AND_EXPR, dataref_ptr,
>> -                                 build_int_cst
>> -                                 (TREE_TYPE (dataref_ptr),
>> -                                  -(HOST_WIDE_INT) align));
>> -                   vect_finish_stmt_generation (vinfo, stmt_info,
>> -                                                new_stmt, gsi);
>> -                   data_ref
>> -                     = build2 (MEM_REF, vectype, ptr,
>> -                               build_int_cst (ref_type, 0));
>> -                   vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
>> -                   vec_dest = vect_create_destination_var (scalar_dest,
>> -                                                           vectype);
>> -                   new_stmt = gimple_build_assign (vec_dest, data_ref);
>> -                   new_temp = make_ssa_name (vec_dest, new_stmt);
>> -                   gimple_assign_set_lhs (new_stmt, new_temp);
>> -                   gimple_move_vops (new_stmt, stmt_info->stmt);
>> -                   vect_finish_stmt_generation (vinfo, stmt_info,
>> -                                                new_stmt, gsi);
>> -                   msq = new_temp;
>> -
>> -                   bump = size_binop (MULT_EXPR, vs,
>> -                                      TYPE_SIZE_UNIT (elem_type));
>> -                   bump = size_binop (MINUS_EXPR, bump, size_one_node);
>> -                   ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi,
>> -                                          stmt_info, bump);
>> -                   new_stmt = gimple_build_assign
>> -                                (NULL_TREE, BIT_AND_EXPR, ptr,
>> -                                 build_int_cst
>> -                                 (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
>> -                   if (TREE_CODE (ptr) == SSA_NAME)
>> -                     ptr = copy_ssa_name (ptr, new_stmt);
>> -                   else
>> -                     ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
>> -                   gimple_assign_set_lhs (new_stmt, ptr);
>> -                   vect_finish_stmt_generation (vinfo, stmt_info,
>> -                                                new_stmt, gsi);
>> -                   data_ref
>> -                     = build2 (MEM_REF, vectype, ptr,
>> -                               build_int_cst (ref_type, 0));
>> -                   break;
>> -                 }
>> -               case dr_explicit_realign_optimized:
>> -                 {
>> -                   if (costing_p)
>> -                     break;
>> -                   if (TREE_CODE (dataref_ptr) == SSA_NAME)
>> -                     new_temp = copy_ssa_name (dataref_ptr);
>> -                   else
>> -                     new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
>> -                   // We should only be doing this if we know the target
>> -                   // alignment at compile time.
>> -                   unsigned HOST_WIDE_INT align =
>> -                     DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
>> -                   new_stmt = gimple_build_assign
>> -                     (new_temp, BIT_AND_EXPR, dataref_ptr,
>> -                      build_int_cst (TREE_TYPE (dataref_ptr),
>> -                                    -(HOST_WIDE_INT) align));
>> -                   vect_finish_stmt_generation (vinfo, stmt_info,
>> -                                                new_stmt, gsi);
>> -                   data_ref
>> -                     = build2 (MEM_REF, vectype, new_temp,
>> -                               build_int_cst (ref_type, 0));
>> -                   break;
>> -                 }
>> -               default:
>> -                 gcc_unreachable ();
>> -               }
>> +               if (compute_in_loop)
>> +                 msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
>> +                                               &realignment_token,
>> +                                               dr_explicit_realign,
>> +                                               dataref_ptr, NULL);
>> +
>> +               if (TREE_CODE (dataref_ptr) == SSA_NAME)
>> +                 ptr = copy_ssa_name (dataref_ptr);
>> +               else
>> +                 ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
>> +               // For explicit realign the target alignment should be
>> +               // known at compile time.
>> +               unsigned HOST_WIDE_INT align
>> +                 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
>> +               new_stmt = gimple_build_assign (
>> +                 ptr, BIT_AND_EXPR, dataref_ptr,
>> +                 build_int_cst (TREE_TYPE (dataref_ptr),
>> +                                -(HOST_WIDE_INT) align));
>> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>> +               data_ref
>> +                 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
>> +               vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
>> +               vec_dest = vect_create_destination_var (scalar_dest, vectype);
>> +               new_stmt = gimple_build_assign (vec_dest, data_ref);
>> +               new_temp = make_ssa_name (vec_dest, new_stmt);
>> +               gimple_assign_set_lhs (new_stmt, new_temp);
>> +               gimple_move_vops (new_stmt, stmt_info->stmt);
>> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>> +               msq = new_temp;
>> +
>> +               bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type));
>> +               bump = size_binop (MINUS_EXPR, bump, size_one_node);
>> +               ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info,
>> +                                      bump);
>> +               new_stmt = gimple_build_assign (
>> +                 NULL_TREE, BIT_AND_EXPR, ptr,
>> +                 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
>> +               if (TREE_CODE (ptr) == SSA_NAME)
>> +                 ptr = copy_ssa_name (ptr, new_stmt);
>> +               else
>> +                 ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
>> +               gimple_assign_set_lhs (new_stmt, ptr);
>> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>> +               data_ref
>> +                 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
>> +               break;
>> +             }
>> +           case dr_explicit_realign_optimized:
>> +             {
>> +               if (costing_p)
>> +                 break;
>> +               if (TREE_CODE (dataref_ptr) == SSA_NAME)
>> +                 new_temp = copy_ssa_name (dataref_ptr);
>> +               else
>> +                 new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
>> +               // We should only be doing this if we know the target
>> +               // alignment at compile time.
>> +               unsigned HOST_WIDE_INT align
>> +                 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
>> +               new_stmt = gimple_build_assign (
>> +                 new_temp, BIT_AND_EXPR, dataref_ptr,
>> +                 build_int_cst (TREE_TYPE (dataref_ptr),
>> +                                -(HOST_WIDE_INT) align));
>> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>> +               data_ref = build2 (MEM_REF, vectype, new_temp,
>> +                                  build_int_cst (ref_type, 0));
>> +               break;
>> +             }
>> +           default:
>> +             gcc_unreachable ();
>> +           }
>>
>> -             /* One common place to cost the above vect load for different
>> -                alignment support schemes.  */
>> -             if (costing_p)
>> -               {
>> -                 /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
>> -                    only need to take care of the first stmt, whose
>> -                    stmt_info is first_stmt_info, vec_num iterating on it
>> -                    will cover the cost for the remaining, it's consistent
>> -                    with transforming.  For the prologue cost for realign,
>> -                    we only need to count it once for the whole group.  */
>> -                 bool first_stmt_info_p = first_stmt_info == stmt_info;
>> -                 bool add_realign_cost = first_stmt_info_p && i == 0;
>> -                 if (memory_access_type == VMAT_CONTIGUOUS
>> -                     || memory_access_type == VMAT_CONTIGUOUS_REVERSE
>> -                     || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
>> -                         && (!grouped_load || first_stmt_info_p)))
>> -                   vect_get_load_cost (vinfo, stmt_info, 1,
>> -                                       alignment_support_scheme, misalignment,
>> -                                       add_realign_cost, &inside_cost,
>> -                                       &prologue_cost, cost_vec, cost_vec,
>> -                                       true);
>> -               }
>> -             else
>> +         /* One common place to cost the above vect load for different
>> +            alignment support schemes.  */
>> +         if (costing_p)
>> +           {
>> +             /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
>> +                only need to take care of the first stmt, whose
>> +                stmt_info is first_stmt_info, vec_num iterating on it
>> +                will cover the cost for the remaining, it's consistent
>> +                with transforming.  For the prologue cost for realign,
>> +                we only need to count it once for the whole group.  */
>> +             bool first_stmt_info_p = first_stmt_info == stmt_info;
>> +             bool add_realign_cost = first_stmt_info_p && i == 0;
>> +             if (memory_access_type == VMAT_CONTIGUOUS
>> +                 || memory_access_type == VMAT_CONTIGUOUS_REVERSE
>> +                 || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
>> +                     && (!grouped_load || first_stmt_info_p)))
>> +               vect_get_load_cost (vinfo, stmt_info, 1,
>> +                                   alignment_support_scheme, misalignment,
>> +                                   add_realign_cost, &inside_cost,
>> +                                   &prologue_cost, cost_vec, cost_vec, true);
>> +           }
>> +         else
>> +           {
>> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
>> +             /* DATA_REF is null if we've already built the statement.  */
>> +             if (data_ref)
>>                 {
>> -                 vec_dest = vect_create_destination_var (scalar_dest, vectype);
>> -                 /* DATA_REF is null if we've already built the statement.  */
>> -                 if (data_ref)
>> -                   {
>> -                     vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
>> -                     new_stmt = gimple_build_assign (vec_dest, data_ref);
>> -                   }
>> -                 new_temp = make_ssa_name (vec_dest, new_stmt);
>> -                 gimple_set_lhs (new_stmt, new_temp);
>> -                 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>> +                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
>> +                 new_stmt = gimple_build_assign (vec_dest, data_ref);
>>                 }
>> +             new_temp = make_ssa_name (vec_dest, new_stmt);
>> +             gimple_set_lhs (new_stmt, new_temp);
>> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>> +           }
>>
>> -             /* 3. Handle explicit realignment if necessary/supported.
>> -                Create in loop:
>> -                  vec_dest = realign_load (msq, lsq, realignment_token)  */
>> -             if (!costing_p
>> -                 && (alignment_support_scheme == dr_explicit_realign_optimized
>> -                     || alignment_support_scheme == dr_explicit_realign))
>> -               {
>> -                 lsq = gimple_assign_lhs (new_stmt);
>> -                 if (!realignment_token)
>> -                   realignment_token = dataref_ptr;
>> -                 vec_dest = vect_create_destination_var (scalar_dest, vectype);
>> -                 new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR,
>> -                                                 msq, lsq, realignment_token);
>> -                 new_temp = make_ssa_name (vec_dest, new_stmt);
>> -                 gimple_assign_set_lhs (new_stmt, new_temp);
>> -                 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>> +         /* 3. Handle explicit realignment if necessary/supported.
>> +            Create in loop:
>> +              vec_dest = realign_load (msq, lsq, realignment_token)  */
>> +         if (!costing_p
>> +             && (alignment_support_scheme == dr_explicit_realign_optimized
>> +                 || alignment_support_scheme == dr_explicit_realign))
>> +           {
>> +             lsq = gimple_assign_lhs (new_stmt);
>> +             if (!realignment_token)
>> +               realignment_token = dataref_ptr;
>> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
>> +             new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq,
>> +                                             lsq, realignment_token);
>> +             new_temp = make_ssa_name (vec_dest, new_stmt);
>> +             gimple_assign_set_lhs (new_stmt, new_temp);
>> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>>
>> -                 if (alignment_support_scheme == dr_explicit_realign_optimized)
>> -                   {
>> -                     gcc_assert (phi);
>> -                     if (i == vec_num - 1 && j == ncopies - 1)
>> -                       add_phi_arg (phi, lsq,
>> -                                    loop_latch_edge (containing_loop),
>> -                                    UNKNOWN_LOCATION);
>> -                     msq = lsq;
>> -                   }
>> +             if (alignment_support_scheme == dr_explicit_realign_optimized)
>> +               {
>> +                 gcc_assert (phi);
>> +                 if (i == vec_num - 1 && j == ncopies - 1)
>> +                   add_phi_arg (phi, lsq, loop_latch_edge (containing_loop),
>> +                                UNKNOWN_LOCATION);
>> +                 msq = lsq;
>>                 }
>> +           }
>>
>> -             if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
>> +         if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
>> +           {
>> +             if (costing_p)
>> +               inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
>> +                                               stmt_info, 0, vect_body);
>> +             else
>>                 {
>> -                 if (costing_p)
>> -                   inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
>> -                                                   stmt_info, 0, vect_body);
>> -                 else
>> -                   {
>> -                     tree perm_mask = perm_mask_for_reverse (vectype);
>> -                     new_temp
>> -                       = permute_vec_elements (vinfo, new_temp, new_temp,
>> -                                               perm_mask, stmt_info, gsi);
>> -                     new_stmt = SSA_NAME_DEF_STMT (new_temp);
>> -                   }
>> +                 tree perm_mask = perm_mask_for_reverse (vectype);
>> +                 new_temp = permute_vec_elements (vinfo, new_temp, new_temp,
>> +                                                  perm_mask, stmt_info, gsi);
>> +                 new_stmt = SSA_NAME_DEF_STMT (new_temp);
>>                 }
>> +           }
>>
>> -             /* Collect vector loads and later create their permutation in
>> -                vect_transform_grouped_load ().  */
>> -             if (!costing_p && (grouped_load || slp_perm))
>> -               dr_chain.quick_push (new_temp);
>> +         /* Collect vector loads and later create their permutation in
>> +            vect_transform_grouped_load ().  */
>> +         if (!costing_p && (grouped_load || slp_perm))
>> +           dr_chain.quick_push (new_temp);
>>
>> -             /* Store vector loads in the corresponding SLP_NODE.  */
>> -             if (!costing_p && slp && !slp_perm)
>> -               slp_node->push_vec_def (new_stmt);
>> +         /* Store vector loads in the corresponding SLP_NODE.  */
>> +         if (!costing_p && slp && !slp_perm)
>> +           slp_node->push_vec_def (new_stmt);
>>
>> -             /* With SLP permutation we load the gaps as well, without
>> -                we need to skip the gaps after we manage to fully load
>> -                all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
>> -             group_elt += nunits;
>> -             if (!costing_p
>> -                 && maybe_ne (group_gap_adj, 0U)
>> -                 && !slp_perm
>> -                 && known_eq (group_elt, group_size - group_gap_adj))
>> -               {
>> -                 poly_wide_int bump_val
>> -                   = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
>> -                      * group_gap_adj);
>> -                 if (tree_int_cst_sgn
>> -                       (vect_dr_behavior (vinfo, dr_info)->step) == -1)
>> -                   bump_val = -bump_val;
>> -                 tree bump = wide_int_to_tree (sizetype, bump_val);
>> -                 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
>> -                                                gsi, stmt_info, bump);
>> -                 group_elt = 0;
>> -               }
>> -           }
>> -         /* Bump the vector pointer to account for a gap or for excess
>> -            elements loaded for a permuted SLP load.  */
>> +         /* With SLP permutation we load the gaps as well, without
>> +            we need to skip the gaps after we manage to fully load
>> +            all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
>> +         group_elt += nunits;
>>           if (!costing_p
>>               && maybe_ne (group_gap_adj, 0U)
>> -             && slp_perm)
>> +             && !slp_perm
>> +             && known_eq (group_elt, group_size - group_gap_adj))
>>             {
>>               poly_wide_int bump_val
>> -               = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
>> -                  * group_gap_adj);
>> -             if (tree_int_cst_sgn
>> -                   (vect_dr_behavior (vinfo, dr_info)->step) == -1)
>> +               = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
>> +             if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step)
>> +                 == -1)
>>                 bump_val = -bump_val;
>>               tree bump = wide_int_to_tree (sizetype, bump_val);
>>               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
>>                                              stmt_info, bump);
>> +             group_elt = 0;
>>             }
>>         }
>> +      /* Bump the vector pointer to account for a gap or for excess
>> +        elements loaded for a permuted SLP load.  */
>> +      if (!costing_p
>> +         && maybe_ne (group_gap_adj, 0U)
>> +         && slp_perm)
>> +       {
>> +         poly_wide_int bump_val
>> +           = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
>> +         if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1)
>> +           bump_val = -bump_val;
>> +         tree bump = wide_int_to_tree (sizetype, bump_val);
>> +         dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
>> +                                        stmt_info, bump);
>> +       }
>>
>>        if (slp && !slp_perm)
>>         continue;
>> @@ -11120,39 +11117,36 @@ vectorizable_load (vec_info *vinfo,
>>             }
>>         }
>>        else
>> -        {
>> -          if (grouped_load)
>> -           {
>> -             if (memory_access_type != VMAT_LOAD_STORE_LANES)
>> +       {
>> +         if (grouped_load)
>> +           {
>> +             gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
>> +             /* We assume that the cost of a single load-lanes instruction
>> +                is equivalent to the cost of DR_GROUP_SIZE separate loads.
>> +                If a grouped access is instead being provided by a
>> +                load-and-permute operation, include the cost of the
>> +                permutes.  */
>> +             if (costing_p && first_stmt_info == stmt_info)
>>                 {
>> -                 gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
>> -                 /* We assume that the cost of a single load-lanes instruction
>> -                    is equivalent to the cost of DR_GROUP_SIZE separate loads.
>> -                    If a grouped access is instead being provided by a
>> -                    load-and-permute operation, include the cost of the
>> -                    permutes.  */
>> -                 if (costing_p && first_stmt_info == stmt_info)
>> -                   {
>> -                     /* Uses an even and odd extract operations or shuffle
>> -                        operations for each needed permute.  */
>> -                     int group_size = DR_GROUP_SIZE (first_stmt_info);
>> -                     int nstmts = ceil_log2 (group_size) * group_size;
>> -                     inside_cost
>> -                       += record_stmt_cost (cost_vec, nstmts, vec_perm,
>> -                                            stmt_info, 0, vect_body);
>> +                 /* Uses an even and odd extract operations or shuffle
>> +                    operations for each needed permute.  */
>> +                 int group_size = DR_GROUP_SIZE (first_stmt_info);
>> +                 int nstmts = ceil_log2 (group_size) * group_size;
>> +                 inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
>> +                                                  stmt_info, 0, vect_body);
>>
>> -                     if (dump_enabled_p ())
>> -                       dump_printf_loc (
>> -                         MSG_NOTE, vect_location,
>> -                         "vect_model_load_cost: strided group_size = %d .\n",
>> -                         group_size);
>> -                   }
>> -                 else if (!costing_p)
>> -                   vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
>> -                                                group_size, gsi);
>> +                 if (dump_enabled_p ())
>> +                   dump_printf_loc (MSG_NOTE, vect_location,
>> +                                    "vect_model_load_cost:"
>> +                                    "strided group_size = %d .\n",
>> +                                    group_size);
>> +               }
>> +             else if (!costing_p)
>> +               {
>> +                 vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
>> +                                              group_size, gsi);
>> +                 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>>                 }
>> -             if (!costing_p)
>> -               *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
>>             }
>>           else if (!costing_p)
>>             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>> @@ -11166,7 +11160,8 @@ vectorizable_load (vec_info *vinfo,
>>      {
>>        gcc_assert (memory_access_type != VMAT_INVARIANT
>>                   && memory_access_type != VMAT_ELEMENTWISE
>> -                 && memory_access_type != VMAT_STRIDED_SLP);
>> +                 && memory_access_type != VMAT_STRIDED_SLP
>> +                 && memory_access_type != VMAT_LOAD_STORE_LANES);
>>        if (dump_enabled_p ())
>>         dump_printf_loc (MSG_NOTE, vect_location,
>>                          "vect_model_load_cost: inside_cost = %u, "
>> --
>> 2.31.1
>
  
Richard Biener Aug. 14, 2023, 1:47 p.m. UTC | #3
On Mon, Aug 14, 2023 at 2:49 PM Kewen.Lin <linkw@linux.ibm.com> wrote:
>
> Hi Richi,
>
> on 2023/8/14 20:04, Richard Biener wrote:
> > On Mon, Aug 14, 2023 at 10:54 AM Kewen.Lin <linkw@linux.ibm.com> wrote:
> >>
> >> Hi,
> >>
> >> Following Richi's suggestion [1], this patch is to move the
> >> handlings on VMAT_LOAD_STORE_LANES in the final loop nest
> >> of function vectorizable_load to its own loop.  Basically
> >> it duplicates the final loop nest, clean up some useless
> >> set up code for the case of VMAT_LOAD_STORE_LANES, remove
> >> some unreachable code.  Also remove the corresponding
> >> handlings in the final loop nest.
> >>
> >> Bootstrapped and regtested on x86_64-redhat-linux,
> >> aarch64-linux-gnu and powerpc64{,le}-linux-gnu.
> >
> > OK (I guess the big diff is mostly because of re-indenting).
>
> Thanks!  Yes, there is some code in the original final loop nest like
>
> if (memory_access_type == VMAT_LOAD_STORE_LANES)
>   {
>     ...
>   }
> else
>   {
>     ...
>   }
>
> Then the else arm is fully re-indented.
>
> The other patch on VMAT_GATHER_SCATTER looks a bit better since
> it doesn't need re-indenting.

Yes, that's also because VMAT_LOAD_STORE_LANES isn't for SLP so
it even makes more sense to split that case out.

Richard.

> BR,
> Kewen
>
> >
> > Thanks,
> > Richard.
> >
> >> [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html
> >>
> >> gcc/ChangeLog:
> >>
> >>         * tree-vect-stmts.cc (vectorizable_load): Move the handlings on
> >>         VMAT_LOAD_STORE_LANES in the final loop nest to its own loop,
> >>         and update the final nest accordingly.
> >> ---
> >>  gcc/tree-vect-stmts.cc | 1275 ++++++++++++++++++++--------------------
> >>  1 file changed, 634 insertions(+), 641 deletions(-)
> >>
> >> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> >> index 4f2d088484c..c361e16cb7b 100644
> >> --- a/gcc/tree-vect-stmts.cc
> >> +++ b/gcc/tree-vect-stmts.cc
> >> @@ -10332,7 +10332,129 @@ vectorizable_load (vec_info *vinfo,
> >>         vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
> >>                                        &vec_masks, mask_vectype);
> >>      }
> >> +
> >>    tree vec_mask = NULL_TREE;
> >> +  if (memory_access_type == VMAT_LOAD_STORE_LANES)
> >> +    {
> >> +      gcc_assert (alignment_support_scheme == dr_aligned
> >> +                 || alignment_support_scheme == dr_unaligned_supported);
> >> +      gcc_assert (grouped_load && !slp);
> >> +
> >> +      unsigned int inside_cost = 0, prologue_cost = 0;
> >> +      for (j = 0; j < ncopies; j++)
> >> +       {
> >> +         if (costing_p)
> >> +           {
> >> +             /* An IFN_LOAD_LANES will load all its vector results,
> >> +                regardless of which ones we actually need.  Account
> >> +                for the cost of unused results.  */
> >> +             if (first_stmt_info == stmt_info)
> >> +               {
> >> +                 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
> >> +                 stmt_vec_info next_stmt_info = first_stmt_info;
> >> +                 do
> >> +                   {
> >> +                     gaps -= 1;
> >> +                     next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
> >> +                   }
> >> +                 while (next_stmt_info);
> >> +                 if (gaps)
> >> +                   {
> >> +                     if (dump_enabled_p ())
> >> +                       dump_printf_loc (MSG_NOTE, vect_location,
> >> +                                        "vect_model_load_cost: %d "
> >> +                                        "unused vectors.\n",
> >> +                                        gaps);
> >> +                     vect_get_load_cost (vinfo, stmt_info, gaps,
> >> +                                         alignment_support_scheme,
> >> +                                         misalignment, false, &inside_cost,
> >> +                                         &prologue_cost, cost_vec, cost_vec,
> >> +                                         true);
> >> +                   }
> >> +               }
> >> +             vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
> >> +                                 misalignment, false, &inside_cost,
> >> +                                 &prologue_cost, cost_vec, cost_vec, true);
> >> +             continue;
> >> +           }
> >> +
> >> +         /* 1. Create the vector or array pointer update chain.  */
> >> +         if (j == 0)
> >> +           dataref_ptr
> >> +             = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
> >> +                                         at_loop, offset, &dummy, gsi,
> >> +                                         &ptr_incr, false, bump);
> >> +         else
> >> +           {
> >> +             gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
> >> +             dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
> >> +                                            stmt_info, bump);
> >> +           }
> >> +         if (mask)
> >> +           vec_mask = vec_masks[j];
> >> +
> >> +         tree vec_array = create_vector_array (vectype, vec_num);
> >> +
> >> +         tree final_mask = NULL_TREE;
> >> +         if (loop_masks)
> >> +           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >> +                                            ncopies, vectype, j);
> >> +         if (vec_mask)
> >> +           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
> >> +                                          vec_mask, gsi);
> >> +
> >> +         gcall *call;
> >> +         if (final_mask)
> >> +           {
> >> +             /* Emit:
> >> +                  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> >> +                                               VEC_MASK).  */
> >> +             unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> >> +             tree alias_ptr = build_int_cst (ref_type, align);
> >> +             call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
> >> +                                                dataref_ptr, alias_ptr,
> >> +                                                final_mask);
> >> +           }
> >> +         else
> >> +           {
> >> +             /* Emit:
> >> +                  VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
> >> +             data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
> >> +             call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
> >> +           }
> >> +         gimple_call_set_lhs (call, vec_array);
> >> +         gimple_call_set_nothrow (call, true);
> >> +         vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> >> +
> >> +         dr_chain.create (vec_num);
> >> +         /* Extract each vector into an SSA_NAME.  */
> >> +         for (i = 0; i < vec_num; i++)
> >> +           {
> >> +             new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
> >> +                                           vec_array, i);
> >> +             dr_chain.quick_push (new_temp);
> >> +           }
> >> +
> >> +         /* Record the mapping between SSA_NAMEs and statements.  */
> >> +         vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
> >> +
> >> +         /* Record that VEC_ARRAY is now dead.  */
> >> +         vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
> >> +
> >> +         dr_chain.release ();
> >> +
> >> +         *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> >> +       }
> >> +
> >> +      if (costing_p && dump_enabled_p ())
> >> +       dump_printf_loc (MSG_NOTE, vect_location,
> >> +                        "vect_model_load_cost: inside_cost = %u, "
> >> +                        "prologue_cost = %u .\n",
> >> +                        inside_cost, prologue_cost);
> >> +
> >> +      return true;
> >> +    }
> >> +
> >>    poly_uint64 group_elt = 0;
> >>    unsigned int inside_cost = 0, prologue_cost = 0;
> >>    for (j = 0; j < ncopies; j++)
> >> @@ -10414,685 +10538,558 @@ vectorizable_load (vec_info *vinfo,
> >>         dr_chain.create (vec_num);
> >>
> >>        gimple *new_stmt = NULL;
> >> -      if (memory_access_type == VMAT_LOAD_STORE_LANES)
> >> +      for (i = 0; i < vec_num; i++)
> >>         {
> >> -         if (costing_p)
> >> -           {
> >> -             /* An IFN_LOAD_LANES will load all its vector results,
> >> -                regardless of which ones we actually need.  Account
> >> -                for the cost of unused results.  */
> >> -             if (grouped_load && first_stmt_info == stmt_info)
> >> -               {
> >> -                 unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
> >> -                 stmt_vec_info next_stmt_info = first_stmt_info;
> >> -                 do
> >> -                   {
> >> -                     gaps -= 1;
> >> -                     next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
> >> -                   }
> >> -                 while (next_stmt_info);
> >> -                 if (gaps)
> >> -                   {
> >> -                     if (dump_enabled_p ())
> >> -                       dump_printf_loc (MSG_NOTE, vect_location,
> >> -                                        "vect_model_load_cost: %d "
> >> -                                        "unused vectors.\n",
> >> -                                        gaps);
> >> -                     vect_get_load_cost (vinfo, stmt_info, gaps,
> >> -                                         alignment_support_scheme,
> >> -                                         misalignment, false, &inside_cost,
> >> -                                         &prologue_cost, cost_vec, cost_vec,
> >> -                                         true);
> >> -                   }
> >> -               }
> >> -             vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
> >> -                                 misalignment, false, &inside_cost,
> >> -                                 &prologue_cost, cost_vec, cost_vec, true);
> >> -             continue;
> >> -           }
> >> -         tree vec_array;
> >> -
> >> -         vec_array = create_vector_array (vectype, vec_num);
> >> -
> >>           tree final_mask = NULL_TREE;
> >> -         if (loop_masks)
> >> -           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >> -                                            ncopies, vectype, j);
> >> -         if (vec_mask)
> >> -           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> >> -                                          final_mask, vec_mask, gsi);
> >> -
> >> -         gcall *call;
> >> -         if (final_mask)
> >> -           {
> >> -             /* Emit:
> >> -                  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> >> -                                               VEC_MASK).  */
> >> -             unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> >> -             tree alias_ptr = build_int_cst (ref_type, align);
> >> -             call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
> >> -                                                dataref_ptr, alias_ptr,
> >> -                                                final_mask);
> >> -           }
> >> -         else
> >> +         tree final_len = NULL_TREE;
> >> +         tree bias = NULL_TREE;
> >> +         if (!costing_p)
> >>             {
> >> -             /* Emit:
> >> -                  VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
> >> -             data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
> >> -             call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
> >> -           }
> >> -         gimple_call_set_lhs (call, vec_array);
> >> -         gimple_call_set_nothrow (call, true);
> >> -         vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> >> -         new_stmt = call;
> >> +             if (loop_masks)
> >> +               final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >> +                                                vec_num * ncopies, vectype,
> >> +                                                vec_num * j + i);
> >> +             if (vec_mask)
> >> +               final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> >> +                                              final_mask, vec_mask, gsi);
> >>
> >> -         /* Extract each vector into an SSA_NAME.  */
> >> -         for (i = 0; i < vec_num; i++)
> >> -           {
> >> -             new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
> >> -                                           vec_array, i);
> >> -             dr_chain.quick_push (new_temp);
> >> +             if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> +               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> >> +                                              gsi, stmt_info, bump);
> >>             }
> >>
> >> -         /* Record the mapping between SSA_NAMEs and statements.  */
> >> -         vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
> >> -
> >> -         /* Record that VEC_ARRAY is now dead.  */
> >> -         vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
> >> -       }
> >> -      else
> >> -       {
> >> -         for (i = 0; i < vec_num; i++)
> >> +         /* 2. Create the vector-load in the loop.  */
> >> +         switch (alignment_support_scheme)
> >>             {
> >> -             tree final_mask = NULL_TREE;
> >> -             tree final_len = NULL_TREE;
> >> -             tree bias = NULL_TREE;
> >> -             if (!costing_p)
> >> -               {
> >> -                 if (loop_masks)
> >> -                   final_mask
> >> -                     = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> >> -                                           vec_num * ncopies, vectype,
> >> -                                           vec_num * j + i);
> >> -                 if (vec_mask)
> >> -                   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> >> -                                                  final_mask, vec_mask, gsi);
> >> -
> >> -                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> -                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> >> -                                                  gsi, stmt_info, bump);
> >> -               }
> >> +           case dr_aligned:
> >> +           case dr_unaligned_supported:
> >> +             {
> >> +               unsigned int misalign;
> >> +               unsigned HOST_WIDE_INT align;
> >>
> >> -             /* 2. Create the vector-load in the loop.  */
> >> -             switch (alignment_support_scheme)
> >> -               {
> >> -               case dr_aligned:
> >> -               case dr_unaligned_supported:
> >> +               if (memory_access_type == VMAT_GATHER_SCATTER
> >> +                   && gs_info.ifn != IFN_LAST)
> >>                   {
> >> -                   unsigned int misalign;
> >> -                   unsigned HOST_WIDE_INT align;
> >> -
> >> -                   if (memory_access_type == VMAT_GATHER_SCATTER
> >> -                       && gs_info.ifn != IFN_LAST)
> >> +                   if (costing_p)
> >>                       {
> >> -                       if (costing_p)
> >> -                         {
> >> -                           unsigned int cnunits
> >> -                             = vect_nunits_for_cost (vectype);
> >> -                           inside_cost
> >> -                             = record_stmt_cost (cost_vec, cnunits,
> >> -                                                 scalar_load, stmt_info, 0,
> >> -                                                 vect_body);
> >> -                           break;
> >> -                         }
> >> -                       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> -                         vec_offset = vec_offsets[vec_num * j + i];
> >> -                       tree zero = build_zero_cst (vectype);
> >> -                       tree scale = size_int (gs_info.scale);
> >> -
> >> -                       if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
> >> -                         {
> >> -                           if (loop_lens)
> >> -                             final_len
> >> -                               = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> >> -                                                    vec_num * ncopies, vectype,
> >> -                                                    vec_num * j + i, 1);
> >> -                           else
> >> -                             final_len = build_int_cst (sizetype,
> >> -                                                        TYPE_VECTOR_SUBPARTS (
> >> -                                                          vectype));
> >> -                           signed char biasval
> >> -                             = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> >> -                           bias = build_int_cst (intQI_type_node, biasval);
> >> -                           if (!final_mask)
> >> -                             {
> >> -                               mask_vectype = truth_type_for (vectype);
> >> -                               final_mask = build_minus_one_cst (mask_vectype);
> >> -                             }
> >> -                         }
> >> -
> >> -                       gcall *call;
> >> -                       if (final_len && final_mask)
> >> -                         call = gimple_build_call_internal (
> >> -                           IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr,
> >> -                           vec_offset, scale, zero, final_mask, final_len,
> >> -                           bias);
> >> -                       else if (final_mask)
> >> -                         call = gimple_build_call_internal
> >> -                           (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
> >> -                            vec_offset, scale, zero, final_mask);
> >> -                       else
> >> -                         call = gimple_build_call_internal
> >> -                           (IFN_GATHER_LOAD, 4, dataref_ptr,
> >> -                            vec_offset, scale, zero);
> >> -                       gimple_call_set_nothrow (call, true);
> >> -                       new_stmt = call;
> >> -                       data_ref = NULL_TREE;
> >> +                       unsigned int cnunits = vect_nunits_for_cost (vectype);
> >> +                       inside_cost
> >> +                         = record_stmt_cost (cost_vec, cnunits, scalar_load,
> >> +                                             stmt_info, 0, vect_body);
> >>                         break;
> >>                       }
> >> -                   else if (memory_access_type == VMAT_GATHER_SCATTER)
> >> +                   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> >> +                     vec_offset = vec_offsets[vec_num * j + i];
> >> +                   tree zero = build_zero_cst (vectype);
> >> +                   tree scale = size_int (gs_info.scale);
> >> +
> >> +                   if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
> >>                       {
> >> -                       /* Emulated gather-scatter.  */
> >> -                       gcc_assert (!final_mask);
> >> -                       unsigned HOST_WIDE_INT const_nunits
> >> -                         = nunits.to_constant ();
> >> -                       if (costing_p)
> >> -                         {
> >> -                           /* For emulated gathers N offset vector element
> >> -                              offset add is consumed by the load).  */
> >> -                           inside_cost
> >> -                             = record_stmt_cost (cost_vec, const_nunits,
> >> -                                                 vec_to_scalar, stmt_info, 0,
> >> -                                                 vect_body);
> >> -                           /* N scalar loads plus gathering them into a
> >> -                              vector.  */
> >> -                           inside_cost
> >> -                             = record_stmt_cost (cost_vec, const_nunits,
> >> -                                                 scalar_load, stmt_info, 0,
> >> -                                                 vect_body);
> >> -                           inside_cost
> >> -                             = record_stmt_cost (cost_vec, 1, vec_construct,
> >> -                                                 stmt_info, 0, vect_body);
> >> -                           break;
> >> -                         }
> >> -                       unsigned HOST_WIDE_INT const_offset_nunits
> >> -                         = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
> >> -                             .to_constant ();
> >> -                       vec<constructor_elt, va_gc> *ctor_elts;
> >> -                       vec_alloc (ctor_elts, const_nunits);
> >> -                       gimple_seq stmts = NULL;
> >> -                       /* We support offset vectors with more elements
> >> -                          than the data vector for now.  */
> >> -                       unsigned HOST_WIDE_INT factor
> >> -                         = const_offset_nunits / const_nunits;
> >> -                       vec_offset = vec_offsets[j / factor];
> >> -                       unsigned elt_offset = (j % factor) * const_nunits;
> >> -                       tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> >> -                       tree scale = size_int (gs_info.scale);
> >> -                       align
> >> -                         = get_object_alignment (DR_REF (first_dr_info->dr));
> >> -                       tree ltype = build_aligned_type (TREE_TYPE (vectype),
> >> -                                                        align);
> >> -                       for (unsigned k = 0; k < const_nunits; ++k)
> >> +                       if (loop_lens)
> >> +                         final_len
> >> +                           = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> >> +                                                vec_num * ncopies, vectype,
> >> +                                                vec_num * j + i, 1);
> >> +                       else
> >> +                         final_len
> >> +                           = build_int_cst (sizetype,
> >> +                                            TYPE_VECTOR_SUBPARTS (vectype));
> >> +                       signed char biasval
> >> +                         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> >> +                       bias = build_int_cst (intQI_type_node, biasval);
> >> +                       if (!final_mask)
> >>                           {
> >> -                           tree boff = size_binop (MULT_EXPR,
> >> -                                                   TYPE_SIZE (idx_type),
> >> -                                                   bitsize_int
> >> -                                                     (k + elt_offset));
> >> -                           tree idx = gimple_build (&stmts, BIT_FIELD_REF,
> >> -                                                    idx_type, vec_offset,
> >> -                                                    TYPE_SIZE (idx_type),
> >> -                                                    boff);
> >> -                           idx = gimple_convert (&stmts, sizetype, idx);
> >> -                           idx = gimple_build (&stmts, MULT_EXPR,
> >> -                                               sizetype, idx, scale);
> >> -                           tree ptr = gimple_build (&stmts, PLUS_EXPR,
> >> -                                                    TREE_TYPE (dataref_ptr),
> >> -                                                    dataref_ptr, idx);
> >> -                           ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> >> -                           tree elt = make_ssa_name (TREE_TYPE (vectype));
> >> -                           tree ref = build2 (MEM_REF, ltype, ptr,
> >> -                                              build_int_cst (ref_type, 0));
> >> -                           new_stmt = gimple_build_assign (elt, ref);
> >> -                           gimple_set_vuse (new_stmt,
> >> -                                            gimple_vuse (gsi_stmt (*gsi)));
> >> -                           gimple_seq_add_stmt (&stmts, new_stmt);
> >> -                           CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
> >> +                           mask_vectype = truth_type_for (vectype);
> >> +                           final_mask = build_minus_one_cst (mask_vectype);
> >>                           }
> >> -                       gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> >> -                       new_stmt = gimple_build_assign (NULL_TREE,
> >> -                                                       build_constructor
> >> -                                                         (vectype, ctor_elts));
> >> -                       data_ref = NULL_TREE;
> >> -                       break;
> >>                       }
> >>
> >> -                   if (costing_p)
> >> -                     break;
> >> -
> >> -                   align =
> >> -                     known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
> >> -                   if (alignment_support_scheme == dr_aligned)
> >> -                     misalign = 0;
> >> -                   else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
> >> -                     {
> >> -                       align = dr_alignment
> >> -                         (vect_dr_behavior (vinfo, first_dr_info));
> >> -                       misalign = 0;
> >> -                     }
> >> +                   gcall *call;
> >> +                   if (final_len && final_mask)
> >> +                     call = gimple_build_call_internal (
> >> +                       IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
> >> +                       scale, zero, final_mask, final_len, bias);
> >> +                   else if (final_mask)
> >> +                     call
> >> +                       = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
> >> +                                                     dataref_ptr, vec_offset,
> >> +                                                     scale, zero, final_mask);
> >>                     else
> >> -                     misalign = misalignment;
> >> -                   if (dataref_offset == NULL_TREE
> >> -                       && TREE_CODE (dataref_ptr) == SSA_NAME)
> >> -                     set_ptr_info_alignment (get_ptr_info (dataref_ptr),
> >> -                                             align, misalign);
> >> -                   align = least_bit_hwi (misalign | align);
> >> -
> >> -                   /* Compute IFN when LOOP_LENS or final_mask valid.  */
> >> -                   machine_mode vmode = TYPE_MODE (vectype);
> >> -                   machine_mode new_vmode = vmode;
> >> -                   internal_fn partial_ifn = IFN_LAST;
> >> -                   if (loop_lens)
> >> +                     call
> >> +                       = gimple_build_call_internal (IFN_GATHER_LOAD, 4,
> >> +                                                     dataref_ptr, vec_offset,
> >> +                                                     scale, zero);
> >> +                   gimple_call_set_nothrow (call, true);
> >> +                   new_stmt = call;
> >> +                   data_ref = NULL_TREE;
> >> +                   break;
> >> +                 }
> >> +               else if (memory_access_type == VMAT_GATHER_SCATTER)
> >> +                 {
> >> +                   /* Emulated gather-scatter.  */
> >> +                   gcc_assert (!final_mask);
> >> +                   unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
> >> +                   if (costing_p)
> >>                       {
> >> -                       opt_machine_mode new_ovmode
> >> -                         = get_len_load_store_mode (vmode, true,
> >> -                                                    &partial_ifn);
> >> -                       new_vmode = new_ovmode.require ();
> >> -                       unsigned factor = (new_ovmode == vmode)
> >> -                                           ? 1
> >> -                                           : GET_MODE_UNIT_SIZE (vmode);
> >> -                       final_len
> >> -                         = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> >> -                                              vec_num * ncopies, vectype,
> >> -                                              vec_num * j + i, factor);
> >> +                       /* For emulated gathers N offset vector element
> >> +                          offset add is consumed by the load).  */
> >> +                       inside_cost
> >> +                         = record_stmt_cost (cost_vec, const_nunits,
> >> +                                             vec_to_scalar, stmt_info, 0,
> >> +                                             vect_body);
> >> +                       /* N scalar loads plus gathering them into a
> >> +                          vector.  */
> >> +                       inside_cost = record_stmt_cost (cost_vec, const_nunits,
> >> +                                                       scalar_load, stmt_info,
> >> +                                                       0, vect_body);
> >> +                       inside_cost
> >> +                         = record_stmt_cost (cost_vec, 1, vec_construct,
> >> +                                             stmt_info, 0, vect_body);
> >> +                       break;
> >>                       }
> >> -                   else if (final_mask)
> >> +                   unsigned HOST_WIDE_INT const_offset_nunits
> >> +                     = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
> >> +                         .to_constant ();
> >> +                   vec<constructor_elt, va_gc> *ctor_elts;
> >> +                   vec_alloc (ctor_elts, const_nunits);
> >> +                   gimple_seq stmts = NULL;
> >> +                   /* We support offset vectors with more elements
> >> +                      than the data vector for now.  */
> >> +                   unsigned HOST_WIDE_INT factor
> >> +                     = const_offset_nunits / const_nunits;
> >> +                   vec_offset = vec_offsets[j / factor];
> >> +                   unsigned elt_offset = (j % factor) * const_nunits;
> >> +                   tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> >> +                   tree scale = size_int (gs_info.scale);
> >> +                   align = get_object_alignment (DR_REF (first_dr_info->dr));
> >> +                   tree ltype
> >> +                     = build_aligned_type (TREE_TYPE (vectype), align);
> >> +                   for (unsigned k = 0; k < const_nunits; ++k)
> >>                       {
> >> -                       if (!can_vec_mask_load_store_p (
> >> -                             vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
> >> -                             &partial_ifn))
> >> -                         gcc_unreachable ();
> >> +                       tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
> >> +                                               bitsize_int (k + elt_offset));
> >> +                       tree idx = gimple_build (&stmts, BIT_FIELD_REF,
> >> +                                                idx_type, vec_offset,
> >> +                                                TYPE_SIZE (idx_type), boff);
> >> +                       idx = gimple_convert (&stmts, sizetype, idx);
> >> +                       idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
> >> +                                           scale);
> >> +                       tree ptr = gimple_build (&stmts, PLUS_EXPR,
> >> +                                                TREE_TYPE (dataref_ptr),
> >> +                                                dataref_ptr, idx);
> >> +                       ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> >> +                       tree elt = make_ssa_name (TREE_TYPE (vectype));
> >> +                       tree ref = build2 (MEM_REF, ltype, ptr,
> >> +                                          build_int_cst (ref_type, 0));
> >> +                       new_stmt = gimple_build_assign (elt, ref);
> >> +                       gimple_set_vuse (new_stmt,
> >> +                                        gimple_vuse (gsi_stmt (*gsi)));
> >> +                       gimple_seq_add_stmt (&stmts, new_stmt);
> >> +                       CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
> >>                       }
> >> +                   gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> >> +                   new_stmt = gimple_build_assign (
> >> +                     NULL_TREE, build_constructor (vectype, ctor_elts));
> >> +                   data_ref = NULL_TREE;
> >> +                   break;
> >> +                 }
> >>
> >> -                   if (partial_ifn == IFN_MASK_LEN_LOAD)
> >> +               if (costing_p)
> >> +                 break;
> >> +
> >> +               align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
> >> +               if (alignment_support_scheme == dr_aligned)
> >> +                 misalign = 0;
> >> +               else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
> >> +                 {
> >> +                   align
> >> +                     = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
> >> +                   misalign = 0;
> >> +                 }
> >> +               else
> >> +                 misalign = misalignment;
> >> +               if (dataref_offset == NULL_TREE
> >> +                   && TREE_CODE (dataref_ptr) == SSA_NAME)
> >> +                 set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
> >> +                                         misalign);
> >> +               align = least_bit_hwi (misalign | align);
> >> +
> >> +               /* Compute IFN when LOOP_LENS or final_mask valid.  */
> >> +               machine_mode vmode = TYPE_MODE (vectype);
> >> +               machine_mode new_vmode = vmode;
> >> +               internal_fn partial_ifn = IFN_LAST;
> >> +               if (loop_lens)
> >> +                 {
> >> +                   opt_machine_mode new_ovmode
> >> +                     = get_len_load_store_mode (vmode, true, &partial_ifn);
> >> +                   new_vmode = new_ovmode.require ();
> >> +                   unsigned factor
> >> +                     = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
> >> +                   final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> >> +                                                  vec_num * ncopies, vectype,
> >> +                                                  vec_num * j + i, factor);
> >> +                 }
> >> +               else if (final_mask)
> >> +                 {
> >> +                   if (!can_vec_mask_load_store_p (
> >> +                         vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
> >> +                         &partial_ifn))
> >> +                     gcc_unreachable ();
> >> +                 }
> >> +
> >> +               if (partial_ifn == IFN_MASK_LEN_LOAD)
> >> +                 {
> >> +                   if (!final_len)
> >>                       {
> >> -                       if (!final_len)
> >> -                         {
> >> -                           /* Pass VF value to 'len' argument of
> >> -                              MASK_LEN_LOAD if LOOP_LENS is invalid.  */
> >> -                           final_len
> >> -                             = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> >> -                         }
> >> -                       if (!final_mask)
> >> -                         {
> >> -                           /* Pass all ones value to 'mask' argument of
> >> -                              MASK_LEN_LOAD if final_mask is invalid.  */
> >> -                           mask_vectype = truth_type_for (vectype);
> >> -                           final_mask = build_minus_one_cst (mask_vectype);
> >> -                         }
> >> +                       /* Pass VF value to 'len' argument of
> >> +                          MASK_LEN_LOAD if LOOP_LENS is invalid.  */
> >> +                       final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> >>                       }
> >> -                   if (final_len)
> >> +                   if (!final_mask)
> >>                       {
> >> -                       signed char biasval
> >> -                         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> >> -
> >> -                       bias = build_int_cst (intQI_type_node, biasval);
> >> +                       /* Pass all ones value to 'mask' argument of
> >> +                          MASK_LEN_LOAD if final_mask is invalid.  */
> >> +                       mask_vectype = truth_type_for (vectype);
> >> +                       final_mask = build_minus_one_cst (mask_vectype);
> >>                       }
> >> +                 }
> >> +               if (final_len)
> >> +                 {
> >> +                   signed char biasval
> >> +                     = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> >>
> >> -                   if (final_len)
> >> +                   bias = build_int_cst (intQI_type_node, biasval);
> >> +                 }
> >> +
> >> +               if (final_len)
> >> +                 {
> >> +                   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> >> +                   gcall *call;
> >> +                   if (partial_ifn == IFN_MASK_LEN_LOAD)
> >> +                     call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, 5,
> >> +                                                        dataref_ptr, ptr,
> >> +                                                        final_mask, final_len,
> >> +                                                        bias);
> >> +                   else
> >> +                     call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
> >> +                                                        dataref_ptr, ptr,
> >> +                                                        final_len, bias);
> >> +                   gimple_call_set_nothrow (call, true);
> >> +                   new_stmt = call;
> >> +                   data_ref = NULL_TREE;
> >> +
> >> +                   /* Need conversion if it's wrapped with VnQI.  */
> >> +                   if (vmode != new_vmode)
> >>                       {
> >> -                       tree ptr
> >> -                         = build_int_cst (ref_type, align * BITS_PER_UNIT);
> >> -                       gcall *call;
> >> -                       if (partial_ifn == IFN_MASK_LEN_LOAD)
> >> -                         call = gimple_build_call_internal (IFN_MASK_LEN_LOAD,
> >> -                                                            5, dataref_ptr,
> >> -                                                            ptr, final_mask,
> >> -                                                            final_len, bias);
> >> -                       else
> >> -                         call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
> >> -                                                            dataref_ptr, ptr,
> >> -                                                            final_len, bias);
> >> -                       gimple_call_set_nothrow (call, true);
> >> -                       new_stmt = call;
> >> -                       data_ref = NULL_TREE;
> >> -
> >> -                       /* Need conversion if it's wrapped with VnQI.  */
> >> -                       if (vmode != new_vmode)
> >> -                         {
> >> -                           tree new_vtype = build_vector_type_for_mode (
> >> -                             unsigned_intQI_type_node, new_vmode);
> >> -                           tree var = vect_get_new_ssa_name (new_vtype,
> >> -                                                             vect_simple_var);
> >> -                           gimple_set_lhs (call, var);
> >> -                           vect_finish_stmt_generation (vinfo, stmt_info, call,
> >> -                                                        gsi);
> >> -                           tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
> >> -                           new_stmt
> >> -                             = gimple_build_assign (vec_dest,
> >> -                                                    VIEW_CONVERT_EXPR, op);
> >> -                         }
> >> +                       tree new_vtype = build_vector_type_for_mode (
> >> +                         unsigned_intQI_type_node, new_vmode);
> >> +                       tree var
> >> +                         = vect_get_new_ssa_name (new_vtype, vect_simple_var);
> >> +                       gimple_set_lhs (call, var);
> >> +                       vect_finish_stmt_generation (vinfo, stmt_info, call,
> >> +                                                    gsi);
> >> +                       tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
> >> +                       new_stmt = gimple_build_assign (vec_dest,
> >> +                                                       VIEW_CONVERT_EXPR, op);
> >>                       }
> >> -                   else if (final_mask)
> >> +                 }
> >> +               else if (final_mask)
> >> +                 {
> >> +                   tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
> >> +                   gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
> >> +                                                             dataref_ptr, ptr,
> >> +                                                             final_mask);
> >> +                   gimple_call_set_nothrow (call, true);
> >> +                   new_stmt = call;
> >> +                   data_ref = NULL_TREE;
> >> +                 }
> >> +               else
> >> +                 {
> >> +                   tree ltype = vectype;
> >> +                   tree new_vtype = NULL_TREE;
> >> +                   unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
> >> +                   unsigned int vect_align
> >> +                     = vect_known_alignment_in_bytes (first_dr_info, vectype);
> >> +                   unsigned int scalar_dr_size
> >> +                     = vect_get_scalar_dr_size (first_dr_info);
> >> +                   /* If there's no peeling for gaps but we have a gap
> >> +                      with slp loads then load the lower half of the
> >> +                      vector only.  See get_group_load_store_type for
> >> +                      when we apply this optimization.  */
> >> +                   if (slp
> >> +                       && loop_vinfo
> >> +                       && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0
> >> +                       && known_eq (nunits, (group_size - gap) * 2)
> >> +                       && known_eq (nunits, group_size)
> >> +                       && gap >= (vect_align / scalar_dr_size))
> >>                       {
> >> -                       tree ptr = build_int_cst (ref_type,
> >> -                                                 align * BITS_PER_UNIT);
> >> -                       gcall *call
> >> -                         = gimple_build_call_internal (IFN_MASK_LOAD, 3,
> >> -                                                       dataref_ptr, ptr,
> >> -                                                       final_mask);
> >> -                       gimple_call_set_nothrow (call, true);
> >> -                       new_stmt = call;
> >> -                       data_ref = NULL_TREE;
> >> +                       tree half_vtype;
> >> +                       new_vtype
> >> +                         = vector_vector_composition_type (vectype, 2,
> >> +                                                           &half_vtype);
> >> +                       if (new_vtype != NULL_TREE)
> >> +                         ltype = half_vtype;
> >>                       }
> >> +                   tree offset
> >> +                     = (dataref_offset ? dataref_offset
> >> +                                       : build_int_cst (ref_type, 0));
> >> +                   if (ltype != vectype
> >> +                       && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> >> +                     {
> >> +                       unsigned HOST_WIDE_INT gap_offset
> >> +                         = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
> >> +                       tree gapcst = build_int_cst (ref_type, gap_offset);
> >> +                       offset = size_binop (PLUS_EXPR, offset, gapcst);
> >> +                     }
> >> +                   data_ref
> >> +                     = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
> >> +                   if (alignment_support_scheme == dr_aligned)
> >> +                     ;
> >>                     else
> >> +                     TREE_TYPE (data_ref)
> >> +                       = build_aligned_type (TREE_TYPE (data_ref),
> >> +                                             align * BITS_PER_UNIT);
> >> +                   if (ltype != vectype)
> >>                       {
> >> -                       tree ltype = vectype;
> >> -                       tree new_vtype = NULL_TREE;
> >> -                       unsigned HOST_WIDE_INT gap
> >> -                         = DR_GROUP_GAP (first_stmt_info);
> >> -                       unsigned int vect_align
> >> -                         = vect_known_alignment_in_bytes (first_dr_info,
> >> -                                                          vectype);
> >> -                       unsigned int scalar_dr_size
> >> -                         = vect_get_scalar_dr_size (first_dr_info);
> >> -                       /* If there's no peeling for gaps but we have a gap
> >> -                          with slp loads then load the lower half of the
> >> -                          vector only.  See get_group_load_store_type for
> >> -                          when we apply this optimization.  */
> >> -                       if (slp
> >> -                           && loop_vinfo
> >> -                           && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
> >> -                           && gap != 0
> >> -                           && known_eq (nunits, (group_size - gap) * 2)
> >> -                           && known_eq (nunits, group_size)
> >> -                           && gap >= (vect_align / scalar_dr_size))
> >> +                       vect_copy_ref_info (data_ref,
> >> +                                           DR_REF (first_dr_info->dr));
> >> +                       tree tem = make_ssa_name (ltype);
> >> +                       new_stmt = gimple_build_assign (tem, data_ref);
> >> +                       vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
> >> +                                                    gsi);
> >> +                       data_ref = NULL;
> >> +                       vec<constructor_elt, va_gc> *v;
> >> +                       vec_alloc (v, 2);
> >> +                       if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> >>                           {
> >> -                           tree half_vtype;
> >> -                           new_vtype
> >> -                             = vector_vector_composition_type (vectype, 2,
> >> -                                                               &half_vtype);
> >> -                           if (new_vtype != NULL_TREE)
> >> -                             ltype = half_vtype;
> >> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> >> +                                                   build_zero_cst (ltype));
> >> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
> >>                           }
> >> -                       tree offset
> >> -                         = (dataref_offset ? dataref_offset
> >> -                                           : build_int_cst (ref_type, 0));
> >> -                       if (ltype != vectype
> >> -                           && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> >> +                       else
> >>                           {
> >> -                           unsigned HOST_WIDE_INT gap_offset
> >> -                             = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
> >> -                           tree gapcst = build_int_cst (ref_type, gap_offset);
> >> -                           offset = size_binop (PLUS_EXPR, offset, gapcst);
> >> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
> >> +                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> >> +                                                   build_zero_cst (ltype));
> >>                           }
> >> -                       data_ref
> >> -                         = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
> >> -                       if (alignment_support_scheme == dr_aligned)
> >> -                         ;
> >> +                       gcc_assert (new_vtype != NULL_TREE);
> >> +                       if (new_vtype == vectype)
> >> +                         new_stmt = gimple_build_assign (
> >> +                           vec_dest, build_constructor (vectype, v));
> >>                         else
> >> -                         TREE_TYPE (data_ref)
> >> -                           = build_aligned_type (TREE_TYPE (data_ref),
> >> -                                                 align * BITS_PER_UNIT);
> >> -                       if (ltype != vectype)
> >>                           {
> >> -                           vect_copy_ref_info (data_ref,
> >> -                                               DR_REF (first_dr_info->dr));
> >> -                           tree tem = make_ssa_name (ltype);
> >> -                           new_stmt = gimple_build_assign (tem, data_ref);
> >> +                           tree new_vname = make_ssa_name (new_vtype);
> >> +                           new_stmt = gimple_build_assign (
> >> +                             new_vname, build_constructor (new_vtype, v));
> >>                             vect_finish_stmt_generation (vinfo, stmt_info,
> >>                                                          new_stmt, gsi);
> >> -                           data_ref = NULL;
> >> -                           vec<constructor_elt, va_gc> *v;
> >> -                           vec_alloc (v, 2);
> >> -                           if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> >> -                             {
> >> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> >> -                                                       build_zero_cst (ltype));
> >> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
> >> -                             }
> >> -                           else
> >> -                             {
> >> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
> >> -                               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
> >> -                                                       build_zero_cst (ltype));
> >> -                             }
> >> -                           gcc_assert (new_vtype != NULL_TREE);
> >> -                           if (new_vtype == vectype)
> >> -                             new_stmt = gimple_build_assign (
> >> -                               vec_dest, build_constructor (vectype, v));
> >> -                           else
> >> -                             {
> >> -                               tree new_vname = make_ssa_name (new_vtype);
> >> -                               new_stmt = gimple_build_assign (
> >> -                                 new_vname, build_constructor (new_vtype, v));
> >> -                               vect_finish_stmt_generation (vinfo, stmt_info,
> >> -                                                            new_stmt, gsi);
> >> -                               new_stmt = gimple_build_assign (
> >> -                                 vec_dest, build1 (VIEW_CONVERT_EXPR, vectype,
> >> -                                                   new_vname));
> >> -                             }
> >> +                           new_stmt = gimple_build_assign (
> >> +                             vec_dest,
> >> +                             build1 (VIEW_CONVERT_EXPR, vectype, new_vname));
> >>                           }
> >>                       }
> >> -                   break;
> >>                   }
> >> -               case dr_explicit_realign:
> >> -                 {
> >> -                   if (costing_p)
> >> -                     break;
> >> -                   tree ptr, bump;
> >> -
> >> -                   tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> >> +               break;
> >> +             }
> >> +           case dr_explicit_realign:
> >> +             {
> >> +               if (costing_p)
> >> +                 break;
> >> +               tree ptr, bump;
> >>
> >> -                   if (compute_in_loop)
> >> -                     msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
> >> -                                                   &realignment_token,
> >> -                                                   dr_explicit_realign,
> >> -                                                   dataref_ptr, NULL);
> >> +               tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> >>
> >> -                   if (TREE_CODE (dataref_ptr) == SSA_NAME)
> >> -                     ptr = copy_ssa_name (dataref_ptr);
> >> -                   else
> >> -                     ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
> >> -                   // For explicit realign the target alignment should be
> >> -                   // known at compile time.
> >> -                   unsigned HOST_WIDE_INT align =
> >> -                     DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> >> -                   new_stmt = gimple_build_assign
> >> -                                (ptr, BIT_AND_EXPR, dataref_ptr,
> >> -                                 build_int_cst
> >> -                                 (TREE_TYPE (dataref_ptr),
> >> -                                  -(HOST_WIDE_INT) align));
> >> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> >> -                                                new_stmt, gsi);
> >> -                   data_ref
> >> -                     = build2 (MEM_REF, vectype, ptr,
> >> -                               build_int_cst (ref_type, 0));
> >> -                   vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> >> -                   vec_dest = vect_create_destination_var (scalar_dest,
> >> -                                                           vectype);
> >> -                   new_stmt = gimple_build_assign (vec_dest, data_ref);
> >> -                   new_temp = make_ssa_name (vec_dest, new_stmt);
> >> -                   gimple_assign_set_lhs (new_stmt, new_temp);
> >> -                   gimple_move_vops (new_stmt, stmt_info->stmt);
> >> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> >> -                                                new_stmt, gsi);
> >> -                   msq = new_temp;
> >> -
> >> -                   bump = size_binop (MULT_EXPR, vs,
> >> -                                      TYPE_SIZE_UNIT (elem_type));
> >> -                   bump = size_binop (MINUS_EXPR, bump, size_one_node);
> >> -                   ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi,
> >> -                                          stmt_info, bump);
> >> -                   new_stmt = gimple_build_assign
> >> -                                (NULL_TREE, BIT_AND_EXPR, ptr,
> >> -                                 build_int_cst
> >> -                                 (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
> >> -                   if (TREE_CODE (ptr) == SSA_NAME)
> >> -                     ptr = copy_ssa_name (ptr, new_stmt);
> >> -                   else
> >> -                     ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
> >> -                   gimple_assign_set_lhs (new_stmt, ptr);
> >> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> >> -                                                new_stmt, gsi);
> >> -                   data_ref
> >> -                     = build2 (MEM_REF, vectype, ptr,
> >> -                               build_int_cst (ref_type, 0));
> >> -                   break;
> >> -                 }
> >> -               case dr_explicit_realign_optimized:
> >> -                 {
> >> -                   if (costing_p)
> >> -                     break;
> >> -                   if (TREE_CODE (dataref_ptr) == SSA_NAME)
> >> -                     new_temp = copy_ssa_name (dataref_ptr);
> >> -                   else
> >> -                     new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
> >> -                   // We should only be doing this if we know the target
> >> -                   // alignment at compile time.
> >> -                   unsigned HOST_WIDE_INT align =
> >> -                     DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> >> -                   new_stmt = gimple_build_assign
> >> -                     (new_temp, BIT_AND_EXPR, dataref_ptr,
> >> -                      build_int_cst (TREE_TYPE (dataref_ptr),
> >> -                                    -(HOST_WIDE_INT) align));
> >> -                   vect_finish_stmt_generation (vinfo, stmt_info,
> >> -                                                new_stmt, gsi);
> >> -                   data_ref
> >> -                     = build2 (MEM_REF, vectype, new_temp,
> >> -                               build_int_cst (ref_type, 0));
> >> -                   break;
> >> -                 }
> >> -               default:
> >> -                 gcc_unreachable ();
> >> -               }
> >> +               if (compute_in_loop)
> >> +                 msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
> >> +                                               &realignment_token,
> >> +                                               dr_explicit_realign,
> >> +                                               dataref_ptr, NULL);
> >> +
> >> +               if (TREE_CODE (dataref_ptr) == SSA_NAME)
> >> +                 ptr = copy_ssa_name (dataref_ptr);
> >> +               else
> >> +                 ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
> >> +               // For explicit realign the target alignment should be
> >> +               // known at compile time.
> >> +               unsigned HOST_WIDE_INT align
> >> +                 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> >> +               new_stmt = gimple_build_assign (
> >> +                 ptr, BIT_AND_EXPR, dataref_ptr,
> >> +                 build_int_cst (TREE_TYPE (dataref_ptr),
> >> +                                -(HOST_WIDE_INT) align));
> >> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >> +               data_ref
> >> +                 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
> >> +               vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> >> +               vec_dest = vect_create_destination_var (scalar_dest, vectype);
> >> +               new_stmt = gimple_build_assign (vec_dest, data_ref);
> >> +               new_temp = make_ssa_name (vec_dest, new_stmt);
> >> +               gimple_assign_set_lhs (new_stmt, new_temp);
> >> +               gimple_move_vops (new_stmt, stmt_info->stmt);
> >> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >> +               msq = new_temp;
> >> +
> >> +               bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type));
> >> +               bump = size_binop (MINUS_EXPR, bump, size_one_node);
> >> +               ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info,
> >> +                                      bump);
> >> +               new_stmt = gimple_build_assign (
> >> +                 NULL_TREE, BIT_AND_EXPR, ptr,
> >> +                 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
> >> +               if (TREE_CODE (ptr) == SSA_NAME)
> >> +                 ptr = copy_ssa_name (ptr, new_stmt);
> >> +               else
> >> +                 ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
> >> +               gimple_assign_set_lhs (new_stmt, ptr);
> >> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >> +               data_ref
> >> +                 = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
> >> +               break;
> >> +             }
> >> +           case dr_explicit_realign_optimized:
> >> +             {
> >> +               if (costing_p)
> >> +                 break;
> >> +               if (TREE_CODE (dataref_ptr) == SSA_NAME)
> >> +                 new_temp = copy_ssa_name (dataref_ptr);
> >> +               else
> >> +                 new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
> >> +               // We should only be doing this if we know the target
> >> +               // alignment at compile time.
> >> +               unsigned HOST_WIDE_INT align
> >> +                 = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
> >> +               new_stmt = gimple_build_assign (
> >> +                 new_temp, BIT_AND_EXPR, dataref_ptr,
> >> +                 build_int_cst (TREE_TYPE (dataref_ptr),
> >> +                                -(HOST_WIDE_INT) align));
> >> +               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >> +               data_ref = build2 (MEM_REF, vectype, new_temp,
> >> +                                  build_int_cst (ref_type, 0));
> >> +               break;
> >> +             }
> >> +           default:
> >> +             gcc_unreachable ();
> >> +           }
> >>
> >> -             /* One common place to cost the above vect load for different
> >> -                alignment support schemes.  */
> >> -             if (costing_p)
> >> -               {
> >> -                 /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
> >> -                    only need to take care of the first stmt, whose
> >> -                    stmt_info is first_stmt_info, vec_num iterating on it
> >> -                    will cover the cost for the remaining, it's consistent
> >> -                    with transforming.  For the prologue cost for realign,
> >> -                    we only need to count it once for the whole group.  */
> >> -                 bool first_stmt_info_p = first_stmt_info == stmt_info;
> >> -                 bool add_realign_cost = first_stmt_info_p && i == 0;
> >> -                 if (memory_access_type == VMAT_CONTIGUOUS
> >> -                     || memory_access_type == VMAT_CONTIGUOUS_REVERSE
> >> -                     || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
> >> -                         && (!grouped_load || first_stmt_info_p)))
> >> -                   vect_get_load_cost (vinfo, stmt_info, 1,
> >> -                                       alignment_support_scheme, misalignment,
> >> -                                       add_realign_cost, &inside_cost,
> >> -                                       &prologue_cost, cost_vec, cost_vec,
> >> -                                       true);
> >> -               }
> >> -             else
> >> +         /* One common place to cost the above vect load for different
> >> +            alignment support schemes.  */
> >> +         if (costing_p)
> >> +           {
> >> +             /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
> >> +                only need to take care of the first stmt, whose
> >> +                stmt_info is first_stmt_info, vec_num iterating on it
> >> +                will cover the cost for the remaining, it's consistent
> >> +                with transforming.  For the prologue cost for realign,
> >> +                we only need to count it once for the whole group.  */
> >> +             bool first_stmt_info_p = first_stmt_info == stmt_info;
> >> +             bool add_realign_cost = first_stmt_info_p && i == 0;
> >> +             if (memory_access_type == VMAT_CONTIGUOUS
> >> +                 || memory_access_type == VMAT_CONTIGUOUS_REVERSE
> >> +                 || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
> >> +                     && (!grouped_load || first_stmt_info_p)))
> >> +               vect_get_load_cost (vinfo, stmt_info, 1,
> >> +                                   alignment_support_scheme, misalignment,
> >> +                                   add_realign_cost, &inside_cost,
> >> +                                   &prologue_cost, cost_vec, cost_vec, true);
> >> +           }
> >> +         else
> >> +           {
> >> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
> >> +             /* DATA_REF is null if we've already built the statement.  */
> >> +             if (data_ref)
> >>                 {
> >> -                 vec_dest = vect_create_destination_var (scalar_dest, vectype);
> >> -                 /* DATA_REF is null if we've already built the statement.  */
> >> -                 if (data_ref)
> >> -                   {
> >> -                     vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> >> -                     new_stmt = gimple_build_assign (vec_dest, data_ref);
> >> -                   }
> >> -                 new_temp = make_ssa_name (vec_dest, new_stmt);
> >> -                 gimple_set_lhs (new_stmt, new_temp);
> >> -                 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >> +                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> >> +                 new_stmt = gimple_build_assign (vec_dest, data_ref);
> >>                 }
> >> +             new_temp = make_ssa_name (vec_dest, new_stmt);
> >> +             gimple_set_lhs (new_stmt, new_temp);
> >> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >> +           }
> >>
> >> -             /* 3. Handle explicit realignment if necessary/supported.
> >> -                Create in loop:
> >> -                  vec_dest = realign_load (msq, lsq, realignment_token)  */
> >> -             if (!costing_p
> >> -                 && (alignment_support_scheme == dr_explicit_realign_optimized
> >> -                     || alignment_support_scheme == dr_explicit_realign))
> >> -               {
> >> -                 lsq = gimple_assign_lhs (new_stmt);
> >> -                 if (!realignment_token)
> >> -                   realignment_token = dataref_ptr;
> >> -                 vec_dest = vect_create_destination_var (scalar_dest, vectype);
> >> -                 new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR,
> >> -                                                 msq, lsq, realignment_token);
> >> -                 new_temp = make_ssa_name (vec_dest, new_stmt);
> >> -                 gimple_assign_set_lhs (new_stmt, new_temp);
> >> -                 vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >> +         /* 3. Handle explicit realignment if necessary/supported.
> >> +            Create in loop:
> >> +              vec_dest = realign_load (msq, lsq, realignment_token)  */
> >> +         if (!costing_p
> >> +             && (alignment_support_scheme == dr_explicit_realign_optimized
> >> +                 || alignment_support_scheme == dr_explicit_realign))
> >> +           {
> >> +             lsq = gimple_assign_lhs (new_stmt);
> >> +             if (!realignment_token)
> >> +               realignment_token = dataref_ptr;
> >> +             vec_dest = vect_create_destination_var (scalar_dest, vectype);
> >> +             new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq,
> >> +                                             lsq, realignment_token);
> >> +             new_temp = make_ssa_name (vec_dest, new_stmt);
> >> +             gimple_assign_set_lhs (new_stmt, new_temp);
> >> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> >>
> >> -                 if (alignment_support_scheme == dr_explicit_realign_optimized)
> >> -                   {
> >> -                     gcc_assert (phi);
> >> -                     if (i == vec_num - 1 && j == ncopies - 1)
> >> -                       add_phi_arg (phi, lsq,
> >> -                                    loop_latch_edge (containing_loop),
> >> -                                    UNKNOWN_LOCATION);
> >> -                     msq = lsq;
> >> -                   }
> >> +             if (alignment_support_scheme == dr_explicit_realign_optimized)
> >> +               {
> >> +                 gcc_assert (phi);
> >> +                 if (i == vec_num - 1 && j == ncopies - 1)
> >> +                   add_phi_arg (phi, lsq, loop_latch_edge (containing_loop),
> >> +                                UNKNOWN_LOCATION);
> >> +                 msq = lsq;
> >>                 }
> >> +           }
> >>
> >> -             if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> >> +         if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> >> +           {
> >> +             if (costing_p)
> >> +               inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
> >> +                                               stmt_info, 0, vect_body);
> >> +             else
> >>                 {
> >> -                 if (costing_p)
> >> -                   inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
> >> -                                                   stmt_info, 0, vect_body);
> >> -                 else
> >> -                   {
> >> -                     tree perm_mask = perm_mask_for_reverse (vectype);
> >> -                     new_temp
> >> -                       = permute_vec_elements (vinfo, new_temp, new_temp,
> >> -                                               perm_mask, stmt_info, gsi);
> >> -                     new_stmt = SSA_NAME_DEF_STMT (new_temp);
> >> -                   }
> >> +                 tree perm_mask = perm_mask_for_reverse (vectype);
> >> +                 new_temp = permute_vec_elements (vinfo, new_temp, new_temp,
> >> +                                                  perm_mask, stmt_info, gsi);
> >> +                 new_stmt = SSA_NAME_DEF_STMT (new_temp);
> >>                 }
> >> +           }
> >>
> >> -             /* Collect vector loads and later create their permutation in
> >> -                vect_transform_grouped_load ().  */
> >> -             if (!costing_p && (grouped_load || slp_perm))
> >> -               dr_chain.quick_push (new_temp);
> >> +         /* Collect vector loads and later create their permutation in
> >> +            vect_transform_grouped_load ().  */
> >> +         if (!costing_p && (grouped_load || slp_perm))
> >> +           dr_chain.quick_push (new_temp);
> >>
> >> -             /* Store vector loads in the corresponding SLP_NODE.  */
> >> -             if (!costing_p && slp && !slp_perm)
> >> -               slp_node->push_vec_def (new_stmt);
> >> +         /* Store vector loads in the corresponding SLP_NODE.  */
> >> +         if (!costing_p && slp && !slp_perm)
> >> +           slp_node->push_vec_def (new_stmt);
> >>
> >> -             /* With SLP permutation we load the gaps as well, without
> >> -                we need to skip the gaps after we manage to fully load
> >> -                all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
> >> -             group_elt += nunits;
> >> -             if (!costing_p
> >> -                 && maybe_ne (group_gap_adj, 0U)
> >> -                 && !slp_perm
> >> -                 && known_eq (group_elt, group_size - group_gap_adj))
> >> -               {
> >> -                 poly_wide_int bump_val
> >> -                   = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
> >> -                      * group_gap_adj);
> >> -                 if (tree_int_cst_sgn
> >> -                       (vect_dr_behavior (vinfo, dr_info)->step) == -1)
> >> -                   bump_val = -bump_val;
> >> -                 tree bump = wide_int_to_tree (sizetype, bump_val);
> >> -                 dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> >> -                                                gsi, stmt_info, bump);
> >> -                 group_elt = 0;
> >> -               }
> >> -           }
> >> -         /* Bump the vector pointer to account for a gap or for excess
> >> -            elements loaded for a permuted SLP load.  */
> >> +         /* With SLP permutation we load the gaps as well, without
> >> +            we need to skip the gaps after we manage to fully load
> >> +            all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
> >> +         group_elt += nunits;
> >>           if (!costing_p
> >>               && maybe_ne (group_gap_adj, 0U)
> >> -             && slp_perm)
> >> +             && !slp_perm
> >> +             && known_eq (group_elt, group_size - group_gap_adj))
> >>             {
> >>               poly_wide_int bump_val
> >> -               = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
> >> -                  * group_gap_adj);
> >> -             if (tree_int_cst_sgn
> >> -                   (vect_dr_behavior (vinfo, dr_info)->step) == -1)
> >> +               = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
> >> +             if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step)
> >> +                 == -1)
> >>                 bump_val = -bump_val;
> >>               tree bump = wide_int_to_tree (sizetype, bump_val);
> >>               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
> >>                                              stmt_info, bump);
> >> +             group_elt = 0;
> >>             }
> >>         }
> >> +      /* Bump the vector pointer to account for a gap or for excess
> >> +        elements loaded for a permuted SLP load.  */
> >> +      if (!costing_p
> >> +         && maybe_ne (group_gap_adj, 0U)
> >> +         && slp_perm)
> >> +       {
> >> +         poly_wide_int bump_val
> >> +           = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
> >> +         if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1)
> >> +           bump_val = -bump_val;
> >> +         tree bump = wide_int_to_tree (sizetype, bump_val);
> >> +         dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
> >> +                                        stmt_info, bump);
> >> +       }
> >>
> >>        if (slp && !slp_perm)
> >>         continue;
> >> @@ -11120,39 +11117,36 @@ vectorizable_load (vec_info *vinfo,
> >>             }
> >>         }
> >>        else
> >> -        {
> >> -          if (grouped_load)
> >> -           {
> >> -             if (memory_access_type != VMAT_LOAD_STORE_LANES)
> >> +       {
> >> +         if (grouped_load)
> >> +           {
> >> +             gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
> >> +             /* We assume that the cost of a single load-lanes instruction
> >> +                is equivalent to the cost of DR_GROUP_SIZE separate loads.
> >> +                If a grouped access is instead being provided by a
> >> +                load-and-permute operation, include the cost of the
> >> +                permutes.  */
> >> +             if (costing_p && first_stmt_info == stmt_info)
> >>                 {
> >> -                 gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
> >> -                 /* We assume that the cost of a single load-lanes instruction
> >> -                    is equivalent to the cost of DR_GROUP_SIZE separate loads.
> >> -                    If a grouped access is instead being provided by a
> >> -                    load-and-permute operation, include the cost of the
> >> -                    permutes.  */
> >> -                 if (costing_p && first_stmt_info == stmt_info)
> >> -                   {
> >> -                     /* Uses an even and odd extract operations or shuffle
> >> -                        operations for each needed permute.  */
> >> -                     int group_size = DR_GROUP_SIZE (first_stmt_info);
> >> -                     int nstmts = ceil_log2 (group_size) * group_size;
> >> -                     inside_cost
> >> -                       += record_stmt_cost (cost_vec, nstmts, vec_perm,
> >> -                                            stmt_info, 0, vect_body);
> >> +                 /* Uses an even and odd extract operations or shuffle
> >> +                    operations for each needed permute.  */
> >> +                 int group_size = DR_GROUP_SIZE (first_stmt_info);
> >> +                 int nstmts = ceil_log2 (group_size) * group_size;
> >> +                 inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
> >> +                                                  stmt_info, 0, vect_body);
> >>
> >> -                     if (dump_enabled_p ())
> >> -                       dump_printf_loc (
> >> -                         MSG_NOTE, vect_location,
> >> -                         "vect_model_load_cost: strided group_size = %d .\n",
> >> -                         group_size);
> >> -                   }
> >> -                 else if (!costing_p)
> >> -                   vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
> >> -                                                group_size, gsi);
> >> +                 if (dump_enabled_p ())
> >> +                   dump_printf_loc (MSG_NOTE, vect_location,
> >> +                                    "vect_model_load_cost:"
> >> +                                    "strided group_size = %d .\n",
> >> +                                    group_size);
> >> +               }
> >> +             else if (!costing_p)
> >> +               {
> >> +                 vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
> >> +                                              group_size, gsi);
> >> +                 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> >>                 }
> >> -             if (!costing_p)
> >> -               *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> >>             }
> >>           else if (!costing_p)
> >>             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >> @@ -11166,7 +11160,8 @@ vectorizable_load (vec_info *vinfo,
> >>      {
> >>        gcc_assert (memory_access_type != VMAT_INVARIANT
> >>                   && memory_access_type != VMAT_ELEMENTWISE
> >> -                 && memory_access_type != VMAT_STRIDED_SLP);
> >> +                 && memory_access_type != VMAT_STRIDED_SLP
> >> +                 && memory_access_type != VMAT_LOAD_STORE_LANES);
> >>        if (dump_enabled_p ())
> >>         dump_printf_loc (MSG_NOTE, vect_location,
> >>                          "vect_model_load_cost: inside_cost = %u, "
> >> --
> >> 2.31.1
> >
>
  

Patch

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 4f2d088484c..c361e16cb7b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -10332,7 +10332,129 @@  vectorizable_load (vec_info *vinfo,
 	vect_get_vec_defs_for_operand (vinfo, stmt_info, ncopies, mask,
 				       &vec_masks, mask_vectype);
     }
+
   tree vec_mask = NULL_TREE;
+  if (memory_access_type == VMAT_LOAD_STORE_LANES)
+    {
+      gcc_assert (alignment_support_scheme == dr_aligned
+		  || alignment_support_scheme == dr_unaligned_supported);
+      gcc_assert (grouped_load && !slp);
+
+      unsigned int inside_cost = 0, prologue_cost = 0;
+      for (j = 0; j < ncopies; j++)
+	{
+	  if (costing_p)
+	    {
+	      /* An IFN_LOAD_LANES will load all its vector results,
+		 regardless of which ones we actually need.  Account
+		 for the cost of unused results.  */
+	      if (first_stmt_info == stmt_info)
+		{
+		  unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
+		  stmt_vec_info next_stmt_info = first_stmt_info;
+		  do
+		    {
+		      gaps -= 1;
+		      next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
+		    }
+		  while (next_stmt_info);
+		  if (gaps)
+		    {
+		      if (dump_enabled_p ())
+			dump_printf_loc (MSG_NOTE, vect_location,
+					 "vect_model_load_cost: %d "
+					 "unused vectors.\n",
+					 gaps);
+		      vect_get_load_cost (vinfo, stmt_info, gaps,
+					  alignment_support_scheme,
+					  misalignment, false, &inside_cost,
+					  &prologue_cost, cost_vec, cost_vec,
+					  true);
+		    }
+		}
+	      vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
+				  misalignment, false, &inside_cost,
+				  &prologue_cost, cost_vec, cost_vec, true);
+	      continue;
+	    }
+
+	  /* 1. Create the vector or array pointer update chain.  */
+	  if (j == 0)
+	    dataref_ptr
+	      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
+					  at_loop, offset, &dummy, gsi,
+					  &ptr_incr, false, bump);
+	  else
+	    {
+	      gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
+	      dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
+					     stmt_info, bump);
+	    }
+	  if (mask)
+	    vec_mask = vec_masks[j];
+
+	  tree vec_array = create_vector_array (vectype, vec_num);
+
+	  tree final_mask = NULL_TREE;
+	  if (loop_masks)
+	    final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+					     ncopies, vectype, j);
+	  if (vec_mask)
+	    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
+					   vec_mask, gsi);
+
+	  gcall *call;
+	  if (final_mask)
+	    {
+	      /* Emit:
+		   VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
+						VEC_MASK).  */
+	      unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
+	      tree alias_ptr = build_int_cst (ref_type, align);
+	      call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
+						 dataref_ptr, alias_ptr,
+						 final_mask);
+	    }
+	  else
+	    {
+	      /* Emit:
+		   VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
+	      data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
+	      call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
+	    }
+	  gimple_call_set_lhs (call, vec_array);
+	  gimple_call_set_nothrow (call, true);
+	  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
+
+	  dr_chain.create (vec_num);
+	  /* Extract each vector into an SSA_NAME.  */
+	  for (i = 0; i < vec_num; i++)
+	    {
+	      new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
+					    vec_array, i);
+	      dr_chain.quick_push (new_temp);
+	    }
+
+	  /* Record the mapping between SSA_NAMEs and statements.  */
+	  vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
+
+	  /* Record that VEC_ARRAY is now dead.  */
+	  vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
+
+	  dr_chain.release ();
+
+	  *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+	}
+
+      if (costing_p && dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "vect_model_load_cost: inside_cost = %u, "
+			 "prologue_cost = %u .\n",
+			 inside_cost, prologue_cost);
+
+      return true;
+    }
+
   poly_uint64 group_elt = 0;
   unsigned int inside_cost = 0, prologue_cost = 0;
   for (j = 0; j < ncopies; j++)
@@ -10414,685 +10538,558 @@  vectorizable_load (vec_info *vinfo,
 	dr_chain.create (vec_num);

       gimple *new_stmt = NULL;
-      if (memory_access_type == VMAT_LOAD_STORE_LANES)
+      for (i = 0; i < vec_num; i++)
 	{
-	  if (costing_p)
-	    {
-	      /* An IFN_LOAD_LANES will load all its vector results,
-		 regardless of which ones we actually need.  Account
-		 for the cost of unused results.  */
-	      if (grouped_load && first_stmt_info == stmt_info)
-		{
-		  unsigned int gaps = DR_GROUP_SIZE (first_stmt_info);
-		  stmt_vec_info next_stmt_info = first_stmt_info;
-		  do
-		    {
-		      gaps -= 1;
-		      next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
-		    }
-		  while (next_stmt_info);
-		  if (gaps)
-		    {
-		      if (dump_enabled_p ())
-			dump_printf_loc (MSG_NOTE, vect_location,
-					 "vect_model_load_cost: %d "
-					 "unused vectors.\n",
-					 gaps);
-		      vect_get_load_cost (vinfo, stmt_info, gaps,
-					  alignment_support_scheme,
-					  misalignment, false, &inside_cost,
-					  &prologue_cost, cost_vec, cost_vec,
-					  true);
-		    }
-		}
-	      vect_get_load_cost (vinfo, stmt_info, 1, alignment_support_scheme,
-				  misalignment, false, &inside_cost,
-				  &prologue_cost, cost_vec, cost_vec, true);
-	      continue;
-	    }
-	  tree vec_array;
-
-	  vec_array = create_vector_array (vectype, vec_num);
-
 	  tree final_mask = NULL_TREE;
-	  if (loop_masks)
-	    final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
-					     ncopies, vectype, j);
-	  if (vec_mask)
-	    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
-					   final_mask, vec_mask, gsi);
-
-	  gcall *call;
-	  if (final_mask)
-	    {
-	      /* Emit:
-		   VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
-		                                VEC_MASK).  */
-	      unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
-	      tree alias_ptr = build_int_cst (ref_type, align);
-	      call = gimple_build_call_internal (IFN_MASK_LOAD_LANES, 3,
-						 dataref_ptr, alias_ptr,
-						 final_mask);
-	    }
-	  else
+	  tree final_len = NULL_TREE;
+	  tree bias = NULL_TREE;
+	  if (!costing_p)
 	    {
-	      /* Emit:
-		   VEC_ARRAY = LOAD_LANES (MEM_REF[...all elements...]).  */
-	      data_ref = create_array_ref (aggr_type, dataref_ptr, ref_type);
-	      call = gimple_build_call_internal (IFN_LOAD_LANES, 1, data_ref);
-	    }
-	  gimple_call_set_lhs (call, vec_array);
-	  gimple_call_set_nothrow (call, true);
-	  vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
-	  new_stmt = call;
+	      if (loop_masks)
+		final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+						 vec_num * ncopies, vectype,
+						 vec_num * j + i);
+	      if (vec_mask)
+		final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+					       final_mask, vec_mask, gsi);

-	  /* Extract each vector into an SSA_NAME.  */
-	  for (i = 0; i < vec_num; i++)
-	    {
-	      new_temp = read_vector_array (vinfo, stmt_info, gsi, scalar_dest,
-					    vec_array, i);
-	      dr_chain.quick_push (new_temp);
+	      if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+					       gsi, stmt_info, bump);
 	    }

-	  /* Record the mapping between SSA_NAMEs and statements.  */
-	  vect_record_grouped_load_vectors (vinfo, stmt_info, dr_chain);
-
-	  /* Record that VEC_ARRAY is now dead.  */
-	  vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
-	}
-      else
-	{
-	  for (i = 0; i < vec_num; i++)
+	  /* 2. Create the vector-load in the loop.  */
+	  switch (alignment_support_scheme)
 	    {
-	      tree final_mask = NULL_TREE;
-	      tree final_len = NULL_TREE;
-	      tree bias = NULL_TREE;
-	      if (!costing_p)
-		{
-		  if (loop_masks)
-		    final_mask
-		      = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
-					    vec_num * ncopies, vectype,
-					    vec_num * j + i);
-		  if (vec_mask)
-		    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
-						   final_mask, vec_mask, gsi);
-
-		  if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-		    dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
-						   gsi, stmt_info, bump);
-		}
+	    case dr_aligned:
+	    case dr_unaligned_supported:
+	      {
+		unsigned int misalign;
+		unsigned HOST_WIDE_INT align;

-	      /* 2. Create the vector-load in the loop.  */
-	      switch (alignment_support_scheme)
-		{
-		case dr_aligned:
-		case dr_unaligned_supported:
+		if (memory_access_type == VMAT_GATHER_SCATTER
+		    && gs_info.ifn != IFN_LAST)
 		  {
-		    unsigned int misalign;
-		    unsigned HOST_WIDE_INT align;
-
-		    if (memory_access_type == VMAT_GATHER_SCATTER
-			&& gs_info.ifn != IFN_LAST)
+		    if (costing_p)
 		      {
-			if (costing_p)
-			  {
-			    unsigned int cnunits
-			      = vect_nunits_for_cost (vectype);
-			    inside_cost
-			      = record_stmt_cost (cost_vec, cnunits,
-						  scalar_load, stmt_info, 0,
-						  vect_body);
-			    break;
-			  }
-			if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-			  vec_offset = vec_offsets[vec_num * j + i];
-			tree zero = build_zero_cst (vectype);
-			tree scale = size_int (gs_info.scale);
-
-			if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
-			  {
-			    if (loop_lens)
-			      final_len
-				= vect_get_loop_len (loop_vinfo, gsi, loop_lens,
-						     vec_num * ncopies, vectype,
-						     vec_num * j + i, 1);
-			    else
-			      final_len = build_int_cst (sizetype,
-							 TYPE_VECTOR_SUBPARTS (
-							   vectype));
-			    signed char biasval
-			      = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
-			    bias = build_int_cst (intQI_type_node, biasval);
-			    if (!final_mask)
-			      {
-				mask_vectype = truth_type_for (vectype);
-				final_mask = build_minus_one_cst (mask_vectype);
-			      }
-			  }
-
-			gcall *call;
-			if (final_len && final_mask)
-			  call = gimple_build_call_internal (
-			    IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr,
-			    vec_offset, scale, zero, final_mask, final_len,
-			    bias);
-			else if (final_mask)
-			  call = gimple_build_call_internal
-			    (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
-			     vec_offset, scale, zero, final_mask);
-			else
-			  call = gimple_build_call_internal
-			    (IFN_GATHER_LOAD, 4, dataref_ptr,
-			     vec_offset, scale, zero);
-			gimple_call_set_nothrow (call, true);
-			new_stmt = call;
-			data_ref = NULL_TREE;
+			unsigned int cnunits = vect_nunits_for_cost (vectype);
+			inside_cost
+			  = record_stmt_cost (cost_vec, cnunits, scalar_load,
+					      stmt_info, 0, vect_body);
 			break;
 		      }
-		    else if (memory_access_type == VMAT_GATHER_SCATTER)
+		    if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		      vec_offset = vec_offsets[vec_num * j + i];
+		    tree zero = build_zero_cst (vectype);
+		    tree scale = size_int (gs_info.scale);
+
+		    if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
 		      {
-			/* Emulated gather-scatter.  */
-			gcc_assert (!final_mask);
-			unsigned HOST_WIDE_INT const_nunits
-			  = nunits.to_constant ();
-			if (costing_p)
-			  {
-			    /* For emulated gathers N offset vector element
-			       offset add is consumed by the load).  */
-			    inside_cost
-			      = record_stmt_cost (cost_vec, const_nunits,
-						  vec_to_scalar, stmt_info, 0,
-						  vect_body);
-			    /* N scalar loads plus gathering them into a
-			       vector.  */
-			    inside_cost
-			      = record_stmt_cost (cost_vec, const_nunits,
-						  scalar_load, stmt_info, 0,
-						  vect_body);
-			    inside_cost
-			      = record_stmt_cost (cost_vec, 1, vec_construct,
-						  stmt_info, 0, vect_body);
-			    break;
-			  }
-			unsigned HOST_WIDE_INT const_offset_nunits
-			  = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
-			      .to_constant ();
-			vec<constructor_elt, va_gc> *ctor_elts;
-			vec_alloc (ctor_elts, const_nunits);
-			gimple_seq stmts = NULL;
-			/* We support offset vectors with more elements
-			   than the data vector for now.  */
-			unsigned HOST_WIDE_INT factor
-			  = const_offset_nunits / const_nunits;
-			vec_offset = vec_offsets[j / factor];
-			unsigned elt_offset = (j % factor) * const_nunits;
-			tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
-			tree scale = size_int (gs_info.scale);
-			align
-			  = get_object_alignment (DR_REF (first_dr_info->dr));
-			tree ltype = build_aligned_type (TREE_TYPE (vectype),
-							 align);
-			for (unsigned k = 0; k < const_nunits; ++k)
+			if (loop_lens)
+			  final_len
+			    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+						 vec_num * ncopies, vectype,
+						 vec_num * j + i, 1);
+			else
+			  final_len
+			    = build_int_cst (sizetype,
+					     TYPE_VECTOR_SUBPARTS (vectype));
+			signed char biasval
+			  = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+			bias = build_int_cst (intQI_type_node, biasval);
+			if (!final_mask)
 			  {
-			    tree boff = size_binop (MULT_EXPR,
-						    TYPE_SIZE (idx_type),
-						    bitsize_int
-						      (k + elt_offset));
-			    tree idx = gimple_build (&stmts, BIT_FIELD_REF,
-						     idx_type, vec_offset,
-						     TYPE_SIZE (idx_type),
-						     boff);
-			    idx = gimple_convert (&stmts, sizetype, idx);
-			    idx = gimple_build (&stmts, MULT_EXPR,
-						sizetype, idx, scale);
-			    tree ptr = gimple_build (&stmts, PLUS_EXPR,
-						     TREE_TYPE (dataref_ptr),
-						     dataref_ptr, idx);
-			    ptr = gimple_convert (&stmts, ptr_type_node, ptr);
-			    tree elt = make_ssa_name (TREE_TYPE (vectype));
-			    tree ref = build2 (MEM_REF, ltype, ptr,
-					       build_int_cst (ref_type, 0));
-			    new_stmt = gimple_build_assign (elt, ref);
-			    gimple_set_vuse (new_stmt,
-					     gimple_vuse (gsi_stmt (*gsi)));
-			    gimple_seq_add_stmt (&stmts, new_stmt);
-			    CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
+			    mask_vectype = truth_type_for (vectype);
+			    final_mask = build_minus_one_cst (mask_vectype);
 			  }
-			gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-			new_stmt = gimple_build_assign (NULL_TREE,
-							build_constructor
-							  (vectype, ctor_elts));
-			data_ref = NULL_TREE;
-			break;
 		      }

-		    if (costing_p)
-		      break;
-
-		    align =
-		      known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
-		    if (alignment_support_scheme == dr_aligned)
-		      misalign = 0;
-		    else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
-		      {
-			align = dr_alignment
-			  (vect_dr_behavior (vinfo, first_dr_info));
-			misalign = 0;
-		      }
+		    gcall *call;
+		    if (final_len && final_mask)
+		      call = gimple_build_call_internal (
+			IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
+			scale, zero, final_mask, final_len, bias);
+		    else if (final_mask)
+		      call
+			= gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
+						      dataref_ptr, vec_offset,
+						      scale, zero, final_mask);
 		    else
-		      misalign = misalignment;
-		    if (dataref_offset == NULL_TREE
-			&& TREE_CODE (dataref_ptr) == SSA_NAME)
-		      set_ptr_info_alignment (get_ptr_info (dataref_ptr),
-					      align, misalign);
-		    align = least_bit_hwi (misalign | align);
-
-		    /* Compute IFN when LOOP_LENS or final_mask valid.  */
-		    machine_mode vmode = TYPE_MODE (vectype);
-		    machine_mode new_vmode = vmode;
-		    internal_fn partial_ifn = IFN_LAST;
-		    if (loop_lens)
+		      call
+			= gimple_build_call_internal (IFN_GATHER_LOAD, 4,
+						      dataref_ptr, vec_offset,
+						      scale, zero);
+		    gimple_call_set_nothrow (call, true);
+		    new_stmt = call;
+		    data_ref = NULL_TREE;
+		    break;
+		  }
+		else if (memory_access_type == VMAT_GATHER_SCATTER)
+		  {
+		    /* Emulated gather-scatter.  */
+		    gcc_assert (!final_mask);
+		    unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
+		    if (costing_p)
 		      {
-			opt_machine_mode new_ovmode
-			  = get_len_load_store_mode (vmode, true,
-						     &partial_ifn);
-			new_vmode = new_ovmode.require ();
-			unsigned factor = (new_ovmode == vmode)
-					    ? 1
-					    : GET_MODE_UNIT_SIZE (vmode);
-			final_len
-			  = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
-					       vec_num * ncopies, vectype,
-					       vec_num * j + i, factor);
+			/* For emulated gathers N offset vector element
+			   offset add is consumed by the load).  */
+			inside_cost
+			  = record_stmt_cost (cost_vec, const_nunits,
+					      vec_to_scalar, stmt_info, 0,
+					      vect_body);
+			/* N scalar loads plus gathering them into a
+			   vector.  */
+			inside_cost = record_stmt_cost (cost_vec, const_nunits,
+							scalar_load, stmt_info,
+							0, vect_body);
+			inside_cost
+			  = record_stmt_cost (cost_vec, 1, vec_construct,
+					      stmt_info, 0, vect_body);
+			break;
 		      }
-		    else if (final_mask)
+		    unsigned HOST_WIDE_INT const_offset_nunits
+		      = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
+			  .to_constant ();
+		    vec<constructor_elt, va_gc> *ctor_elts;
+		    vec_alloc (ctor_elts, const_nunits);
+		    gimple_seq stmts = NULL;
+		    /* We support offset vectors with more elements
+		       than the data vector for now.  */
+		    unsigned HOST_WIDE_INT factor
+		      = const_offset_nunits / const_nunits;
+		    vec_offset = vec_offsets[j / factor];
+		    unsigned elt_offset = (j % factor) * const_nunits;
+		    tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
+		    tree scale = size_int (gs_info.scale);
+		    align = get_object_alignment (DR_REF (first_dr_info->dr));
+		    tree ltype
+		      = build_aligned_type (TREE_TYPE (vectype), align);
+		    for (unsigned k = 0; k < const_nunits; ++k)
 		      {
-			if (!can_vec_mask_load_store_p (
-			      vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
-			      &partial_ifn))
-			  gcc_unreachable ();
+			tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
+						bitsize_int (k + elt_offset));
+			tree idx = gimple_build (&stmts, BIT_FIELD_REF,
+						 idx_type, vec_offset,
+						 TYPE_SIZE (idx_type), boff);
+			idx = gimple_convert (&stmts, sizetype, idx);
+			idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
+					    scale);
+			tree ptr = gimple_build (&stmts, PLUS_EXPR,
+						 TREE_TYPE (dataref_ptr),
+						 dataref_ptr, idx);
+			ptr = gimple_convert (&stmts, ptr_type_node, ptr);
+			tree elt = make_ssa_name (TREE_TYPE (vectype));
+			tree ref = build2 (MEM_REF, ltype, ptr,
+					   build_int_cst (ref_type, 0));
+			new_stmt = gimple_build_assign (elt, ref);
+			gimple_set_vuse (new_stmt,
+					 gimple_vuse (gsi_stmt (*gsi)));
+			gimple_seq_add_stmt (&stmts, new_stmt);
+			CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
 		      }
+		    gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+		    new_stmt = gimple_build_assign (
+		      NULL_TREE, build_constructor (vectype, ctor_elts));
+		    data_ref = NULL_TREE;
+		    break;
+		  }

-		    if (partial_ifn == IFN_MASK_LEN_LOAD)
+		if (costing_p)
+		  break;
+
+		align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
+		if (alignment_support_scheme == dr_aligned)
+		  misalign = 0;
+		else if (misalignment == DR_MISALIGNMENT_UNKNOWN)
+		  {
+		    align
+		      = dr_alignment (vect_dr_behavior (vinfo, first_dr_info));
+		    misalign = 0;
+		  }
+		else
+		  misalign = misalignment;
+		if (dataref_offset == NULL_TREE
+		    && TREE_CODE (dataref_ptr) == SSA_NAME)
+		  set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
+					  misalign);
+		align = least_bit_hwi (misalign | align);
+
+		/* Compute IFN when LOOP_LENS or final_mask valid.  */
+		machine_mode vmode = TYPE_MODE (vectype);
+		machine_mode new_vmode = vmode;
+		internal_fn partial_ifn = IFN_LAST;
+		if (loop_lens)
+		  {
+		    opt_machine_mode new_ovmode
+		      = get_len_load_store_mode (vmode, true, &partial_ifn);
+		    new_vmode = new_ovmode.require ();
+		    unsigned factor
+		      = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
+		    final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+						   vec_num * ncopies, vectype,
+						   vec_num * j + i, factor);
+		  }
+		else if (final_mask)
+		  {
+		    if (!can_vec_mask_load_store_p (
+			  vmode, TYPE_MODE (TREE_TYPE (final_mask)), true,
+			  &partial_ifn))
+		      gcc_unreachable ();
+		  }
+
+		if (partial_ifn == IFN_MASK_LEN_LOAD)
+		  {
+		    if (!final_len)
 		      {
-			if (!final_len)
-			  {
-			    /* Pass VF value to 'len' argument of
-			       MASK_LEN_LOAD if LOOP_LENS is invalid.  */
-			    final_len
-			      = size_int (TYPE_VECTOR_SUBPARTS (vectype));
-			  }
-			if (!final_mask)
-			  {
-			    /* Pass all ones value to 'mask' argument of
-			       MASK_LEN_LOAD if final_mask is invalid.  */
-			    mask_vectype = truth_type_for (vectype);
-			    final_mask = build_minus_one_cst (mask_vectype);
-			  }
+			/* Pass VF value to 'len' argument of
+			   MASK_LEN_LOAD if LOOP_LENS is invalid.  */
+			final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
 		      }
-		    if (final_len)
+		    if (!final_mask)
 		      {
-			signed char biasval
-			  = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
-
-			bias = build_int_cst (intQI_type_node, biasval);
+			/* Pass all ones value to 'mask' argument of
+			   MASK_LEN_LOAD if final_mask is invalid.  */
+			mask_vectype = truth_type_for (vectype);
+			final_mask = build_minus_one_cst (mask_vectype);
 		      }
+		  }
+		if (final_len)
+		  {
+		    signed char biasval
+		      = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);

-		    if (final_len)
+		    bias = build_int_cst (intQI_type_node, biasval);
+		  }
+
+		if (final_len)
+		  {
+		    tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
+		    gcall *call;
+		    if (partial_ifn == IFN_MASK_LEN_LOAD)
+		      call = gimple_build_call_internal (IFN_MASK_LEN_LOAD, 5,
+							 dataref_ptr, ptr,
+							 final_mask, final_len,
+							 bias);
+		    else
+		      call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
+							 dataref_ptr, ptr,
+							 final_len, bias);
+		    gimple_call_set_nothrow (call, true);
+		    new_stmt = call;
+		    data_ref = NULL_TREE;
+
+		    /* Need conversion if it's wrapped with VnQI.  */
+		    if (vmode != new_vmode)
 		      {
-			tree ptr
-			  = build_int_cst (ref_type, align * BITS_PER_UNIT);
-			gcall *call;
-			if (partial_ifn == IFN_MASK_LEN_LOAD)
-			  call = gimple_build_call_internal (IFN_MASK_LEN_LOAD,
-							     5, dataref_ptr,
-							     ptr, final_mask,
-							     final_len, bias);
-			else
-			  call = gimple_build_call_internal (IFN_LEN_LOAD, 4,
-							     dataref_ptr, ptr,
-							     final_len, bias);
-			gimple_call_set_nothrow (call, true);
-			new_stmt = call;
-			data_ref = NULL_TREE;
-
-			/* Need conversion if it's wrapped with VnQI.  */
-			if (vmode != new_vmode)
-			  {
-			    tree new_vtype = build_vector_type_for_mode (
-			      unsigned_intQI_type_node, new_vmode);
-			    tree var = vect_get_new_ssa_name (new_vtype,
-							      vect_simple_var);
-			    gimple_set_lhs (call, var);
-			    vect_finish_stmt_generation (vinfo, stmt_info, call,
-							 gsi);
-			    tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
-			    new_stmt
-			      = gimple_build_assign (vec_dest,
-						     VIEW_CONVERT_EXPR, op);
-			  }
+			tree new_vtype = build_vector_type_for_mode (
+			  unsigned_intQI_type_node, new_vmode);
+			tree var
+			  = vect_get_new_ssa_name (new_vtype, vect_simple_var);
+			gimple_set_lhs (call, var);
+			vect_finish_stmt_generation (vinfo, stmt_info, call,
+						     gsi);
+			tree op = build1 (VIEW_CONVERT_EXPR, vectype, var);
+			new_stmt = gimple_build_assign (vec_dest,
+							VIEW_CONVERT_EXPR, op);
 		      }
-		    else if (final_mask)
+		  }
+		else if (final_mask)
+		  {
+		    tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
+		    gcall *call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
+							      dataref_ptr, ptr,
+							      final_mask);
+		    gimple_call_set_nothrow (call, true);
+		    new_stmt = call;
+		    data_ref = NULL_TREE;
+		  }
+		else
+		  {
+		    tree ltype = vectype;
+		    tree new_vtype = NULL_TREE;
+		    unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
+		    unsigned int vect_align
+		      = vect_known_alignment_in_bytes (first_dr_info, vectype);
+		    unsigned int scalar_dr_size
+		      = vect_get_scalar_dr_size (first_dr_info);
+		    /* If there's no peeling for gaps but we have a gap
+		       with slp loads then load the lower half of the
+		       vector only.  See get_group_load_store_type for
+		       when we apply this optimization.  */
+		    if (slp
+			&& loop_vinfo
+			&& !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && gap != 0
+			&& known_eq (nunits, (group_size - gap) * 2)
+			&& known_eq (nunits, group_size)
+			&& gap >= (vect_align / scalar_dr_size))
 		      {
-			tree ptr = build_int_cst (ref_type,
-						  align * BITS_PER_UNIT);
-			gcall *call
-			  = gimple_build_call_internal (IFN_MASK_LOAD, 3,
-							dataref_ptr, ptr,
-							final_mask);
-			gimple_call_set_nothrow (call, true);
-			new_stmt = call;
-			data_ref = NULL_TREE;
+			tree half_vtype;
+			new_vtype
+			  = vector_vector_composition_type (vectype, 2,
+							    &half_vtype);
+			if (new_vtype != NULL_TREE)
+			  ltype = half_vtype;
 		      }
+		    tree offset
+		      = (dataref_offset ? dataref_offset
+					: build_int_cst (ref_type, 0));
+		    if (ltype != vectype
+			&& memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+		      {
+			unsigned HOST_WIDE_INT gap_offset
+			  = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
+			tree gapcst = build_int_cst (ref_type, gap_offset);
+			offset = size_binop (PLUS_EXPR, offset, gapcst);
+		      }
+		    data_ref
+		      = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
+		    if (alignment_support_scheme == dr_aligned)
+		      ;
 		    else
+		      TREE_TYPE (data_ref)
+			= build_aligned_type (TREE_TYPE (data_ref),
+					      align * BITS_PER_UNIT);
+		    if (ltype != vectype)
 		      {
-			tree ltype = vectype;
-			tree new_vtype = NULL_TREE;
-			unsigned HOST_WIDE_INT gap
-			  = DR_GROUP_GAP (first_stmt_info);
-			unsigned int vect_align
-			  = vect_known_alignment_in_bytes (first_dr_info,
-							   vectype);
-			unsigned int scalar_dr_size
-			  = vect_get_scalar_dr_size (first_dr_info);
-			/* If there's no peeling for gaps but we have a gap
-			   with slp loads then load the lower half of the
-			   vector only.  See get_group_load_store_type for
-			   when we apply this optimization.  */
-			if (slp
-			    && loop_vinfo
-			    && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-			    && gap != 0
-			    && known_eq (nunits, (group_size - gap) * 2)
-			    && known_eq (nunits, group_size)
-			    && gap >= (vect_align / scalar_dr_size))
+			vect_copy_ref_info (data_ref,
+					    DR_REF (first_dr_info->dr));
+			tree tem = make_ssa_name (ltype);
+			new_stmt = gimple_build_assign (tem, data_ref);
+			vect_finish_stmt_generation (vinfo, stmt_info, new_stmt,
+						     gsi);
+			data_ref = NULL;
+			vec<constructor_elt, va_gc> *v;
+			vec_alloc (v, 2);
+			if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
 			  {
-			    tree half_vtype;
-			    new_vtype
-			      = vector_vector_composition_type (vectype, 2,
-								&half_vtype);
-			    if (new_vtype != NULL_TREE)
-			      ltype = half_vtype;
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+						    build_zero_cst (ltype));
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
 			  }
-			tree offset
-			  = (dataref_offset ? dataref_offset
-					    : build_int_cst (ref_type, 0));
-			if (ltype != vectype
-			    && memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+			else
 			  {
-			    unsigned HOST_WIDE_INT gap_offset
-			      = gap * tree_to_uhwi (TYPE_SIZE_UNIT (elem_type));
-			    tree gapcst = build_int_cst (ref_type, gap_offset);
-			    offset = size_binop (PLUS_EXPR, offset, gapcst);
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
+			    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+						    build_zero_cst (ltype));
 			  }
-			data_ref
-			  = fold_build2 (MEM_REF, ltype, dataref_ptr, offset);
-			if (alignment_support_scheme == dr_aligned)
-			  ;
+			gcc_assert (new_vtype != NULL_TREE);
+			if (new_vtype == vectype)
+			  new_stmt = gimple_build_assign (
+			    vec_dest, build_constructor (vectype, v));
 			else
-			  TREE_TYPE (data_ref)
-			    = build_aligned_type (TREE_TYPE (data_ref),
-						  align * BITS_PER_UNIT);
-			if (ltype != vectype)
 			  {
-			    vect_copy_ref_info (data_ref,
-						DR_REF (first_dr_info->dr));
-			    tree tem = make_ssa_name (ltype);
-			    new_stmt = gimple_build_assign (tem, data_ref);
+			    tree new_vname = make_ssa_name (new_vtype);
+			    new_stmt = gimple_build_assign (
+			      new_vname, build_constructor (new_vtype, v));
 			    vect_finish_stmt_generation (vinfo, stmt_info,
 							 new_stmt, gsi);
-			    data_ref = NULL;
-			    vec<constructor_elt, va_gc> *v;
-			    vec_alloc (v, 2);
-			    if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
-			      {
-				CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
-							build_zero_cst (ltype));
-				CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
-			      }
-			    else
-			      {
-				CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
-				CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
-							build_zero_cst (ltype));
-			      }
-			    gcc_assert (new_vtype != NULL_TREE);
-			    if (new_vtype == vectype)
-			      new_stmt = gimple_build_assign (
-				vec_dest, build_constructor (vectype, v));
-			    else
-			      {
-				tree new_vname = make_ssa_name (new_vtype);
-				new_stmt = gimple_build_assign (
-				  new_vname, build_constructor (new_vtype, v));
-				vect_finish_stmt_generation (vinfo, stmt_info,
-							     new_stmt, gsi);
-				new_stmt = gimple_build_assign (
-				  vec_dest, build1 (VIEW_CONVERT_EXPR, vectype,
-						    new_vname));
-			      }
+			    new_stmt = gimple_build_assign (
+			      vec_dest,
+			      build1 (VIEW_CONVERT_EXPR, vectype, new_vname));
 			  }
 		      }
-		    break;
 		  }
-		case dr_explicit_realign:
-		  {
-		    if (costing_p)
-		      break;
-		    tree ptr, bump;
-
-		    tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+		break;
+	      }
+	    case dr_explicit_realign:
+	      {
+		if (costing_p)
+		  break;
+		tree ptr, bump;

-		    if (compute_in_loop)
-		      msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
-						    &realignment_token,
-						    dr_explicit_realign,
-						    dataref_ptr, NULL);
+		tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));

-		    if (TREE_CODE (dataref_ptr) == SSA_NAME)
-		      ptr = copy_ssa_name (dataref_ptr);
-		    else
-		      ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
-		    // For explicit realign the target alignment should be
-		    // known at compile time.
-		    unsigned HOST_WIDE_INT align =
-		      DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
-		    new_stmt = gimple_build_assign
-				 (ptr, BIT_AND_EXPR, dataref_ptr,
-				  build_int_cst
-				  (TREE_TYPE (dataref_ptr),
-				   -(HOST_WIDE_INT) align));
-		    vect_finish_stmt_generation (vinfo, stmt_info,
-						 new_stmt, gsi);
-		    data_ref
-		      = build2 (MEM_REF, vectype, ptr,
-				build_int_cst (ref_type, 0));
-		    vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
-		    vec_dest = vect_create_destination_var (scalar_dest,
-							    vectype);
-		    new_stmt = gimple_build_assign (vec_dest, data_ref);
-		    new_temp = make_ssa_name (vec_dest, new_stmt);
-		    gimple_assign_set_lhs (new_stmt, new_temp);
-		    gimple_move_vops (new_stmt, stmt_info->stmt);
-		    vect_finish_stmt_generation (vinfo, stmt_info,
-						 new_stmt, gsi);
-		    msq = new_temp;
-
-		    bump = size_binop (MULT_EXPR, vs,
-				       TYPE_SIZE_UNIT (elem_type));
-		    bump = size_binop (MINUS_EXPR, bump, size_one_node);
-		    ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi,
-					   stmt_info, bump);
-		    new_stmt = gimple_build_assign
-				 (NULL_TREE, BIT_AND_EXPR, ptr,
-				  build_int_cst
-				  (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
-		    if (TREE_CODE (ptr) == SSA_NAME)
-		      ptr = copy_ssa_name (ptr, new_stmt);
-		    else
-		      ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
-		    gimple_assign_set_lhs (new_stmt, ptr);
-		    vect_finish_stmt_generation (vinfo, stmt_info,
-						 new_stmt, gsi);
-		    data_ref
-		      = build2 (MEM_REF, vectype, ptr,
-				build_int_cst (ref_type, 0));
-		    break;
-		  }
-		case dr_explicit_realign_optimized:
-		  {
-		    if (costing_p)
-		      break;
-		    if (TREE_CODE (dataref_ptr) == SSA_NAME)
-		      new_temp = copy_ssa_name (dataref_ptr);
-		    else
-		      new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
-		    // We should only be doing this if we know the target
-		    // alignment at compile time.
-		    unsigned HOST_WIDE_INT align =
-		      DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
-		    new_stmt = gimple_build_assign
-		      (new_temp, BIT_AND_EXPR, dataref_ptr,
-		       build_int_cst (TREE_TYPE (dataref_ptr),
-				     -(HOST_WIDE_INT) align));
-		    vect_finish_stmt_generation (vinfo, stmt_info,
-						 new_stmt, gsi);
-		    data_ref
-		      = build2 (MEM_REF, vectype, new_temp,
-				build_int_cst (ref_type, 0));
-		    break;
-		  }
-		default:
-		  gcc_unreachable ();
-		}
+		if (compute_in_loop)
+		  msq = vect_setup_realignment (vinfo, first_stmt_info, gsi,
+						&realignment_token,
+						dr_explicit_realign,
+						dataref_ptr, NULL);
+
+		if (TREE_CODE (dataref_ptr) == SSA_NAME)
+		  ptr = copy_ssa_name (dataref_ptr);
+		else
+		  ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
+		// For explicit realign the target alignment should be
+		// known at compile time.
+		unsigned HOST_WIDE_INT align
+		  = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
+		new_stmt = gimple_build_assign (
+		  ptr, BIT_AND_EXPR, dataref_ptr,
+		  build_int_cst (TREE_TYPE (dataref_ptr),
+				 -(HOST_WIDE_INT) align));
+		vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+		data_ref
+		  = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
+		vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+		vec_dest = vect_create_destination_var (scalar_dest, vectype);
+		new_stmt = gimple_build_assign (vec_dest, data_ref);
+		new_temp = make_ssa_name (vec_dest, new_stmt);
+		gimple_assign_set_lhs (new_stmt, new_temp);
+		gimple_move_vops (new_stmt, stmt_info->stmt);
+		vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+		msq = new_temp;
+
+		bump = size_binop (MULT_EXPR, vs, TYPE_SIZE_UNIT (elem_type));
+		bump = size_binop (MINUS_EXPR, bump, size_one_node);
+		ptr = bump_vector_ptr (vinfo, dataref_ptr, NULL, gsi, stmt_info,
+				       bump);
+		new_stmt = gimple_build_assign (
+		  NULL_TREE, BIT_AND_EXPR, ptr,
+		  build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
+		if (TREE_CODE (ptr) == SSA_NAME)
+		  ptr = copy_ssa_name (ptr, new_stmt);
+		else
+		  ptr = make_ssa_name (TREE_TYPE (ptr), new_stmt);
+		gimple_assign_set_lhs (new_stmt, ptr);
+		vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+		data_ref
+		  = build2 (MEM_REF, vectype, ptr, build_int_cst (ref_type, 0));
+		break;
+	      }
+	    case dr_explicit_realign_optimized:
+	      {
+		if (costing_p)
+		  break;
+		if (TREE_CODE (dataref_ptr) == SSA_NAME)
+		  new_temp = copy_ssa_name (dataref_ptr);
+		else
+		  new_temp = make_ssa_name (TREE_TYPE (dataref_ptr));
+		// We should only be doing this if we know the target
+		// alignment at compile time.
+		unsigned HOST_WIDE_INT align
+		  = DR_TARGET_ALIGNMENT (first_dr_info).to_constant ();
+		new_stmt = gimple_build_assign (
+		  new_temp, BIT_AND_EXPR, dataref_ptr,
+		  build_int_cst (TREE_TYPE (dataref_ptr),
+				 -(HOST_WIDE_INT) align));
+		vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+		data_ref = build2 (MEM_REF, vectype, new_temp,
+				   build_int_cst (ref_type, 0));
+		break;
+	      }
+	    default:
+	      gcc_unreachable ();
+	    }

-	      /* One common place to cost the above vect load for different
-		 alignment support schemes.  */
-	      if (costing_p)
-		{
-		  /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
-		     only need to take care of the first stmt, whose
-		     stmt_info is first_stmt_info, vec_num iterating on it
-		     will cover the cost for the remaining, it's consistent
-		     with transforming.  For the prologue cost for realign,
-		     we only need to count it once for the whole group.  */
-		  bool first_stmt_info_p = first_stmt_info == stmt_info;
-		  bool add_realign_cost = first_stmt_info_p && i == 0;
-		  if (memory_access_type == VMAT_CONTIGUOUS
-		      || memory_access_type == VMAT_CONTIGUOUS_REVERSE
-		      || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
-			  && (!grouped_load || first_stmt_info_p)))
-		    vect_get_load_cost (vinfo, stmt_info, 1,
-					alignment_support_scheme, misalignment,
-					add_realign_cost, &inside_cost,
-					&prologue_cost, cost_vec, cost_vec,
-					true);
-		}
-	      else
+	  /* One common place to cost the above vect load for different
+	     alignment support schemes.  */
+	  if (costing_p)
+	    {
+	      /* For VMAT_CONTIGUOUS_PERMUTE if it's grouped load, we
+		 only need to take care of the first stmt, whose
+		 stmt_info is first_stmt_info, vec_num iterating on it
+		 will cover the cost for the remaining, it's consistent
+		 with transforming.  For the prologue cost for realign,
+		 we only need to count it once for the whole group.  */
+	      bool first_stmt_info_p = first_stmt_info == stmt_info;
+	      bool add_realign_cost = first_stmt_info_p && i == 0;
+	      if (memory_access_type == VMAT_CONTIGUOUS
+		  || memory_access_type == VMAT_CONTIGUOUS_REVERSE
+		  || (memory_access_type == VMAT_CONTIGUOUS_PERMUTE
+		      && (!grouped_load || first_stmt_info_p)))
+		vect_get_load_cost (vinfo, stmt_info, 1,
+				    alignment_support_scheme, misalignment,
+				    add_realign_cost, &inside_cost,
+				    &prologue_cost, cost_vec, cost_vec, true);
+	    }
+	  else
+	    {
+	      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+	      /* DATA_REF is null if we've already built the statement.  */
+	      if (data_ref)
 		{
-		  vec_dest = vect_create_destination_var (scalar_dest, vectype);
-		  /* DATA_REF is null if we've already built the statement.  */
-		  if (data_ref)
-		    {
-		      vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
-		      new_stmt = gimple_build_assign (vec_dest, data_ref);
-		    }
-		  new_temp = make_ssa_name (vec_dest, new_stmt);
-		  gimple_set_lhs (new_stmt, new_temp);
-		  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+		  vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+		  new_stmt = gimple_build_assign (vec_dest, data_ref);
 		}
+	      new_temp = make_ssa_name (vec_dest, new_stmt);
+	      gimple_set_lhs (new_stmt, new_temp);
+	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+	    }

-	      /* 3. Handle explicit realignment if necessary/supported.
-		 Create in loop:
-		   vec_dest = realign_load (msq, lsq, realignment_token)  */
-	      if (!costing_p
-		  && (alignment_support_scheme == dr_explicit_realign_optimized
-		      || alignment_support_scheme == dr_explicit_realign))
-		{
-		  lsq = gimple_assign_lhs (new_stmt);
-		  if (!realignment_token)
-		    realignment_token = dataref_ptr;
-		  vec_dest = vect_create_destination_var (scalar_dest, vectype);
-		  new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR,
-						  msq, lsq, realignment_token);
-		  new_temp = make_ssa_name (vec_dest, new_stmt);
-		  gimple_assign_set_lhs (new_stmt, new_temp);
-		  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+	  /* 3. Handle explicit realignment if necessary/supported.
+	     Create in loop:
+	       vec_dest = realign_load (msq, lsq, realignment_token)  */
+	  if (!costing_p
+	      && (alignment_support_scheme == dr_explicit_realign_optimized
+		  || alignment_support_scheme == dr_explicit_realign))
+	    {
+	      lsq = gimple_assign_lhs (new_stmt);
+	      if (!realignment_token)
+		realignment_token = dataref_ptr;
+	      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+	      new_stmt = gimple_build_assign (vec_dest, REALIGN_LOAD_EXPR, msq,
+					      lsq, realignment_token);
+	      new_temp = make_ssa_name (vec_dest, new_stmt);
+	      gimple_assign_set_lhs (new_stmt, new_temp);
+	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);

-		  if (alignment_support_scheme == dr_explicit_realign_optimized)
-		    {
-		      gcc_assert (phi);
-		      if (i == vec_num - 1 && j == ncopies - 1)
-			add_phi_arg (phi, lsq,
-				     loop_latch_edge (containing_loop),
-				     UNKNOWN_LOCATION);
-		      msq = lsq;
-		    }
+	      if (alignment_support_scheme == dr_explicit_realign_optimized)
+		{
+		  gcc_assert (phi);
+		  if (i == vec_num - 1 && j == ncopies - 1)
+		    add_phi_arg (phi, lsq, loop_latch_edge (containing_loop),
+				 UNKNOWN_LOCATION);
+		  msq = lsq;
 		}
+	    }

-	      if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+	  if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+	    {
+	      if (costing_p)
+		inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
+						stmt_info, 0, vect_body);
+	      else
 		{
-		  if (costing_p)
-		    inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
-						    stmt_info, 0, vect_body);
-		  else
-		    {
-		      tree perm_mask = perm_mask_for_reverse (vectype);
-		      new_temp
-			= permute_vec_elements (vinfo, new_temp, new_temp,
-						perm_mask, stmt_info, gsi);
-		      new_stmt = SSA_NAME_DEF_STMT (new_temp);
-		    }
+		  tree perm_mask = perm_mask_for_reverse (vectype);
+		  new_temp = permute_vec_elements (vinfo, new_temp, new_temp,
+						   perm_mask, stmt_info, gsi);
+		  new_stmt = SSA_NAME_DEF_STMT (new_temp);
 		}
+	    }

-	      /* Collect vector loads and later create their permutation in
-		 vect_transform_grouped_load ().  */
-	      if (!costing_p && (grouped_load || slp_perm))
-		dr_chain.quick_push (new_temp);
+	  /* Collect vector loads and later create their permutation in
+	     vect_transform_grouped_load ().  */
+	  if (!costing_p && (grouped_load || slp_perm))
+	    dr_chain.quick_push (new_temp);

-	      /* Store vector loads in the corresponding SLP_NODE.  */
-	      if (!costing_p && slp && !slp_perm)
-		slp_node->push_vec_def (new_stmt);
+	  /* Store vector loads in the corresponding SLP_NODE.  */
+	  if (!costing_p && slp && !slp_perm)
+	    slp_node->push_vec_def (new_stmt);

-	      /* With SLP permutation we load the gaps as well, without
-	         we need to skip the gaps after we manage to fully load
-		 all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
-	      group_elt += nunits;
-	      if (!costing_p
-		  && maybe_ne (group_gap_adj, 0U)
-		  && !slp_perm
-		  && known_eq (group_elt, group_size - group_gap_adj))
-		{
-		  poly_wide_int bump_val
-		    = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
-		       * group_gap_adj);
-		  if (tree_int_cst_sgn
-			(vect_dr_behavior (vinfo, dr_info)->step) == -1)
-		    bump_val = -bump_val;
-		  tree bump = wide_int_to_tree (sizetype, bump_val);
-		  dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
-						 gsi, stmt_info, bump);
-		  group_elt = 0;
-		}
-	    }
-	  /* Bump the vector pointer to account for a gap or for excess
-	     elements loaded for a permuted SLP load.  */
+	  /* With SLP permutation we load the gaps as well, without
+	     we need to skip the gaps after we manage to fully load
+	     all elements.  group_gap_adj is DR_GROUP_SIZE here.  */
+	  group_elt += nunits;
 	  if (!costing_p
 	      && maybe_ne (group_gap_adj, 0U)
-	      && slp_perm)
+	      && !slp_perm
+	      && known_eq (group_elt, group_size - group_gap_adj))
 	    {
 	      poly_wide_int bump_val
-		= (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
-		   * group_gap_adj);
-	      if (tree_int_cst_sgn
-		    (vect_dr_behavior (vinfo, dr_info)->step) == -1)
+		= (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
+	      if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step)
+		  == -1)
 		bump_val = -bump_val;
 	      tree bump = wide_int_to_tree (sizetype, bump_val);
 	      dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
 					     stmt_info, bump);
+	      group_elt = 0;
 	    }
 	}
+      /* Bump the vector pointer to account for a gap or for excess
+	 elements loaded for a permuted SLP load.  */
+      if (!costing_p
+	  && maybe_ne (group_gap_adj, 0U)
+	  && slp_perm)
+	{
+	  poly_wide_int bump_val
+	    = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
+	  if (tree_int_cst_sgn (vect_dr_behavior (vinfo, dr_info)->step) == -1)
+	    bump_val = -bump_val;
+	  tree bump = wide_int_to_tree (sizetype, bump_val);
+	  dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
+					 stmt_info, bump);
+	}

       if (slp && !slp_perm)
 	continue;
@@ -11120,39 +11117,36 @@  vectorizable_load (vec_info *vinfo,
 	    }
 	}
       else
-        {
-          if (grouped_load)
-  	    {
-	      if (memory_access_type != VMAT_LOAD_STORE_LANES)
+	{
+	  if (grouped_load)
+	    {
+	      gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
+	      /* We assume that the cost of a single load-lanes instruction
+		 is equivalent to the cost of DR_GROUP_SIZE separate loads.
+		 If a grouped access is instead being provided by a
+		 load-and-permute operation, include the cost of the
+		 permutes.  */
+	      if (costing_p && first_stmt_info == stmt_info)
 		{
-		  gcc_assert (memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
-		  /* We assume that the cost of a single load-lanes instruction
-		     is equivalent to the cost of DR_GROUP_SIZE separate loads.
-		     If a grouped access is instead being provided by a
-		     load-and-permute operation, include the cost of the
-		     permutes.  */
-		  if (costing_p && first_stmt_info == stmt_info)
-		    {
-		      /* Uses an even and odd extract operations or shuffle
-			 operations for each needed permute.  */
-		      int group_size = DR_GROUP_SIZE (first_stmt_info);
-		      int nstmts = ceil_log2 (group_size) * group_size;
-		      inside_cost
-			+= record_stmt_cost (cost_vec, nstmts, vec_perm,
-					     stmt_info, 0, vect_body);
+		  /* Uses an even and odd extract operations or shuffle
+		     operations for each needed permute.  */
+		  int group_size = DR_GROUP_SIZE (first_stmt_info);
+		  int nstmts = ceil_log2 (group_size) * group_size;
+		  inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
+						   stmt_info, 0, vect_body);

-		      if (dump_enabled_p ())
-			dump_printf_loc (
-			  MSG_NOTE, vect_location,
-			  "vect_model_load_cost: strided group_size = %d .\n",
-			  group_size);
-		    }
-		  else if (!costing_p)
-		    vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
-						 group_size, gsi);
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				     "vect_model_load_cost:"
+				     "strided group_size = %d .\n",
+				     group_size);
+		}
+	      else if (!costing_p)
+		{
+		  vect_transform_grouped_load (vinfo, stmt_info, dr_chain,
+					       group_size, gsi);
+		  *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
 		}
-	      if (!costing_p)
-		*vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
 	    }
 	  else if (!costing_p)
 	    STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
@@ -11166,7 +11160,8 @@  vectorizable_load (vec_info *vinfo,
     {
       gcc_assert (memory_access_type != VMAT_INVARIANT
 		  && memory_access_type != VMAT_ELEMENTWISE
-		  && memory_access_type != VMAT_STRIDED_SLP);
+		  && memory_access_type != VMAT_STRIDED_SLP
+		  && memory_access_type != VMAT_LOAD_STORE_LANES);
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "vect_model_load_cost: inside_cost = %u, "